Source code for swh.model.discovery

# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

"""Primitives for finding unknown content efficiently."""

from __future__ import annotations

from collections import namedtuple
import itertools
import logging
import random
from typing import (
    Any,
    Callable,
    Iterable,
    List,
    Mapping,
    NamedTuple,
    Optional,
    Set,
    Union,
)

from typing_extensions import Protocol, runtime_checkable

from .from_disk import model
from .model import Sha1Git

logger = logging.getLogger(__name__)

# Maximum amount when sampling from the undecided set of directory entries
SAMPLE_SIZE = 1000

# Sets of sha1 of contents, skipped contents and directories respectively
Sample: NamedTuple = namedtuple(
    "Sample", ["contents", "skipped_contents", "directories"]
)



[docs]
@runtime_checkable
class ArchiveDiscoveryInterface(Protocol):
    """Interface used in discovery code to abstract over ways of connecting to
    the SWH archive (direct storage, web API, etc.) for all methods needed by
    discovery algorithms."""

    contents: List[model.Content]
    skipped_contents: List[model.SkippedContent]
    directories: List[model.Directory]

    def __init__(
        self,
        contents: List[model.Content],
        skipped_contents: List[model.SkippedContent],
        directories: List[model.Directory],
    ) -> None:
        self.contents = contents
        self.skipped_contents = skipped_contents
        self.directories = directories


[docs]
    def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
        """List content missing from the archive by sha1"""



[docs]
    def skipped_content_missing(
        self, skipped_contents: List[Sha1Git]
    ) -> Iterable[Sha1Git]:
        """List skipped content missing from the archive by sha1"""



[docs]
    def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
        """List directories missing from the archive by sha1"""





[docs]
class BaseDiscoveryGraph:
    """Creates the base structures and methods needed for discovery algorithms.
    Subclasses should override ``get_sample`` to affect how the discovery is made.

    The `update_info_callback` is an optional argument that will get called for
    each new piece of information we get. The callback arguments are `(content,
    known)`.
    - content: the relevant model.Content object,
    - known: a boolean, True if the file is known to the archive False otherwise.
    """

    def __init__(
        self,
        contents,
        skipped_contents,
        directories,
        update_info_callback: Optional[Callable[[Any, bool], None]] = None,
    ):
        self._all_contents: Mapping[
            Sha1Git, Union[model.Content, model.SkippedContent]
        ] = {}
        self._undecided_directories: Set[Sha1Git] = set()
        self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
        self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
        self.undecided: Set[Sha1Git] = set()

        for content in itertools.chain(contents, skipped_contents):
            self.undecided.add(content.sha1_git)
            self._all_contents[content.sha1_git] = content

        for directory in directories:
            self.undecided.add(directory.id)
            self._undecided_directories.add(directory.id)
            self._children[directory.id] = {c.target for c in directory.entries}
            for child in directory.entries:
                self._parents.setdefault(child.target, set()).add(directory.id)

        self.undecided |= self._undecided_directories
        self.known: Set[Sha1Git] = set()
        self.unknown: Set[Sha1Git] = set()
        self._update_info_callback = update_info_callback
        self._sha1_to_obj = {}
        for content in itertools.chain(contents, skipped_contents):
            self._sha1_to_obj[content.sha1_git] = content
        for directory in directories:
            self._sha1_to_obj[directory.id] = directory


[docs]
    def mark_known(self, entries: Iterable[Sha1Git]):
        """Mark ``entries`` and those they imply as known in the SWH archive"""
        self._mark_entries(entries, self._children, self.known)



[docs]
    def mark_unknown(self, entries: Iterable[Sha1Git]):
        """Mark ``entries`` and those they imply as unknown in the SWH archive"""
        self._mark_entries(entries, self._parents, self.unknown)


    def _mark_entries(
        self,
        entries: Iterable[Sha1Git],
        transitive_mapping: Mapping[Any, Any],
        target_set: Set[Any],
    ):
        """Use Merkle graph properties to mark a directory entry as known or unknown.

        If an entry is known, then all of its descendants are known. If it's
        unknown, then all of its ancestors are unknown.

        - ``entries``: directory entries to mark along with their ancestors/descendants
          where applicable.
        - ``transitive_mapping``: mapping from an entry to the next entries to mark
          in the hierarchy, if any.
        - ``target_set``: set where marked entries will be added.

        """
        callback = self._update_info_callback
        to_process = set(entries)
        while to_process:
            current = to_process.pop()
            target_set.add(current)
            new = current in self.undecided
            self.undecided.discard(current)
            self._undecided_directories.discard(current)
            next_entries = transitive_mapping.get(current, set()) & self.undecided
            to_process.update(next_entries)
            if new and callback is not None:
                obj = self._sha1_to_obj[current]
                callback(obj, current in self.known)


[docs]
    def get_sample(
        self,
    ) -> Sample:
        """Return a three-tuple of samples from the undecided sets of contents,
        skipped contents and directories respectively.
        These samples will be queried against the storage which will tell us
        which are known."""
        raise NotImplementedError()



[docs]
    def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
        """Given a three-tuple of samples, ask the archive which are known or
        unknown and mark them as such."""

        methods = (
            archive.content_missing,
            archive.skipped_content_missing,
            archive.directory_missing,
        )

        for sample_per_type, method in zip(sample, methods):
            if not sample_per_type:
                continue
            known = set(sample_per_type)
            unknown = set(method(list(sample_per_type)))
            known -= unknown

            self.mark_known(known)
            self.mark_unknown(unknown)





[docs]
class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
    """Use a random sampling using only directories.

    This allows us to find a statistically good spread of entries in the graph
    with a smaller population than using all types of entries. When there are
    no more directories, only contents or skipped contents are undecided if any
    are left: we send them directly to the storage since they should be few and
    their structure flat."""


[docs]
    def get_sample(self) -> Sample:
        if self._undecided_directories:
            if len(self._undecided_directories) <= SAMPLE_SIZE:
                return Sample(
                    contents=set(),
                    skipped_contents=set(),
                    directories=set(self._undecided_directories),
                )
            sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
            directories = {o for o in sample}
            return Sample(
                contents=set(), skipped_contents=set(), directories=directories
            )

        contents = set()
        skipped_contents = set()

        for sha1 in self.undecided:
            obj = self._all_contents[sha1]
            obj_type = obj.object_type
            if obj_type == model.Content.object_type:
                contents.add(sha1)
            elif obj_type == model.SkippedContent.object_type:
                skipped_contents.add(sha1)
            else:
                raise TypeError(f"Unexpected object type {obj_type}")

        return Sample(
            contents=contents, skipped_contents=skipped_contents, directories=set()
        )





[docs]
def filter_known_objects(
    archive: ArchiveDiscoveryInterface,
    update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
    """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
    to only return those that are unknown to the SWH archive using a discovery
    algorithm.

    The `update_info_callback` is an optional argument that will get called for
    each new piece of information we get. The callback arguments are `(content,
    known)`.
    - content: the relevant model.Content object,
    - known: a boolean, True if the file is known to the archive False otherwise.
    """
    contents = archive.contents
    skipped_contents = archive.skipped_contents
    directories = archive.directories

    contents_count = len(contents)
    skipped_contents_count = len(skipped_contents)
    directories_count = len(directories)

    graph = RandomDirSamplingDiscoveryGraph(
        contents,
        skipped_contents,
        directories,
        update_info_callback=update_info_callback,
    )

    while graph.undecided:
        sample = graph.get_sample()
        graph.do_query(archive, sample)

    contents = [c for c in contents if c.sha1_git in graph.unknown]
    skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
    directories = [c for c in directories if c.id in graph.unknown]

    logger.debug(
        "Filtered out %d contents, %d skipped contents and %d directories",
        contents_count - len(contents),
        skipped_contents_count - len(skipped_contents),
        directories_count - len(directories),
    )

    return (contents, skipped_contents, directories)