Source code for swh.storage.proxies.buffer

# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from functools import partial
import logging
from typing import Dict, Iterable, Mapping, Sequence, Tuple, cast
import warnings

from typing_extensions import Literal

from swh.core.utils import grouper
from swh.model.model import (
    BaseModel,
    Content,
    Directory,
    RawExtrinsicMetadata,
    Release,
    Revision,
    SkippedContent,
)
from swh.storage import get_storage
from swh.storage.interface import StorageInterface

logger = logging.getLogger(__name__)

LObjectType = Literal[
    "raw_extrinsic_metadata",
    "content",
    "skipped_content",
    "directory",
    "revision",
    "release",
    "snapshot",
    "extid",
]

# In the order objects are flushed
OBJECT_TYPES: Tuple[LObjectType, ...] = (
    "raw_extrinsic_metadata",
    "content",
    "skipped_content",
    "directory",
    "revision",
    "release",
    "snapshot",
    "extid",
)

DEFAULT_BUFFER_THRESHOLDS: Dict[str, int] = {
    "content": 10000,
    "content_bytes": 100 * 1024 * 1024,
    "skipped_content": 10000,
    "directory": 25000,
    "directory_entries": 200000,
    "raw_extrinsic_metadata": 1000,
    "revision": 100000,
    "revision_parents": 200000,
    "revision_bytes": 100 * 1024 * 1024,
    "release": 100000,
    "release_bytes": 100 * 1024 * 1024,
    "snapshot": 25000,
    "extid": 10000,
}


[docs] def estimate_revision_size(revision: Revision) -> int: """Estimate the size of a revision, by summing the size of variable length fields""" s = 20 * len(revision.parents) if revision.message: s += len(revision.message) if revision.author is not None: s += len(revision.author.fullname) if revision.committer is not None: s += len(revision.committer.fullname) s += sum(len(h) + len(v) for h, v in revision.extra_headers) return s
[docs] def estimate_release_size(release: Release) -> int: """Estimate the size of a release, by summing the size of variable length fields""" s = 0 if release.message: s += len(release.message) if release.author: s += len(release.author.fullname) return s
[docs] class BufferingProxyStorage: """ Storage implementation in charge of accumulating objects prior to discussing with the "main" storage. When the number of objects of any given type exceeds the configure threshold, then objects of this type (and of other types, see the 'Flush order' below) are flushed to the backend. Deduplicates values based on a tuple of keys depending on the object type. Sample configuration use case for buffering storage: .. code-block:: yaml storage: cls: buffer args: storage: cls: remote args: http://storage.internal.staging.swh.network:5002/ min_batch_size: content: 10000 content_bytes: 100000000 skipped_content: 10000 directory: 5000 directory_entries: 100000 raw_extrinsic_metadata: 1000 revision: 1000 revision_parents: 2000 revision_bytes: 100000000 release: 10000 release_bytes: 100000000 snapshot: 5000 Flush order =========== In order not to create holes when the process crashes (eg. by adding a revision but crashing before adding its root directory), objects are always flushed in reverse topological order of their types: {flush_order} This guarantees not to create holes between objects of different types. However, holes may still be created between objects of the same type when using a backend storage which inserts neither sequentially not transactionally, such as the :mod:`Cassandra <swh.storage.cassandra>` backend or the :mod:`tenacious <swh.storage.proxies.tenacious>` proxy, so this is mostly best effort, and relies on both of these inserting objects **mostly** in sequential order. """.format( flush_order="\n ".join("#. " + object_type for object_type in OBJECT_TYPES) ) def __init__(self, storage: Mapping, min_batch_size: Mapping = {}): self.storage: StorageInterface = get_storage(**storage) self._buffer_thresholds = {**DEFAULT_BUFFER_THRESHOLDS, **min_batch_size} self._objects: Dict[LObjectType, Dict[Tuple[str, ...], BaseModel]] = { k: {} for k in OBJECT_TYPES } self._sizes: Dict[LObjectType, int] = {k: 0 for k in OBJECT_TYPES} self._directory_entries: int = 0 self._revision_parents: int = 0 def __del__(self): if not hasattr(self, "_sizes"): return leftovers = False for object_type in OBJECT_TYPES: if self._sizes[object_type] != 0: leftovers = True break if not leftovers: return # Attempts to flush nonetheless to minimize data loss flush_failed = False try: self.flush() except Exception: flush_failed = True warnings.warn( "Some objects were still present in the memory of the buffering " "proxy during shutdown. " f"{'**They are now probably lost!** ' if flush_failed else ''}" "A call to `storage.flush()` is probably missing." ) def __getattr__(self, key: str): if key.endswith("_add"): object_type = key.rsplit("_", 1)[0] if object_type in OBJECT_TYPES: return partial( self.object_add, object_type=object_type, keys=["id"], ) if key == "storage": raise AttributeError(key) return getattr(self.storage, key)
[docs] def content_add(self, contents: Sequence[Content]) -> Dict[str, int]: """Push contents to write to the storage in the buffer. Following policies apply: - if the buffer's threshold is hit, flush content to the storage. - otherwise, if the total size of buffered contents's threshold is hit, flush content to the storage. """ stats = self.object_add( contents, object_type="content", keys=["sha1", "sha1_git", "sha256", "blake2s256"], ) if not stats: # We did not flush based on number of objects; check total size self._sizes["content"] += sum(c.length for c in contents) if self._sizes["content"] >= self._buffer_thresholds["content_bytes"]: return self.flush(["raw_extrinsic_metadata", "content"]) return stats
[docs] def skipped_content_add(self, contents: Sequence[SkippedContent]) -> Dict[str, int]: return self.object_add( contents, object_type="skipped_content", keys=["sha1", "sha1_git", "sha256", "blake2s256"], )
[docs] def directory_add(self, directories: Sequence[Directory]) -> Dict[str, int]: stats = self.object_add(directories, object_type="directory", keys=["id"]) if not stats: # We did not flush based on number of objects; check the number of entries self._directory_entries += sum(len(d.entries) for d in directories) if self._directory_entries >= self._buffer_thresholds["directory_entries"]: return self.flush(["raw_extrinsic_metadata", "content", "directory"]) return stats
[docs] def revision_add(self, revisions: Sequence[Revision]) -> Dict[str, int]: stats = self.object_add(revisions, object_type="revision", keys=["id"]) if not stats: # We did not flush based on number of objects; check the number of # parents and estimated size self._revision_parents += sum(len(r.parents) for r in revisions) self._sizes["revision"] += sum(estimate_revision_size(r) for r in revisions) if ( self._revision_parents >= self._buffer_thresholds["revision_parents"] or self._sizes["revision"] >= self._buffer_thresholds["revision_bytes"] ): return self.flush( ["raw_extrinsic_metadata", "content", "directory", "revision"] ) return stats
[docs] def release_add(self, releases: Sequence[Release]) -> Dict[str, int]: stats = self.object_add(releases, object_type="release", keys=["id"]) if not stats: # We did not flush based on number of objects; check the estimated size self._sizes["release"] += sum(estimate_release_size(r) for r in releases) if self._sizes["release"] >= self._buffer_thresholds["release_bytes"]: return self.flush( [ "raw_extrinsic_metadata", "content", "directory", "revision", "release", ] ) return stats
[docs] def object_add( self, objects: Sequence[BaseModel], *, object_type: LObjectType, keys: Iterable[str], ) -> Dict[str, int]: """Push objects to write to the storage in the buffer. Flushes the buffer to the storage if the threshold is hit. """ buffer_ = self._objects[object_type] for obj in objects: obj_key = tuple(getattr(obj, key) for key in keys) buffer_[obj_key] = obj if len(buffer_) >= self._buffer_thresholds[object_type]: return self.flush() return {}
[docs] def flush( self, object_types: Sequence[LObjectType] = OBJECT_TYPES ) -> Dict[str, int]: summary: Dict[str, int] = {} def update_summary(stats): for k, v in stats.items(): summary[k] = v + summary.get(k, 0) for object_type in object_types: buffer_ = self._objects[object_type] if not buffer_: continue if logger.isEnabledFor(logging.DEBUG): log = "Flushing %s objects of type %s" log_args = [len(buffer_), object_type] if object_type == "content": log += " (%s bytes)" log_args.append( sum(cast(Content, c).length for c in buffer_.values()) ) elif object_type == "directory": log += " (%s entries)" log_args.append( sum(len(cast(Directory, d).entries) for d in buffer_.values()) ) elif object_type == "revision": log += " (%s parents, %s estimated bytes)" log_args.extend( ( sum( len(cast(Revision, r).parents) for r in buffer_.values() ), sum( estimate_revision_size(cast(Revision, r)) for r in buffer_.values() ), ) ) elif object_type == "release": log += " (%s estimated bytes)" log_args.append( sum( estimate_release_size(cast(Release, r)) for r in buffer_.values() ) ) elif object_type == "raw_extrinsic_metadata": log += " (%s estimated bytes)" log_args.append( sum( len(cast(RawExtrinsicMetadata, r).metadata) for r in buffer_.values() ) ) logger.debug(log, *log_args) batches = grouper(buffer_.values(), n=self._buffer_thresholds[object_type]) for batch in batches: add_fn = getattr(self.storage, "%s_add" % object_type) stats = add_fn(list(batch)) update_summary(stats) # Flush underlying storage stats = self.storage.flush(object_types) update_summary(stats) self.clear_buffers(object_types) return summary
[docs] def clear_buffers(self, object_types: Sequence[LObjectType] = OBJECT_TYPES) -> None: """Clear objects from current buffer. WARNING: data that has not been flushed to storage will be lost when this method is called. This should only be called when `flush` fails and you want to continue your processing. """ for object_type in object_types: buffer_ = self._objects[object_type] buffer_.clear() self._sizes[object_type] = 0 if object_type == "directory": self._directory_entries = 0 elif object_type == "revision": self._revision_parents = 0 self.storage.clear_buffers(object_types)