swh.storage.storage module

swh.storage.storage.EMPTY_SNAPSHOT_ID = b'\x1a\x88\x93\xe6\xa8oDN\x8b\xe8\xe7\xbd\xa6\xcb4\xfb\x175\xa0\x0e'

Identifier for the empty snapshot

swh.storage.storage.VALIDATION_EXCEPTIONS = (<class 'psycopg2.errors.CheckViolation'>, <class 'psycopg2.IntegrityError'>, <class 'psycopg2.errors.InvalidTextRepresentation'>, <class 'psycopg2.errors.NotNullViolation'>, <class 'psycopg2.errors.NumericValueOutOfRange'>, <class 'psycopg2.errors.UndefinedFunction'>)

Exceptions raised by postgresql when validation of the arguments failed.

swh.storage.storage.convert_validation_exceptions()[source]

Catches postgresql errors related to invalid arguments, and re-raises a StorageArgumentException.

class swh.storage.storage.Storage(db, objstorage, min_pool_conns=1, max_pool_conns=10, journal_writer=None)[source]

Bases: object

SWH storage proxy, encompassing DB and object storage

__init__(db, objstorage, min_pool_conns=1, max_pool_conns=10, journal_writer=None)[source]
Parameters
  • db_conn – either a libpq connection string, or a psycopg2 connection

  • obj_root – path to the root of the object storage

get_db()[source]
put_db(db)[source]
db()[source]
check_config(*, check_write)[source]
_content_unique_key(hash, db)[source]

Given a hash (tuple or dict), return a unique key from the aggregation of keys.

_content_add_metadata(db, cur, content)[source]

Add content to the postgresql database but not the object storage.

content_add(content: Iterable[swh.model.model.Content]) → Dict[source]
content_update(content, keys=[])[source]
content_add_metadata(content: Iterable[swh.model.model.Content]) → Dict[source]
content_get(content)[source]
content_get_range(start, end, limit=1000)[source]
content_get_partition(partition_id: int, nb_partitions: int, limit: int = 1000, page_token: str = None)[source]
content_get_metadata(contents: List[bytes]) → Dict[bytes, List[Dict]][source]
content_missing(content, key_hash='sha1')[source]
content_missing_per_sha1(contents)[source]
content_missing_per_sha1_git(contents)[source]
content_find(content)[source]
content_get_random()[source]
static _skipped_content_normalize(d)[source]
static _skipped_content_validate(d)[source]

Sanity checks on status / reason / length, that postgresql doesn’t enforce.

_skipped_content_add_metadata(db, cur, content: Iterable[swh.model.model.SkippedContent])[source]
skipped_content_add(content: Iterable[swh.model.model.SkippedContent]) → Dict[source]
skipped_content_missing(contents)[source]
directory_add(directories: Iterable[swh.model.model.Directory]) → Dict[source]
directory_missing(directories)[source]
directory_ls(directory, recursive=False)[source]
directory_entry_get_by_path(directory, paths)[source]
directory_get_random()[source]
revision_add(revisions: Iterable[swh.model.model.Revision]) → Dict[source]
revision_missing(revisions)[source]
revision_get(revisions)[source]
revision_log(revisions, limit=None)[source]
revision_shortlog(revisions, limit=None)[source]
revision_get_random()[source]
release_add(releases: Iterable[swh.model.model.Release]) → Dict[source]
release_missing(releases)[source]
release_get(releases)[source]
release_get_random()[source]
snapshot_add(snapshots: Iterable[swh.model.model.Snapshot]) → Dict[source]
snapshot_missing(snapshots)[source]
snapshot_get(snapshot_id)[source]
snapshot_get_by_origin_visit(origin, visit)[source]
snapshot_get_latest(origin, allowed_statuses=None)[source]
snapshot_count_branches(snapshot_id)[source]
snapshot_get_branches(snapshot_id, branches_from=b'', branches_count=1000, target_types=None)[source]
snapshot_get_random()[source]
origin_visit_add(origin, date, type) → Optional[Dict[str, Union[str, int]]][source]
origin_visit_update(origin: str, visit_id: int, status: Optional[str] = None, metadata: Optional[Dict] = None, snapshot: Optional[bytes] = None)[source]
origin_visit_upsert(visits)[source]
origin_visit_get(origin, last_visit=None, limit=None)[source]
origin_visit_find_by_date(origin, visit_date)[source]
origin_visit_get_by(origin, visit)[source]
origin_visit_get_latest(origin, allowed_statuses=None, require_snapshot=False)[source]
origin_visit_get_random(type: str) → Optional[Dict[str, Any]][source]
object_find_by_sha1_git(ids)[source]
origin_get(origins)[source]
origin_get_by_sha1(sha1s)[source]
origin_get_range(origin_from=1, origin_count=100)[source]
origin_list(page_token: Optional[str] = None, limit: int = 100) → dict[source]
origin_count(url_pattern, regexp=False, with_visit=False)[source]
origin_add(origins: Iterable[swh.model.model.Origin]) → List[Dict][source]
origin_add_one(origin: swh.model.model.Origin) → str[source]
stat_counters()[source]
refresh_stat_counters()[source]
origin_metadata_add(origin_url, ts, provider, tool, metadata)[source]
origin_metadata_get_by(origin_url, provider_type=None)[source]
tool_add(tools)[source]
tool_get(tool)[source]
metadata_provider_add(provider_name, provider_type, provider_url, metadata)[source]
metadata_provider_get(provider_id)[source]
__dict__ = mappingproxy({'__module__': 'swh.storage.storage', '__doc__': 'SWH storage proxy, encompassing DB and object storage\n\n ', '__init__': <function Storage.__init__>, 'get_db': <function Storage.get_db>, 'put_db': <function Storage.put_db>, 'db': <function Storage.db>, 'check_config': <function Storage.check_config>, '_content_unique_key': <function Storage._content_unique_key>, '_content_add_metadata': <function Storage._content_add_metadata>, 'content_add': <function Storage.content_add>, 'content_update': <function Storage.content_update>, 'content_add_metadata': <function Storage.content_add_metadata>, 'content_get': <function Storage.content_get>, 'content_get_range': <function Storage.content_get_range>, 'content_get_partition': <function Storage.content_get_partition>, 'content_get_metadata': <function Storage.content_get_metadata>, 'content_missing': <function Storage.content_missing>, 'content_missing_per_sha1': <function Storage.content_missing_per_sha1>, 'content_missing_per_sha1_git': <function Storage.content_missing_per_sha1_git>, 'content_find': <function Storage.content_find>, 'content_get_random': <function Storage.content_get_random>, '_skipped_content_normalize': <staticmethod object>, '_skipped_content_validate': <staticmethod object>, '_skipped_content_add_metadata': <function Storage._skipped_content_add_metadata>, 'skipped_content_add': <function Storage.skipped_content_add>, 'skipped_content_missing': <function Storage.skipped_content_missing>, 'directory_add': <function Storage.directory_add>, 'directory_missing': <function Storage.directory_missing>, 'directory_ls': <function Storage.directory_ls>, 'directory_entry_get_by_path': <function Storage.directory_entry_get_by_path>, 'directory_get_random': <function Storage.directory_get_random>, 'revision_add': <function Storage.revision_add>, 'revision_missing': <function Storage.revision_missing>, 'revision_get': <function Storage.revision_get>, 'revision_log': <function Storage.revision_log>, 'revision_shortlog': <function Storage.revision_shortlog>, 'revision_get_random': <function Storage.revision_get_random>, 'release_add': <function Storage.release_add>, 'release_missing': <function Storage.release_missing>, 'release_get': <function Storage.release_get>, 'release_get_random': <function Storage.release_get_random>, 'snapshot_add': <function Storage.snapshot_add>, 'snapshot_missing': <function Storage.snapshot_missing>, 'snapshot_get': <function Storage.snapshot_get>, 'snapshot_get_by_origin_visit': <function Storage.snapshot_get_by_origin_visit>, 'snapshot_get_latest': <function Storage.snapshot_get_latest>, 'snapshot_count_branches': <function Storage.snapshot_count_branches>, 'snapshot_get_branches': <function Storage.snapshot_get_branches>, 'snapshot_get_random': <function Storage.snapshot_get_random>, 'origin_visit_add': <function Storage.origin_visit_add>, 'origin_visit_update': <function Storage.origin_visit_update>, 'origin_visit_upsert': <function Storage.origin_visit_upsert>, 'origin_visit_get': <function Storage.origin_visit_get>, 'origin_visit_find_by_date': <function Storage.origin_visit_find_by_date>, 'origin_visit_get_by': <function Storage.origin_visit_get_by>, 'origin_visit_get_latest': <function Storage.origin_visit_get_latest>, 'origin_visit_get_random': <function Storage.origin_visit_get_random>, 'object_find_by_sha1_git': <function Storage.object_find_by_sha1_git>, 'origin_get': <function Storage.origin_get>, 'origin_get_by_sha1': <function Storage.origin_get_by_sha1>, 'origin_get_range': <function Storage.origin_get_range>, 'origin_list': <function Storage.origin_list>, 'origin_search': <function Storage.origin_search>, 'origin_count': <function Storage.origin_count>, 'origin_add': <function Storage.origin_add>, 'origin_add_one': <function Storage.origin_add_one>, 'stat_counters': <function Storage.stat_counters>, 'refresh_stat_counters': <function Storage.refresh_stat_counters>, 'origin_metadata_add': <function Storage.origin_metadata_add>, 'origin_metadata_get_by': <function Storage.origin_metadata_get_by>, 'tool_add': <function Storage.tool_add>, 'tool_get': <function Storage.tool_get>, 'metadata_provider_add': <function Storage.metadata_provider_add>, 'metadata_provider_get': <function Storage.metadata_provider_get>, 'metadata_provider_get_by': <function Storage.metadata_provider_get_by>, 'diff_directories': <function Storage.diff_directories>, 'diff_revisions': <function Storage.diff_revisions>, 'diff_revision': <function Storage.diff_revision>, '__dict__': <attribute '__dict__' of 'Storage' objects>, '__weakref__': <attribute '__weakref__' of 'Storage' objects>})
__module__ = 'swh.storage.storage'
__weakref__

list of weak references to the object (if defined)

metadata_provider_get_by(provider)[source]
diff_directories(from_dir, to_dir, track_renaming=False)[source]
diff_revisions(from_rev, to_rev, track_renaming=False)[source]
diff_revision(revision, track_renaming=False)[source]