swh.storage.cassandra package#
Submodules#
- swh.storage.cassandra.common module
- swh.storage.cassandra.converters module
- swh.storage.cassandra.cql module
PARTITION_KEY_RESTRICTION_MAX_SIZE
get_execution_profiles()
create_keyspace()
CqlRunner
CqlRunner.MAX_RETRIES
CqlRunner.content_add_prepare()
CqlRunner.content_get_from_pk()
CqlRunner.content_missing_from_all_hashes()
CqlRunner.content_get_from_tokens()
CqlRunner.content_get_random()
CqlRunner.content_get_token_range()
CqlRunner.content_delete()
CqlRunner.content_index_add_one()
CqlRunner.content_get_tokens_from_single_algo()
CqlRunner.skipped_content_add_prepare()
CqlRunner.skipped_content_get_from_pk()
CqlRunner.skipped_content_get_from_token()
CqlRunner.skipped_content_delete()
CqlRunner.skipped_content_index_add_one()
CqlRunner.skipped_content_get_tokens_from_single_hash()
CqlRunner.directory_missing()
CqlRunner.directory_add_one()
CqlRunner.directory_get_random()
CqlRunner.directory_get()
CqlRunner.directory_get_token_range()
CqlRunner.directory_delete()
CqlRunner.directory_entry_add_one()
CqlRunner.directory_entry_add_concurrent()
CqlRunner.directory_entry_add_batch()
CqlRunner.directory_entry_get()
CqlRunner.directory_entry_get_from_name()
CqlRunner.directory_entry_delete()
CqlRunner.revision_missing()
CqlRunner.revision_add_one()
CqlRunner.revision_get_ids()
CqlRunner.revision_get()
CqlRunner.revision_get_random()
CqlRunner.revision_get_token_range()
CqlRunner.revision_delete()
CqlRunner.revision_parent_add_one()
CqlRunner.revision_parent_get()
CqlRunner.revision_parent_delete()
CqlRunner.release_missing()
CqlRunner.release_add_one()
CqlRunner.release_get()
CqlRunner.release_get_random()
CqlRunner.release_get_token_range()
CqlRunner.release_delete()
CqlRunner.snapshot_missing()
CqlRunner.snapshot_add_one()
CqlRunner.snapshot_get_random()
CqlRunner.snapshot_get_token_range()
CqlRunner.snapshot_delete()
CqlRunner.snapshot_branch_add_one()
CqlRunner.snapshot_count_branches_from_name()
CqlRunner.snapshot_count_branches_before_name()
CqlRunner.snapshot_count_branches()
CqlRunner.snapshot_branch_get_from_name()
CqlRunner.snapshot_branch_get_range()
CqlRunner.snapshot_branch_get()
CqlRunner.snapshot_branch_delete()
CqlRunner.origin_add_one()
CqlRunner.origin_get_by_sha1()
CqlRunner.origin_get_by_url()
CqlRunner.origin_list()
CqlRunner.origin_iter_all()
CqlRunner.origin_bump_next_visit_id()
CqlRunner.origin_generate_unique_visit_id()
CqlRunner.origin_delete()
CqlRunner.origin_visit_get()
CqlRunner.origin_visit_add_one()
CqlRunner.origin_visit_get_one()
CqlRunner.origin_visit_iter_all()
CqlRunner.origin_visit_iter()
CqlRunner.origin_visit_delete()
CqlRunner.origin_visit_status_get_range()
CqlRunner.origin_visit_status_get_all_range()
CqlRunner.origin_visit_status_add_one()
CqlRunner.origin_visit_status_get_latest()
CqlRunner.origin_visit_status_get()
CqlRunner.origin_snapshot_get_all()
CqlRunner.origin_visit_status_delete()
CqlRunner.raw_extrinsic_metadata_by_id_add()
CqlRunner.raw_extrinsic_metadata_get_by_ids()
CqlRunner.raw_extrinsic_metadata_by_id_delete()
CqlRunner.raw_extrinsic_metadata_add()
CqlRunner.raw_extrinsic_metadata_get_after_date()
CqlRunner.raw_extrinsic_metadata_get_after_date_and_id()
CqlRunner.raw_extrinsic_metadata_get()
CqlRunner.raw_extrinsic_metadata_get_authorities()
CqlRunner.raw_extrinsic_metadata_delete()
CqlRunner.metadata_authority_add()
CqlRunner.metadata_authority_get()
CqlRunner.metadata_fetcher_add()
CqlRunner.metadata_fetcher_get()
CqlRunner.extid_add_prepare()
CqlRunner.extid_get_from_pk()
CqlRunner.extid_get_from_token()
CqlRunner.extid_get_from_token_and_extid_version()
CqlRunner.extid_get_from_extid()
CqlRunner.extid_get_from_extid_and_version()
CqlRunner.extid_get_from_target()
CqlRunner.extid_delete()
CqlRunner.extid_index_add_one()
CqlRunner.extid_delete_from_by_target_table()
CqlRunner.object_reference_add_concurrent()
CqlRunner.object_reference_get()
CqlRunner.stat_counters()
CqlRunner.check_read()
- swh.storage.cassandra.model module
MAGIC_NULL_PK
content_index_table_name()
BaseRow
ContentRow
SkippedContentRow
SkippedContentRow.TABLE
SkippedContentRow.PARTITION_KEY
SkippedContentRow.sha1
SkippedContentRow.sha1_git
SkippedContentRow.sha256
SkippedContentRow.blake2s256
SkippedContentRow.length
SkippedContentRow.ctime
SkippedContentRow.status
SkippedContentRow.reason
SkippedContentRow.origin
SkippedContentRow.from_dict()
DirectoryRow
DirectoryEntryRow
RevisionRow
RevisionParentRow
ReleaseRow
SnapshotRow
SnapshotBranchRow
OriginVisitRow
OriginVisitStatusRow
OriginVisitStatusRow.TABLE
OriginVisitStatusRow.PARTITION_KEY
OriginVisitStatusRow.CLUSTERING_KEY
OriginVisitStatusRow.origin
OriginVisitStatusRow.visit
OriginVisitStatusRow.date
OriginVisitStatusRow.type
OriginVisitStatusRow.status
OriginVisitStatusRow.metadata
OriginVisitStatusRow.snapshot
OriginVisitStatusRow.from_dict()
OriginRow
MetadataAuthorityRow
MetadataFetcherRow
RawExtrinsicMetadataRow
RawExtrinsicMetadataRow.TABLE
RawExtrinsicMetadataRow.PARTITION_KEY
RawExtrinsicMetadataRow.CLUSTERING_KEY
RawExtrinsicMetadataRow.id
RawExtrinsicMetadataRow.type
RawExtrinsicMetadataRow.target
RawExtrinsicMetadataRow.authority_type
RawExtrinsicMetadataRow.authority_url
RawExtrinsicMetadataRow.discovery_date
RawExtrinsicMetadataRow.fetcher_name
RawExtrinsicMetadataRow.fetcher_version
RawExtrinsicMetadataRow.format
RawExtrinsicMetadataRow.metadata
RawExtrinsicMetadataRow.origin
RawExtrinsicMetadataRow.visit
RawExtrinsicMetadataRow.snapshot
RawExtrinsicMetadataRow.release
RawExtrinsicMetadataRow.revision
RawExtrinsicMetadataRow.path
RawExtrinsicMetadataRow.directory
RawExtrinsicMetadataByIdRow
ObjectCountRow
ExtIDRow
ExtIDByTargetRow
ObjectReferenceRow
- swh.storage.cassandra.schema module
- swh.storage.cassandra.storage module
CassandraStorage
CassandraStorage.hosts
CassandraStorage.keyspace
CassandraStorage.port
CassandraStorage.check_config()
CassandraStorage.content_add()
CassandraStorage.content_update()
CassandraStorage.content_add_metadata()
CassandraStorage.content_get_data()
CassandraStorage.content_get_partition()
CassandraStorage.content_get()
CassandraStorage.content_find()
CassandraStorage.content_missing()
CassandraStorage.content_missing_per_sha1()
CassandraStorage.content_missing_per_sha1_git()
CassandraStorage.content_get_random()
CassandraStorage.skipped_content_add()
CassandraStorage.skipped_content_find()
CassandraStorage.skipped_content_missing()
CassandraStorage.directory_add()
CassandraStorage.directory_missing()
CassandraStorage.directory_entry_get_by_path()
CassandraStorage.directory_ls()
CassandraStorage.directory_get_entries()
CassandraStorage.directory_get_raw_manifest()
CassandraStorage.directory_get_random()
CassandraStorage.directory_get_id_partition()
CassandraStorage.revision_add()
CassandraStorage.revision_missing()
CassandraStorage.revision_get()
CassandraStorage.revision_get_partition()
CassandraStorage.revision_log()
CassandraStorage.revision_shortlog()
CassandraStorage.revision_get_random()
CassandraStorage.release_add()
CassandraStorage.release_missing()
CassandraStorage.release_get()
CassandraStorage.release_get_partition()
CassandraStorage.release_get_random()
CassandraStorage.snapshot_add()
CassandraStorage.snapshot_missing()
CassandraStorage.snapshot_get()
CassandraStorage.snapshot_get_id_partition()
CassandraStorage.snapshot_count_branches()
CassandraStorage.snapshot_get_branches()
CassandraStorage.snapshot_get_random()
CassandraStorage.snapshot_branch_get_by_name()
CassandraStorage.origin_get()
CassandraStorage.origin_get_one()
CassandraStorage.origin_get_by_sha1()
CassandraStorage.origin_list()
CassandraStorage.origin_search()
CassandraStorage.origin_count()
CassandraStorage.origin_snapshot_get_all()
CassandraStorage.origin_add()
CassandraStorage.origin_visit_add()
CassandraStorage.origin_visit_status_add()
CassandraStorage.origin_visit_get()
CassandraStorage.origin_visit_get_with_statuses()
CassandraStorage.origin_visit_status_get()
CassandraStorage.origin_visit_find_by_date()
CassandraStorage.origin_visit_get_by()
CassandraStorage.origin_visit_get_latest()
CassandraStorage.origin_visit_status_get_latest()
CassandraStorage.origin_visit_status_get_random()
CassandraStorage.object_find_by_sha1_git()
CassandraStorage.stat_counters()
CassandraStorage.refresh_stat_counters()
CassandraStorage.raw_extrinsic_metadata_add()
CassandraStorage.raw_extrinsic_metadata_get()
CassandraStorage.raw_extrinsic_metadata_get_by_ids()
CassandraStorage.raw_extrinsic_metadata_get_authorities()
CassandraStorage.metadata_fetcher_add()
CassandraStorage.metadata_fetcher_get()
CassandraStorage.metadata_authority_add()
CassandraStorage.metadata_authority_get()
CassandraStorage.extid_add()
CassandraStorage.extid_get_from_extid()
CassandraStorage.extid_get_from_target()
CassandraStorage.object_find_recent_references()
CassandraStorage.object_references_add()
CassandraStorage.object_delete()
CassandraStorage.extid_delete_for_target()
CassandraStorage.clear_buffers()
CassandraStorage.flush()
Module contents#
- swh.storage.cassandra.create_keyspace(hosts: List[str], keyspace: str, port: int = 9042, *, durable_writes=True, auth_provider: Dict | None = None)[source]#
- class swh.storage.cassandra.CassandraStorage(hosts, keyspace, objstorage=None, port=9042, journal_writer=None, allow_overwrite=False, consistency_level='ONE', directory_entries_insert_algo='one-by-one', auth_provider: Dict | None = None)[source]#
Bases:
object
A backend of swh-storage backed by Cassandra
- Parameters:
hosts – Seed Cassandra nodes, to start connecting to the cluster
keyspace – Name of the Cassandra database to use
objstorage – Passed as argument to
ObjStorage
; if unset, use a NoopObjStorageport – Cassandra port
journal_writer – Passed as argument to
JournalWriter
allow_overwrite – Whether
*_add
functions will check if an object already exists in the database before sending it in an INSERT.False
is the default as it is more efficient when there is a moderately high probability the object is already known, butTrue
can be useful to overwrite existing objects (eg. when applying a schema update), or when the database is known to be mostly empty. Note that aFalse
value does not guarantee there won’t be any overwrite.consistency_level – The default read/write consistency to use
directory_entries_insert_algo – Must be one of: * one-by-one: naive, one INSERT per directory entry, serialized * concurrent: one INSERT per directory entry, concurrent * batch: using UNLOGGED BATCH to insert many entries in a few statements
auth_provider –
An optional dict describing the authentication provider to use. Must contain at least a
cls
entry and the parameters to pass to the constructor. For example:auth_provider: cls: cassandra.auth.PlainTextAuthProvider username: myusername password: mypassword
- content_get_partition(partition_id: int, nb_partitions: int, page_token: str | None = None, limit: int = 1000) PagedResult[Content, str] [source]#
- skipped_content_find(content: HashDict) List[SkippedContent] [source]#
- directory_get_entries(directory_id: bytes, page_token: bytes | None = None, limit: int = 1000) PagedResult[DirectoryEntry, str] | None [source]#
- directory_get_id_partition(partition_id: int, nb_partitions: int, page_token: str | None = None, limit: int = 1000) PagedResult[bytes, str] [source]#
- revision_get(revision_ids: List[bytes], ignore_displayname: bool = False) List[Revision | None] [source]#
- revision_get_partition(partition_id: int, nb_partitions: int, page_token: str | None = None, limit: int = 1000) PagedResult[Revision, str] [source]#
- revision_log(revisions: List[bytes], ignore_displayname: bool = False, limit: int | None = None) Iterable[Dict[str, Any] | None] [source]#
- revision_shortlog(revisions: List[bytes], limit: int | None = None) Iterable[Tuple[bytes, Tuple[bytes, ...]] | None] [source]#
- release_get_partition(partition_id: int, nb_partitions: int, page_token: str | None = None, limit: int = 1000) PagedResult[Release, str] [source]#
- snapshot_get_id_partition(partition_id: int, nb_partitions: int, page_token: str | None = None, limit: int = 1000) PagedResult[bytes, str] [source]#
- snapshot_count_branches(snapshot_id: bytes, branch_name_exclude_prefix: bytes | None = None) Dict[str | None, int] | None [source]#
- snapshot_get_branches(snapshot_id: bytes, branches_from: bytes = b'', branches_count: int = 1000, target_types: List[str] | None = None, branch_name_include_substring: bytes | None = None, branch_name_exclude_prefix: bytes | None = None) PartialBranches | None [source]#
- snapshot_branch_get_by_name(snapshot_id: bytes, branch_name: bytes, follow_alias_chain: bool = True, max_alias_chain_length: int = 100) SnapshotBranchByNameResponse | None [source]#
- origin_get_one(origin_url: str) Origin | None [source]#
Given an origin url, return the origin if it exists, None otherwise
- origin_search(url_pattern: str, page_token: str | None = None, limit: int = 50, regexp: bool = False, with_visit: bool = False, visit_types: List[str] | None = None) PagedResult[Origin, str] [source]#
- origin_visit_add(visits: List[OriginVisit]) Iterable[OriginVisit] [source]#
- origin_visit_get(origin: str, page_token: str | None = None, order: ListOrder = ListOrder.ASC, limit: int = 10) PagedResult[OriginVisit, str] [source]#
- origin_visit_get_with_statuses(origin: str, allowed_statuses: List[str] | None = None, require_snapshot: bool = False, page_token: str | None = None, order: ListOrder = ListOrder.ASC, limit: int = 10) PagedResult[OriginVisitWithStatuses, str] [source]#
- origin_visit_status_get(origin: str, visit: int, page_token: str | None = None, order: ListOrder = ListOrder.ASC, limit: int = 10) PagedResult[OriginVisitStatus, str] [source]#
- origin_visit_find_by_date(origin: str, visit_date: datetime, type: str | None = None) OriginVisit | None [source]#
- origin_visit_get_latest(origin: str, type: str | None = None, allowed_statuses: List[str] | None = None, require_snapshot: bool = False) OriginVisit | None [source]#
- origin_visit_status_get_latest(origin_url: str, visit: int, allowed_statuses: List[str] | None = None, require_snapshot: bool = False) OriginVisitStatus | None [source]#
- origin_visit_status_get_random(type: str) OriginVisitStatus | None [source]#
- raw_extrinsic_metadata_add(metadata: List[RawExtrinsicMetadata]) Dict[str, int] [source]#
Add extrinsic metadata on objects (contents, directories, …).
- raw_extrinsic_metadata_get(target: ExtendedSWHID, authority: MetadataAuthority, after: datetime | None = None, page_token: bytes | None = None, limit: int = 1000) PagedResult[RawExtrinsicMetadata, str] [source]#
- raw_extrinsic_metadata_get_authorities(target: ExtendedSWHID) List[MetadataAuthority] [source]#
- metadata_authority_get(type: MetadataAuthorityType, url: str) MetadataAuthority | None [source]#
- extid_get_from_extid(id_type: str, ids: List[bytes], version: int | None = None) List[ExtID] [source]#
- extid_get_from_target(target_type: ObjectType, ids: List[bytes], extid_type: str | None = None, extid_version: int | None = None) List[ExtID] [source]#
- object_find_recent_references(target_swhid: ExtendedSWHID, limit: int) List[ExtendedSWHID] [source]#
- object_delete(swhids: List[ExtendedSWHID]) Dict[str, int] [source]#
Delete objects from the storage
All skipped content objects matching the given SWHID will be removed, including those who have the same SWHID due to hash collisions.
Origin objects are removed alongside their associated origin visit and origin visit status objects.
- Parameters:
swhids – list of SWHID of the objects to remove
- Returns:
number of objects removed. Details of each key:
- content:delete
Number of content objects removed
- content:delete:bytes
Sum of the removed contents’ data length
- skipped_content:delete
Number of skipped content objects removed
- directory:delete
Number of directory objects removed
- revision:delete
Number of revision objects removed
- release:delete
Number of release objects removed
- snapshot:delete
Number of snapshot objects removed
- origin:delete
Number of origin objects removed
- origin_visit:delete
Number of origin visit objects removed
- origin_visit_status:delete
Number of origin visit status objects removed
- ori_metadata:delete
Number of raw extrinsic metadata objects targeting an origin that have been removed
- snp_metadata:delete
Number of raw extrinsic metadata objects targeting a snapshot that have been removed
- rev_metadata:delete
Number of raw extrinsic metadata objects targeting a revision that have been removed
- rel_metadata:delete
Number of raw extrinsic metadata objects targeting a release that have been removed
- dir_metadata:delete
Number ef raw extrinsic metadata objects targeting a directory that have been removed
- cnt_metadata:delete
Number of raw extrinsic metadata objects targeting a content that have been removed
- emd_metadata:delete
Number of raw extrinsic metadata objects targeting a raw extrinsic metadata object that have been removed
- Return type:
- extid_delete_for_target(target_swhids: List[CoreSWHID]) Dict[str, int] [source]#
Delete ExtID objects from the storage
- Parameters:
target_swhids – list of SWHIDs targeted by the ExtID objects to remove
- Returns:
extid:delete: Number of ExtID objects removed
- Return type:
Summary dict with the following keys and associated values