swh.storage.cassandra.cql module#

swh.storage.cassandra.cql.PARTITION_KEY_RESTRICTION_MAX_SIZE = 100#

Maximum number of restrictions in a single query. Usually this is a very low number (eg. SELECT … FROM … WHERE x=?), but some queries can request arbitrarily many (eg. SELECT … FROM … WHERE x IN ?).

This can cause performance issues, as the node getting the query need to coordinate with other nodes to get the complete results. See <scylladb/scylla#4797> for details and rationale.

swh.storage.cassandra.cql.get_execution_profiles(consistency_level: str = 'ONE') → Dict[object, ExecutionProfile][source]#

swh.storage.cassandra.cql.create_keyspace(cql_runner: CqlRunner, *, durable_writes=True) → None[source]#

swh.storage.cassandra.cql.create_table(cql_runner: CqlRunner, table_name: str) → None[source]#

swh.storage.cassandra.cql.mark_all_migrations_completed(cql_runner: CqlRunner) → None[source]#

class swh.storage.cassandra.cql.CqlRunner(hosts: List[str], keyspace: str, port: int, consistency_level: str, auth_provider: Dict | None = None, table_options: Dict[str, str] | None = None, register_user_types: bool = True)[source]#

Bases: object

Class managing prepared statements and building queries to be sent to Cassandra.

register_user_types()[source]#

MAX_RETRIES = 3#

execute_with_retries(statement, args: Sequence | None) → ResultSet[source]#

execute_many_with_retries(statement, args_list: Sequence[Tuple]) → Iterable[Dict[str, Any]][source]#

execute_many_statements_with_retries(statements_and_parameters: Sequence[Tuple[Any, Tuple]]) → Iterable[Dict[str, Any]][source]#

migration_add_one(migration: MigrationRow, *, statement) → None[source]#

migration_add_concurrent(migrations: List[MigrationRow], *, statement) → None[source]#

migration_get(migration_ids, *, statement) → Iterable[MigrationRow][source]#

migration_list(*, statement) → Iterable[MigrationRow][source]#

content_add_prepare(content: ContentRow, *, statement) → Tuple[int, Callable[[], None]][source]#: Prepares insertion of a Content to the main ‘content’ table. Returns a token (to be used in secondary tables), and a function to be called to perform the insertion in the main table.

content_get_from_pk(content_hashes: TotalHashDict, *, statement) → ContentRow | None[source]#

content_missing_from_all_hashes(contents_hashes: List[TotalHashDict]) → Iterator[TotalHashDict][source]#

content_get_from_tokens(tokens, *, statement) → Iterable[ContentRow][source]#

content_get_random(*, statement) → ContentRow | None[source]#

content_get_token_range(start: int, end: int, limit: int, *, statement) → Iterator[Tuple[int, ContentRow]][source]#: Returns an iterable of (token, row)

content_delete(content_hashes: TotalHashDict, *, statement) → None[source]#

content_index_add_one(algo: str, content: Content, token: int) → None[source]#: Adds a row mapping content[algo] to the token of the Content in the main ‘content’ table.

content_get_tokens_from_single_algo(algo: str, hashes: List[bytes]) → Iterable[int][source]#

skipped_content_add_prepare(content, *, statement) → Tuple[int, Callable[[], None]][source]#: Prepares insertion of a Content to the main ‘skipped_content’ table. Returns a token (to be used in secondary tables), and a function to be called to perform the insertion in the main table.

skipped_content_get_from_pk(content_hashes: Dict[str, bytes], *, statement) → SkippedContentRow | None[source]#

skipped_content_get_from_token(token, *, statement) → Iterable[SkippedContentRow][source]#

skipped_content_delete(skipped_content_hashes: TotalHashDict, *, statement) → None[source]#

skipped_content_index_add_one(algo: str, content: SkippedContent, token: int) → None[source]#: Adds a row mapping content[algo] to the token of the SkippedContent in the main ‘skipped_content’ table.

skipped_content_get_tokens_from_single_hash(algo: str, hash_: bytes) → Iterable[int][source]#

directory_missing(ids: List[bytes], *, statement) → List[bytes][source]#

directory_add_one(directory: DirectoryRow, *, statement) → None[source]#: Called after all calls to directory_entry_add_one, to commit/finalize the directory.

directory_get_random(*, statement) → DirectoryRow | None[source]#

directory_get(directory_ids: List[bytes], *, statement) → Iterable[DirectoryRow][source]#: Return fields from the main directory table (e.g. raw_manifest, but not entries)

directory_get_token_range(start: int, end: int, limit: int, *, statement) → Iterator[Tuple[int, DirectoryRow]][source]#: Returns an iterable of (token, row)

directory_delete(directory_id: bytes, *, statement) → None[source]#

directory_entry_add_one(entry: DirectoryEntryRow, *, statement) → None[source]#

directory_entry_add_concurrent(entries: List[DirectoryEntryRow], *, statement) → None[source]#

directory_entry_add_batch(entries: List[DirectoryEntryRow], *, statement) → None[source]#

directory_entry_get(directory_ids, *, statement) → Iterable[DirectoryEntryRow][source]#

directory_entry_get_from_name(directory_id: bytes, from_: bytes, limit: int, *, statement) → Iterable[DirectoryEntryRow][source]#

directory_entry_delete(directory_id: bytes, *, statement) → None[source]#

revision_missing(ids: List[bytes], *, statement) → List[bytes][source]#

revision_add_one(revision: RevisionRow, *, statement) → None[source]#

revision_get_ids(revision_ids, *, statement) → Iterable[bytes][source]#

revision_get(revision_ids: List[bytes], *, statement) → Iterable[RevisionRow][source]#

revision_get_random(*, statement) → RevisionRow | None[source]#

revision_get_token_range(start: int, end: int, limit: int, *, statement) → Iterator[Tuple[int, RevisionRow]][source]#: Returns an iterable of (token, row)

revision_delete(revision_id: bytes, *, statement) → None[source]#

revision_parent_add_one(revision_parent: RevisionParentRow, *, statement) → None[source]#

revision_parent_get(revision_id: bytes, *, statement) → Iterable[bytes][source]#

revision_parent_delete(revision_id: bytes, *, statement) → None[source]#

release_missing(ids: List[bytes], *, statement) → List[bytes][source]#

release_add_one(release: ReleaseRow, *, statement) → None[source]#

release_get(release_ids: List[bytes], *, statement) → Iterable[ReleaseRow][source]#

release_get_random(*, statement) → ReleaseRow | None[source]#

release_get_token_range(start: int, end: int, limit: int, *, statement) → Iterator[Tuple[int, ReleaseRow]][source]#: Returns an iterable of (token, row)

release_delete(release_id: bytes, *, statement) → None[source]#

snapshot_missing(ids: List[bytes], *, statement) → List[bytes][source]#

snapshot_add_one(snapshot: SnapshotRow, *, statement) → None[source]#

snapshot_get_random(*, statement) → SnapshotRow | None[source]#

snapshot_get_token_range(start: int, end: int, limit: int, *, statement) → Iterator[Tuple[int, SnapshotRow]][source]#: Returns an iterable of (token, row)

snapshot_delete(snapshot_id: bytes, *, statement) → None[source]#

snapshot_branch_add_one(branch: SnapshotBranchRow, *, statement) → None[source]#

snapshot_count_branches_from_name(snapshot_id: bytes, from_: bytes, *, statement) → Dict[str | None, int][source]#

snapshot_count_branches_before_name(snapshot_id: bytes, before: bytes, *, statement) → Dict[str | None, int][source]#

snapshot_count_branches(snapshot_id: bytes, branch_name_exclude_prefix: bytes | None = None) → Dict[str | None, int][source]#: Returns a dictionary from type names to the number of branches of that type.

snapshot_branch_get_from_name(snapshot_id: bytes, from_: bytes, limit: int, *, statement) → Iterable[SnapshotBranchRow][source]#

snapshot_branch_get_range(snapshot_id: bytes, from_: bytes, before: bytes, limit: int, *, statement) → Iterable[SnapshotBranchRow][source]#

snapshot_branch_get(snapshot_id: bytes, from_: bytes, limit: int, branch_name_exclude_prefix: bytes | None = None) → Iterable[SnapshotBranchRow][source]#

snapshot_branch_delete(snapshot_id: bytes, *, statement) → None[source]#

origin_add_one(origin: OriginRow, *, statement) → None[source]#

origin_get_by_sha1(sha1: bytes, *, statement) → Iterable[OriginRow][source]#

origin_get_by_url(url: str) → Iterable[OriginRow][source]#

origin_list(start_token: int, limit: int, *, statement) → Iterable[Tuple[int, OriginRow]][source]#: Returns an iterable of (token, origin)

origin_iter_all(*, statement) → Iterable[OriginRow][source]#

origin_bump_next_visit_id(origin_url: str, visit_id: int, *, statement) → None[source]#

origin_generate_unique_visit_id(origin_url: str, *, statement) → int[source]#

origin_delete(sha1: bytes, *, statement) → None[source]#

origin_visit_get(origin_url: str, last_visit: int | None, limit: int, order: ListOrder, *, statements) → Iterable[OriginVisitRow][source]#

origin_visit_add_one(visit: OriginVisitRow, *, statement) → None[source]#

origin_visit_get_one(origin_url: str, visit_id: int, *, statement) → OriginVisitRow | None[source]#

origin_visit_iter_all(origin_url: str, *, statement) → Iterable[OriginVisitRow][source]#: Returns an iterator on visits for a given origin, ordered by descending visit id.

origin_visit_iter(start_token: int) → Iterator[OriginVisitRow][source]#: Returns all origin visits in order from this token, and wraps around the token space.

origin_visit_delete(origin_url: str, *, statement) → None[source]#

origin_visit_status_get_range(origin: str, visit: int, date_from: datetime | None, limit: int, order: ListOrder, *, statements) → Iterable[OriginVisitStatusRow][source]#

origin_visit_status_get_all_range(origin_url: str, visit_from: int, visit_to: int, *, statement) → Iterable[OriginVisitStatusRow][source]#

origin_visit_status_add_one(visit_update: OriginVisitStatusRow, *, statement) → None[source]#

origin_visit_status_get_latest(origin: str, visit: int) → OriginVisitStatusRow | None[source]#: Given an origin visit id, return its latest origin_visit_status

origin_visit_status_get(origin: str, visit: int, *, statement) → Iterator[OriginVisitStatusRow][source]#: Return all origin visit statuses for a given visit

origin_snapshot_get_all(origin: str, *, statement) → Iterable[bytes][source]#

origin_visit_status_delete(origin_url: str, *, statement) → None[source]#

raw_extrinsic_metadata_by_id_add(row, *, statement)[source]#

raw_extrinsic_metadata_get_by_ids(ids: List[bytes], *, statement) → Iterable[RawExtrinsicMetadataByIdRow][source]#

raw_extrinsic_metadata_by_id_delete(emd_id, *, statement)[source]#

raw_extrinsic_metadata_add(raw_extrinsic_metadata, *, statement)[source]#

raw_extrinsic_metadata_get_after_date(target: str, authority_type: str, authority_url: str, after: datetime, *, statement) → Iterable[RawExtrinsicMetadataRow][source]#

raw_extrinsic_metadata_get_after_date_and_id(target: str, authority_type: str, authority_url: str, after_date: datetime, after_id: bytes, *, statement) → Iterable[RawExtrinsicMetadataRow][source]#

raw_extrinsic_metadata_get(target: str, authority_type: str, authority_url: str, *, statement) → Iterable[RawExtrinsicMetadataRow][source]#

raw_extrinsic_metadata_get_authorities(target: str, *, statement) → Iterable[Tuple[str, str]][source]#

raw_extrinsic_metadata_delete(target, authority_type, authority_url, discovery_date, emd_id, *, statement)[source]#

metadata_authority_add(authority: MetadataAuthorityRow, *, statement)[source]#

metadata_authority_get(type, url, *, statement) → MetadataAuthorityRow | None[source]#

metadata_fetcher_add(fetcher, *, statement)[source]#

metadata_fetcher_get(name, version, *, statement) → MetadataFetcherRow | None[source]#

extid_add_prepare(extid: ExtIDRow, *, statement) → Tuple[int, Callable[[], None]][source]#

extid_get_from_pk(extid_type: str, extid: bytes, extid_version: int, target: CoreSWHID, *, statement) → ExtIDRow | None[source]#

extid_get_from_token(token: int, *, statement) → Iterable[ExtIDRow][source]#

extid_get_from_token_and_extid_version(token: int, extid_version: int, *, statement) → Iterable[ExtIDRow][source]#

extid_get_from_extid(extid_type: str, extid: bytes, *, statement) → Iterable[ExtIDRow][source]#

extid_get_from_extid_and_version(extid_type: str, extid: bytes, extid_version: int, *, statement) → Iterable[ExtIDRow][source]#

extid_get_from_target(target_type: str, target: bytes, extid_type: str | None = None, extid_version: int | None = None) → Iterable[ExtIDRow][source]#

extid_delete(extid_type: str, extid: bytes, extid_version: int, target_type: str, target: bytes, *, statement) → None[source]#

extid_index_add_one(row: ExtIDByTargetRow, *, statement) → None[source]#: Adds a row mapping extid[target_type, target] to the token of the ExtID in the main ‘extid’ table.

extid_delete_from_by_target_table(target_type: str, target: bytes, *, statement) → None[source]#

object_reference_add_concurrent(entries: List[ObjectReferenceRow], *, statement) → None[source]#

object_reference_get(target: bytes, target_type: str, limit: int) → Iterable[ObjectReferenceRow][source]#

object_references_list_tables(*, statement) → List[ObjectReferencesTableRow][source]#

List existing tables of the object_references table, ordered from oldest to the most recent.

Its result is cached per-CqlRunner instance for an hour, to avoid a round-trip on every object write.

object_references_create_table(date: Tuple[int, int], *, statement) → Tuple[date, date][source]#: Create the table of the object_references table for the given ISO year and week.

object_references_drop_table(year: int, week: int) → None[source]#: Delete the table of the object_references table for the given ISO year and week.

stat_counters() → Iterable[ObjectCountRow][source]#

check_read(*, statement)[source]#

swh.storage.cassandra.cql module#

This Page