swh.storage.cassandra.model module#

Classes representing tables in the Cassandra database.

They are very close to classes found in swh.model.model, but most of them are subtly different:

Large objects are split into other classes (eg. RevisionRow has no ‘parents’ field, because parents are stored in a different table, represented by RevisionParentRow)
They have a “cols” field, which returns the list of column names of the table
They only use types that map directly to Cassandra’s schema (ie. no enums)

Therefore, this model doesn’t reuse swh.model.model, except for types that can be mapped to UDTs (Person and TimestampWithTimezone).

Fields may have dataclasses metadata keys fk if the existence of a corresponding row in a different table is almost guaranteed (up to loaders not crashing and eventual-consistency settling down) and points_to if they are a Merkle-DAG link to another object (which is more likely to be missing). This is used by swh.storage.cassandra.diagram.dot_diagram().

swh.storage.cassandra.model.MAGIC_NULL_PK = b'<null>'#: NULLs (or all-empty blobs) are not allowed in primary keys; instead we use a special value that can’t possibly be a valid hash.

swh.storage.cassandra.model.content_index_table_name(algo: str, skipped_content: bool) → str[source]#

Given an algorithm name, returns the name of one of the ‘content_by_*’ and ‘skipped_content_by_*’ tables that serve as index for the ‘content’ and ‘skipped_content’ tables based on this algorithm’s hashes.

For now it is a simple substitution, but future versions may append a version number to it, if needed for schema updates.

class swh.storage.cassandra.model.BaseRow[source]#

Bases: object

TABLE: ClassVar[str]#

PARTITION_KEY: ClassVar[Tuple[str, ...]]#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ()#

classmethod denullify_clustering_key(ck: Tuple) → Tuple[source]#: If this class has a Optional fields used as a clustering key, this replaces such values from the given clustering key so it is suitable for sorting purposes

classmethod from_dict(d: Dict[str, Any]) → T[source]#

classmethod cols() → List[str][source]#

to_dict() → Dict[str, Any][source]#

class swh.storage.cassandra.model.MigrationRow(id: str, dependencies: set[str], min_read_version: str, status: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'migration'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

id: str#

dependencies: set[str]#

min_read_version: str#

status: str#: pending/running/completed

class swh.storage.cassandra.model.ContentRow(sha1: bytes, sha1_git: bytes, sha256: bytes, blake2s256: bytes, length: int, ctime: datetime.datetime | None, status: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'content'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha256',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('sha1', 'sha1_git', 'blake2s256')#

sha1: bytes#

sha1_git: bytes#

sha256: bytes#

blake2s256: bytes#

length: int#

ctime: datetime | None#: creation time, i.e. time of (first) injection into the storage

status: str#

class swh.storage.cassandra.model.SkippedContentRow(sha1: bytes | None, sha1_git: bytes | None, sha256: bytes | None, blake2s256: bytes | None, length: int | None, ctime: datetime.datetime | None, status: str, reason: str, origin: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'skipped_content'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha1', 'sha1_git', 'sha256', 'blake2s256')#

sha1: bytes | None#

sha1_git: bytes | None#

sha256: bytes | None#

blake2s256: bytes | None#

length: int | None#

ctime: datetime | None#: creation time, i.e. time of (first) injection into the storage

status: str#

reason: str#

origin: str#

classmethod denullify_clustering_key(ck: Tuple) → Tuple[source]#: If this class has a Optional fields used as a clustering key, this replaces such values from the given clustering key so it is suitable for sorting purposes

classmethod from_dict(d: Dict[str, Any]) → SkippedContentRow[source]#

class swh.storage.cassandra.model.DirectoryRow(id: bytes, raw_manifest: bytes | None)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'directory'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

id: bytes#

raw_manifest: bytes | None#: NULL if the object can be rebuild from (sorted) entries

class swh.storage.cassandra.model.DirectoryEntryRow(directory_id: bytes, name: bytes, target: bytes, perms: int, type: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'directory_entry'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('directory_id',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)#

directory_id: bytes#

name: bytes#: path name, relative to containing dir

target: bytes#

perms: int#: unix-like permissions

type: str#: target type

class swh.storage.cassandra.model.RevisionRow(id: bytes, date: swh.model.model.TimestampWithTimezone | None, committer_date: swh.model.model.TimestampWithTimezone | None, type: str, directory: bytes, message: bytes, author: swh.model.model.Person, committer: swh.model.model.Person, synthetic: bool, metadata: str, extra_headers: dict, raw_manifest: bytes | None)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'revision'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

id: bytes#

date: TimestampWithTimezone | None#

committer_date: TimestampWithTimezone | None#

type: str#

directory: bytes#: source code “root” directory

message: bytes#

author: Person#

committer: Person#

synthetic: bool#: true iff revision has been created by Software Heritage

metadata: str#: extra metadata as JSON(tarball checksums, etc…)

extra_headers: dict#: extra commit information as (tuple(key, value), …)

raw_manifest: bytes | None#: NULL if the object can be rebuild from other cells and revision_parent.

class swh.storage.cassandra.model.RevisionParentRow(id: bytes, parent_rank: int, parent_id: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'revision_parent'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('parent_rank',)#

id: bytes#

parent_rank: int#: parent position in merge commits, 0-based

parent_id: bytes#

class swh.storage.cassandra.model.ReleaseRow(id: bytes, target_type: str, target: bytes, date: swh.model.model.TimestampWithTimezone, name: bytes, message: bytes, author: swh.model.model.Person, synthetic: bool, raw_manifest: bytes | None)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'release'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

id: bytes#

target_type: str#

target: bytes#

date: TimestampWithTimezone#

name: bytes#

message: bytes#

author: Person#

synthetic: bool#: true iff release has been created by Software Heritage

raw_manifest: bytes | None#: NULL if the object can be rebuild from other cells

class swh.storage.cassandra.model.SnapshotRow(id: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'snapshot'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

id: bytes#

class swh.storage.cassandra.model.SnapshotBranchRow(snapshot_id: bytes, name: bytes, target_type: str | None, target: bytes | None)[source]#

Bases: BaseRow

For a given snapshot_id, branches are sorted by their name, allowing easy pagination.

TABLE: ClassVar[str] = 'snapshot_branch'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('snapshot_id',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)#

snapshot_id: bytes#

name: bytes#

target_type: str | None#

target: bytes | None#

class swh.storage.cassandra.model.OriginVisitRow(origin: str, visit: int, date: datetime.datetime, type: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'origin_visit'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('origin',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('visit',)#

origin: str#

visit: int#

date: datetime#

type: str#

class swh.storage.cassandra.model.OriginVisitStatusRow(origin: str, visit: int, date: datetime.datetime, type: str, status: str, metadata: str, snapshot: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'origin_visit_status'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('origin',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('visit', 'date')#

origin: str#

visit: int#

date: datetime#

type: str#

status: str#

metadata: str#

snapshot: bytes#

classmethod from_dict(d: Dict[str, Any]) → T[source]#

class swh.storage.cassandra.model.OriginRow(sha1: bytes, url: str, next_visit_id: int)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'origin'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha1',)#

sha1: bytes#

url: str#

next_visit_id: int#: We need integer visit ids for compatibility with the pgsql storage, so we’re using lightweight transactions with this trick: https://stackoverflow.com/a/29391877/539465

class swh.storage.cassandra.model.MetadataAuthorityRow(url: str, type: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'metadata_authority'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('url',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('type',)#

url: str#

type: str#

class swh.storage.cassandra.model.MetadataFetcherRow(name: str, version: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'metadata_fetcher'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('name',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('version',)#

name: str#

version: str#

class swh.storage.cassandra.model.RawExtrinsicMetadataRow(id: bytes, type: str, target: str, authority_type: str, authority_url: str, discovery_date: datetime, fetcher_name: str, fetcher_version: str, format: str, metadata: bytes, origin: str | None, visit: int | None, snapshot: str | None, release: str | None, revision: str | None, path: bytes | None, directory: str | None)[source]#

Bases: BaseRow

An explanation is in order for the primary key:

Intuitively, the primary key should only be ‘id’, because two metadata entries are the same iff the id is the same; and ‘id’ is used for deduplication.

However, we also want to query by (target, authority_type, authority_url, discovery_date) The naive solution to this would be an extra table, to use as index; but it means 1. extra code to keep them in sync 2. overhead when writing 3. overhead + random reads (instead of linear) when reading.

Therefore, we use a single table for both, by adding the column we want to query with before the id. It solves both a) the query/order issues and b) the uniqueness issue because:

adding the id at the end of the primary key does not change the rows’ order: for two different rows, id1 != id2, so (target1, …, date1) < (target2, …, date2) <=> (target1, …, date1, id1) < (target2, …, date2, id2)
the id is a hash of all the columns, so: rows are the same <=> id1 == id2 <=> (target1, …, date1, id1) == (target2, …, date2, id2)

TABLE: ClassVar[str] = 'raw_extrinsic_metadata'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('authority_type', 'authority_url', 'discovery_date', 'id')#

id: bytes#

type: str#

target: str#

authority_type: str#

authority_url: str#

discovery_date: datetime#

fetcher_name: str#

fetcher_version: str#

format: str#

metadata: bytes#

origin: str | None#

visit: int | None#

snapshot: str | None#

release: str | None#

revision: str | None#

path: bytes | None#

directory: str | None#

class swh.storage.cassandra.model.RawExtrinsicMetadataByIdRow(id: bytes, target: str, authority_type: str, authority_url: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'raw_extrinsic_metadata_by_id'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ()#

id: bytes#

target: str#

authority_type: str#

authority_url: str#

class swh.storage.cassandra.model.ObjectCountRow(partition_key: int, object_type: str, count: int)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'object_count'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('partition_key',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('object_type',)#

partition_key: int#

object_type: str#

count: int#

class swh.storage.cassandra.model.ExtIDRow(extid_type: str, extid: bytes, extid_version: int, target_type: str, target: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'extid'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('extid_type', 'extid')#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('extid_version', 'target_type', 'target')#

extid_type: str#

extid: bytes#

extid_version: int#

target_type: str#

target: bytes#

class swh.storage.cassandra.model.ExtIDByTargetRow(target_type: str, target: bytes, target_token: int)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'extid_by_target'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target_type', 'target')#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('target_token',)#

target_type: str#

target: bytes#

target_token: int#: value of token(pk) on the “primary” table

class swh.storage.cassandra.model.ObjectReferenceRow(target_type: str, target: bytes, source_type: str, source: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'object_references'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target_type', 'target')#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('source_type', 'source')#

target_type: str#

target: bytes#

source_type: str#

source: bytes#

class swh.storage.cassandra.model.ObjectReferencesTableRow(pk: int, name: str, year: int, week: int, start: cassandra.util.Date, end: cassandra.util.Date)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'object_references_table'#

PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('pk',)#

CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)#

pk: int#: always zero, puts everything in the same Cassandra partition for faster querying

name: str#

year: int#: ISO year.

week: int#: ISO week.

start: Date#

end: Date#