swh.storage.cassandra.model module#

Classes representing tables in the Cassandra database.

They are very close to classes found in swh.model.model, but most of them are subtly different:

  • Large objects are split into other classes (eg. RevisionRow has no ‘parents’ field, because parents are stored in a different table, represented by RevisionParentRow)

  • They have a “cols” field, which returns the list of column names of the table

  • They only use types that map directly to Cassandra’s schema (ie. no enums)

Therefore, this model doesn’t reuse swh.model.model, except for types that can be mapped to UDTs (Person and TimestampWithTimezone).

swh.storage.cassandra.model.MAGIC_NULL_PK = b'<null>'#

NULLs (or all-empty blobs) are not allowed in primary keys; instead we use a special value that can’t possibly be a valid hash.

swh.storage.cassandra.model.content_index_table_name(algo: str, skipped_content: bool) str[source]#

Given an algorithm name, returns the name of one of the ‘content_by_*’ and ‘skipped_content_by_*’ tables that serve as index for the ‘content’ and ‘skipped_content’ tables based on this algorithm’s hashes.

For now it is a simple substitution, but future versions may append a version number to it, if needed for schema updates.

class swh.storage.cassandra.model.BaseRow[source]#

Bases: object

TABLE: ClassVar[str]#
PARTITION_KEY: ClassVar[Tuple[str, ...]]#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ()#
classmethod from_dict(d: Dict[str, Any]) T[source]#
classmethod cols() List[str][source]#
to_dict() Dict[str, Any][source]#
class swh.storage.cassandra.model.ContentRow(sha1: bytes, sha1_git: bytes, sha256: bytes, blake2s256: bytes, length: int, ctime: datetime.datetime, status: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'content'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha256',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('sha1', 'sha1_git', 'blake2s256')#
sha1: bytes#
sha1_git: bytes#
sha256: bytes#
blake2s256: bytes#
length: int#
ctime: datetime#

creation time, i.e. time of (first) injection into the storage

status: str#
class swh.storage.cassandra.model.SkippedContentRow(sha1: bytes | None, sha1_git: bytes | None, sha256: bytes | None, blake2s256: bytes | None, length: int | None, ctime: datetime.datetime | None, status: str, reason: str, origin: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'skipped_content'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha1', 'sha1_git', 'sha256', 'blake2s256')#
sha1: bytes | None#
sha1_git: bytes | None#
sha256: bytes | None#
blake2s256: bytes | None#
length: int | None#
ctime: datetime | None#

creation time, i.e. time of (first) injection into the storage

status: str#
reason: str#
origin: str#
classmethod from_dict(d: Dict[str, Any]) SkippedContentRow[source]#
class swh.storage.cassandra.model.DirectoryRow(id: bytes, raw_manifest: bytes | None)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'directory'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#
id: bytes#
raw_manifest: bytes | None#

NULL if the object can be rebuild from (sorted) entries

class swh.storage.cassandra.model.DirectoryEntryRow(directory_id: bytes, name: bytes, target: bytes, perms: int, type: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'directory_entry'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('directory_id',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)#
directory_id: bytes#
name: bytes#

path name, relative to containing dir

target: bytes#
perms: int#

unix-like permissions

type: str#

target type

class swh.storage.cassandra.model.RevisionRow(id: bytes, date: swh.model.model.TimestampWithTimezone | None, committer_date: swh.model.model.TimestampWithTimezone | None, type: str, directory: bytes, message: bytes, author: swh.model.model.Person, committer: swh.model.model.Person, synthetic: bool, metadata: str, extra_headers: dict, raw_manifest: bytes | None)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'revision'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#
id: bytes#
date: TimestampWithTimezone | None#
committer_date: TimestampWithTimezone | None#
type: str#
directory: bytes#

source code “root” directory

message: bytes#
author: Person#
committer: Person#
synthetic: bool#

true iff revision has been created by Software Heritage

metadata: str#

extra metadata as JSON(tarball checksums, etc…)

extra_headers: dict#

extra commit information as (tuple(key, value), …)

raw_manifest: bytes | None#

NULL if the object can be rebuild from other cells and revision_parent.

class swh.storage.cassandra.model.RevisionParentRow(id: bytes, parent_rank: int, parent_id: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'revision_parent'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('parent_rank',)#
id: bytes#
parent_rank: int#

parent position in merge commits, 0-based

parent_id: bytes#
class swh.storage.cassandra.model.ReleaseRow(id: bytes, target_type: str, target: bytes, date: swh.model.model.TimestampWithTimezone, name: bytes, message: bytes, author: swh.model.model.Person, synthetic: bool, raw_manifest: bytes | None)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'release'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#
id: bytes#
target_type: str#
target: bytes#
date: TimestampWithTimezone#
name: bytes#
message: bytes#
author: Person#
synthetic: bool#

true iff release has been created by Software Heritage

raw_manifest: bytes | None#

NULL if the object can be rebuild from other cells

class swh.storage.cassandra.model.SnapshotRow(id: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'snapshot'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#
id: bytes#
class swh.storage.cassandra.model.SnapshotBranchRow(snapshot_id: bytes, name: bytes, target_type: str | None, target: bytes | None)[source]#

Bases: BaseRow

For a given snapshot_id, branches are sorted by their name, allowing easy pagination.

TABLE: ClassVar[str] = 'snapshot_branch'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('snapshot_id',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('name',)#
snapshot_id: bytes#
name: bytes#
target_type: str | None#
target: bytes | None#
class swh.storage.cassandra.model.OriginVisitRow(origin: str, visit: int, date: datetime.datetime, type: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'origin_visit'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('origin',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('visit',)#
origin: str#
visit: int#
date: datetime#
type: str#
class swh.storage.cassandra.model.OriginVisitStatusRow(origin: str, visit: int, date: datetime.datetime, type: str, status: str, metadata: str, snapshot: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'origin_visit_status'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('origin',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('visit', 'date')#
origin: str#
visit: int#
date: datetime#
type: str#
status: str#
metadata: str#
snapshot: bytes#
classmethod from_dict(d: Dict[str, Any]) T[source]#
class swh.storage.cassandra.model.OriginRow(sha1: bytes, url: str, next_visit_id: int)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'origin'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('sha1',)#
sha1: bytes#
url: str#
next_visit_id: int#

We need integer visit ids for compatibility with the pgsql storage, so we’re using lightweight transactions with this trick: https://stackoverflow.com/a/29391877/539465

class swh.storage.cassandra.model.MetadataAuthorityRow(url: str, type: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'metadata_authority'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('url',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('type',)#
url: str#
type: str#
class swh.storage.cassandra.model.MetadataFetcherRow(name: str, version: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'metadata_fetcher'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('name',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('version',)#
name: str#
version: str#
class swh.storage.cassandra.model.RawExtrinsicMetadataRow(id: bytes, type: str, target: str, authority_type: str, authority_url: str, discovery_date: datetime, fetcher_name: str, fetcher_version: str, format: str, metadata: bytes, origin: str | None, visit: int | None, snapshot: str | None, release: str | None, revision: str | None, path: bytes | None, directory: str | None)[source]#

Bases: BaseRow

An explanation is in order for the primary key:

Intuitively, the primary key should only be ‘id’, because two metadata entries are the same iff the id is the same; and ‘id’ is used for deduplication.

However, we also want to query by (target, authority_type, authority_url, discovery_date) The naive solution to this would be an extra table, to use as index; but it means 1. extra code to keep them in sync 2. overhead when writing 3. overhead + random reads (instead of linear) when reading.

Therefore, we use a single table for both, by adding the column we want to query with before the id. It solves both a) the query/order issues and b) the uniqueness issue because:

  1. adding the id at the end of the primary key does not change the rows’ order: for two different rows, id1 != id2, so (target1, …, date1) < (target2, …, date2) <=> (target1, …, date1, id1) < (target2, …, date2, id2)

  2. the id is a hash of all the columns, so: rows are the same <=> id1 == id2 <=> (target1, …, date1, id1) == (target2, …, date2, id2)

TABLE: ClassVar[str] = 'raw_extrinsic_metadata'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('authority_type', 'authority_url', 'discovery_date', 'id')#
id: bytes#
type: str#
target: str#
authority_type: str#
authority_url: str#
discovery_date: datetime#
fetcher_name: str#
fetcher_version: str#
format: str#
metadata: bytes#
origin: str | None#
visit: int | None#
snapshot: str | None#
release: str | None#
revision: str | None#
path: bytes | None#
directory: str | None#
class swh.storage.cassandra.model.RawExtrinsicMetadataByIdRow(id: bytes, target: str, authority_type: str, authority_url: str)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'raw_extrinsic_metadata_by_id'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('id',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ()#
id: bytes#
target: str#
authority_type: str#
authority_url: str#
class swh.storage.cassandra.model.ObjectCountRow(partition_key: int, object_type: str, count: int)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'object_count'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('partition_key',)#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('object_type',)#
partition_key: int#
object_type: str#
count: int#
class swh.storage.cassandra.model.ExtIDRow(extid_type: str, extid: bytes, extid_version: int, target_type: str, target: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'extid'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('extid_type', 'extid')#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('extid_version', 'target_type', 'target')#
extid_type: str#
extid: bytes#
extid_version: int#
target_type: str#
target: bytes#
class swh.storage.cassandra.model.ExtIDByTargetRow(target_type: str, target: bytes, target_token: int)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'extid_by_target'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target_type', 'target')#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('target_token',)#
target_type: str#
target: bytes#
target_token: int#

value of token(pk) on the “primary” table

class swh.storage.cassandra.model.ObjectReferenceRow(target_type: str, target: bytes, source_type: str, source: bytes)[source]#

Bases: BaseRow

TABLE: ClassVar[str] = 'object_references'#
PARTITION_KEY: ClassVar[Tuple[str, ...]] = ('target_type', 'target')#
CLUSTERING_KEY: ClassVar[Tuple[str, ...]] = ('source_type', 'source')#
target_type: str#
target: bytes#
source_type: str#
source: bytes#