Source code for swh.storage.algos.swhid
# Copyright (C) 2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from typing import Iterable, Tuple, TypeVar
from swh.model.swhids import (
CoreSWHID,
ExtendedObjectType,
ExtendedSWHID,
ObjectType,
QualifiedSWHID,
)
from swh.storage.interface import StorageInterface
T = TypeVar("T", CoreSWHID, ExtendedSWHID, CoreSWHID | ExtendedSWHID)
[docs]
def known_swhids(storage: StorageInterface, swhids: Iterable[T]) -> set[T]:
"""Query the storage to check if ``swhids`` exist.
We group SWHIDs by type and then call the corresponding storage ``_missing`` method
(directory_missing, snapshot_missing, etc.) and switch the results as we want to
know what exists, not what's missing.
As the storage does not use values from the qualifier (snapshot, etc.) we exclude
``QualifiedSWHID`` from this method because
``known_swhids(storage, swh:1:cnt:1234567890;visit=swh:1:snp:098654321)``
could return that `swh:1:cnt:1234567890` exists even if `swh:1:snp:098654321`
doesn't.
Args:
storage: a ``StorageInterface``
swhid: a list of SWHIDs
Raises:
TypeError: received a ``QualifiedSWHID`` in the ``swhids`` list
Returns:
A set of SWHIDs found in the storage
"""
grouped_swhids: dict[ObjectType | ExtendedObjectType, list[T]] = defaultdict(list)
missing: set[Tuple[ObjectType | ExtendedObjectType, bytes]] = set()
for swhid in swhids:
if isinstance(swhid, QualifiedSWHID):
raise TypeError(
f"This method can't properly handle QualifiedSWHID like {swhid} "
"but only CoreSWHID or ExtendedSWHID"
)
grouped_swhids[swhid.object_type].append(swhid)
for object_type, objects in grouped_swhids.items():
if object_type == ObjectType.CONTENT:
storage_missing_method = storage.content_missing_per_sha1_git
else:
storage_missing_method = getattr(
storage, f"{object_type.name.lower()}_missing"
)
missing |= set(
(object_type, object_id)
for object_id in storage_missing_method(
[swhid.object_id for swhid in objects]
)
)
return {
swhid for swhid in swhids if (swhid.object_type, swhid.object_id) not in missing
}
[docs]
def swhid_is_known(storage: StorageInterface, swhid: CoreSWHID | ExtendedSWHID) -> bool:
"""Query the storage to check if ``swhid`` exists.
A wrapper for :meth:`known_swhids` but for a single SWHID.
Args:
storage: a ``StorageInterface``
swhid: a SWHID
Returns:
:const:`True` if ``swhid`` exists in the storage
"""
return swhid in known_swhids(storage, [swhid])