Source code for swh.web.utils.identifiers

# Copyright (C) 2020-2024  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information

from typing import Any, Dict, Iterable, List, Mapping, Optional
from urllib.parse import quote, unquote

from typing_extensions import TypedDict

from swh.model.exceptions import ValidationError
from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID
from swh.web.utils import archive, reverse
from swh.web.utils.exc import BadInputExc
from swh.web.utils.typing import SnapshotContext, SWHIDContext, SWHIDInfo, SWHObjectInfo


[docs] def parse_object_type(object_type: str) -> ObjectType: try: return ObjectType[object_type.upper()] except KeyError: valid_types = ", ".join(variant.name.lower() for variant in ObjectType) raise BadInputExc( f"Invalid swh object type! Valid types are {valid_types}; not {object_type}" )
[docs] def gen_swhid( object_type: ObjectType, object_id: str, scheme_version: int = 1, metadata: SWHIDContext = {}, ) -> str: """ Returns the SoftWare Hash IDentifier for a swh object based on: * the object type * the object id * the SWHID scheme version Args: object_type: the swh object type (content/directory/release/revision/snapshot) object_id: the swh object id (hexadecimal representation of its hash value) scheme_version: the scheme version of the SWHIDs Returns: the SWHID of the object Raises: BadInputExc: if the provided parameters do not enable to generate a valid identifier """ try: decoded_object_id = hash_to_bytes(object_id) obj_swhid = str( QualifiedSWHID( object_type=object_type, object_id=decoded_object_id, scheme_version=scheme_version, **metadata, ) ) except (ValidationError, KeyError, ValueError) as e: raise BadInputExc("Invalid object (%s) for SWHID. %s" % (object_id, e)) else: return obj_swhid
[docs] class ResolvedSWHID(TypedDict): """parsed SWHID with context""" swhid_parsed: QualifiedSWHID """URL to browse object according to SWHID context""" browse_url: Optional[str]
[docs] def resolve_swhid( swhid: str, query_params: Optional[Mapping[str, str]] = None ) -> ResolvedSWHID: """ Try to resolve a SoftWare Hash IDentifier into an url for browsing the targeted object. Args: swhid: a SoftWare Hash IDentifier query_params: optional dict filled with query parameters to append to the browse url Returns: a dict with the following keys: * **swhid_parsed**: the parsed identifier * **browse_url**: the url for browsing the targeted object """ swhid_parsed = get_qualified_swhid(swhid) object_type = swhid_parsed.object_type object_id = swhid_parsed.object_id browse_url = None url_args = {} fragment = "" process_lines = object_type == ObjectType.CONTENT query_dict: Dict[str, str] = dict(query_params or {}) if swhid_parsed.origin: origin_url = unquote(swhid_parsed.origin) origin_url = archive.lookup_origin(origin_url)["url"] query_dict["origin_url"] = origin_url if swhid_parsed.path and swhid_parsed.path != b"/": query_dict["path"] = swhid_parsed.path.decode("utf8", errors="replace") if swhid_parsed.anchor: directory = b"" if swhid_parsed.anchor.object_type == ObjectType.DIRECTORY: directory = swhid_parsed.anchor.object_id elif swhid_parsed.anchor.object_type == ObjectType.REVISION: revision = archive.lookup_revision( hash_to_hex(swhid_parsed.anchor.object_id) ) directory = revision["directory"] elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release["target_type"] == ObjectType.REVISION.name.lower(): revision = archive.lookup_revision(release["target"]) directory = revision["directory"] elif release["target_type"] == ObjectType.DIRECTORY.name.lower(): directory = release["target"] if object_type == ObjectType.CONTENT: if ( not swhid_parsed.origin and swhid_parsed.anchor.object_type != ObjectType.REVISION ): # when no origin or revision context, content objects need to have # their path prefixed by root directory id for breadcrumbs display query_dict["path"] = hash_to_hex(directory) + query_dict["path"] elif query_dict["path"] is not None: # remove leading slash from SWHID content path query_dict["path"] = query_dict["path"].lstrip("/") elif object_type == ObjectType.DIRECTORY and query_dict["path"] is not None: object_id = directory # remove leading and trailing slashes from SWHID directory path query_dict["path"] = query_dict["path"].strip("/") # snapshot context if swhid_parsed.visit: if swhid_parsed.visit.object_type != ObjectType.SNAPSHOT: raise BadInputExc("Visit must be a snapshot SWHID.") query_dict["snapshot"] = hash_to_hex(swhid_parsed.visit.object_id) if swhid_parsed.anchor: if ( swhid_parsed.anchor.object_type == ObjectType.REVISION and object_type != ObjectType.REVISION ): query_dict["revision"] = hash_to_hex(swhid_parsed.anchor.object_id) elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release: query_dict["release"] = release["name"] # browsing content or directory without snapshot context elif ( object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY) and swhid_parsed.anchor ): if swhid_parsed.anchor.object_type == ObjectType.REVISION: # anchor revision, objects are browsed from its view object_type = ObjectType.REVISION object_id = swhid_parsed.anchor.object_id elif ( object_type == ObjectType.DIRECTORY and swhid_parsed.anchor.object_type == ObjectType.DIRECTORY ): # a directory is browsed from its root object_id = swhid_parsed.anchor.object_id if object_type == ObjectType.CONTENT: url_args["query_string"] = f"sha1_git:{hash_to_hex(object_id)}" elif object_type in (ObjectType.DIRECTORY, ObjectType.RELEASE, ObjectType.REVISION): url_args["sha1_git"] = hash_to_hex(object_id) elif object_type == ObjectType.SNAPSHOT: url_args["snapshot_id"] = hash_to_hex(object_id) if swhid_parsed.lines and process_lines: lines = swhid_parsed.lines fragment += "#L" + str(lines[0]) if lines[1]: fragment += "-L" + str(lines[1]) if url_args: browse_url = ( reverse( f"browse-{object_type.name.lower()}", url_args=url_args, query_params=query_dict, ) + fragment ) return ResolvedSWHID(swhid_parsed=swhid_parsed, browse_url=browse_url)
[docs] def get_qualified_swhid(swhid: str) -> QualifiedSWHID: """Leniently check if a qualified SWHID is valid and return it parsed. This allows a superset of core SWHIDs, which are badly capitalized or quoted. Args: swhid: a SoftWare Hash IDentifier. Raises: BadInputExc: if the provided SWHID cannot be parsed. Return: A parsed SWHID. """ try: # ensure core part of SWHID is in lower case to avoid parsing error (core, sep, qualifiers) = swhid.partition(";") core = core.lower() # quoted white spaces might have been automatically unquoted when a SWHID # is passed as URL argument so ensure to quote them back qualifiers = qualifiers.replace(" ", "%20") return QualifiedSWHID.from_string(core + sep + qualifiers) except ValidationError as ve: raise BadInputExc("Error when parsing identifier: %s" % " ".join(ve.messages))
[docs] def parse_core_swhid(swhid: str) -> CoreSWHID: """Check if a core SWHID is valid and return it parsed. Args: swhid: a SoftWare Hash IDentifier. Raises: BadInputExc: if the provided SWHID cannot be parsed. Return: A parsed SWHID. """ try: return CoreSWHID.from_string(swhid) except ValidationError as ve: raise BadInputExc(f"Error when parsing identifier: {' '.join(ve.messages)}") except ValueError as e: raise BadInputExc(f"Error when parsing identifier: {e}")
[docs] def group_swhids( swhids: Iterable[CoreSWHID], ) -> Dict[ObjectType, List[bytes]]: """ Groups many SoftWare Hash IDentifiers into a dictionary depending on their type. Args: swhids: an iterable of SoftWare Hash IDentifier objects Returns: A dictionary with: keys: object types values: object hashes """ swhids_by_type: Dict[ObjectType, List[bytes]] = { ObjectType.CONTENT: [], ObjectType.DIRECTORY: [], ObjectType.REVISION: [], ObjectType.RELEASE: [], ObjectType.SNAPSHOT: [], } for obj_swhid in swhids: obj_id = obj_swhid.object_id obj_type = obj_swhid.object_type swhids_by_type[obj_type].append(hash_to_bytes(obj_id)) return swhids_by_type
[docs] def get_swhids_info( swh_objects: Iterable[SWHObjectInfo], snapshot_context: Optional[SnapshotContext] = None, extra_context: Optional[Mapping[str, Any]] = None, ) -> List[SWHIDInfo]: """ Returns a list of dict containing info related to SWHIDs of objects. Args: swh_objects: an iterable of dict describing archived objects snapshot_context: optional dict parameter describing the snapshot in which the objects have been found extra_context: optional dict filled with extra contextual info about the objects Returns: a list of dict containing SWHIDs info """ swhids_info = [] for swh_object in swh_objects: if not swh_object["object_id"]: swhids_info.append( SWHIDInfo( object_type=swh_object["object_type"], object_id="", swhid="", swhid_url="", context={}, swhid_with_context=None, swhid_with_context_url=None, ) ) continue object_type = swh_object["object_type"] object_id = swh_object["object_id"] swhid_context: SWHIDContext = {} if snapshot_context: if snapshot_context["origin_info"] is not None: swhid_context["origin"] = quote( snapshot_context["origin_info"]["url"], safe="/?:@&" ) if object_type != ObjectType.SNAPSHOT: swhid_context["visit"] = gen_swhid( ObjectType.SNAPSHOT, snapshot_context["snapshot_id"] ) if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY): if snapshot_context["release_id"] is not None: swhid_context["anchor"] = gen_swhid( ObjectType.RELEASE, snapshot_context["release_id"] ) elif snapshot_context["revision_id"] is not None: swhid_context["anchor"] = gen_swhid( ObjectType.REVISION, snapshot_context["revision_id"] ) if object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY): if ( extra_context and "revision" in extra_context and extra_context["revision"] and "anchor" not in swhid_context ): swhid_context["anchor"] = gen_swhid( ObjectType.REVISION, extra_context["revision"] ) elif ( extra_context and "root_directory" in extra_context and extra_context["root_directory"] and "anchor" not in swhid_context and ( object_type != ObjectType.DIRECTORY or extra_context["root_directory"] != object_id ) ): swhid_context["anchor"] = gen_swhid( ObjectType.DIRECTORY, extra_context["root_directory"] ) path = None if extra_context and "path" in extra_context: path = extra_context["path"] or "/" if "filename" in extra_context and object_type == ObjectType.CONTENT: path += extra_context["filename"] if object_type == ObjectType.DIRECTORY and path == "/": path = None if path: swhid_context["path"] = quote(path, safe="/?:@&") swhid = gen_swhid(object_type, object_id) swhid_url = reverse("browse-swhid", url_args={"swhid": swhid}) swhid_with_context = None swhid_with_context_url = None if swhid_context: swhid_with_context = gen_swhid( object_type, object_id, metadata=swhid_context ) swhid_with_context_url = reverse( "browse-swhid", url_args={"swhid": quote(swhid_with_context, safe=":;=/%")}, ) swhids_info.append( SWHIDInfo( object_type=object_type, object_id=object_id, swhid=swhid, swhid_url=swhid_url, context=swhid_context, swhid_with_context=swhid_with_context, swhid_with_context_url=swhid_with_context_url, ) ) return swhids_info