Source code for swh.web.api.views.raw

# Copyright (C) 2022-2024  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information

from django.http import HttpResponse
from rest_framework.request import Request

from swh.model import model
from swh.model.git_objects import (
    content_git_object,
    directory_git_object,
    release_git_object,
    revision_git_object,
    snapshot_git_object,
)
from swh.model.hashutil import hash_to_hex
from swh.model.swhids import ObjectType
from swh.storage.algos.directory import directory_get
from swh.storage.algos.snapshot import snapshot_get_all_branches
from swh.web import config
from swh.web.api.apidoc import api_doc, format_docstring
from swh.web.api.apiurls import api_route
from swh.web.utils.exc import NotFoundExc
from swh.web.utils.identifiers import parse_core_swhid


[docs] @api_route( "/raw/<swhid:swhid>/", "api-1-raw-object", throttle_scope="swh_raw_object", ) @api_doc("/raw/", category="Archive") @format_docstring() def api_raw_object(request: Request, swhid: str): """ .. http:get:: /api/1/raw/(swhid)/ Get the object corresponding to the SWHID in raw form. This endpoint exposes the internal representation (see the ``*_git_object`` functions in :mod:`swh.model.git_objects`), and so can be used to fetch a binary blob which hashes to the same identifier. :param string swhid: the object's SWHID :resheader Content-Type: application/octet-stream :statuscode 200: no error :statuscode 404: the requested object cannot be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`raw/swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a` """ parsed_swhid = parse_core_swhid(swhid) object_id = parsed_swhid.object_id object_type = parsed_swhid.object_type storage = config.storage() def not_found(): return NotFoundExc(f"Object with id {swhid} not found.") if object_type == ObjectType.CONTENT: results = storage.content_find({"sha1_git": object_id}) if len(results) == 0: raise not_found() cnt = results[0] # `cnt.with_data()` unfortunately doesn't seem to work. if cnt.data is None: d = cnt.to_dict() d["data"] = storage.content_get_data({"sha1": cnt.sha1}) cnt = model.Content.from_dict(d) assert ( cnt.data is not None ), f"Content {hash_to_hex(cnt.sha1)} ceased to exist" result = content_git_object(cnt) elif object_type == ObjectType.DIRECTORY: dir_ = directory_get(storage, object_id) if dir_ is None: raise not_found() result = directory_git_object(dir_) elif object_type == ObjectType.REVISION: rev = storage.revision_get([object_id])[0] if rev is None: raise not_found() result = revision_git_object(rev) elif object_type == ObjectType.RELEASE: rel = storage.release_get([object_id])[0] if rel is None: raise not_found() result = release_git_object(rel) elif object_type == ObjectType.SNAPSHOT: snp = snapshot_get_all_branches(storage, object_id) if snp is None: raise not_found() result = snapshot_git_object(snp) else: raise ValueError(f"Unexpected object type variant: {object_type}") response = HttpResponse(result, content_type="application/octet-stream") filename = swhid.replace(":", "_") + "_raw" response["Content-disposition"] = f"attachment; filename={filename}" return response