# Copyright (C) 2015-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import functools
import io
import os
from typing import Optional
from django.http import FileResponse
from rest_framework import serializers
from rest_framework.request import Request
from swh.web.api import utils
from swh.web.api.apidoc import api_doc, format_docstring
from swh.web.api.apiurls import api_route
from swh.web.api.views.utils import api_lookup
from swh.web.utils import archive
from swh.web.utils.exc import NotFoundExc
class ContentRawQuerySerializer(serializers.Serializer):
"""Content Raw query parameters serializer."""
filename = serializers.CharField(
required=False, min_length=1, max_length=255, default=""
@api_doc("/content/filetype/", category="Metadata")
def api_content_filetype(request: Request, q: str):
.. http:get:: /api/1/content/[(hash_type):](hash)/filetype/
Get information about the detected MIME type of a content object.
:param string hash_type: optional parameter specifying which hashing algorithm
has been used to compute the content checksum. It can be either ``sha1``,
``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not
provided, it is assumed that the hashing algorithm used is ``sha1``.
:param string hash: hexadecimal representation of the checksum value computed
with the specified hashing algorithm.
:>json object content_url: link to
:http:get:`/api/1/content/[(hash_type):](hash)/` for getting information
about the content
:>json string encoding: the detected content encoding
:>json string id: the **sha1** identifier of the content
:>json string mimetype: the detected MIME type of the content
:>json object tool: information about the tool used to detect the content
:statuscode 200: no error
:statuscode 400: an invalid **hash_type** or **hash** has been provided
:statuscode 404: requested content cannot be found in the archive
.. parsed-literal::
return api_lookup(
notfound_msg="No filetype information found for content {}.".format(q),
@api_doc("/content/language/", category="Metadata")
def api_content_language(request: Request, q: str):
.. http:get:: /api/1/content/[(hash_type):](hash)/language/
Get information about the programming language used in a content object.
Note: this endpoint currently returns no data.
:param string hash_type: optional parameter specifying which hashing algorithm
has been used to compute the content checksum. It can be either ``sha1``,
``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not
provided, it is assumed that the hashing algorithm used is ``sha1``.
:param string hash: hexadecimal representation of the checksum value computed
with the specified hashing algorithm.
:>json object content_url: link to
:http:get:`/api/1/content/[(hash_type):](hash)/` for getting information
about the content
:>json string id: the **sha1** identifier of the content
:>json string lang: the detected programming language if any
:>json object tool: information about the tool used to detect the programming
:statuscode 200: no error
:statuscode 400: an invalid **hash_type** or **hash** has been provided
:statuscode 404: requested content cannot be found in the archive
.. parsed-literal::
return api_lookup(
notfound_msg="No language information found for content {}.".format(q),
@api_doc("/content/license/", category="Metadata")
def api_content_license(request: Request, q: str):
.. http:get:: /api/1/content/[(hash_type):](hash)/license/
Get information about the license of a content object.
:param string hash_type: optional parameter specifying which hashing algorithm
has been used to compute the content checksum. It can be either ``sha1``,
``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not
provided, it is assumed that the hashing algorithm used is ``sha1``.
:param string hash: hexadecimal representation of the checksum value computed
with the specified hashing algorithm.
:>json object content_url: link to
:http:get:`/api/1/content/[(hash_type):](hash)/` for getting information
about the content
:>json string id: the **sha1** identifier of the content
:>json array licenses: array of strings containing the detected license names
:>json object tool: information about the tool used to detect the license
:statuscode 200: no error
:statuscode 400: an invalid **hash_type** or **hash** has been provided
:statuscode 404: requested content cannot be found in the archive
.. parsed-literal::
return api_lookup(
notfound_msg="No license information found for content {}.".format(q),
@api_doc("/content/raw/", category="Archive")
def api_content_raw(request: Request, q: str, validated_query_params: dict[str, str]):
.. http:get:: /api/1/content/[(hash_type):](hash)/raw/
Get the raw content of a content object (aka a "blob"), as a byte sequence.
:param string hash_type: optional parameter specifying which hashing algorithm
has been used to compute the content checksum. It can be either ``sha1``,
``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not
provided, it is assumed that the hashing algorithm used is ``sha1``.
:param string hash: hexadecimal representation of the checksum value computed
with the specified hashing algorithm.
:query string filename: if provided, the downloaded content will get that
:resheader Content-Type: application/octet-stream
:statuscode 200: no error
:statuscode 400: an invalid **hash_type** or **hash** has been provided
:statuscode 404: requested content cannot be found in the archive
.. parsed-literal::
content = archive.lookup_content(q, with_data=True)
if not content:
raise NotFoundExc("Content %s is not found." % q)
filename = validated_query_params["filename"] or "content_%s_raw" % q.replace(
":", "_"
return FileResponse(
io.BytesIO(content["data"]), # not copied, as this is never modified
@api_route(r"/content/known/search/", "api-1-content-known", methods=["POST"])
@api_route(r"/content/known/(?P<q>(?!search).+)/", "api-1-content-known")
@api_doc("/content/known/", category="Archive", tags=["hidden"])
def api_check_content_known(request: Request, q: Optional[str] = None):
.. http:get:: /api/1/content/known/(sha1)[,(sha1), ...,(sha1)]/
Check whether some content(s) (aka "blob(s)") is present in the archive
based on its **sha1** checksum.
:param string sha1: hexadecimal representation of the **sha1** checksum value
for the content to check existence. Multiple values can be provided
separated by ','.
:>json array search_res: array holding the search result for each provided
:>json object search_stats: some statistics regarding the number of **sha1**
provided and the percentage of those found in the archive
:statuscode 200: no error
:statuscode 400: an invalid **sha1** has been provided
.. parsed-literal::
search_stats = {"nbfiles": 0, "pct": 0}
search_res = None
queries = []
# GET: Many hash separated values request
if q:
hashes = q.split(",")
for v in hashes:
queries.append({"filename": None, "sha1": v})
# POST: Many hash requests in post form submission
elif request.method == "POST":
data = request.data
# Remove potential inputs with no associated value
for k, v in data.items():
if v is not None:
if k == "q" and len(v) > 0:
queries.append({"filename": None, "sha1": v})
elif v != "":
queries.append({"filename": k, "sha1": v})
if queries:
lookup = archive.lookup_multiple_hashes(queries)
result = []
nb_queries = len(queries)
for el in lookup:
res_d = {"sha1": el["sha1"], "found": el["found"]}
if "filename" in el and el["filename"]:
res_d["filename"] = el["filename"]
search_res = result
nbfound = len([x for x in lookup if x["found"]])
search_stats["nbfiles"] = nb_queries
search_stats["pct"] = int((nbfound / nb_queries) * 100)
return {"search_res": search_res, "search_stats": search_stats}
r"/content/(?P<q>[0-9a-z_:]*[0-9a-f]+)/", "api-1-content", checksum_args=["q"]
@api_doc("/content/", category="Archive")
def api_content_metadata(request: Request, q: str):
.. http:get:: /api/1/content/[(hash_type):](hash)/
Get information about a content (aka a "blob") object.
In the archive, a content object is identified based on checksum
values computed using various hashing algorithms.
:param string hash_type: optional parameter specifying which hashing algorithm
has been used to compute the content checksum. It can be either ``sha1``,
``sha1_git``, ``sha256`` or ``blake2s256``. If that parameter is not
provided, it is assumed that the hashing algorithm used is ``sha1``.
:param string hash: hexadecimal representation of the checksum value computed
with the specified hashing algorithm.
:>json object checksums: object holding the computed checksum values for the
requested content
:>json string data_url: link to
for downloading the content raw bytes
:>json string filetype_url: link to
for getting information about the content MIME type
:>json string language_url: link to
for getting information about the programming language used in the content
:>json number length: length of the content in bytes
:>json string license_url: link to
for getting information about the license of the content
:statuscode 200: no error
:statuscode 400: an invalid **hash_type** or **hash** has been provided
:statuscode 404: requested content cannot be found in the archive
.. parsed-literal::
return api_lookup(
notfound_msg="Content with {} not found.".format(q),
enrich_fn=functools.partial(utils.enrich_content, query_string=q),