Source code for swh.web.api.views.graph
# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from distutils.util import strtobool
import json
from typing import Dict, Iterator, Union
from urllib.parse import unquote, urlparse, urlunparse
import requests
from django.http import QueryDict
from django.http.response import StreamingHttpResponse
from rest_framework.decorators import renderer_classes
from rest_framework.renderers import JSONRenderer
from rest_framework.request import Request
from rest_framework.response import Response
from swh.model.hashutil import hash_to_hex
from swh.model.model import Sha1Git
from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
from swh.web.api.apidoc import api_doc
from swh.web.api.apiurls import api_route
from swh.web.api.renderers import PlainTextRenderer
from swh.web.config import SWH_WEB_INTERNAL_SERVER_NAMES, get_config
from swh.web.utils import archive
API_GRAPH_PERM = "swh.web.api.graph"
def _resolve_origin_swhid(swhid: str, origin_urls: Dict[Sha1Git, str]) -> str:
"""
Resolve origin url from its swhid sha1 representation.
"""
parsed_swhid = ExtendedSWHID.from_string(swhid)
if parsed_swhid.object_type == ExtendedObjectType.ORIGIN:
if parsed_swhid.object_id in origin_urls:
return origin_urls[parsed_swhid.object_id]
else:
origin_info = list(
archive.lookup_origins_by_sha1s([hash_to_hex(parsed_swhid.object_id)])
)[0]
assert origin_info is not None
origin_urls[parsed_swhid.object_id] = origin_info["url"]
return origin_info["url"]
else:
return swhid
def _resolve_origin_swhids_in_graph_response(
response: requests.Response,
) -> Iterator[bytes]:
"""
Resolve origin urls from their swhid sha1 representations in graph service
responses.
"""
content_type = response.headers["Content-Type"]
origin_urls: Dict[Sha1Git, str] = {}
if content_type == "application/x-ndjson":
for line in response.iter_lines():
swhids = json.loads(line.decode("utf-8"))
processed_line = []
for swhid in swhids:
processed_line.append(_resolve_origin_swhid(swhid, origin_urls))
yield (json.dumps(processed_line) + "\n").encode()
elif content_type == "text/plain":
for line in response.iter_lines():
if not line:
continue
processed_line = []
swhids = line.decode("utf-8").split(" ")
for swhid in swhids:
processed_line.append(_resolve_origin_swhid(swhid, origin_urls))
yield (" ".join(processed_line) + "\n").encode()
else:
for line in response.iter_lines():
yield line + b"\n"
[docs]
@api_route(r"/graph/", "api-1-graph-doc")
@api_doc("/graph/", category="Miscellaneous")
def api_graph(request: Request) -> None:
"""
.. http:get:: /api/1/graph/(graph_query)/
Provide fast access to the graph representation of the Software Heritage
archive.
That endpoint acts as a proxy for the `Software Heritage Graph service
<https://docs.softwareheritage.org/devel/swh-graph/index.html>`_.
It provides fast access to the `graph representation
<https://docs.softwareheritage.org/devel/swh-model/data-model.html#data-structure>`_
of the Software Heritage archive.
For more details please refer to the `Graph RPC API documentation
<https://docs.softwareheritage.org/devel/swh-graph/api.html>`_.
.. warning::
That endpoint is not publicly available and requires authentication and
special user permission in order to be able to request it.
:param string graph_query: query to forward to the Software Heritage Graph
archive (see its `documentation
<https://docs.softwareheritage.org/devel/swh-graph/api.html>`_)
:query boolean resolve_origins: extra parameter defined by that proxy enabling
to resolve origin urls from their sha1 representations
:statuscode 200: no error
:statuscode 400: an invalid graph query has been provided
:statuscode 404: provided graph node cannot be found
**Examples:**
.. parsed-literal::
:swh_web_api:`graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323/`
:swh_web_api:`graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35/`
:swh_web_api:`graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true`
:swh_web_api:`graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true`
"""
return None
[docs]
@api_route(r"/graph/(?P<graph_query>.+)/", "api-1-graph")
@renderer_classes([JSONRenderer, PlainTextRenderer])
def api_graph_proxy(
request: Request, graph_query: str
) -> Union[Response, StreamingHttpResponse]:
if request.get_host() not in SWH_WEB_INTERNAL_SERVER_NAMES:
if not bool(request.user and request.user.is_authenticated):
return Response("Authentication credentials were not provided.", status=401)
if not request.user.has_perm(API_GRAPH_PERM):
return Response(
"You do not have permission to perform this action.", status=403
)
graph_config = get_config()["graph"]
graph_query = unquote(graph_query)
graph_query_url = graph_config["server_url"]
graph_query_url += graph_query
parsed_url = urlparse(graph_query_url)
query_dict = QueryDict(parsed_url.query, mutable=True)
query_dict.update(request.GET)
# clamp max_edges query parameter according to authentication
if request.user.is_staff:
max_edges = graph_config["max_edges"]["staff"]
elif request.user.is_authenticated:
max_edges = graph_config["max_edges"]["user"]
else:
max_edges = graph_config["max_edges"]["anonymous"]
query_dict["max_edges"] = min(
max_edges, int(query_dict.get("max_edges", max_edges + 1))
)
if query_dict:
graph_query_url = urlunparse(
parsed_url._replace(query=query_dict.urlencode(safe="/;:"))
)
response = requests.get(graph_query_url, stream=True)
if response.status_code != 200:
return Response(
response.content,
status=response.status_code,
content_type=response.headers["Content-Type"],
)
# graph stats and counter endpoint responses are not streamed
if response.headers.get("Transfer-Encoding") != "chunked":
return Response(
response.json(),
status=response.status_code,
content_type=response.headers["Content-Type"],
)
# other endpoint responses are streamed
else:
resolve_origins = strtobool(request.GET.get("resolve_origins", "false"))
if response.status_code == 200 and resolve_origins:
response_stream = _resolve_origin_swhids_in_graph_response(response)
else:
response_stream = map(lambda line: line + b"\n", response.iter_lines())
return StreamingHttpResponse(
response_stream,
status=response.status_code,
content_type=response.headers["Content-Type"],
)