Source code for swh.indexer.codemeta

# Copyright (C) 2018-2022  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import collections
import csv
import itertools
import json
import os.path
import re
from typing import Any, Dict, List, Set, TextIO, Tuple

from pyld import jsonld
import rdflib

import swh.indexer
from swh.indexer.namespaces import ACTIVITYSTREAMS, CODEMETA, FORGEFED, SCHEMA, XSD

_DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), "data")

CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, "codemeta", "crosswalk.csv")

CODEMETA_V2_CONTEXT_PATH = os.path.join(_DATA_DIR, "codemeta", "codemeta-2.0.jsonld")
CODEMETA_V3_CONTEXT_PATH = os.path.join(_DATA_DIR, "codemeta", "codemeta-3.0.jsonld")


with open(CODEMETA_V2_CONTEXT_PATH) as fdv2, open(CODEMETA_V3_CONTEXT_PATH) as fdv3:
    CODEMETA_V2_CONTEXT = json.load(fdv2)
    CODEMETA_V3_CONTEXT = json.load(fdv3)

with open(os.path.join(_DATA_DIR, "schema.org", "schemaorgcontext.jsonld")) as fd:
    _SCHEMA_DOT_ORG_CONTEXT = json.load(fd)

_EMPTY_PROCESSED_CONTEXT: Any = {"mappings": {}}
_PROCESSED_CODEMETA_CONTEXT = jsonld.JsonLdProcessor().process_context(
    _EMPTY_PROCESSED_CONTEXT, CODEMETA_V2_CONTEXT, None
)

CODEMETA_V2_CONTEXT_URL = "https://doi.org/10.5063/schema/codemeta-2.0"
CODEMETA_V2_ALTERNATE_CONTEXT_URLS = {
    "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
    "https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld",
    "https://doi.org/doi:10.5063/schema/codemeta-2.0",
    "http://purl.org/codemeta/2.0",
}

CODEMETA_V3_CONTEXT_URL = "https://w3id.org/codemeta/3.0"
CODEMETA_V3_ALTERNATE_CONTEXT_URLS = {
    "https://raw.githubusercontent.com/codemeta/codemeta/3.0/codemeta.jsonld"
}

PROPERTY_BLACKLIST = {
    # CodeMeta properties that we cannot properly represent.
    SCHEMA.softwareRequirements,
    CODEMETA.softwareSuggestions,
    # Duplicate of 'author'
    SCHEMA.creator,
}

_codemeta_field_separator = re.compile(r"\s*[,/]\s*")



[docs]
def make_absolute_uri(local_name):
    """Parses codemeta.jsonld, and returns the @id of terms it defines.

    >>> make_absolute_uri("name")
    'http://schema.org/name'
    >>> make_absolute_uri("downloadUrl")
    'http://schema.org/downloadUrl'
    >>> make_absolute_uri("referencePublication")
    'https://codemeta.github.io/terms/referencePublication'
    """
    uri = jsonld.JsonLdProcessor.get_context_value(
        _PROCESSED_CODEMETA_CONTEXT, local_name, "@id"
    )
    assert uri.startswith(("@", CODEMETA, SCHEMA)), (local_name, uri)
    return uri




[docs]
def read_crosstable(fd: TextIO) -> Tuple[Set[str], Dict[str, Dict[str, rdflib.URIRef]]]:
    """
    Given a file-like object to a `CodeMeta crosswalk table` (either the main
    cross-table with all columns, or an auxiliary table with just the CodeMeta
    column and one ecosystem-specific table); returns a list of all CodeMeta
    terms, and a dictionary ``{ecosystem: {ecosystem_term: codemeta_term}}``

    .. _CodeMeta crosswalk table: <https://codemeta.github.io/crosswalk/
    """
    reader = csv.reader(fd)
    try:
        header = next(reader)
    except StopIteration:
        raise ValueError("empty file")

    data_sources = set(header) - {"Parent Type", "Property", "Type", "Description"}

    codemeta_translation: Dict[str, Dict[str, rdflib.URIRef]] = {
        data_source: {} for data_source in data_sources
    }
    terms = set()

    for line in reader:  # For each canonical name
        local_name = dict(zip(header, line))["Property"]
        if not local_name:
            continue
        canonical_name = make_absolute_uri(local_name)
        if rdflib.URIRef(canonical_name) in PROPERTY_BLACKLIST:
            continue
        terms.add(canonical_name)
        for col, value in zip(header, line):  # For each cell in the row
            if col in data_sources:
                # If that's not the parentType/property/type/description
                for local_name in _codemeta_field_separator.split(value):
                    # For each of the data source's properties that maps
                    # to this canonical name
                    if local_name.strip():
                        codemeta_translation[col][local_name.strip()] = rdflib.URIRef(
                            canonical_name
                        )

    return (terms, codemeta_translation)



with open(CROSSWALK_TABLE_PATH) as fd:
    (CODEMETA_TERMS, CROSSWALK_TABLE) = read_crosstable(fd)


def _document_loader(url, options=None):
    """Document loader for pyld.

    Reads the local codemeta.jsonld file instead of fetching it
    from the Internet every single time."""
    if (
        url.lower() == CODEMETA_V2_CONTEXT_URL.lower()
        or url.lower() in CODEMETA_V2_ALTERNATE_CONTEXT_URLS
    ):
        return {
            "contextUrl": None,
            "documentUrl": url,
            "document": CODEMETA_V2_CONTEXT,
        }
    if (
        url.lower() == CODEMETA_V3_CONTEXT_URL.lower()
        or url.lower() in CODEMETA_V3_ALTERNATE_CONTEXT_URLS
    ):
        return {
            "contextUrl": None,
            "documentUrl": url,
            "document": CODEMETA_V3_CONTEXT,
        }
    elif url == CODEMETA:
        raise Exception(
            "{} is CodeMeta's URI, use {} as context url".format(
                CODEMETA, CODEMETA_V2_CONTEXT_URL
            )
        )
    elif url.lower().rstrip("/") in ("http://schema.org", "https://schema.org"):
        return {
            "contextUrl": None,
            "documentUrl": url,
            "document": _SCHEMA_DOT_ORG_CONTEXT,
        }
    else:
        raise Exception(f"Unknown context URL: {url}")



[docs]
def compact(doc, forgefed: bool):
    """Same as `pyld.jsonld.compact`, but in the context of CodeMeta.

    Args:
        forgefed: Whether to add ForgeFed and ActivityStreams as compact URIs.
          This is typically used for extrinsic metadata documents, which frequently
          use properties from these namespaces.
    """
    contexts: List[Any] = [CODEMETA_V2_CONTEXT_URL]
    if forgefed:
        contexts.append(
            {"as": str(ACTIVITYSTREAMS), "forge": str(FORGEFED), "xsd": str(XSD)}
        )
    return jsonld.compact(doc, contexts, options={"documentLoader": _document_loader})




[docs]
def expand(doc):
    """Same as `pyld.jsonld.expand`, but in the context of CodeMeta."""
    return jsonld.expand(doc, options={"documentLoader": _document_loader})




[docs]
def merge_documents(documents):
    """Takes a list of metadata dicts, each generated from a different
    metadata file, and merges them.

    Removes duplicates, if any."""
    documents = list(itertools.chain.from_iterable(map(expand, documents)))
    merged_document = collections.defaultdict(list)
    for document in documents:
        for key, values in document.items():
            if key == "@id":
                # @id does not get expanded to a list
                value = values

                # Only one @id is allowed, move it to sameAs
                if "@id" not in merged_document:
                    merged_document["@id"] = value
                elif value != merged_document["@id"]:
                    if value not in merged_document[SCHEMA.sameAs]:
                        merged_document[SCHEMA.sameAs].append(value)
            else:
                for value in values:
                    if isinstance(value, dict) and set(value) == {"@list"}:
                        # Value is of the form {'@list': [item1, item2]}
                        # instead of the usual [item1, item2].
                        # We need to merge the inner lists (and mostly
                        # preserve order).
                        merged_value = merged_document.setdefault(key, {"@list": []})
                        for subvalue in value["@list"]:
                            # merged_value must be of the form
                            # {'@list': [item1, item2]}; as it is the same
                            # type as value, which is an @list.
                            if subvalue not in merged_value["@list"]:
                                merged_value["@list"].append(subvalue)
                    elif value not in merged_document[key]:
                        merged_document[key].append(value)

    # XXX: we should set forgefed=True when merging extrinsic_metadata documents.
    # however, this function is only used to merge multiple files of the same
    # directory (which is only for intrinsic-metadata), so it is not an issue for now
    return compact(merged_document, forgefed=False)