Source code for swh.indexer.metadata_dictionary.npm

# Copyright (C) 2018-2022  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import re

from rdflib import RDF, BNode, Graph, Literal, URIRef

from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA

from .base import JsonMapping, SingleFileIntrinsicMapping
from .utils import add_list, add_url_if_valid, prettyprint_graph  # noqa

SPDX = URIRef("https://spdx.org/licenses/")



[docs]
class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
    """
    dedicated class for NPM (package.json) mapping and translation
    """

    name = "npm"
    mapping = CROSSWALK_TABLE["NodeJS"]
    filename = b"package.json"
    string_fields = ["name", "version", "description", "email"]
    uri_fields = ["homepage"]

    _schema_shortcuts = {
        "github": "git+https://github.com/%s.git",
        "gist": "git+https://gist.github.com/%s.git",
        "gitlab": "git+https://gitlab.com/%s.git",
        # Bitbucket supports both hg and git, and the shortcut does not
        # tell which one to use.
        # 'bitbucket': 'https://bitbucket.org/',
    }


[docs]
    def normalize_repository(self, d):
        """https://docs.npmjs.com/cli/v11/configuring-npm/package-json#repository

        >>> NpmMapping().normalize_repository({
        ...     'type': 'git',
        ...     'url': 'https://example.org/foo.git'
        ... })
        rdflib.term.URIRef('git+https://example.org/foo.git')
        >>> NpmMapping().normalize_repository(
        ...     'gitlab:foo/bar')
        rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
        >>> NpmMapping().normalize_repository(
        ...     'foo/bar')
        rdflib.term.URIRef('git+https://github.com/foo/bar.git')
        """
        if (
            isinstance(d, dict)
            and isinstance(d.get("type"), str)
            and isinstance(d.get("url"), str)
        ):
            url = "{type}+{url}".format(**d)
        elif isinstance(d, str):
            if "://" in d:
                url = d
            elif ":" in d:
                (schema, rest) = d.split(":", 1)
                if schema in self._schema_shortcuts:
                    url = self._schema_shortcuts[schema] % rest
                else:
                    return None
            else:
                url = self._schema_shortcuts["github"] % d

        else:
            return None

        return URIRef(url)



[docs]
    def normalize_bugs(self, d):
        """https://docs.npmjs.com/cli/v11/configuring-npm/package-json#bugs

        >>> NpmMapping().normalize_bugs({
        ...     'url': 'https://example.org/bugs/',
        ...     'email': 'bugs@example.org'
        ... })
        rdflib.term.URIRef('https://example.org/bugs/')
        >>> NpmMapping().normalize_bugs(
        ...     'https://example.org/bugs/')
        rdflib.term.URIRef('https://example.org/bugs/')
        """
        if isinstance(d, dict) and isinstance(d.get("url"), str):
            url = d["url"]
        elif isinstance(d, str):
            url = d
        else:
            url = ""

        return URIRef(url)


    _parse_author = re.compile(
        r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
    )


[docs]
    def translate_author(self, graph: Graph, root, d):
        r"""https://docs.npmjs.com/cli/v11/configuring-npm/package-json#people-fields-author-contributors'

        >>> from pprint import pprint
        >>> root = URIRef("http://example.org/test-software")
        >>> graph = Graph()
        >>> NpmMapping().translate_author(graph, root, {
        ...     'name': 'John Doe',
        ...     'email': 'john.doe@example.org',
        ...     'url': 'https://example.org/~john.doe',
        ... })
        >>> prettyprint_graph(graph, root)
        {
            "@id": ...,
            "http://schema.org/author": {
                "@list": [
                    {
                        "@type": "http://schema.org/Person",
                        "http://schema.org/email": "john.doe@example.org",
                        "http://schema.org/name": "John Doe",
                        "http://schema.org/url": {
                            "@id": "https://example.org/~john.doe"
                        }
                    }
                ]
            }
        }
        >>> graph = Graph()
        >>> NpmMapping().translate_author(graph, root,
        ...     'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
        ... )
        >>> prettyprint_graph(graph, root)
        {
            "@id": ...,
            "http://schema.org/author": {
                "@list": [
                    {
                        "@type": "http://schema.org/Person",
                        "http://schema.org/email": "john.doe@example.org",
                        "http://schema.org/name": "John Doe",
                        "http://schema.org/url": {
                            "@id": "https://example.org/~john.doe"
                        }
                    }
                ]
            }
        }
        >>> graph = Graph()
        >>> NpmMapping().translate_author(graph, root, {
        ...     'name': 'John Doe',
        ...     'email': 'john.doe@example.org',
        ...     'url': 'https:\\\\example.invalid/~john.doe',
        ... })
        >>> prettyprint_graph(graph, root)
        {
            "@id": ...,
            "http://schema.org/author": {
                "@list": [
                    {
                        "@type": "http://schema.org/Person",
                        "http://schema.org/email": "john.doe@example.org",
                        "http://schema.org/name": "John Doe"
                    }
                ]
            }
        }
        """  # noqa
        author = BNode()
        graph.add((author, RDF.type, SCHEMA.Person))
        if isinstance(d, dict):
            name = d.get("name", None)
            email = d.get("email", None)
            url = d.get("url", None)
        elif isinstance(d, str):
            match = self._parse_author.match(d)
            if not match:
                return None
            name = match.group("name")
            email = match.group("email")
            url = match.group("url")
        else:
            return None

        if name and isinstance(name, str):
            graph.add((author, SCHEMA.name, Literal(name)))
        if email and isinstance(email, str):
            graph.add((author, SCHEMA.email, Literal(email)))
        add_url_if_valid(graph, author, SCHEMA.url, url)

        add_list(graph, root, SCHEMA.author, [author])



[docs]
    def normalize_description(self, description):
        r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
        mistake that causes issues in the database because of null bytes in JSON.

        >>> NpmMapping().normalize_description("foo bar")
        rdflib.term.Literal('foo bar')
        >>> NpmMapping().normalize_description(
        ...     "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
        ... )
        rdflib.term.Literal('foo bar')
        >>> NpmMapping().normalize_description(
        ...     "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
        ... )
        rdflib.term.Literal('foo bar')
        >>> NpmMapping().normalize_description(
        ...     # invalid UTF-16 and meaningless UTF-8:
        ...     "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
        ... ) is None
        True
        >>> NpmMapping().normalize_description(
        ...     # ditto (ut looks like little-endian at first)
        ...     "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
        ... ) is None
        True
        >>> NpmMapping().normalize_description(None) is None
        True
        """
        if not isinstance(description, str):
            return None
        # XXX: if this function ever need to support more cases, consider
        # switching to https://pypi.org/project/ftfy/ instead of adding more hacks
        if description.startswith("\ufffd\ufffd") and "\x00" in description:
            # 2 unicode replacement characters followed by '# ' encoded as UTF-16
            # is a common mistake, which indicates a README.md was saved as UTF-16,
            # and some NPM tool opened it as UTF-8 and used the first line as
            # description.

            description_bytes = description.encode()

            # Strip the the two unicode replacement characters
            assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
            description_bytes = description_bytes[6:]

            # If the following attempts fail to recover the description, discard it
            # entirely because the current indexer storage backend (postgresql) cannot
            # store zero bytes in JSON columns.
            description = None

            if not description_bytes.startswith(b"\x00"):
                # try UTF-16 little-endian (the most common) first
                try:
                    description = description_bytes.decode("utf-16le")
                except UnicodeDecodeError:
                    pass
            if description is None:
                # if it fails, try UTF-16 big-endian
                try:
                    description = description_bytes.decode("utf-16be")
                except UnicodeDecodeError:
                    pass

            if description:
                if description.startswith("# "):
                    description = description[2:]
                return Literal(description.rstrip())
            else:
                return None
        return Literal(description)



[docs]
    def normalize_license(self, s):
        """https://docs.npmjs.com/cli/v11/configuring-npm/package-json#license

        >>> NpmMapping().normalize_license('MIT')
        rdflib.term.URIRef('https://spdx.org/licenses/MIT')
        """
        if isinstance(s, str):
            if s.startswith("SEE LICENSE IN "):
                # Very common pattern, because it is an example in the specification.
                # It is followed by the filename; and the indexer architecture currently
                # does not allow accessing that from metadata mappings.
                # (Plus, an hypothetical license mapping would eventually pick it up)
                return

            # Remove parentheses from the string
            s = s.replace("(", "").replace(")", "")

            if " " in s:
                # Either an SPDX expression, or unusable data
                # Check for SPDX expression first
                # Extract the SPDX expression if it contains OR,
                # ignore licenses with AND or WITH operator.
                if " OR " in s and " AND " not in s and " WITH " not in s:
                    # Multiple licenses, or a license exception
                    # return multiple licenses in a list
                    return [self.normalize_license(x) for x in s.split(" OR ")]

                return
            return SPDX + s



[docs]
    def normalize_keywords(self, lst):
        """https://docs.npmjs.com/cli/v11/configuring-npm/package-json#homepage

        >>> NpmMapping().normalize_keywords(['foo', 'bar'])
        [rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
        """
        if isinstance(lst, list):
            return [Literal(x) for x in lst if isinstance(x, str)]