Source code for swh.indexer.metadata_dictionary.maven

# Copyright (C) 2018-2022  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import os
from typing import Any, Dict

from rdflib import Graph, Literal

from swh.indexer.codemeta import CROSSWALK_TABLE
from swh.indexer.namespaces import SCHEMA

from .base import SingleFileIntrinsicMapping, XmlMapping
from .utils import add_url_if_valid, prettyprint_graph  # noqa


[docs] class MavenMapping(XmlMapping, SingleFileIntrinsicMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ name = "maven" filename = b"pom.xml" mapping = CROSSWALK_TABLE["Java (Maven)"] string_fields = ["name", "version", "description", "email"] _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]: return super()._translate_dict(d.get("project") or {})
[docs] def extra_translation(self, graph: Graph, root, d): self.parse_repositories(graph, root, d)
[docs] def parse_repositories(self, graph: Graph, root, d): """https://maven.apache.org/pom.html#Repositories >>> import rdflib >>> import xmltodict >>> from pprint import pprint >>> d = xmltodict.parse(''' ... <repositories> ... <repository> ... <id>codehausSnapshots</id> ... <name>Codehaus Snapshots</name> ... <url>http://snapshots.maven.codehaus.org/maven2</url> ... <layout>default</layout> ... </repository> ... </repositories> ... ''') >>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d) """ repositories = d.get("repositories") if not repositories: self.parse_repository(graph, root, d, self._default_repository) elif isinstance(repositories, dict): repositories = repositories.get("repository") or [] if not isinstance(repositories, list): repositories = [repositories] for repo in repositories: self.parse_repository(graph, root, d, repo)
[docs] def parse_repository(self, graph: Graph, root, d, repo): if not isinstance(repo, dict): return if repo.get("layout", "default") != "default": return # TODO ? url = repo.get("url") group_id = d.get("groupId") artifact_id = d.get("artifactId") if ( isinstance(url, str) and isinstance(group_id, str) and isinstance(artifact_id, str) ): repo = os.path.join(url, *group_id.split("."), artifact_id) if "${" in repo: # Often use as templating in pom.xml files collected from VCSs return add_url_if_valid(graph, root, SCHEMA.codeRepository, repo)
[docs] def normalize_groupId(self, id_): """https://maven.apache.org/pom.html#Maven_Coordinates >>> MavenMapping().normalize_groupId('org.example') rdflib.term.Literal('org.example') """ if isinstance(id_, str): return Literal(id_)
[docs] def translate_licenses(self, graph, root, licenses): """https://maven.apache.org/pom.html#Licenses >>> import xmltodict >>> import json >>> from rdflib import URIRef >>> d = xmltodict.parse(''' ... <licenses> ... <license> ... <name>Apache License, Version 2.0</name> ... <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url> ... </license> ... </licenses> ... ''') >>> print(json.dumps(d, indent=4)) { "licenses": { "license": { "name": "Apache License, Version 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" } } } >>> graph = Graph() >>> root = URIRef("http://example.org/test-software") >>> MavenMapping().translate_licenses(graph, root, d["licenses"]) >>> prettyprint_graph(graph, root) { "@id": ..., "http://schema.org/license": { "@id": "https://www.apache.org/licenses/LICENSE-2.0.txt" } } or, if there are more than one license: >>> import xmltodict >>> from pprint import pprint >>> d = xmltodict.parse(''' ... <licenses> ... <license> ... <name>Apache License, Version 2.0</name> ... <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url> ... </license> ... <license> ... <name>MIT License</name> ... <url>https://opensource.org/licenses/MIT</url> ... </license> ... </licenses> ... ''') >>> graph = Graph() >>> root = URIRef("http://example.org/test-software") >>> MavenMapping().translate_licenses(graph, root, d["licenses"]) >>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None)))) {(rdflib.term.URIRef('http://example.org/test-software'), rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://opensource.org/licenses/MIT')), (rdflib.term.URIRef('http://example.org/test-software'), rdflib.term.URIRef('http://schema.org/license'), rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))} """ if not isinstance(licenses, dict): return licenses = licenses.get("license") if isinstance(licenses, dict): licenses = [licenses] elif not isinstance(licenses, list): return for license in licenses: if isinstance(license, dict): add_url_if_valid(graph, root, SCHEMA.license, license.get("url"))