swh.indexer.metadata_dictionary.maven module#

class swh.indexer.metadata_dictionary.maven.MavenMapping(log_suffix='')[source]#

Bases: XmlMapping, SingleFileIntrinsicMapping

dedicated class for Maven (pom.xml) mapping and translation

name = 'maven'#
filename: bytes | Pattern[bytes] = b'pom.xml'#
mapping = {'ciManagement': rdflib.term.URIRef('https://codemeta.github.io/terms/contIntegration'), 'description': rdflib.term.URIRef('http://schema.org/description'), 'groupId': rdflib.term.URIRef('http://schema.org/identifier'), 'issueManagement': rdflib.term.URIRef('https://codemeta.github.io/terms/issueTracker'), 'license': rdflib.term.URIRef('http://schema.org/license'), 'name': rdflib.term.URIRef('http://schema.org/name'), 'repositories': rdflib.term.URIRef('http://schema.org/codeRepository'), 'scm': rdflib.term.URIRef('http://schema.org/codeRepository'), 'version': rdflib.term.URIRef('http://schema.org/version')}#
string_fields: List[str] = ['name', 'version', 'description', 'email']#

List of fields that are simple strings, and don’t need any normalization.

extra_translation(graph: Graph, root, d)[source]#

Called at the end of the translation process, and may add arbitrary triples to graph based on the input dictionary (passed as d).

parse_repositories(graph: Graph, root, d)[source]#

https://maven.apache.org/pom.html#Repositories

>>> import rdflib
>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
... <repositories>
...   <repository>
...     <id>codehausSnapshots</id>
...     <name>Codehaus Snapshots</name>
...     <url>http://snapshots.maven.codehaus.org/maven2</url>
...     <layout>default</layout>
...   </repository>
... </repositories>
... ''')
>>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
parse_repository(graph: Graph, root, d, repo)[source]#
normalize_groupId(id_)[source]#

https://maven.apache.org/pom.html#Maven_Coordinates

>>> MavenMapping().normalize_groupId('org.example')
rdflib.term.Literal('org.example')
translate_licenses(graph, root, licenses)[source]#

https://maven.apache.org/pom.html#Licenses

>>> import xmltodict
>>> import json
>>> from rdflib import URIRef
>>> d = xmltodict.parse('''
... <licenses>
...   <license>
...     <name>Apache License, Version 2.0</name>
...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
...   </license>
... </licenses>
... ''')
>>> print(json.dumps(d, indent=4))
{
    "licenses": {
        "license": {
            "name": "Apache License, Version 2.0",
            "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
        }
    }
}
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> MavenMapping().translate_licenses(graph, root, d["licenses"])
>>> prettyprint_graph(graph, root)
{
    "@id": ...,
    "http://schema.org/license": {
        "@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
    }
}

or, if there are more than one license:

>>> import xmltodict
>>> from pprint import pprint
>>> d = xmltodict.parse('''
... <licenses>
...   <license>
...     <name>Apache License, Version 2.0</name>
...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
...   </license>
...   <license>
...     <name>MIT License</name>
...     <url>https://opensource.org/licenses/MIT</url>
...   </license>
... </licenses>
... ''')
>>> graph = Graph()
>>> root = URIRef("http://example.org/test-software")
>>> MavenMapping().translate_licenses(graph, root, d["licenses"])
>>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
{(rdflib.term.URIRef('http://example.org/test-software'),
  rdflib.term.URIRef('http://schema.org/license'),
  rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
 (rdflib.term.URIRef('http://example.org/test-software'),
  rdflib.term.URIRef('http://schema.org/license'),
  rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}