Source code for swh.indexer.bibtex

# Copyright (C) 2023  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import collections
import json
import sys
from typing import Any, Dict, List
import uuid

from pybtex.database import Entry, Person
import rdflib

from swh.indexer.codemeta import compact, expand
from swh.indexer.namespaces import SCHEMA, SPDX_LICENSES

TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"


[docs] def codemeta_to_bibtex(doc: Dict[str, Any]) -> str: doc = compact(doc, False) identifiers = [] if "id" in doc: identifiers.append(doc["id"]) else: doc["id"] = f"_:{uuid.uuid4()}" id_: rdflib.term.Node if doc["id"].startswith("_:"): id_ = rdflib.term.BNode(doc["id"][2:]) else: # using a base in case the id is not an absolute URI id_ = rdflib.term.URIRef(doc["id"], base=TMP_ROOT_URI_PREFIX) doc["id"] = str(id_) # workaround for https://github.com/codemeta/codemeta/pull/322 if "identifier" in doc: if isinstance(doc["identifier"], list): for identifier in doc["identifier"]: if isinstance(identifier, str) and "/" not in identifier: identifiers.append(identifier) elif isinstance(doc["identifier"], str) and "/" not in doc["identifier"]: identifiers.append(doc["identifier"]) doc = expand(doc) g = rdflib.Graph().parse(data=json.dumps(doc), format="json-ld") persons: Dict[str, List[Person]] = collections.defaultdict(list) fields: Dict[str, Any] = {} def add_person(persons: List[Person], person_id: rdflib.term.Node) -> None: for _, _, name in g.triples((person_id, SCHEMA.name, None)): if (person_id, rdflib.RDF.type, SCHEMA.Organization) in g: # prevent interpreting the name as "Firstname Lastname" and reformatting # it to "Lastname, Firstname" person = Person(last=name) else: person = Person(name) if person not in persons: persons.append(person) def add_affiliations(person: rdflib.term.Node) -> None: for _, _, organization in g.triples((person, SCHEMA.affiliation, None)): add_person(persons["organization"], organization) # abstract for _, _, description in g.triples((id_, SCHEMA.description, None)): fields["abstract"] = description break # authors, which are an ordered list for _, _, author_list in g.triples((id_, SCHEMA.author, None)): for author in rdflib.collection.Collection(g, author_list): add_person(persons["author"], author) add_affiliations(author) # date for _, _, date in g.triples((id_, SCHEMA.datePublished, None)): fields["date"] = date break else: for _, _, date in g.triples((id_, SCHEMA.dateCreated, None)): fields["date"] = date break else: for _, _, date in g.triples((id_, SCHEMA.dateModified, None)): fields["date"] = date break if "date" in fields: (fields["year"], fields["month"], _) = fields["date"].split("-") # identifier, doi, hal_id entry_key = None for _, _, identifier in g.triples((id_, SCHEMA.identifier, None)): identifiers.append(identifier) for identifier in identifiers: if entry_key is None and "/" not in identifier: # Avoid URLs entry_key = identifier if identifier.startswith("https://doi.org/"): fields["doi"] = identifier if identifier.startswith("hal-"): fields["hal_id"] = identifier # editor for _, _, editor in g.triples((id_, SCHEMA.editor, None)): add_person(persons["editor"], editor) add_affiliations(editor) # file for _, _, download_url in g.triples((id_, SCHEMA.downloadUrl, None)): fields["file"] = download_url break # license (represented by "Person" as it's the only way to make pybtex format # them as a list) for _, _, license in g.triples((id_, SCHEMA.license, None)): if license is None: continue license_ = str(license) if license_.startswith(str(SPDX_LICENSES)): license_ = license_[len(str(SPDX_LICENSES)) :] if license_.endswith(".html"): license_ = license_[:-5] persons["license"].append(Person(last=license_)) # publisher for _, _, publisher in g.triples((id_, SCHEMA.publisher, None)): add_person(persons["publisher"], publisher) add_affiliations(publisher) # repository for _, _, code_repository in g.triples((id_, SCHEMA.codeRepository, None)): fields["repository"] = code_repository break # title for _, _, name in g.triples((id_, SCHEMA.name, None)): fields["title"] = name break # url for _, _, name in g.triples((id_, SCHEMA.url, None)): fields["url"] = name break # version for _, _, version in g.triples((id_, SCHEMA.softwareVersion, None)): fields["version"] = version break else: for _, _, version in g.triples((id_, SCHEMA.version, None)): fields["version"] = version entry_type = "softwareversion" if "version" in fields else "software" entry = Entry( entry_type, persons=persons, fields=fields, ) entry.key = entry_key or "REPLACEME" return entry.to_string(bib_format="bibtex")
if __name__ == "__main__": for filename in sys.argv[1:]: if filename == "-": print(codemeta_to_bibtex(json.load(sys.stdin))) else: with open(filename) as f: print(codemeta_to_bibtex(json.load(f)))