Source code for swh.indexer.citation.bibtex

# Copyright (C) 2023-2026  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import calendar
import collections
import json
import secrets
import sys
from typing import Any, Dict, List, Optional

import iso8601
from pybtex.database import Entry, Person
from pybtex.database.output.bibtex import Writer
from pybtex.plugin import register_plugin

from swh.indexer.citation.codemeta_data import CodeMetaData, CodeMetaPerson
from swh.indexer.citation.exceptions import CitationError
from swh.indexer.namespaces import SPDX_LICENSES
from swh.model.swhids import ObjectType

MACRO_PREFIX = "macro" + secrets.token_urlsafe(16).replace("_", "")


[docs] class BibTeXWithMacroWriter(Writer):
[docs] def quote(self, s): r""" >>> w = BibTeXWithMacroWriter() >>> print(w.quote(f'{MACRO_PREFIX}:jan')) jan """ if s.startswith(f"{MACRO_PREFIX}:"): return s[len(MACRO_PREFIX) + 1 :] return super().quote(s)
register_plugin("pybtex.database.output", "bibtex_with_macro", BibTeXWithMacroWriter)
[docs] def codemeta_person_to_pybtex_person(person: CodeMetaPerson) -> Optional[Person]: pybtex_person = Person() for name in person.names: if person.is_organization: # prevent interpreting the name as "Firstname Lastname" and reformatting # it to "Lastname, Firstname" pybtex_person.last_names.append(name) else: pybtex_person = Person(name) for given_name in person.given_names: pybtex_person.first_names.append(given_name) for family_name in person.family_names: pybtex_person.last_names.append(family_name) if not str(pybtex_person): return None return pybtex_person
[docs] def codemeta_data_to_bibtex( codemeta_data: CodeMetaData, ) -> str: swhid = codemeta_data.swhid persons: Dict[str, List[Person]] = collections.defaultdict(list) fields: Dict[str, Any] = {} if codemeta_data.description: fields["abstract"] = codemeta_data.description for author in codemeta_data.author or []: pybtex_person = codemeta_person_to_pybtex_person(author) if pybtex_person and pybtex_person not in persons["author"]: persons["author"].append(pybtex_person) for affiliation in codemeta_data.affiliation or []: pybtex_person = codemeta_person_to_pybtex_person(affiliation) if pybtex_person and pybtex_person not in persons["organization"]: persons["organization"].append(pybtex_person) date = ( codemeta_data.datePublished or codemeta_data.dateCreated or codemeta_data.dateModified ) if date: fields["date"] = date try: parsed_date = iso8601.parse_date(date) fields["year"] = str(parsed_date.year) fields["month"] = ( f"{MACRO_PREFIX}:{calendar.month_abbr[parsed_date.month].lower()}" ) except iso8601.ParseError: pass entry_key = None for identifier in codemeta_data.identifier or []: if entry_key is None and "/" not in identifier: # Avoid URLs entry_key = identifier if identifier.startswith("https://doi.org/"): fields["doi"] = identifier if identifier.startswith("hal-"): fields["hal_id"] = identifier for editor in codemeta_data.editor or []: pybtex_person = codemeta_person_to_pybtex_person(editor) if pybtex_person and pybtex_person not in persons["editor"]: persons["editor"].append(pybtex_person) if codemeta_data.downloadUrl: fields["file"] = codemeta_data.downloadUrl # license (represented by "Person" as it's the only way to make pybtex format # them as a list) for license in codemeta_data.license or []: if license.startswith(str(SPDX_LICENSES)): license_name = license[len(str(SPDX_LICENSES)) :] if license_name.endswith(".html"): license_name = license_name[:-5] persons["license"].append(Person(last=license_name)) for publisher in codemeta_data.publisher or []: pybtex_person = codemeta_person_to_pybtex_person(publisher) if pybtex_person and pybtex_person not in persons["publisher"]: persons["publisher"].append(pybtex_person) if codemeta_data.codeRepository: fields["repository"] = codemeta_data.codeRepository if codemeta_data.name: fields["title"] = codemeta_data.name if codemeta_data.url: fields["url"] = codemeta_data.url version = codemeta_data.softwareVersion or codemeta_data.version if version: fields["version"] = version if not fields: raise CitationError( "No BibTex fields could be extracted from citation metadata file " "(codemeta.json or citation.cff), please check its content is valid." ) # entry_type if swhid: fields["swhid"] = str(swhid) if swhid.object_type == ObjectType.SNAPSHOT: entry_type = "software" elif swhid.object_type == ObjectType.CONTENT: entry_type = "codefragment" else: entry_type = "softwareversion" if entry_key is None: entry_key = f"swh-{swhid.object_type.value}-{swhid.object_id.hex()[:7]}" if swhid.lines: line_start, line_end = swhid.lines if line_start: entry_key += f"-L{line_start}" if line_end: entry_key += f"-L{line_end}" elif "version" in fields: entry_type = "softwareversion" else: entry_type = "software" entry = Entry( entry_type, persons=persons, fields=fields, ) entry.key = entry_key or "REPLACEME" return entry.to_string(bib_format="bibtex_with_macro")
if __name__ == "__main__": from swh.indexer.citation import CitationFormat, codemeta_to_citation for filename in sys.argv[1:]: if filename == "-": print(codemeta_to_citation(json.load(sys.stdin), CitationFormat.BIBTEX)) else: with open(filename) as f: print(codemeta_to_citation(json.load(f), CitationFormat.BIBTEX))