swh.indexer.metadata_dictionary.base module#

swh.indexer.metadata_dictionary.base.TMP_ROOT_URI_PREFIX = 'https://www.softwareheritage.org/schema/2022/indexer/tmp-node/'#

Prefix used to generate temporary URIs for root nodes being translated.

class swh.indexer.metadata_dictionary.base.DirectoryLsEntry[source]#

Bases: TypedDict

target: bytes#
sha1: bytes | None#
name: bytes#
type: str#
swh.indexer.metadata_dictionary.base.produce_terms(*uris: str) Callable[[TTranslateCallable], TTranslateCallable][source]#

Returns a decorator that marks the decorated function as adding the given terms to the translated_metadata dict

class swh.indexer.metadata_dictionary.base.BaseMapping(log_suffix='')[source]#

Bases: object

Base class for BaseExtrinsicMapping and BaseIntrinsicMapping, not to be inherited directly.

property name#

A name of this mapping, used as an identifier in the indexer storage.

translate(raw_content: bytes) Dict | None[source]#

Translates content by parsing content from a bytestring containing mapping-specific data and translating with the appropriate mapping to JSON-LD using the Codemeta and ForgeFed vocabularies.

Parameters:

raw_content – raw content to translate

Returns:

translated metadata in JSON friendly form needed for the content if parseable, None otherwise.

normalize_translation(metadata: Dict[str, Any]) Dict[str, Any][source]#
class swh.indexer.metadata_dictionary.base.BaseExtrinsicMapping(log_suffix='')[source]#

Bases: BaseMapping

Base class for extrinsic_metadata mappings to inherit from

To implement a new mapping:

  • inherit this class

  • override translate function

classmethod extrinsic_metadata_formats() Tuple[str, ...][source]#

Returns the list of extrinsic metadata formats which can be translated by this mapping

normalize_translation(metadata: Dict[str, Any]) Dict[str, Any][source]#
class swh.indexer.metadata_dictionary.base.BaseIntrinsicMapping(log_suffix='')[source]#

Bases: BaseMapping

Base class for intrinsic-metadata mappings to inherit from

To implement a new mapping:

  • inherit this class

  • override translate function

classmethod detect_metadata_files(file_entries: List[DirectoryLsEntry]) List[bytes][source]#

Returns the sha1 hashes of files which can be translated by this mapping

normalize_translation(metadata: Dict[str, Any]) Dict[str, Any][source]#
class swh.indexer.metadata_dictionary.base.SingleFileIntrinsicMapping(log_suffix='')[source]#

Bases: BaseIntrinsicMapping

Base class for all intrinsic metadata mappings that use a single file as input.

filename: bytes | Pattern[bytes]#
classmethod detect_metadata_files(file_entries: List[DirectoryLsEntry]) List[bytes][source]#

Returns the sha1 hashes of files which can be translated by this mapping

class swh.indexer.metadata_dictionary.base.DictMapping(log_suffix='')[source]#

Bases: BaseMapping

Base class for mappings that take as input a file that is mostly a key-value store (eg. a shallow JSON dict).

string_fields: List[str] = []#

List of fields that are simple strings, and don’t need any normalization.

date_fields: List[str] = []#

//schema.org/Date

Type:

List of fields that are strings that should be typed as http

uri_fields: List[str] = []#

List of fields that are simple URIs, and don’t need any normalization.

property mapping#

A translation dict to map dict keys into a canonical name.

classmethod supported_terms()[source]#
get_root_uri(content_dict: Dict) URIRef[source]#

Returns an URI for the SoftwareSourceCode or Repository being described.

The default implementation uses a temporary URI that is stripped before normalization by _translate_dict().

sanitize(graph: Graph) None[source]#
extra_translation(graph: Graph, root: Node, d: Dict[str, Any]) None[source]#

Called at the end of the translation process, and may add arbitrary triples to graph based on the input dictionary (passed as d).

class swh.indexer.metadata_dictionary.base.JsonMapping(log_suffix='')[source]#

Bases: DictMapping

Base class for all mappings that use JSON data as input.

translate(raw_content: bytes) Dict | None[source]#

Translates content by parsing content from a bytestring containing mapping-specific data and translating with the appropriate mapping to JSON-LD using the Codemeta and ForgeFed vocabularies.

Parameters:

raw_content – raw content to translate

Returns:

translated metadata in JSON friendly form needed for the content if parseable, None otherwise.

class swh.indexer.metadata_dictionary.base.XmlMapping(log_suffix='')[source]#

Bases: DictMapping

Base class for all mappings that use XML data as input.

translate(raw_content: bytes) Dict | None[source]#

Translates content by parsing content from a bytestring containing mapping-specific data and translating with the appropriate mapping to JSON-LD using the Codemeta and ForgeFed vocabularies.

Parameters:

raw_content – raw content to translate

Returns:

translated metadata in JSON friendly form needed for the content if parseable, None otherwise.

class swh.indexer.metadata_dictionary.base.SafeLoader(stream)[source]#

Bases: SafeLoader

Initialize the scanner.

yaml_implicit_resolvers = {'': [('tag:yaml.org,2002:null', re.compile('^(?: ~\n                    |null|Null|NULL\n                    | )$', re.VERBOSE))], '!': [('tag:yaml.org,2002:yaml', re.compile('^(?:!|&|\\*)$'))], '&': [('tag:yaml.org,2002:yaml', re.compile('^(?:!|&|\\*)$'))], '*': [('tag:yaml.org,2002:yaml', re.compile('^(?:!|&|\\*)$'))], '+': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '-': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '.': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE))], '0': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '1': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '2': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '3': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '4': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '5': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '6': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '7': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '8': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '9': [('tag:yaml.org,2002:float', re.compile('^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+][0-9]+)?\n                    |\\.[0-9][0-9_]*(?:[eE][-+][0-9]+)?\n                    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*\n                   , re.VERBOSE)), ('tag:yaml.org,2002:int', re.compile('^(?:[-+]?0b[0-1_]+\n                    |[-+]?0[0-7_]+\n                    |[-+]?(?:0|[1-9][0-9_]*)\n                    |[-+]?0x[0-9a-fA-F_]+\n                    |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9]), re.VERBOSE))], '<': [('tag:yaml.org,2002:merge', re.compile('^(?:<<)$'))], '=': [('tag:yaml.org,2002:value', re.compile('^(?:=)$'))], 'F': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 'N': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE)), ('tag:yaml.org,2002:null', re.compile('^(?: ~\n                    |null|Null|NULL\n                    | )$', re.VERBOSE))], 'O': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 'T': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 'Y': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 'f': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 'n': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE)), ('tag:yaml.org,2002:null', re.compile('^(?: ~\n                    |null|Null|NULL\n                    | )$', re.VERBOSE))], 'o': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 't': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], 'y': [('tag:yaml.org,2002:bool', re.compile('^(?:yes|Yes|YES|no|No|NO\n                    |true|True|TRUE|false|False|FALSE\n                    |on|On|ON|off|Off|OFF)$', re.VERBOSE))], '~': [('tag:yaml.org,2002:null', re.compile('^(?: ~\n                    |null|Null|NULL\n                    | )$', re.VERBOSE))]}#
class swh.indexer.metadata_dictionary.base.YamlMapping(log_suffix='')[source]#

Bases: DictMapping

Base class for all mappings that use Yaml data as input.

translate(raw_content: bytes) Dict[str, str] | None[source]#

Translates content by parsing content from a bytestring containing mapping-specific data and translating with the appropriate mapping to JSON-LD using the Codemeta and ForgeFed vocabularies.

Parameters:

raw_content – raw content to translate

Returns:

translated metadata in JSON friendly form needed for the content if parseable, None otherwise.