Source code for swh.deposit.loader.checks

# Copyright (C) 2017-2025  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

"""Functional Metadata checks:

Mandatory fields:
- 'author'
- 'name' or 'title'

Suggested fields:
- metadata-provenance

"""

import dataclasses
import functools
import importlib.resources
import re
from typing import Dict, Iterator, Optional, Tuple, cast
import urllib
from xml.etree import ElementTree

import xmlschema

from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance

MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
INVALID_DATE_FORMAT = "Invalid date format"

SUGGESTED_FIELDS_MISSING = "Suggested fields are missing"
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"

AFFILIATION_NO_NAME = "Reason: affiliation does not have a <codemeta:name> element"

# from https://datatracker.ietf.org/doc/html/rfc4287
ATOM_ELEMENTS = [
    "name",
    "uri",
    "email",
    # specifically not allowing this one, because clients are supposed to send one
    # entry at a time:
    # "feed",
    "entry",
    # ditto:
    # "content",
    "author",
    "category",
    "contributor",
    "generator",
    "icon",
    "id",
    "link",
    "logo",
    "published",
    "rights",
    "source",
    "subtitle",
    "summary",
    "title",
    "updated",
]

# from https://github.com/codemeta/codemeta/blob/2.0/codemeta.jsonld
CODEMETA2_CONTEXT = {
    "type": "@type",
    "id": "@id",
    "schema": "http://schema.org/",
    "codemeta": "https://codemeta.github.io/terms/",
    "Organization": {"@id": "schema:Organization"},
    "Person": {"@id": "schema:Person"},
    "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"},
    "SoftwareApplication": {"@id": "schema:SoftwareApplication"},
    "Text": {"@id": "schema:Text"},
    "URL": {"@id": "schema:URL"},
    "address": {"@id": "schema:address"},
    "affiliation": {"@id": "schema:affiliation"},
    "applicationCategory": {"@id": "schema:applicationCategory", "@type": "@id"},
    "applicationSubCategory": {"@id": "schema:applicationSubCategory", "@type": "@id"},
    "citation": {"@id": "schema:citation"},
    "codeRepository": {"@id": "schema:codeRepository", "@type": "@id"},
    "contributor": {"@id": "schema:contributor"},
    "copyrightHolder": {"@id": "schema:copyrightHolder"},
    "copyrightYear": {"@id": "schema:copyrightYear"},
    "creator": {"@id": "schema:creator"},
    "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date"},
    "dateModified": {"@id": "schema:dateModified", "@type": "schema:Date"},
    "datePublished": {"@id": "schema:datePublished", "@type": "schema:Date"},
    "description": {"@id": "schema:description"},
    "downloadUrl": {"@id": "schema:downloadUrl", "@type": "@id"},
    "email": {"@id": "schema:email"},
    "editor": {"@id": "schema:editor"},
    "encoding": {"@id": "schema:encoding"},
    "familyName": {"@id": "schema:familyName"},
    "fileFormat": {"@id": "schema:fileFormat", "@type": "@id"},
    "fileSize": {"@id": "schema:fileSize"},
    "funder": {"@id": "schema:funder"},
    "givenName": {"@id": "schema:givenName"},
    "hasPart": {"@id": "schema:hasPart"},
    "identifier": {"@id": "schema:identifier", "@type": "@id"},
    "installUrl": {"@id": "schema:installUrl", "@type": "@id"},
    "isAccessibleForFree": {"@id": "schema:isAccessibleForFree"},
    "isPartOf": {"@id": "schema:isPartOf"},
    "keywords": {"@id": "schema:keywords"},
    "license": {"@id": "schema:license", "@type": "@id"},
    "memoryRequirements": {"@id": "schema:memoryRequirements", "@type": "@id"},
    "name": {"@id": "schema:name"},
    "operatingSystem": {"@id": "schema:operatingSystem"},
    "permissions": {"@id": "schema:permissions"},
    "position": {"@id": "schema:position"},
    "processorRequirements": {"@id": "schema:processorRequirements"},
    "producer": {"@id": "schema:producer"},
    "programmingLanguage": {"@id": "schema:programmingLanguage"},
    "provider": {"@id": "schema:provider"},
    "publisher": {"@id": "schema:publisher"},
    "relatedLink": {"@id": "schema:relatedLink", "@type": "@id"},
    "releaseNotes": {"@id": "schema:releaseNotes", "@type": "@id"},
    "runtimePlatform": {"@id": "schema:runtimePlatform"},
    "sameAs": {"@id": "schema:sameAs", "@type": "@id"},
    "softwareHelp": {"@id": "schema:softwareHelp"},
    "softwareRequirements": {"@id": "schema:softwareRequirements", "@type": "@id"},
    "softwareVersion": {"@id": "schema:softwareVersion"},
    "sponsor": {"@id": "schema:sponsor"},
    "storageRequirements": {"@id": "schema:storageRequirements", "@type": "@id"},
    "supportingData": {"@id": "schema:supportingData"},
    "targetProduct": {"@id": "schema:targetProduct"},
    "url": {"@id": "schema:url", "@type": "@id"},
    "version": {"@id": "schema:version"},
    "author": {"@id": "schema:author", "@container": "@list"},
    "softwareSuggestions": {"@id": "codemeta:softwareSuggestions", "@type": "@id"},
    "contIntegration": {"@id": "codemeta:contIntegration", "@type": "@id"},
    "buildInstructions": {"@id": "codemeta:buildInstructions", "@type": "@id"},
    "developmentStatus": {"@id": "codemeta:developmentStatus", "@type": "@id"},
    "embargoDate": {"@id": "codemeta:embargoDate", "@type": "schema:Date"},
    "funding": {"@id": "codemeta:funding"},
    "readme": {"@id": "codemeta:readme", "@type": "@id"},
    "issueTracker": {"@id": "codemeta:issueTracker", "@type": "@id"},
    "referencePublication": {"@id": "codemeta:referencePublication", "@type": "@id"},
    "maintainer": {"@id": "codemeta:maintainer"},
}


[docs] def extra_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Iterator[xmlschema.XMLSchemaValidationError]: """Performs extra checks on Atom elements that cannot be implemented purely within XML Schema. For now, this only checks URIs are absolute.""" type_name = xsd_element.type.name if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI": # Check their URI is absolute. # This could technically be implemented in the schema like this: # <xsd:simpleType name="URL"> # <xsd:restriction base="xsd:anyURI"> # <!-- https://datatracker.ietf.org/doc/html/rfc2396#section-3.1 --> # <xsd:pattern value="[a-zA-Z][a-zA-Z0-9+.-]*:.+" /> # </xsd:restriction> # </xsd:simpleType> # However, this would give an unreadable error, so we implement it here # in Python instead. yield from absolute_uri_validator(element, xsd_element) elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType": # Made-up type, that allows both absolute URIs and HAL-IDs if not re.match("hal-[0-9]+", element.text or ""): yield from absolute_uri_validator(element, xsd_element)
[docs] def absolute_uri_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Iterator[xmlschema.XMLSchemaValidationError]: try: url = urllib.parse.urlparse(element.text) except ValueError: yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not a valid URI", ) else: if not url.scheme or not url.netloc: yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not an absolute URI", ) elif " " in url.netloc: # urllib is a little too permissive... yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not a valid URI", )
[docs] @dataclasses.dataclass class Schemas: swh: xmlschema.XMLSchema11 codemeta: xmlschema.XMLSchema11
[docs] @functools.lru_cache(1) def schemas() -> Schemas: def load_xsd(name) -> xmlschema.XMLSchema11: xsd_path = importlib.resources.files("swh.deposit").joinpath(f"xsd/{name}.xsd") with importlib.resources.as_file(xsd_path) as xsd: return xmlschema.XMLSchema11(xsd.as_posix()) return Schemas(swh=load_xsd("swh"), codemeta=load_xsd("codemeta"))
[docs] def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]: """Check metadata for mandatory field presence and date format. Args: metadata: Metadata dictionary to check Returns: tuple (status, error_detail): - (True, None) if metadata are ok and suggested fields are also present - (True, <detailed-error>) if metadata are ok but some suggestions are missing - (False, <detailed-error>) otherwise. """ if metadata.tag != "{http://www.w3.org/2005/Atom}entry": return False, { "metadata": [ { "fields": ["atom:entry"], "summary": ( "Root element should be {http://www.w3.org/2005/Atom}entry, " f"but it is {metadata.tag}" ), } ] } suggested_fields = [] # at least one value per couple below is mandatory alternate_fields = { ("atom:name", "atom:title", "codemeta:name"): False, ("atom:author", "codemeta:author"): False, } for possible_names in alternate_fields: for possible_name in possible_names: if metadata.find(possible_name, namespaces=NAMESPACES) is not None: alternate_fields[possible_names] = True continue mandatory_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] # provenance metadata is optional provenance_meta = parse_swh_metadata_provenance(metadata) if provenance_meta is None: suggested_fields = [ {"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]} ] if mandatory_result: detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}] return False, {"metadata": detail + suggested_fields} deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES) if deposit_elt: try: schemas().swh.validate( deposit_elt, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]} detail = [] for child in metadata: for schema_element in schemas().codemeta.root_elements: if child.tag in schema_element.name: break else: # Tag is not specified in the schema, don't validate it continue try: schemas().codemeta.validate( child, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)}) else: # Manually validate <codemeta:affiliation>. Unfortunately, this cannot be # validated by codemeta.xsd, because Codemeta has conflicting requirements: # 1. https://codemeta.github.io/terms/ requires it to be Text (represented # by simple content), but # 2. https://doi.org/10.5063/SCHEMA/CODEMETA-2.0 requires it to be an # Organization (represented by complex content) # And this is (legitimately) not representable in XML Schema. # # See https://github.com/codemeta/codemeta/pull/239 for a discussion about # this issue. for affiliation in child.findall( "codemeta:affiliation", namespaces=NAMESPACES ): if len(affiliation) > 0: # This is a complex element (as required by # https://codemeta.github.io/terms/), then we want to make sure # there is at least a name. if not affiliation.findtext("codemeta:name", namespaces=NAMESPACES): detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break else: # This is a simple element (as required by # https://doi.org/10.5063/SCHEMA/CODEMETA-2.0) if affiliation.text is None or not affiliation.text.strip(): # Completely empty element detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break for element in metadata.iter(): if element.tag.startswith("{http://www.w3.org/2005/Atom}"): _, local_name = element.tag.split("}", 1) if local_name not in ATOM_ELEMENTS: if local_name == "external_identifier": detail.append( { "fields": [local_name], "summary": "<external_identifier> is not supported anymore, " "<swh:create_origin> or <swh:add_to_origin> should be used " "instead.", } ) elif local_name in CODEMETA2_CONTEXT: # Probably confused the two namespaces, display a nicer error detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Atom element. " "However, it would be a valid a Codemeta term; make sure " "namespaces are not swapped", } ) else: detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Atom element, " "see https://datatracker.ietf.org/doc/html/rfc4287", } ) elif element.tag.startswith("{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}"): _, local_name = element.tag.split("}", 1) if local_name not in CODEMETA2_CONTEXT: if local_name in CODEMETA2_CONTEXT: # Probably confused the two namespaces, display a nicer error detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Codemeta 2.0 term. " "However, it would be a valid Atom element; make sure " "namespaces are not swapped", } ) else: detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Codemeta 2.0 term, " "see " "https://github.com/codemeta/codemeta/blob/2.0/codemeta.jsonld", } ) if detail: return False, {"metadata": detail + suggested_fields} if suggested_fields: # it's fine but warn about missing suggested fields return True, {"metadata": suggested_fields} return True, None