Source code for swh.deposit.loader.checks
# Copyright (C) 2017-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Functional Metadata checks:
Mandatory fields:
- 'author'
- 'name' or 'title'
Suggested fields:
- metadata-provenance
"""
import dataclasses
import functools
import importlib.resources
import re
from typing import Dict, Iterator, Optional, Tuple, cast
import urllib
from xml.etree import ElementTree
import xmlschema
from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance
MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
INVALID_DATE_FORMAT = "Invalid date format"
SUGGESTED_FIELDS_MISSING = "Suggested fields are missing"
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
AFFILIATION_NO_NAME = "Reason: affiliation does not have a <codemeta:name> element"
# from https://datatracker.ietf.org/doc/html/rfc4287
ATOM_ELEMENTS = [
"name",
"uri",
"email",
# specifically not allowing this one, because clients are supposed to send one
# entry at a time:
# "feed",
"entry",
# ditto:
# "content",
"author",
"category",
"contributor",
"generator",
"icon",
"id",
"link",
"logo",
"published",
"rights",
"source",
"subtitle",
"summary",
"title",
"updated",
]
# from https://github.com/codemeta/codemeta/blob/2.0/codemeta.jsonld
CODEMETA2_CONTEXT = {
"type": "@type",
"id": "@id",
"schema": "http://schema.org/",
"codemeta": "https://codemeta.github.io/terms/",
"Organization": {"@id": "schema:Organization"},
"Person": {"@id": "schema:Person"},
"SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"},
"SoftwareApplication": {"@id": "schema:SoftwareApplication"},
"Text": {"@id": "schema:Text"},
"URL": {"@id": "schema:URL"},
"address": {"@id": "schema:address"},
"affiliation": {"@id": "schema:affiliation"},
"applicationCategory": {"@id": "schema:applicationCategory", "@type": "@id"},
"applicationSubCategory": {"@id": "schema:applicationSubCategory", "@type": "@id"},
"citation": {"@id": "schema:citation"},
"codeRepository": {"@id": "schema:codeRepository", "@type": "@id"},
"contributor": {"@id": "schema:contributor"},
"copyrightHolder": {"@id": "schema:copyrightHolder"},
"copyrightYear": {"@id": "schema:copyrightYear"},
"creator": {"@id": "schema:creator"},
"dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date"},
"dateModified": {"@id": "schema:dateModified", "@type": "schema:Date"},
"datePublished": {"@id": "schema:datePublished", "@type": "schema:Date"},
"description": {"@id": "schema:description"},
"downloadUrl": {"@id": "schema:downloadUrl", "@type": "@id"},
"email": {"@id": "schema:email"},
"editor": {"@id": "schema:editor"},
"encoding": {"@id": "schema:encoding"},
"familyName": {"@id": "schema:familyName"},
"fileFormat": {"@id": "schema:fileFormat", "@type": "@id"},
"fileSize": {"@id": "schema:fileSize"},
"funder": {"@id": "schema:funder"},
"givenName": {"@id": "schema:givenName"},
"hasPart": {"@id": "schema:hasPart"},
"identifier": {"@id": "schema:identifier", "@type": "@id"},
"installUrl": {"@id": "schema:installUrl", "@type": "@id"},
"isAccessibleForFree": {"@id": "schema:isAccessibleForFree"},
"isPartOf": {"@id": "schema:isPartOf"},
"keywords": {"@id": "schema:keywords"},
"license": {"@id": "schema:license", "@type": "@id"},
"memoryRequirements": {"@id": "schema:memoryRequirements", "@type": "@id"},
"name": {"@id": "schema:name"},
"operatingSystem": {"@id": "schema:operatingSystem"},
"permissions": {"@id": "schema:permissions"},
"position": {"@id": "schema:position"},
"processorRequirements": {"@id": "schema:processorRequirements"},
"producer": {"@id": "schema:producer"},
"programmingLanguage": {"@id": "schema:programmingLanguage"},
"provider": {"@id": "schema:provider"},
"publisher": {"@id": "schema:publisher"},
"relatedLink": {"@id": "schema:relatedLink", "@type": "@id"},
"releaseNotes": {"@id": "schema:releaseNotes", "@type": "@id"},
"runtimePlatform": {"@id": "schema:runtimePlatform"},
"sameAs": {"@id": "schema:sameAs", "@type": "@id"},
"softwareHelp": {"@id": "schema:softwareHelp"},
"softwareRequirements": {"@id": "schema:softwareRequirements", "@type": "@id"},
"softwareVersion": {"@id": "schema:softwareVersion"},
"sponsor": {"@id": "schema:sponsor"},
"storageRequirements": {"@id": "schema:storageRequirements", "@type": "@id"},
"supportingData": {"@id": "schema:supportingData"},
"targetProduct": {"@id": "schema:targetProduct"},
"url": {"@id": "schema:url", "@type": "@id"},
"version": {"@id": "schema:version"},
"author": {"@id": "schema:author", "@container": "@list"},
"softwareSuggestions": {"@id": "codemeta:softwareSuggestions", "@type": "@id"},
"contIntegration": {"@id": "codemeta:contIntegration", "@type": "@id"},
"buildInstructions": {"@id": "codemeta:buildInstructions", "@type": "@id"},
"developmentStatus": {"@id": "codemeta:developmentStatus", "@type": "@id"},
"embargoDate": {"@id": "codemeta:embargoDate", "@type": "schema:Date"},
"funding": {"@id": "codemeta:funding"},
"readme": {"@id": "codemeta:readme", "@type": "@id"},
"issueTracker": {"@id": "codemeta:issueTracker", "@type": "@id"},
"referencePublication": {"@id": "codemeta:referencePublication", "@type": "@id"},
"maintainer": {"@id": "codemeta:maintainer"},
}
[docs]
def extra_validator(
element: ElementTree.Element,
xsd_element: xmlschema.validators.elements.Xsd11Element,
) -> Iterator[xmlschema.XMLSchemaValidationError]:
"""Performs extra checks on Atom elements that cannot be implemented purely
within XML Schema.
For now, this only checks URIs are absolute."""
type_name = xsd_element.type.name
if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI":
# Check their URI is absolute.
# This could technically be implemented in the schema like this:
# <xsd:simpleType name="URL">
# <xsd:restriction base="xsd:anyURI">
# <!-- https://datatracker.ietf.org/doc/html/rfc2396#section-3.1 -->
# <xsd:pattern value="[a-zA-Z][a-zA-Z0-9+.-]*:.+" />
# </xsd:restriction>
# </xsd:simpleType>
# However, this would give an unreadable error, so we implement it here
# in Python instead.
yield from absolute_uri_validator(element, xsd_element)
elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType":
# Made-up type, that allows both absolute URIs and HAL-IDs
if not re.match("hal-[0-9]+", element.text or ""):
yield from absolute_uri_validator(element, xsd_element)
[docs]
def absolute_uri_validator(
element: ElementTree.Element,
xsd_element: xmlschema.validators.elements.Xsd11Element,
) -> Iterator[xmlschema.XMLSchemaValidationError]:
try:
url = urllib.parse.urlparse(element.text)
except ValueError:
yield xmlschema.XMLSchemaValidationError(
xsd_element,
element,
f"{element.text!r} is not a valid URI",
)
else:
if not url.scheme or not url.netloc:
yield xmlschema.XMLSchemaValidationError(
xsd_element,
element,
f"{element.text!r} is not an absolute URI",
)
elif " " in url.netloc:
# urllib is a little too permissive...
yield xmlschema.XMLSchemaValidationError(
xsd_element,
element,
f"{element.text!r} is not a valid URI",
)
[docs]
@dataclasses.dataclass
class Schemas:
swh: xmlschema.XMLSchema11
codemeta: xmlschema.XMLSchema11
[docs]
@functools.lru_cache(1)
def schemas() -> Schemas:
def load_xsd(name) -> xmlschema.XMLSchema11:
xsd_path = importlib.resources.files("swh.deposit").joinpath(f"xsd/{name}.xsd")
with importlib.resources.as_file(xsd_path) as xsd:
return xmlschema.XMLSchema11(xsd.as_posix())
return Schemas(swh=load_xsd("swh"), codemeta=load_xsd("codemeta"))
[docs]
def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]:
"""Check metadata for mandatory field presence and date format.
Args:
metadata: Metadata dictionary to check
Returns:
tuple (status, error_detail):
- (True, None) if metadata are ok and suggested fields are also present
- (True, <detailed-error>) if metadata are ok but some suggestions are missing
- (False, <detailed-error>) otherwise.
"""
if metadata.tag != "{http://www.w3.org/2005/Atom}entry":
return False, {
"metadata": [
{
"fields": ["atom:entry"],
"summary": (
"Root element should be {http://www.w3.org/2005/Atom}entry, "
f"but it is {metadata.tag}"
),
}
]
}
suggested_fields = []
# at least one value per couple below is mandatory
alternate_fields = {
("atom:name", "atom:title", "codemeta:name"): False,
("atom:author", "codemeta:author"): False,
}
for possible_names in alternate_fields:
for possible_name in possible_names:
if metadata.find(possible_name, namespaces=NAMESPACES) is not None:
alternate_fields[possible_names] = True
continue
mandatory_result = [" or ".join(k) for k, v in alternate_fields.items() if not v]
# provenance metadata is optional
provenance_meta = parse_swh_metadata_provenance(metadata)
if provenance_meta is None:
suggested_fields = [
{"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]}
]
if mandatory_result:
detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}]
return False, {"metadata": detail + suggested_fields}
deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES)
if deposit_elt:
try:
schemas().swh.validate(
deposit_elt,
extra_validator=cast(
# ExtraValidatorType is a callable with "SchemaType" as second
# argument, but extra_validator() is actually passed Xsd11Element
# as second argument
# https://github.com/sissaschool/xmlschema/issues/291
xmlschema.aliases.ExtraValidatorType,
extra_validator,
),
)
except xmlschema.exceptions.XMLSchemaException as e:
return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]}
detail = []
for child in metadata:
for schema_element in schemas().codemeta.root_elements:
if child.tag in schema_element.name:
break
else:
# Tag is not specified in the schema, don't validate it
continue
try:
schemas().codemeta.validate(
child,
extra_validator=cast(
# ExtraValidatorType is a callable with "SchemaType" as second
# argument, but extra_validator() is actually passed Xsd11Element
# as second argument
# https://github.com/sissaschool/xmlschema/issues/291
xmlschema.aliases.ExtraValidatorType,
extra_validator,
),
)
except xmlschema.exceptions.XMLSchemaException as e:
detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)})
else:
# Manually validate <codemeta:affiliation>. Unfortunately, this cannot be
# validated by codemeta.xsd, because Codemeta has conflicting requirements:
# 1. https://codemeta.github.io/terms/ requires it to be Text (represented
# by simple content), but
# 2. https://doi.org/10.5063/SCHEMA/CODEMETA-2.0 requires it to be an
# Organization (represented by complex content)
# And this is (legitimately) not representable in XML Schema.
#
# See https://github.com/codemeta/codemeta/pull/239 for a discussion about
# this issue.
for affiliation in child.findall(
"codemeta:affiliation", namespaces=NAMESPACES
):
if len(affiliation) > 0:
# This is a complex element (as required by
# https://codemeta.github.io/terms/), then we want to make sure
# there is at least a name.
if not affiliation.findtext("codemeta:name", namespaces=NAMESPACES):
detail.append(
{
"fields": [schema_element.prefixed_name],
"summary": AFFILIATION_NO_NAME,
}
)
break
else:
# This is a simple element (as required by
# https://doi.org/10.5063/SCHEMA/CODEMETA-2.0)
if affiliation.text is None or not affiliation.text.strip():
# Completely empty element
detail.append(
{
"fields": [schema_element.prefixed_name],
"summary": AFFILIATION_NO_NAME,
}
)
break
for element in metadata.iter():
if element.tag.startswith("{http://www.w3.org/2005/Atom}"):
_, local_name = element.tag.split("}", 1)
if local_name not in ATOM_ELEMENTS:
if local_name == "external_identifier":
detail.append(
{
"fields": [local_name],
"summary": "<external_identifier> is not supported anymore, "
"<swh:create_origin> or <swh:add_to_origin> should be used "
"instead.",
}
)
elif local_name in CODEMETA2_CONTEXT:
# Probably confused the two namespaces, display a nicer error
detail.append(
{
"fields": [local_name],
"summary": f"{local_name} is not a valid Atom element. "
"However, it would be a valid a Codemeta term; make sure "
"namespaces are not swapped",
}
)
else:
detail.append(
{
"fields": [local_name],
"summary": f"{local_name} is not a valid Atom element, "
"see https://datatracker.ietf.org/doc/html/rfc4287",
}
)
elif element.tag.startswith("{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}"):
_, local_name = element.tag.split("}", 1)
if local_name not in CODEMETA2_CONTEXT:
if local_name in CODEMETA2_CONTEXT:
# Probably confused the two namespaces, display a nicer error
detail.append(
{
"fields": [local_name],
"summary": f"{local_name} is not a valid Codemeta 2.0 term. "
"However, it would be a valid Atom element; make sure "
"namespaces are not swapped",
}
)
else:
detail.append(
{
"fields": [local_name],
"summary": f"{local_name} is not a valid Codemeta 2.0 term, "
"see "
"https://github.com/codemeta/codemeta/blob/2.0/codemeta.jsonld",
}
)
if detail:
return False, {"metadata": detail + suggested_fields}
if suggested_fields: # it's fine but warn about missing suggested fields
return True, {"metadata": suggested_fields}
return True, None