Source code for swh.deposit.api.checks

# Copyright (C) 2017-2022  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

"""Functional Metadata checks:

Mandatory fields:
- 'author'
- 'name' or 'title'

Suggested fields:
- metadata-provenance

"""

import dataclasses
import functools
import re
from typing import Dict, Iterator, Optional, Tuple, cast
import urllib
from xml.etree import ElementTree

import pkg_resources
import xmlschema

from swh.deposit.errors import FORBIDDEN, DepositError
from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance

MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
INVALID_DATE_FORMAT = "Invalid date format"

SUGGESTED_FIELDS_MISSING = "Suggested fields are missing"
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"

AFFILIATION_NO_NAME = "Reason: affiliation does not have a <codemeta:name> element"

# from https://datatracker.ietf.org/doc/html/rfc4287
ATOM_ELEMENTS = [
    "name",
    "uri",
    "email",
    # specifically not allowing this one, because clients are supposed to send one
    # entry at a time:
    # "feed",
    "entry",
    # ditto:
    # "content",
    "author",
    "category",
    "contributor",
    "generator",
    "icon",
    "id",
    "link",
    "logo",
    "published",
    "rights",
    "source",
    "subtitle",
    "summary",
    "title",
    "updated",
]

# from https://github.com/codemeta/codemeta/blob/2.0/codemeta.jsonld
CODEMETA2_CONTEXT = {
    "type": "@type",
    "id": "@id",
    "schema": "http://schema.org/",
    "codemeta": "https://codemeta.github.io/terms/",
    "Organization": {"@id": "schema:Organization"},
    "Person": {"@id": "schema:Person"},
    "SoftwareSourceCode": {"@id": "schema:SoftwareSourceCode"},
    "SoftwareApplication": {"@id": "schema:SoftwareApplication"},
    "Text": {"@id": "schema:Text"},
    "URL": {"@id": "schema:URL"},
    "address": {"@id": "schema:address"},
    "affiliation": {"@id": "schema:affiliation"},
    "applicationCategory": {"@id": "schema:applicationCategory", "@type": "@id"},
    "applicationSubCategory": {"@id": "schema:applicationSubCategory", "@type": "@id"},
    "citation": {"@id": "schema:citation"},
    "codeRepository": {"@id": "schema:codeRepository", "@type": "@id"},
    "contributor": {"@id": "schema:contributor"},
    "copyrightHolder": {"@id": "schema:copyrightHolder"},
    "copyrightYear": {"@id": "schema:copyrightYear"},
    "creator": {"@id": "schema:creator"},
    "dateCreated": {"@id": "schema:dateCreated", "@type": "schema:Date"},
    "dateModified": {"@id": "schema:dateModified", "@type": "schema:Date"},
    "datePublished": {"@id": "schema:datePublished", "@type": "schema:Date"},
    "description": {"@id": "schema:description"},
    "downloadUrl": {"@id": "schema:downloadUrl", "@type": "@id"},
    "email": {"@id": "schema:email"},
    "editor": {"@id": "schema:editor"},
    "encoding": {"@id": "schema:encoding"},
    "familyName": {"@id": "schema:familyName"},
    "fileFormat": {"@id": "schema:fileFormat", "@type": "@id"},
    "fileSize": {"@id": "schema:fileSize"},
    "funder": {"@id": "schema:funder"},
    "givenName": {"@id": "schema:givenName"},
    "hasPart": {"@id": "schema:hasPart"},
    "identifier": {"@id": "schema:identifier", "@type": "@id"},
    "installUrl": {"@id": "schema:installUrl", "@type": "@id"},
    "isAccessibleForFree": {"@id": "schema:isAccessibleForFree"},
    "isPartOf": {"@id": "schema:isPartOf"},
    "keywords": {"@id": "schema:keywords"},
    "license": {"@id": "schema:license", "@type": "@id"},
    "memoryRequirements": {"@id": "schema:memoryRequirements", "@type": "@id"},
    "name": {"@id": "schema:name"},
    "operatingSystem": {"@id": "schema:operatingSystem"},
    "permissions": {"@id": "schema:permissions"},
    "position": {"@id": "schema:position"},
    "processorRequirements": {"@id": "schema:processorRequirements"},
    "producer": {"@id": "schema:producer"},
    "programmingLanguage": {"@id": "schema:programmingLanguage"},
    "provider": {"@id": "schema:provider"},
    "publisher": {"@id": "schema:publisher"},
    "relatedLink": {"@id": "schema:relatedLink", "@type": "@id"},
    "releaseNotes": {"@id": "schema:releaseNotes", "@type": "@id"},
    "runtimePlatform": {"@id": "schema:runtimePlatform"},
    "sameAs": {"@id": "schema:sameAs", "@type": "@id"},
    "softwareHelp": {"@id": "schema:softwareHelp"},
    "softwareRequirements": {"@id": "schema:softwareRequirements", "@type": "@id"},
    "softwareVersion": {"@id": "schema:softwareVersion"},
    "sponsor": {"@id": "schema:sponsor"},
    "storageRequirements": {"@id": "schema:storageRequirements", "@type": "@id"},
    "supportingData": {"@id": "schema:supportingData"},
    "targetProduct": {"@id": "schema:targetProduct"},
    "url": {"@id": "schema:url", "@type": "@id"},
    "version": {"@id": "schema:version"},
    "author": {"@id": "schema:author", "@container": "@list"},
    "softwareSuggestions": {"@id": "codemeta:softwareSuggestions", "@type": "@id"},
    "contIntegration": {"@id": "codemeta:contIntegration", "@type": "@id"},
    "buildInstructions": {"@id": "codemeta:buildInstructions", "@type": "@id"},
    "developmentStatus": {"@id": "codemeta:developmentStatus", "@type": "@id"},
    "embargoDate": {"@id": "codemeta:embargoDate", "@type": "schema:Date"},
    "funding": {"@id": "codemeta:funding"},
    "readme": {"@id": "codemeta:readme", "@type": "@id"},
    "issueTracker": {"@id": "codemeta:issueTracker", "@type": "@id"},
    "referencePublication": {"@id": "codemeta:referencePublication", "@type": "@id"},
    "maintainer": {"@id": "codemeta:maintainer"},
}


[docs] def extra_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Optional[Iterator[xmlschema.XMLSchemaValidationError]]: """Performs extra checks on Atom elements that cannot be implemented purely within XML Schema. For now, this only checks URIs are absolute.""" type_name = xsd_element.type.name if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI": # Check their URI is absolute. # This could technically be implemented in the schema like this: # <xsd:simpleType name="URL"> # <xsd:restriction base="xsd:anyURI"> # <!-- https://datatracker.ietf.org/doc/html/rfc2396#section-3.1 --> # <xsd:pattern value="[a-zA-Z][a-zA-Z0-9+.-]*:.+" /> # </xsd:restriction> # </xsd:simpleType> # However, this would give an unreadable error, so we implement it here # in Python instead. yield from absolute_uri_validator(element, xsd_element) elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType": # Made-up type, that allows both absolute URIs and HAL-IDs if not re.match("hal-[0-9]+", element.text or ""): yield from absolute_uri_validator(element, xsd_element)
[docs] def absolute_uri_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Iterator[xmlschema.XMLSchemaValidationError]: try: url = urllib.parse.urlparse(element.text) except ValueError: yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not a valid URI", ) else: if not url.scheme or not url.netloc: yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not an absolute URI", ) elif " " in url.netloc: # urllib is a little too permissive... yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not a valid URI", )
[docs] @dataclasses.dataclass class Schemas: swh: xmlschema.XMLSchema11 codemeta: xmlschema.XMLSchema11
[docs] @functools.lru_cache(1) def schemas() -> Schemas: def load_xsd(name) -> xmlschema.XMLSchema11: return xmlschema.XMLSchema11( pkg_resources.resource_string("swh.deposit", f"xsd/{name}.xsd").decode() ) return Schemas(swh=load_xsd("swh"), codemeta=load_xsd("codemeta"))
[docs] def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]: """Check metadata for mandatory field presence and date format. Args: metadata: Metadata dictionary to check Returns: tuple (status, error_detail): - (True, None) if metadata are ok and suggested fields are also present - (True, <detailed-error>) if metadata are ok but some suggestions are missing - (False, <detailed-error>) otherwise. """ if metadata.tag != "{http://www.w3.org/2005/Atom}entry": return False, { "metadata": [ { "fields": ["atom:entry"], "summary": ( "Root element should be {http://www.w3.org/2005/Atom}entry, " f"but it is {metadata.tag}" ), } ] } suggested_fields = [] # at least one value per couple below is mandatory alternate_fields = { ("atom:name", "atom:title", "codemeta:name"): False, ("atom:author", "codemeta:author"): False, } for possible_names in alternate_fields: for possible_name in possible_names: if metadata.find(possible_name, namespaces=NAMESPACES) is not None: alternate_fields[possible_names] = True continue mandatory_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] # provenance metadata is optional provenance_meta = parse_swh_metadata_provenance(metadata) if provenance_meta is None: suggested_fields = [ {"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]} ] if mandatory_result: detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}] return False, {"metadata": detail + suggested_fields} deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES) if deposit_elt: try: schemas().swh.validate( deposit_elt, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]} detail = [] for child in metadata: for schema_element in schemas().codemeta.root_elements: if child.tag in schema_element.name: break else: # Tag is not specified in the schema, don't validate it continue try: schemas().codemeta.validate( child, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)}) else: # Manually validate <codemeta:affiliation>. Unfortunately, this cannot be # validated by codemeta.xsd, because Codemeta has conflicting requirements: # 1. https://codemeta.github.io/terms/ requires it to be Text (represented # by simple content), but # 2. https://doi.org/10.5063/SCHEMA/CODEMETA-2.0 requires it to be an # Organization (represented by complex content) # And this is (legitimately) not representable in XML Schema. # # See https://github.com/codemeta/codemeta/pull/239 for a discussion about # this issue. for affiliation in child.findall( "codemeta:affiliation", namespaces=NAMESPACES ): if len(affiliation) > 0: # This is a complex element (as required by # https://codemeta.github.io/terms/), then we want to make sure # there is at least a name. if not affiliation.findtext("codemeta:name", namespaces=NAMESPACES): detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break else: # This is a simple element (as required by # https://doi.org/10.5063/SCHEMA/CODEMETA-2.0) if affiliation.text is None or not affiliation.text.strip(): # Completely empty element detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break for element in metadata.iter(): if element.tag.startswith("{http://www.w3.org/2005/Atom}"): _, local_name = element.tag.split("}", 1) if local_name not in ATOM_ELEMENTS: if local_name == "external_identifier": detail.append( { "fields": [local_name], "summary": "<external_identifier> is not supported anymore, " "<swh:create_origin> or <swh:add_to_origin> should be used " "instead.", } ) elif local_name in CODEMETA2_CONTEXT: # Probably confused the two namespaces, display a nicer error detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Atom element. " "However, it would be a valid a Codemeta term; make sure " "namespaces are not swapped", } ) else: detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Atom element, " "see https://datatracker.ietf.org/doc/html/rfc4287", } ) elif element.tag.startswith("{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}"): _, local_name = element.tag.split("}", 1) if local_name not in CODEMETA2_CONTEXT: if local_name in CODEMETA2_CONTEXT: # Probably confused the two namespaces, display a nicer error detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Codemeta 2.0 term. " "However, it would be a valid Atom element; make sure " "namespaces are not swapped", } ) else: detail.append( { "fields": [local_name], "summary": f"{local_name} is not a valid Codemeta 2.0 term, " "see " "https://github.com/codemeta/codemeta/blob/2.0/codemeta.jsonld", } ) if detail: return False, {"metadata": detail + suggested_fields} if suggested_fields: # it's fine but warn about missing suggested fields return True, {"metadata": suggested_fields} return True, None
[docs] def check_url_match_provider(url: str, provider_url: str) -> None: """Check url matches the provider url. Raises DepositError in case of mismatch """ provider_url = provider_url.rstrip("/") + "/" if not url.startswith(provider_url): raise DepositError( FORBIDDEN, f"URL mismatch: {url} must start with {provider_url}", )