Source code for swh.deposit.parsers

# Copyright (C) 2017-2020  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information


"""Module in charge of defining parsers with SWORD 2.0 supported mediatypes.

"""

import logging
from xml.etree import ElementTree

from django.conf import settings
from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser

from swh.deposit.errors import ParserError

logger = logging.getLogger(__name__)


[docs] class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive.""" media_type = "application/zip"
[docs] class SWHFileUploadTarParser(FileUploadParser): """File upload parser limited to tarball (tar, tar.gz, tar.*) archives.""" media_type = "application/x-tar"
[docs] class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml"
[docs] def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) parser = ElementTree.XMLParser(encoding=encoding) return ElementTree.parse(stream, parser=parser)
[docs] class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype""" media_type = "application/atom+xml;type=entry"
[docs] def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream
[docs] class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes.""" media_type = "multipart/*; *"
[docs] def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return ElementTree.fromstring(raw_content) except ElementTree.ParseError as e: raise ParserError(str(e))