Source code for swh.loader.core.converters

# Copyright (C) 2015-2020  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

"""Convert objects to dictionaries suitable for swh.storage"""

import logging
from typing import Dict, Iterable, List, Optional, Tuple

from swh.model.hashutil import hash_to_hex
from swh.model.model import BaseContent, Content, SkippedContent

logger = logging.getLogger(__name__)


[docs]def prepare_contents( contents: Iterable[Dict], max_content_size: Optional[int] = None, origin_url: Optional[str] = None, ) -> Tuple[List[Dict], List[Dict]]: """Prepare contents for storage from a list of contents Returns tuple of content iterable, skipped content iterable """ present_contents: List[Dict] = [] skipped_contents: List[Dict] = [] for _content in contents: content = content_for_storage( _content, max_content_size=max_content_size, origin_url=origin_url ) if isinstance(content, SkippedContent): skipped_contents.append(content.to_dict()) else: present_contents.append(content.to_dict()) return present_contents, skipped_contents
[docs]def content_for_storage( content: Dict, max_content_size: Optional[int] = None, origin_url: Optional[str] = None, ) -> BaseContent: """Prepare content to be ready for storage Note: - 'data' is returned only if max_content_size is not reached. Returns: content with added data (or reason for being missing) """ ret = content.copy() ret.pop("perms", None) if max_content_size and ret["length"] > max_content_size: logger.info( "Skipping content %s, too large (%s > %s)" % (hash_to_hex(content["sha1_git"]), ret["length"], max_content_size) ) ret.pop("data", None) ret.update( {"status": "absent", "reason": "Content too large", "origin": origin_url} ) return SkippedContent.from_dict(ret) if "data" not in ret: with open(ret["path"], "rb") as f: ret["data"] = f.read() # Extra keys added by swh.model.from_disk, that are not accepted # by swh-storage ret.pop("path", None) ret["status"] = "visible" return Content.from_dict(ret)