Source code for swh.storage.fixer
# Copyright (C) 2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Callable, Dict, List
from swh.model.model import Origin
logger = logging.getLogger(__name__)
def _fix_content(content: Dict[str, Any]) -> Dict[str, Any]:
"""Filters-out invalid 'perms' key that leaked from swh.model.from_disk
to the journal.
>>> _fix_content({'perms': 0o100644, 'sha1_git': b'foo'})
{'sha1_git': b'foo'}
>>> _fix_content({'sha1_git': b'bar'})
{'sha1_git': b'bar'}
"""
content = content.copy()
content.pop("perms", None)
return content
def _fix_raw_extrinsic_metadata(obj_dict: Dict) -> Dict:
"""Fix legacy RawExtrinsicMetadata with type which is no longer part of the model.
>>> _fix_raw_extrinsic_metadata({
... 'type': 'directory',
... 'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243',
... })
{'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243'}
>>> _fix_raw_extrinsic_metadata({
... 'type': 'origin',
... 'target': 'https://inria.halpreprod.archives-ouvertes.fr/hal-01667309',
... })
{'target': 'swh:1:ori:155291d5b9ada4570672510509f93fcfd9809882'}
"""
o = obj_dict.copy()
if o.pop("type", None) == "origin":
o["target"] = str(Origin(o["target"]).swhid())
return o
object_fixers: Dict[str, Callable[[Dict], Dict]] = {
"content": _fix_content,
"raw_extrinsic_metadata": _fix_raw_extrinsic_metadata,
}
[docs]
def fix_objects(object_type: str, objects: List[Dict]) -> List[Dict]:
"""
Fix legacy objects from the journal to bring them up to date with the
latest storage schema.
"""
if object_type in object_fixers:
fixer = object_fixers[object_type]
objects = [fixer(v) for v in objects]
return objects