Source code for swh.search.in_memory

# Copyright (C) 2019-2020  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from collections import defaultdict
import itertools
import re
from typing import Any, Dict, Iterable, Iterator, List, Optional

from swh.model.identifiers import origin_identifier
from swh.search.interface import PagedResult


[docs]class InMemorySearch: def __init__(self): pass
[docs] def check(self): return True
[docs] def deinitialize(self) -> None: if hasattr(self, "_origins"): del self._origins del self._origin_ids
[docs] def initialize(self) -> None: self._origins: Dict[str, Dict[str, Any]] = defaultdict(dict) self._origin_ids: List[str] = []
[docs] def flush(self) -> None: pass
_url_splitter = re.compile(r"\W")
[docs] def origin_update(self, documents: Iterable[Dict]) -> None: for document in documents: document = document.copy() id_ = origin_identifier(document) if "url" in document: document["_url_tokens"] = set(self._url_splitter.split(document["url"])) self._origins[id_].update(document) if id_ not in self._origin_ids: self._origin_ids.append(id_)