Source code for swh.lister.elm.lister

# Copyright (C) 2023  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from dataclasses import asdict, dataclass
import logging
from typing import Any, Dict, Iterator, Optional, Set

from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__)

# Aliasing the page results returned by `get_pages` method from the lister.
ElmListerPage = Set[str]


[docs] @dataclass class ElmListerState: """Store lister state for incremental mode operations""" all_packages_count: Optional[int] = None """Store the count of all existing packages, used as ``since`` argument of API endpoint url. """
[docs] class ElmLister(Lister[ElmListerState, ElmListerPage]): """List Elm packages origins""" LISTER_NAME = "elm" VISIT_TYPE = "git" # Elm origins url are Git repositories INSTANCE = "elm" BASE_URL = "https://package.elm-lang.org" ALL_PACKAGES_URL_PATTERN = "{base_url}/all-packages/since/{since}" REPO_URL_PATTERN = "https://github.com/{name}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, url: str = BASE_URL, instance: str = INSTANCE, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, ) self.all_packages_count: int = 0 self.session.headers.update({"Accept": "application/json"})
[docs] def state_from_dict(self, d: Dict[str, Any]) -> ElmListerState: return ElmListerState(**d)
[docs] def state_to_dict(self, state: ElmListerState) -> Dict[str, Any]: return asdict(state)
[docs] def get_pages(self) -> Iterator[ElmListerPage]: """Yield an iterator which returns 'page' It uses the Http api endpoint ``https://package.elm-lang.org/all-packages/since/:since`` to get a list of packages versions from where we get names corresponding to GitHub repository url suffixes. There is only one page that list all origins urls. """ if not self.state.all_packages_count: since = 0 else: since = self.state.all_packages_count response = self.http_request( self.ALL_PACKAGES_URL_PATTERN.format(base_url=self.url, since=since) ) # We’ll save this to the state in finalize() self.all_packages_count = len(response.json()) + since res = set() for entry in response.json(): res.add(entry.split("@")[0]) yield res
[docs] def get_origins_from_page(self, page: ElmListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances""" assert self.lister_obj.id is not None for name in page: repo_url: str = self.REPO_URL_PATTERN.format(name=name) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=repo_url, last_update=None, )
[docs] def finalize(self) -> None: if ( self.state.all_packages_count is None or self.all_packages_count > self.state.all_packages_count ): self.state.all_packages_count = self.all_packages_count self.updated = True