Source code for swh.lister.gnu.lister
# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Iterator, Mapping, Optional
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from .tree import GNUTree
logger = logging.getLogger(__name__)
GNUPageType = Mapping[str, Any]
[docs]
class GNULister(StatelessLister[GNUPageType]):
"""
List all GNU projects and associated artifacts.
"""
LISTER_NAME = "GNU"
INSTANCE = "GNU"
GNU_FTP_URL = "https://ftp.gnu.org"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = GNU_FTP_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
# no side-effect calls in constructor, if extra state is needed, as preconized
# by the pattern docstring, this must happen in the get_pages method.
self.gnu_tree: Optional[GNUTree] = None
[docs]
def get_pages(self) -> Iterator[GNUPageType]:
"""
Yield a single page listing all GNU projects.
"""
# first fetch the manifest to parse
self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz")
yield self.gnu_tree.projects
[docs]
def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]:
"""
Iterate on all GNU projects and yield ListedOrigin instances.
"""
assert self.lister_obj.id is not None
assert self.gnu_tree is not None
artifacts = self.gnu_tree.artifacts
for project_name, project_info in page.items():
origin_url = project_info["url"]
last_update = iso8601.parse_date(project_info["time_modified"])
logger.debug("Found origin %s last updated on %s", origin_url, last_update)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="tar",
last_update=last_update,
extra_loader_arguments={"artifacts": artifacts[project_name]},
)