Source code for swh.lister.packagist.lister

# Copyright (C) 2019-2023  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from dataclasses import dataclass
from datetime import datetime, timezone
import logging
from random import shuffle
from typing import Any, Dict, Iterator, List, Optional

import iso8601
import requests
from tenacity import RetryError

from swh.core.utils import grouper
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__)

PackagistPageType = List[str]


[docs] class NotModifiedSinceLastVisit(ValueError): """Exception raised when a package has seen no change since the last visit.""" pass
[docs] @dataclass class PackagistListerState: """State of Packagist lister""" last_listing_date: Optional[datetime] = None """Last date when packagist lister was executed"""
[docs] class PackagistLister(Lister[PackagistListerState, PackagistPageType]): """ List all Packagist projects and send associated origins to scheduler. The lister queries the Packagist API, whose documentation can be found at https://packagist.org/apidoc. For each package, its metadata are retrieved using Packagist API endpoints whose responses are served from static files, which are guaranteed to be efficient on the Packagist side (no dymamic queries). Furthermore, subsequent listing will send the "If-Modified-Since" HTTP header to only retrieve packages metadata updated since the previous listing operation in order to save bandwidth and return only origins which might have new released versions. """ LISTER_NAME = "Packagist" INSTANCE = "packagist" PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" PACKAGIST_PACKAGE_URL_FORMATS = [ # preferred, static, efficient on their side as it can be cached "https://repo.packagist.org/p2/{package_name}.json", # fallback from 1. ^ as it might contain development versions "https://repo.packagist.org/p2/{package_name}~dev.json", # composer v1 metadata, static, efficient but deprecated on their side. # From previous lister version: very few results but some still exists so ok to # check "https://repo.packagist.org/p/{package_name}.json", # dynamic, inefficient on packagist's end, should be used as a last resort "https://repo.packagist.org/packages/{package_name}.json", ] def __init__( self, scheduler: SchedulerInterface, url: str = PACKAGIST_PACKAGES_LIST_URL, instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, record_batch_size: int = 1000, ): super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, with_github_session=True, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, record_batch_size=record_batch_size, ) self.session.headers.update({"Accept": "application/json"}) self.listing_date = datetime.now().astimezone(tz=timezone.utc)
[docs] def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: last_listing_date = d.get("last_listing_date") if last_listing_date is not None: d["last_listing_date"] = iso8601.parse_date(last_listing_date) return PackagistListerState(**d)
[docs] def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: d: Dict[str, Optional[str]] = {"last_listing_date": None} last_listing_date = state.last_listing_date if last_listing_date is not None: d["last_listing_date"] = last_listing_date.isoformat() return d
[docs] def api_request(self, url: str) -> Dict: """Execute api request to packagist server. Raise: NotModifiedSinceLastVisit: if the url returns a 304 response. Return: The json result in case of a 200, an empty response otherwise. """ response = self.http_request(url) # response is empty when status code is 304 status_code = response.status_code if status_code == 200: return response.json() elif status_code == 304: raise NotModifiedSinceLastVisit(url) else: return {}
[docs] def get_pages(self) -> Iterator[PackagistPageType]: """Retrieve & randomize unique list of packages into pages of packages.""" package_names = self.api_request(self.url)["packageNames"] shuffle(package_names) for page_packages in grouper(package_names, n=self.record_batch_size): yield page_packages
def _get_metadata_from_page( self, package_url_format: str, package_name: str ) -> Optional[List[Dict]]: """Retrieve metadata from a package if any. Raise: NotModifiedSinceLastVisit: if the url returns a 304 response. Return: metadata from a package if any """ try: metadata_url = package_url_format.format(package_name=package_name) metadata = self.api_request(metadata_url) packages = metadata.get("packages", {}) format_json = metadata.get("minified") if not packages: # package metadata not updated since last listing return None package_info = packages.get(package_name) if package_info is None: # missing package metadata in response return None logger.debug( "package-name: %s, package-info: %s", package_name, package_info ) if format_json == "composer/2.0": # /p2/ output # In that format, the package info is a list of dict for each package # version return package_info else: # Otherwise, /p/, /packages/ urls returns a dict output return package_info.values() except requests.HTTPError: # error when getting package metadata (usually 404 when a package has # been removed), skip it and process next package return None def _get_metadata_for_package(self, package_name: str) -> Optional[List[Dict]]: """Tentatively retrieve metadata information from a package name. This tries out in order the following pages: - /p2/{package}.json: static and performant (on packagist's side) url. - /p2/{package}~dev.json: static, performant for development package url. - /packages/{package}.json: costly (for packagist's side) url If nothing is found in all urls, a None result is returned. Raise: NotModifiedSinceLastVisit: if the url returns a 304 response. Return: Metadata information on the package name if any. """ for package_url_format in self.PACKAGIST_PACKAGE_URL_FORMATS: meta_info = self._get_metadata_from_page(package_url_format, package_name) # If information, return it immediately, otherwise fallback to the next if meta_info: return meta_info return None
[docs] def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]: """ Iterate on all Packagist projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None # save some bandwidth by only getting packages metadata updated since # last listing if self.state.last_listing_date is not None: if_modified_since = self.state.last_listing_date.strftime( "%a, %d %b %Y %H:%M:%S GMT" ) self.session.headers["If-Modified-Since"] = if_modified_since # to ensure origins will not be listed multiple times origin_urls = set() for package_name in page: try: versions_info = self._get_metadata_for_package(package_name) except NotModifiedSinceLastVisit: # Package was not modified server side since the last visit, we skip it continue if versions_info is None: # No info on package, we skip it continue origin_url = None visit_type = None last_update = None # extract origin url for package, vcs type and latest release date for version_info in versions_info: origin_url = version_info.get("source", {}).get("url", "") if not origin_url: continue # can be git, hg or svn visit_type = version_info.get("source", {}).get("type", "") dist_time_str = version_info.get("time", "") if not dist_time_str: continue try: dist_time = iso8601.parse_date(dist_time_str) except iso8601.iso8601.ParseError: continue if last_update is None or dist_time > last_update: last_update = dist_time # skip package with already seen origin url or with missing required info if visit_type is None or origin_url is None or origin_url in origin_urls: continue if visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical # ones assert self.github_session is not None try: origin_url = ( self.github_session.get_canonical_url(origin_url) or origin_url ) except (requests.exceptions.ConnectionError, RetryError): # server hangs up, let's ignore it for now # that might not happen later on continue # bitbucket closed its mercurial hosting service, those origins can not be # loaded into the archive anymore if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"): continue origin_urls.add(origin_url) logger.debug( "Found package %s last updated on %s", package_name, last_update ) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=visit_type, last_update=last_update, )
[docs] def finalize(self) -> None: self.state.last_listing_date = self.listing_date self.updated = True