Source code for swh.lister.nuget.lister

# Copyright (C) 2022-2023  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from dataclasses import dataclass
from datetime import datetime
import logging
from typing import Any, Dict, Iterator, List, Optional

from bs4 import BeautifulSoup
import iso8601
from requests.exceptions import HTTPError

from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

from ..pattern import CredentialsType, Lister

logger = logging.getLogger(__name__)


# Aliasing the page results returned by `get_pages` method from the lister.
NugetListerPage = List[Dict[str, str]]



[docs]
@dataclass
class NugetListerState:
    """Store lister state for incremental mode operations"""

    last_listing_date: Optional[datetime] = None
    """Last date from main http api endpoint when lister was executed"""




[docs]
class NugetLister(Lister[NugetListerState, NugetListerPage]):
    """List Nuget (Package manager for .NET) origins."""

    LISTER_NAME = "nuget"
    INSTANCE = "nuget"

    API_INDEX_URL = "https://api.nuget.org/v3/catalog0/index.json"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: str = API_INDEX_URL,
        instance: str = INSTANCE,
        credentials: Optional[CredentialsType] = None,
        max_origins_per_page: Optional[int] = None,
        max_pages: Optional[int] = None,
        enable_origins: bool = True,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            instance=instance,
            url=url,
            max_origins_per_page=max_origins_per_page,
            max_pages=max_pages,
            enable_origins=enable_origins,
        )
        self.listing_date: Optional[datetime] = None


[docs]
    def state_from_dict(self, d: Dict[str, Any]) -> NugetListerState:
        last_listing_date = d.get("last_listing_date")
        if last_listing_date is not None:
            d["last_listing_date"] = iso8601.parse_date(last_listing_date)
        return NugetListerState(**d)



[docs]
    def state_to_dict(self, state: NugetListerState) -> Dict[str, Any]:
        d: Dict[str, Optional[str]] = {"last_listing_date": None}
        last_listing_date = state.last_listing_date
        if last_listing_date is not None:
            d["last_listing_date"] = last_listing_date.isoformat()
        return d



[docs]
    def get_pages(self) -> Iterator[NugetListerPage]:
        """Yield an iterator which returns 'page'

        It uses the following endpoint `https://api.nuget.org/v3/catalog0/index.json`
        to get a list of pages endpoint to iterate.
        """
        index_response = self.http_request(url=self.url)
        index = index_response.json()

        assert "commitTimeStamp" in index
        self.listing_date = iso8601.parse_date(index["commitTimeStamp"])

        assert "items" in index
        for page in index["items"]:
            assert page["@id"]
            assert page["commitTimeStamp"]

            commit_timestamp = iso8601.parse_date(page["commitTimeStamp"])

            if (
                not self.state.last_listing_date
                or commit_timestamp > self.state.last_listing_date
            ):
                try:
                    page_response = self.http_request(url=page["@id"])
                    page_data = page_response.json()
                    assert "items" in page_data
                    yield page_data["items"]
                except HTTPError:
                    logger.warning(
                        "Failed to fetch page %s, skipping it from listing.",
                        page["@id"],
                    )
                    continue



[docs]
    def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]:
        """Iterate on all pages and yield ListedOrigin instances.
        .NET packages are binary, dll, etc. We retrieve only packages for which we can
        find a vcs repository.

        To check if a vcs repository exists, we need for each entry in a page to retrieve
        a .nuspec file, which is a package metadata xml file, and search for a `repository`
        value.
        """
        assert self.lister_obj.id is not None

        for elt in page:
            try:
                res = self.http_request(url=elt["@id"])
            except HTTPError:
                logger.warning(
                    "Failed to fetch page %s, skipping it from listing.",
                    elt["@id"],
                )
                continue

            data = res.json()
            pkgname = data["id"]
            nuspec_url = (
                f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/"
                f"{data['version'].lower()}/{pkgname.lower()}.nuspec"
            )

            try:
                res_metadata = self.http_request(url=nuspec_url)
            except HTTPError:
                logger.warning(
                    "Failed to fetch nuspec file %s, skipping it from listing.",
                    nuspec_url,
                )
                continue
            xml = BeautifulSoup(res_metadata.content, "xml")
            repo = xml.select_one("repository")
            if repo and "url" in repo.attrs and "type" in repo.attrs:
                vcs_url = repo.attrs["url"]
                vcs_type = repo.attrs["type"]
                last_update = iso8601.parse_date(elt["commitTimeStamp"])
                yield ListedOrigin(
                    lister_id=self.lister_obj.id,
                    visit_type=vcs_type,
                    url=vcs_url,
                    last_update=last_update,
                )
            else:
                continue



[docs]
    def finalize(self) -> None:
        self.state.last_listing_date = self.listing_date
        self.updated = True