Source code for swh.lister.stagit.lister

# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup
from requests.exceptions import HTTPError

from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

logger = logging.getLogger(__name__)

Repositories = List[Dict[str, Any]]



[docs]
class StagitLister(StatelessLister[Repositories]):
    """Lister class for Stagit forge instances.

    This lister will retrieve the list of published git repositories by
    parsing the HTML page(s) of the index retrieved at `url`.

    """

    LISTER_NAME = "stagit"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: Optional[str] = None,
        instance: Optional[str] = None,
        credentials: Optional[CredentialsType] = None,
        max_origins_per_page: Optional[int] = None,
        max_pages: Optional[int] = None,
        enable_origins: bool = True,
    ):
        """Lister class for Stagit repositories.

        Args:
            url: (Optional) Root URL of the Stagit instance, i.e. url of the index of
                published git repositories on this instance. Defaults to
                :file:`https://{instance}` if unset.
            instance: Name of stagit instance. Defaults to url's network location
                if unset.

        """
        super().__init__(
            scheduler=scheduler,
            url=url,
            instance=instance,
            credentials=credentials,
            max_origins_per_page=max_origins_per_page,
            max_pages=max_pages,
            enable_origins=enable_origins,
        )

        self.session.headers.update({"Accept": "application/html"})

    def _get_and_parse(self, url: str) -> BeautifulSoup:
        """Get the given url and parse the retrieved HTML using BeautifulSoup"""
        response = self.http_request(url)
        return BeautifulSoup(response.text, features="html.parser")


[docs]
    def get_pages(self) -> Iterator[Repositories]:
        """Generate git 'project' URLs found on the current Stagit server."""
        bs_idx = self._get_and_parse(self.url)

        page_results = []

        for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"):
            link = tr.find("a")
            if not link:
                continue

            repo_description_url = self.url + "/" + link["href"]

            # This retrieves the date in format "%Y-%m-%d %H:%M"
            tds = tr.find_all("td")
            last_update = tds[-1].text if tds and tds[-1] else None

            page_results.append(
                {"url": repo_description_url, "last_update": last_update}
            )

        yield page_results



[docs]
    def get_origins_from_page(
        self, repositories: Repositories
    ) -> Iterator[ListedOrigin]:
        """Convert a page of stagit repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for repo in repositories:
            origin_url = self._get_origin_from_repository_url(repo["url"])
            if origin_url is None:
                continue

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type="git",
                last_update=_parse_date(repo["last_update"]),
            )


    def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
        """Extract the git url from the repository page"""
        try:
            bs = self._get_and_parse(repository_url)
        except HTTPError as e:
            assert e.response is not None
            logger.warning(
                "Unexpected HTTP status code %s on %s",
                e.response.status_code,
                e.response.url,
            )
            return None

        urls = [
            td.find("a")["href"]
            for row in bs.find_all("tr", {"class": "url"})
            for td in row.find_all("td")
            if td.text.startswith("git clone")
        ]

        if not urls:
            return None

        urls = [url for url in urls if urlparse(url).scheme in ("https", "http", "git")]
        if not urls:
            return None
        return urls[0]



def _parse_date(date: Optional[str]) -> Optional[datetime]:
    """Parse the last update date."""
    if not date:
        return None

    parsed_date = None
    try:
        parsed_date = datetime.strptime(date, "%Y-%m-%d %H:%M").replace(
            tzinfo=timezone.utc
        )
    except Exception:
        logger.warning(
            "Could not parse last_update date: %s",
            date,
        )

    return parsed_date