Source code for swh.lister.gitweb.lister

# Copyright (C) 2023-2024 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from datetime import datetime, timezone
import logging
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import parse_qs, urljoin, urlparse

from bs4 import BeautifulSoup
from dateparser import parse
from requests.exceptions import HTTPError

from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

logger = logging.getLogger(__name__)

Repositories = List[Dict[str, Any]]



[docs]
class GitwebLister(StatelessLister[Repositories]):
    """Lister class for Gitweb repositories.

    This lister will retrieve the list of published git repositories by
    parsing the HTML page(s) of the index retrieved at `url`.

    """

    LISTER_NAME = "gitweb"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: Optional[str] = None,
        instance: Optional[str] = None,
        base_git_url: Optional[str] = None,
        credentials: Optional[CredentialsType] = None,
        max_origins_per_page: Optional[int] = None,
        max_pages: Optional[int] = None,
        enable_origins: bool = True,
    ):
        """Lister class for Gitweb repositories.

        Args:
            url: Root URL of the Gitweb instance, i.e. url of the index of
                published git repositories on this instance. Defaults to
                :file:`https://{instance}` if unset.
            instance: Name of gitweb instance. Defaults to url's network location
                if unset.
            base_git_url: Base URL to clone a git project hosted on the Gitweb instance,
                should only be used if the clone URLs cannot be found when scraping project
                page or cannot be easily derived from the root URL of the instance

        """
        super().__init__(
            scheduler=scheduler,
            url=url,
            instance=instance,
            credentials=credentials,
            max_origins_per_page=max_origins_per_page,
            max_pages=max_pages,
            enable_origins=enable_origins,
        )

        self.session.headers.update({"Accept": "application/html"})
        self.instance_scheme = urlparse(url).scheme
        self.base_git_url = base_git_url

    def _get_and_parse(self, url: str) -> BeautifulSoup:
        """Get the given url and parse the retrieved HTML using BeautifulSoup"""
        response = self.http_request(url)
        return BeautifulSoup(response.text, features="html.parser")


[docs]
    def get_pages(self) -> Iterator[Repositories]:
        """Generate git 'project' URLs found on the current Gitweb server."""
        bs_idx = self._get_and_parse(self.url)

        page_results = []

        for tr in bs_idx.select("table.project_list tr"):
            link = tr.select_one("a")
            if not link:
                continue

            repo_url = urljoin(self.url, link.attrs["href"]).strip("/")

            # Skip this description page which is listed but won't yield any origins to list
            if repo_url.endswith("?o=descr"):
                continue

            # This retrieves the date interval in natural language (e.g. '9 years ago')
            # to actual python datetime interval so we can derive last update
            span = tr.select_one('td[class^="age"]')
            page_results.append(
                {"url": repo_url, "last_update_interval": span.text if span else None}
            )

        yield page_results



[docs]
    def get_origins_from_page(
        self, repositories: Repositories
    ) -> Iterator[ListedOrigin]:
        """Convert a page of gitweb repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for repo in repositories:
            origin_url = self._get_origin_from_repository_url(repo["url"])
            if origin_url is None:
                continue

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type="git",
                last_update=parse_last_update(repo.get("last_update_interval")),
            )


    def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
        """Extract the git url from the repository page"""
        try:
            bs = self._get_and_parse(repository_url)
        except HTTPError as e:
            assert e.response is not None
            logger.warning(
                "Unexpected HTTP status code %s on %s",
                e.response.status_code,
                e.response.url,
            )
            return None

        urls = []
        for row in bs.select("tr.metadata_url"):
            url = row.select("td")[-1].text.strip()
            for scheme in ("http", "https", "git"):
                # remove any string prefix before origin
                pos = url.find(f"{scheme}://")
                if pos != -1:
                    url = url[pos:]
                    break

            if "," in url:
                urls_ = [s.strip() for s in url.split(",") if s]
                urls.extend(urls_)
            else:
                urls.append(url)

        if not urls:
            repo = try_to_determine_git_repository(repository_url, self.base_git_url)
            if not repo:
                logger.debug("No git urls found on %s", repository_url)
            return repo

        # look for the http/https url, if any, and use it as origin_url
        for url in urls:
            parsed_url = urlparse(url)
            if parsed_url.scheme == "https":
                origin_url = url
                break
            elif parsed_url.scheme == "http" and self.instance_scheme == "https":
                # workaround for non-working listed http origins
                origin_url = url.replace("http://", "https://")
                break
        else:
            # otherwise, choose the first one
            origin_url = urls[0]
        return origin_url




[docs]
def try_to_determine_git_repository(
    repository_url: str, base_git_url: Optional[str] = None
) -> Optional[str]:
    """Some gitweb instances does not advertise the git urls.

    This heuristic works on instances demonstrating this behavior.

    """
    result = None
    parsed_url = urlparse(repository_url)
    repo = parse_qs(parsed_url.query, separator=";").get("p")
    if repo:
        if base_git_url:
            result = f"{base_git_url.rstrip('/')}/{repo[0]}"
        else:
            result = f"git://{parsed_url.netloc}/{repo[0]}"
    return result




[docs]
def parse_last_update(last_update_interval: Optional[str]) -> Optional[datetime]:
    """Parse the last update string into a datetime."""
    if not last_update_interval:
        return None
    last_update_date = parse(last_update_interval)
    last_update = None
    if last_update_date is not None:
        last_update = last_update_date.replace(tzinfo=timezone.utc)
    return last_update