Source code for swh.lister.hgweb.lister

# Copyright (C) 2026  The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from typing import Iterator, List, Optional, Tuple
from urllib.parse import parse_qs, urlencode, urljoin, urlparse

from bs4 import BeautifulSoup
from requests.exceptions import HTTPError, JSONDecodeError

from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

Repositories = List[Tuple[str, Optional[datetime]]]



[docs]
class HgwebLister(StatelessLister[Repositories]):
    """Lister class for Hgweb repositories.

    This lister uses the hgweb json template style if it works and
    if the url value is available in the JSON response.

    https://repo.mercurial-scm.org/hg/help/hgweb

    This lister falls back on parsing the HTML if it doesn't.
    """

    LISTER_NAME = "hgweb"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: Optional[str] = None,
        instance: Optional[str] = None,
        base_git_url: Optional[str] = None,
        credentials: Optional[CredentialsType] = None,
        max_origins_per_page: Optional[int] = None,
        max_pages: Optional[int] = None,
        enable_origins: bool = True,
        enable_api: bool = True,
    ):
        """Lister class for Hgweb repositories."""
        super().__init__(
            scheduler=scheduler,
            url=url,
            instance=instance,
            credentials=credentials,
            max_origins_per_page=max_origins_per_page,
            max_pages=max_pages,
            enable_origins=enable_origins,
        )

        self.enable_api = enable_api

    def _api_url(self, url):
        api_url = urlparse(url)
        params = parse_qs(api_url.query)
        params["style"] = ["json"]
        query = urlencode(params, doseq=True)
        api_url = api_url._replace(query=query)
        return api_url.geturl()

    def _get_json_pages(self) -> Iterator[Repositories]:
        # This will only work once this patch has been released:
        # https://foss.heptapod.net/mercurial/mercurial-devel/-/merge_requests/1822
        if not self.enable_api:
            raise ValueError("The use of the JSON API is not enabled")
        hostname = urlparse(self.url).hostname
        if hostname and hostname.endswith(".mozilla.org"):
            raise ValueError("The Mozilla hgweb JSON style is missing directories")
        self.session.headers.update({"Accept": "application/json"})
        done: set[str] = set()
        todo = {self.url}
        while todo:
            url = todo.pop()
            done.add(url)
            api_url = self._api_url(url)
            response = self.http_request(api_url)
            response.raise_for_status()
            data = response.json()
            entries = data.get("entries", [])
            if not entries:
                raise ValueError("No entries in JSON")
            page_results = []
            for entry in entries:
                if url := entry.get("url"):
                    url = urljoin(self.url, url)
                    name = entry.get("name", "")
                    # when isdirectory is not yet present, fallback on
                    # that directory names have a trailing / character
                    # https://foss.heptapod.net/mercurial/mercurial-devel/-/blob/branch/default/mercurial/hgweb/hgwebdir_mod_inner.py#L177
                    if entry.get("isdirectory", name.endswith("/")):
                        # directories
                        if url not in done:
                            todo.add(url)
                    else:
                        # repositories
                        lastchange = next(iter(entry.get("lastchange", [])), None)
                        if lastchange is not None:
                            lastchange = datetime.fromtimestamp(
                                lastchange, tz=timezone.utc
                            )
                        page_results.append((url, lastchange))
                else:
                    raise ValueError("No URLs in JSON entries")
            yield page_results

    def _get_html_pages(self) -> Iterator[Repositories]:
        """Get the given url and parse the retrieved HTML using BeautifulSoup"""
        self.session.headers.update({"Accept": "application/html"})
        done: set[str] = set()
        todo = {self.url}
        while todo:
            url = todo.pop()
            done.add(url)
            response = self.http_request(url)
            response.raise_for_status()
            doc = BeautifulSoup(response.text, features="html.parser")
            page_results = []
            for tr in doc.select("table tr"):
                tds = tr.select("td")
                if tds and len(tds) >= 4:
                    # mainline hgweb row for a repository or a directory
                    link = tr.select_one("a")
                    if not link:
                        continue
                    href = link.attrs["href"]
                    if href.startswith("?sort="):
                        # skip headers
                        continue
                    url = urljoin(self.url, href)
                    # the hgweb gitweb template style has extra whitespace
                    name = link.text.rstrip()
                    # directory names have a trailing / character
                    # https://foss.heptapod.net/mercurial/mercurial-devel/-/blob/branch/default/mercurial/hgweb/hgwebdir_mod_inner.py#L177
                    if name.endswith("/"):
                        # directories
                        if url not in done:
                            todo.add(url)
                    else:
                        # repositories
                        age = tr.select_one('td[class="age"]')
                        # remove Mozilla timestamp prefix
                        # https://hg-edge.mozilla.org/hgcustom/version-control-tools/file/tip/hgtemplates/gitweb_mozilla/map#l336
                        age_text = age.text.strip().removeprefix("at ") if age else None
                        try:
                            # Default templates use RFC 822 dates
                            # https://foss.heptapod.net/mercurial/mercurial-devel/-/blob/branch/default/mercurial/templates/monoblue/map#L288
                            lastchange = parsedate_to_datetime(age_text)
                        except ValueError:
                            try:
                                # Mozilla templates use RFC 3339 dates
                                # https://hg-edge.mozilla.org/hgcustom/version-control-tools/file/tip/hgtemplates/gitweb_mozilla/map#l336
                                lastchange = datetime.fromisoformat(age_text or "")
                            except ValueError:
                                lastchange = None
                        page_results.append((url, lastchange))
                elif tds:
                    # Mozilla hgweb index row for a directory
                    # https://hg-edge.mozilla.org/hgcustom/version-control-tools/file/tip/hgtemplates/.patches/index.patch
                    link = tr.select_one("a")
                    if not link:
                        continue
                    href = link.attrs["href"]
                    url = urljoin(self.url, href)
                    if url not in done:
                        todo.add(url)
            yield page_results


[docs]
    def get_pages(self) -> Iterator[Repositories]:
        """Generate hg "project" URLs found on the current Hgweb server."""
        try:
            yield from self._get_json_pages()
        except (ValueError, HTTPError, JSONDecodeError):
            yield from self._get_html_pages()



[docs]
    def get_origins_from_page(
        self, repositories: Repositories
    ) -> Iterator[ListedOrigin]:
        """Convert a page of hgweb repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for origin_url, last_update in repositories:
            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=origin_url,
                visit_type="hg",
                last_update=last_update,
            )