Source code for swh.lister.hgweb.lister

# Copyright (C) 2026  The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from typing import Iterator, List, Optional, Tuple
from urllib.parse import parse_qs, urlencode, urljoin, urlparse

from bs4 import BeautifulSoup
from requests.exceptions import HTTPError, JSONDecodeError

from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

Repositories = List[Tuple[str, Optional[datetime]]]


[docs] class HgwebLister(StatelessLister[Repositories]): """Lister class for Hgweb repositories. This lister uses the hgweb json template style if it works and if the url value is available in the JSON response. https://repo.mercurial-scm.org/hg/help/hgweb This lister falls back on parsing the HTML if it doesn't. """ LISTER_NAME = "hgweb" def __init__( self, scheduler: SchedulerInterface, url: Optional[str] = None, instance: Optional[str] = None, base_git_url: Optional[str] = None, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, enable_api: bool = True, ): """Lister class for Hgweb repositories.""" super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, ) self.enable_api = enable_api def _api_url(self, url): api_url = urlparse(url) params = parse_qs(api_url.query) params["style"] = ["json"] query = urlencode(params, doseq=True) api_url = api_url._replace(query=query) return api_url.geturl() def _get_json_pages(self) -> Iterator[Repositories]: # This will only work once this patch has been released: # https://foss.heptapod.net/mercurial/mercurial-devel/-/merge_requests/1822 if not self.enable_api: raise ValueError("The use of the JSON API is not enabled") hostname = urlparse(self.url).hostname if hostname and hostname.endswith(".mozilla.org"): raise ValueError("The Mozilla hgweb JSON style is missing directories") self.session.headers.update({"Accept": "application/json"}) done: set[str] = set() todo = {self.url} while todo: url = todo.pop() done.add(url) api_url = self._api_url(url) response = self.http_request(api_url) response.raise_for_status() data = response.json() entries = data.get("entries", []) if not entries: raise ValueError("No entries in JSON") page_results = [] for entry in entries: if url := entry.get("url"): url = urljoin(self.url, url) name = entry.get("name", "") # when isdirectory is not yet present, fallback on # that directory names have a trailing / character # https://foss.heptapod.net/mercurial/mercurial-devel/-/blob/branch/default/mercurial/hgweb/hgwebdir_mod_inner.py#L177 if entry.get("isdirectory", name.endswith("/")): # directories if url not in done: todo.add(url) else: # repositories lastchange = next(iter(entry.get("lastchange", [])), None) if lastchange is not None: lastchange = datetime.fromtimestamp( lastchange, tz=timezone.utc ) page_results.append((url, lastchange)) else: raise ValueError("No URLs in JSON entries") yield page_results def _get_html_pages(self) -> Iterator[Repositories]: """Get the given url and parse the retrieved HTML using BeautifulSoup""" self.session.headers.update({"Accept": "application/html"}) done: set[str] = set() todo = {self.url} while todo: url = todo.pop() done.add(url) response = self.http_request(url) response.raise_for_status() doc = BeautifulSoup(response.text, features="html.parser") page_results = [] for tr in doc.select("table tr"): tds = tr.select("td") if tds and len(tds) >= 4: # mainline hgweb row for a repository or a directory link = tr.select_one("a") if not link: continue href = link.attrs["href"] if href.startswith("?sort="): # skip headers continue url = urljoin(self.url, href) # the hgweb gitweb template style has extra whitespace name = link.text.rstrip() # directory names have a trailing / character # https://foss.heptapod.net/mercurial/mercurial-devel/-/blob/branch/default/mercurial/hgweb/hgwebdir_mod_inner.py#L177 if name.endswith("/"): # directories if url not in done: todo.add(url) else: # repositories age = tr.select_one('td[class="age"]') # remove Mozilla timestamp prefix # https://hg-edge.mozilla.org/hgcustom/version-control-tools/file/tip/hgtemplates/gitweb_mozilla/map#l336 age_text = age.text.strip().removeprefix("at ") if age else None try: # Default templates use RFC 822 dates # https://foss.heptapod.net/mercurial/mercurial-devel/-/blob/branch/default/mercurial/templates/monoblue/map#L288 lastchange = parsedate_to_datetime(age_text) except ValueError: try: # Mozilla templates use RFC 3339 dates # https://hg-edge.mozilla.org/hgcustom/version-control-tools/file/tip/hgtemplates/gitweb_mozilla/map#l336 lastchange = datetime.fromisoformat(age_text or "") except ValueError: lastchange = None page_results.append((url, lastchange)) elif tds: # Mozilla hgweb index row for a directory # https://hg-edge.mozilla.org/hgcustom/version-control-tools/file/tip/hgtemplates/.patches/index.patch link = tr.select_one("a") if not link: continue href = link.attrs["href"] url = urljoin(self.url, href) if url not in done: todo.add(url) yield page_results
[docs] def get_pages(self) -> Iterator[Repositories]: """Generate hg "project" URLs found on the current Hgweb server.""" try: yield from self._get_json_pages() except (ValueError, HTTPError, JSONDecodeError): yield from self._get_html_pages()
[docs] def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of hgweb repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for origin_url, last_update in repositories: yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="hg", last_update=last_update, )