Source code for swh.lister.stagit.lister

# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup
from requests.exceptions import HTTPError

from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

logger = logging.getLogger(__name__)

Repositories = List[Dict[str, Any]]


[docs] class StagitLister(StatelessLister[Repositories]): """Lister class for Stagit forge instances. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. """ LISTER_NAME = "stagit" def __init__( self, scheduler: SchedulerInterface, url: Optional[str] = None, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, ): """Lister class for Stagit repositories. Args: url: (Optional) Root URL of the Stagit instance, i.e. url of the index of published git repositories on this instance. Defaults to :file:`https://{instance}` if unset. instance: Name of stagit instance. Defaults to url's network location if unset. """ super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/html"}) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.http_request(url) return BeautifulSoup(response.text, features="html.parser")
[docs] def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current Stagit server.""" bs_idx = self._get_and_parse(self.url) page_results = [] for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"): link = tr.find("a") if not link: continue repo_description_url = self.url + "/" + link["href"] # This retrieves the date in format "%Y-%m-%d %H:%M" tds = tr.find_all("td") last_update = tds[-1].text if tds and tds[-1] else None page_results.append( {"url": repo_description_url, "last_update": last_update} ) yield page_results
[docs] def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of stagit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in repositories: origin_url = self._get_origin_from_repository_url(repo["url"]) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", last_update=_parse_date(repo["last_update"]), )
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" try: bs = self._get_and_parse(repository_url) except HTTPError as e: assert e.response is not None logger.warning( "Unexpected HTTP status code %s on %s", e.response.status_code, e.response.url, ) return None urls = [ td.find("a")["href"] for row in bs.find_all("tr", {"class": "url"}) for td in row.find_all("td") if td.text.startswith("git clone") ] if not urls: return None urls = [url for url in urls if urlparse(url).scheme in ("https", "http", "git")] if not urls: return None return urls[0]
def _parse_date(date: Optional[str]) -> Optional[datetime]: """Parse the last update date.""" if not date: return None parsed_date = None try: parsed_date = datetime.strptime(date, "%Y-%m-%d %H:%M").replace( tzinfo=timezone.utc ) except Exception: logger.warning( "Could not parse last_update date: %s", date, ) return parsed_date