Source code for swh.lister.gitweb.lister

# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import parse_qs, urljoin, urlparse

from bs4 import BeautifulSoup
from dateparser import parse
from requests.exceptions import HTTPError

from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin

logger = logging.getLogger(__name__)

Repositories = List[Dict[str, Any]]


[docs] class GitwebLister(StatelessLister[Repositories]): """Lister class for Gitweb repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. """ LISTER_NAME = "gitweb" def __init__( self, scheduler: SchedulerInterface, url: Optional[str] = None, instance: Optional[str] = None, base_git_url: Optional[str] = None, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, ): """Lister class for Gitweb repositories. Args: url: Root URL of the Gitweb instance, i.e. url of the index of published git repositories on this instance. Defaults to :file:`https://{instance}` if unset. instance: Name of gitweb instance. Defaults to url's network location if unset. base_git_url: Base URL to clone a git project hosted on the Gitweb instance, should only be used if the clone URLs cannot be found when scraping project page or cannot be easily derived from the root URL of the instance """ super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/html"}) self.instance_scheme = urlparse(url).scheme self.base_git_url = base_git_url def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.http_request(url) return BeautifulSoup(response.text, features="html.parser")
[docs] def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current Gitweb server.""" bs_idx = self._get_and_parse(self.url) page_results = [] for tr in bs_idx.find("table", {"class": re.compile("project_list")}).find_all( "tr" ): link = tr.find("a") if not link: continue repo_url = urljoin(self.url, link["href"]).strip("/") # Skip this description page which is listed but won't yield any origins to list if repo_url.endswith("?o=descr"): continue # This retrieves the date interval in natural language (e.g. '9 years ago') # to actual python datetime interval so we can derive last update span = tr.find("td", {"class": re.compile("age.*")}) page_results.append( {"url": repo_url, "last_update_interval": span.text if span else None} ) yield page_results
[docs] def get_origins_from_page( self, repositories: Repositories ) -> Iterator[ListedOrigin]: """Convert a page of gitweb repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in repositories: origin_url = self._get_origin_from_repository_url(repo["url"]) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", last_update=parse_last_update(repo.get("last_update_interval")), )
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: """Extract the git url from the repository page""" try: bs = self._get_and_parse(repository_url) except HTTPError as e: assert e.response is not None logger.warning( "Unexpected HTTP status code %s on %s", e.response.status_code, e.response.url, ) return None urls = [] for row in bs.find_all("tr", {"class": "metadata_url"}): url = row.contents[-1].string.strip() for scheme in ("http", "https", "git"): # remove any string prefix before origin pos = url.find(f"{scheme}://") if pos != -1: url = url[pos:] break if "," in url: urls_ = [s.strip() for s in url.split(",") if s] urls.extend(urls_) else: urls.append(url) if not urls: repo = try_to_determine_git_repository(repository_url, self.base_git_url) if not repo: logger.debug("No git urls found on %s", repository_url) return repo # look for the http/https url, if any, and use it as origin_url for url in urls: parsed_url = urlparse(url) if parsed_url.scheme == "https": origin_url = url break elif parsed_url.scheme == "http" and self.instance_scheme == "https": # workaround for non-working listed http origins origin_url = url.replace("http://", "https://") break else: # otherwise, choose the first one origin_url = urls[0] return origin_url
[docs] def try_to_determine_git_repository( repository_url: str, base_git_url: Optional[str] = None ) -> Optional[str]: """Some gitweb instances does not advertise the git urls. This heuristic works on instances demonstrating this behavior. """ result = None parsed_url = urlparse(repository_url) repo = parse_qs(parsed_url.query, separator=";").get("p") if repo: if base_git_url: result = f"{base_git_url.rstrip('/')}/{repo[0]}" else: result = f"git://{parsed_url.netloc}/{repo[0]}" return result
[docs] def parse_last_update(last_update_interval: Optional[str]) -> Optional[datetime]: """Parse the last update string into a datetime.""" if not last_update_interval: return None last_update_date = parse(last_update_interval) last_update = None if last_update_date is not None: last_update = last_update_date.replace(tzinfo=timezone.utc) return last_update