Source code for swh.lister.gerrit.lister
# Copyright (C) 2026 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
from typing import Dict, Iterator, List, Optional, Union
from requests import HTTPError
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
GerritProjects = List[str]
[docs]
class GerritLister(StatelessLister[GerritProjects]):
"""Lister class for Gerrit instances.
This lister uses the Gerrit REST API projects endpoint
https://gerrit-review.googlesource.com/Documentation/rest-api-projects.html
"""
LISTER_NAME = "gerrit"
LIMITs = ("all", "", 1000, 100, 10, 1)
def __init__(
self,
scheduler: SchedulerInterface,
url: Optional[str] = None,
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
"""Lister class for Gerrit repositories."""
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.session.headers.update({"Accept": "application/json"})
self.url = self.url.rstrip("/") + "/"
self.api_url = self.url + "projects/"
[docs]
def api_request(self, query: str, more: str) -> Optional[Dict]:
url = f"{self.api_url}{query}{more}"
response = self.http_request(url)
text = response.text
text = text.strip()
# Remove Cross Site Script Inclusion (XSSI) prevention prefix
# https://gerrit-review.googlesource.com/Documentation/rest-api.html#output
text = text.removeprefix(")]}'")
text = text.strip()
try:
projects = json.loads(text)
except json.JSONDecodeError:
return None
if isinstance(projects, dict):
return projects
else:
return None
[docs]
def get_pages_limit(self, limit: Union[str, int]) -> Iterator[GerritProjects]:
if isinstance(limit, int):
query = f"{limit=}"
else:
query = limit
sep = "&" if query else "?"
query = f"?{query}" if query else ""
start = 0
while True:
more = f"{sep}{start=}" if start else ""
projects = self.api_request(query, more)
if projects is None:
raise ValueError
else:
yield list(projects)
count = len(projects)
_, info = projects.popitem()
if info.get("_more_projects", False) is True:
start += count
else:
break
[docs]
def get_pages(self) -> Iterator[GerritProjects]:
"""Generate git 'project' URLs found on the current Gerrit server."""
# Some instances do not allow the all option to be enabled
# Maybe some instances have limit requirements too?
for limit in self.LIMITs:
try:
yield from self.get_pages_limit(limit)
break
except (ValueError, HTTPError):
continue
[docs]
def get_origins_from_page(self, projects: GerritProjects) -> Iterator[ListedOrigin]:
"""Convert a list of Gerrit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for project in projects:
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=self.url + project,
visit_type="git",
)