# Copyright (C) 2021-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import os
import shutil
from subprocess import PIPE, run
from typing import Any, Dict, Iterator, Optional
from swh.lister.pattern import StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType
logger = logging.getLogger(__name__)
PageType = str
[docs]
def opam() -> str:
"""Get the path to the opam executable.
Raises:
EnvironmentError if no opam executable is found
"""
ret = shutil.which("opam")
if not ret:
raise EnvironmentError("No opam executable found in path {os.environ['PATH']}")
return ret
[docs]
class OpamLister(StatelessLister[PageType]):
"""
List all repositories hosted on an opam repository.
On initialisation, we create an opam root, with no ocaml compiler (no switch)
as we won't need it and it's costly. In this opam root, we add a single opam
repository (url) and give it a name (instance). Then, to get pages, we just ask
opam to list all the packages for our opam repository in our opam root.
Args:
url: base URL of an opam repository
(for instance https://opam.ocaml.org)
instance: string identifier for the listed repository
"""
# Part of the lister API, that identifies this lister
LISTER_NAME = "opam"
def __init__(
self,
scheduler: SchedulerInterface,
url: str,
instance: Optional[str] = None,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
opam_root: str = "/tmp/opam/",
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=url,
instance=instance,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.env = os.environ.copy()
# Opam root folder is initialized in the :meth:`get_pages` method as no
# side-effect should happen in the constructor to ease instantiation
self.opam_root = opam_root
[docs]
def get_pages(self) -> Iterator[PageType]:
# Initialize the opam root directory
opam_init(self.opam_root, self.instance, self.url, self.env)
# Actually list opam instance data
proc = run(
[
opam(),
"list",
"--all",
"--no-switch",
"--safe",
"--repos",
self.instance,
"--root",
self.opam_root,
"--normalise",
"--short",
],
env=self.env,
stdout=PIPE,
text=True,
check=True,
)
if proc.stdout is not None:
yield from proc.stdout.splitlines()
[docs]
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
"""Convert a page of OpamLister repositories into a list of ListedOrigins"""
assert self.lister_obj.id is not None
# a page is just a package name
url = f"opam+{self.url}/packages/{page}/"
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type="opam",
url=url,
last_update=None,
extra_loader_arguments={
"opam_root": self.opam_root,
"opam_instance": self.instance,
"opam_url": self.url,
"opam_package": page,
},
)
[docs]
def opam_init(opam_root: str, instance: str, url: str, env: Dict[str, Any]) -> None:
"""Initialize an opam_root folder.
Args:
opam_root: The opam root folder to initialize
instance: Name of the opam repository to add or initialize
url: The associated url of the opam repository to add or initialize
env: The global environment to use for the opam command.
Returns:
None.
"""
if not os.path.exists(opam_root) or not os.listdir(opam_root):
command = [
opam(),
"init",
"--reinit",
"--bare",
"--no-setup",
"--root",
opam_root,
instance,
url,
]
else:
# The repository exists and is populated, we just add another instance in the
# repository. If it's already setup, it's a noop
command = [
opam(),
"repository",
"add",
"--set-default",
"--root",
opam_root,
instance,
url,
]
# Actually execute the command
run(command, env=env, check=True)