Source code for swh.lister.julia.lister
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import datetime
import logging
from pathlib import Path
import shutil
import tempfile
from typing import Any, Dict, Iterator, Optional
from dulwich import porcelain
from dulwich.repo import Repo
from dulwich.walk import WalkEntry
import iso8601
import toml
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
JuliaListerPage = Dict[str, Any]
[docs]
@dataclass
class JuliaListerState:
"""Store lister state for incremental mode operations"""
last_seen_commit: Optional[str] = None
"""Hash of the latest Git commit when lister was executed"""
[docs]
class JuliaLister(Lister[JuliaListerState, JuliaListerPage]):
"""List Julia packages origins"""
LISTER_NAME = "julia"
VISIT_TYPE = "git" # Julia origins url are Git repositories
INSTANCE = "julia"
REPO_URL = (
"https://github.com/JuliaRegistries/General.git" # Julia General Registry
)
REPO_PATH = Path(tempfile.mkdtemp(), "General")
def __init__(
self,
scheduler: SchedulerInterface,
url: str = REPO_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
[docs]
def get_registry_repository(self) -> None:
"""Get Julia General Registry Git repository up to date on disk"""
try:
porcelain.clone(source=self.url, target=self.REPO_PATH)
except FileExistsError:
porcelain.pull(self.REPO_PATH, remote_location=self.url)
[docs]
def state_from_dict(self, d: Dict[str, Any]) -> JuliaListerState:
return JuliaListerState(**d)
[docs]
def state_to_dict(self, state: JuliaListerState) -> Dict[str, Any]:
return asdict(state)
[docs]
def get_origin_data(self, entry: WalkEntry) -> Dict[str, Any]:
"""
Given an entry object parse its commit message and other attributes
to detect if the commit is valid to describe a new package or
a new package version.
Returns a dict with origin url as key and iso8601 commit date as value
"""
assert entry
if (
entry.commit
and entry.changes()
and (
entry.commit.message.startswith(b"New package: ")
or entry.commit.message.startswith(b"New version: ")
)
):
package_toml = None
for change in entry.changes():
if change and hasattr(change, "new"):
if change.new.path.endswith(b"/Package.toml"):
package_toml = self.REPO_PATH / change.new.path.decode()
break
elif change.new.path.endswith(b"/Versions.toml"):
versions_path = self.REPO_PATH / change.new.path.decode()
if versions_path.exists():
package_path, _ = change.new.path.decode().split(
"Versions.toml"
)
package_toml = (
self.REPO_PATH / package_path / "Package.toml"
)
break
if package_toml and package_toml.exists():
origin = toml.load(package_toml)["repo"]
last_update = datetime.datetime.fromtimestamp(
entry.commit.commit_time,
tz=datetime.timezone.utc,
).isoformat()
return {f"{origin}": last_update}
return {}
[docs]
def get_pages(self) -> Iterator[JuliaListerPage]:
"""Yield an iterator which returns 'page'
To build a list of origins the ``Julia General registry`` Git
repository is cloned to look at commits history to discover new
package and new package versions.
Depending on ``last_seen_commit`` state it initiate a commit walker
since the last time the lister has been executed.
There is only one page that list all origins urls.
"""
# Clone the repository
self.get_registry_repository()
assert self.REPO_PATH.exists()
repo = Repo(str(self.REPO_PATH))
# Detect commits related to new package and new versions since last_seen_commit
if not self.state.last_seen_commit:
walker = repo.get_walker()
else:
last = repo[self.state.last_seen_commit.encode()]
walker = repo.get_walker(since=last.commit_time, exclude=[last.id])
assert walker
packages = {}
for entry in walker:
packages.update(self.get_origin_data(entry=entry))
yield packages
[docs]
def get_origins_from_page(self, page: JuliaListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances
Each directory of the Git repository have a ``Package.toml`` file from
where we get the Git repository url as an origin for each package.
"""
assert self.lister_obj.id is not None
for origin, last_update in page.items():
last_update = iso8601.parse_date(last_update)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=origin,
last_update=last_update,
)
[docs]
def finalize(self) -> None:
# Get Git HEAD commit hash
repo = Repo(str(self.REPO_PATH))
self.state.last_seen_commit = repo.head().decode("ascii")
self.updated = True
# Rm tmp directory REPO_PATH
if self.REPO_PATH.exists():
shutil.rmtree(self.REPO_PATH)
assert not self.REPO_PATH.exists()