# Copyright (C) 2017-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import functools
import os
import time
import zlib
from fastimport.commands import (
BlobCommand,
CommitCommand,
FileDeleteCommand,
FileModifyCommand,
ResetCommand,
)
from swh.model import hashutil
from swh.model.from_disk import DentryPerms, mode_to_perms
from swh.model.swhids import ObjectType
from swh.model.toposort import toposort
from swh.vault.cookers.base import BaseVaultCooker
from swh.vault.cookers.utils import revision_log
from swh.vault.to_disk import get_filtered_file_content
[docs]
class RevisionGitfastCooker(BaseVaultCooker):
"""Cooker to create a git fast-import bundle"""
BUNDLE_TYPE = "gitfast"
SUPPORTED_OBJECT_TYPES = {ObjectType.REVISION}
[docs]
def check_exists(self):
return not list(self.storage.revision_missing([self.obj_id]))
[docs]
def prepare_bundle(self):
self.log = list(toposort(revision_log(self.storage, self.obj_id)))
self.gzobj = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS | 16)
self.fastexport()
self.write(self.gzobj.flush())
[docs]
def write_cmd(self, cmd):
chunk = bytes(cmd) + b"\n"
super().write(self.gzobj.compress(chunk))
[docs]
def fastexport(self):
"""Generate all the git fast-import commands from a given log."""
self.rev_by_id = {r["id"]: r for r in self.log}
self.obj_done = set()
self.obj_to_mark = {}
self.next_available_mark = 1
last_progress_report = None
for i, rev in enumerate(self.log, 1):
# Update progress if needed
ct = time.time()
if last_progress_report is None or last_progress_report + 2 <= ct:
last_progress_report = ct
pg = "Computing revision {}/{}".format(i, len(self.log))
self.backend.set_progress(self.BUNDLE_TYPE, self.swhid, pg)
# Compute the current commit
self._compute_commit_command(rev)
[docs]
def mark(self, obj_id):
"""Get the mark ID as bytes of a git object.
If the object has not yet been marked, assign a new ID and add it to
the mark dictionary.
"""
if obj_id not in self.obj_to_mark:
self.obj_to_mark[obj_id] = self.next_available_mark
self.next_available_mark += 1
return str(self.obj_to_mark[obj_id]).encode()
def _compute_blob_command_content(self, file_data):
"""Compute the blob command of a file entry if it has not been
computed yet.
"""
obj_id = file_data["sha1"]
if obj_id in self.obj_done:
return
content = get_filtered_file_content(self.storage, file_data)
self.write_cmd(BlobCommand(mark=self.mark(obj_id), data=content["content"]))
self.obj_done.add(obj_id)
def _author_tuple_format(self, author, date):
# We never want to have None values here so we replace null entries
# by ''.
if author is not None:
author_tuple = (author.get("name") or b"", author.get("email") or b"")
else:
author_tuple = (b"", b"")
if date is not None:
date_tuple = (
date.get("timestamp", {}).get("seconds") or 0,
(date.get("offset") or 0) * 60,
)
else:
date_tuple = (0, 0)
return author_tuple + date_tuple
def _compute_commit_command(self, rev):
"""Compute a commit command from a specific revision."""
if "parents" in rev and rev["parents"]:
from_ = b":" + self.mark(rev["parents"][0])
merges = [b":" + self.mark(r) for r in rev["parents"][1:]]
parent = self.rev_by_id[rev["parents"][0]]
else:
# We issue a reset command before all the new roots so that they
# are not automatically added as children of the current branch.
self.write_cmd(ResetCommand(b"refs/heads/master", None))
from_ = None
merges = None
parent = None
# Retrieve the file commands while yielding new blob commands if
# needed.
files = list(self._compute_file_commands(rev, parent))
# Construct and write the commit command
author = self._author_tuple_format(rev["author"], rev["date"])
committer = self._author_tuple_format(rev["committer"], rev["committer_date"])
self.write_cmd(
CommitCommand(
ref=b"refs/heads/master",
mark=self.mark(rev["id"]),
author=author,
committer=committer,
message=rev["message"] or b"",
from_=from_,
merges=merges,
file_iter=files,
)
)
@functools.lru_cache(maxsize=4096)
def _get_dir_ents(self, dir_id=None):
"""Get the entities of a directory as a dictionary (name -> entity).
This function has a cache to avoid doing multiple requests to retrieve
the same entities, as doing a directory_ls() is expensive.
"""
data = self.storage.directory_ls(dir_id) if dir_id is not None else []
return {f["name"]: f for f in data}
def _compute_file_commands(self, rev, parent=None):
"""Compute all the file commands of a revision.
Generate a diff of the files between the revision and its main parent
to find the necessary file commands to apply.
"""
# Initialize the stack with the root of the tree.
cur_dir = rev["directory"]
parent_dir = parent["directory"] if parent else None
stack = [(b"", cur_dir, parent_dir)]
while stack:
# Retrieve the current directory and the directory of the parent
# commit in order to compute the diff of the trees.
root, cur_dir_id, prev_dir_id = stack.pop()
cur_dir = self._get_dir_ents(cur_dir_id)
prev_dir = self._get_dir_ents(prev_dir_id)
# Find subtrees to delete:
# - Subtrees that are not in the new tree (file or directory
# deleted).
# - Subtrees that do not have the same type in the new tree
# (file -> directory or directory -> file)
# After this step, every node remaining in the previous directory
# has the same type than the one in the current directory.
for fname, f in prev_dir.items():
if fname not in cur_dir or f["type"] != cur_dir[fname]["type"]:
yield FileDeleteCommand(path=os.path.join(root, fname))
# Find subtrees to modify:
# - Leaves (files) will be added or modified using `filemodify`
# - Other subtrees (directories) will be added to the stack and
# processed in the next iteration.
for fname, f in cur_dir.items():
# A file is added or modified if it was not in the tree, if its
# permissions changed or if its content changed.
if f["type"] == "file" and (
fname not in prev_dir
or f["sha1"] != prev_dir[fname]["sha1"]
or f["perms"] != prev_dir[fname]["perms"]
):
# Issue a blob command for the new blobs if needed.
self._compute_blob_command_content(f)
yield FileModifyCommand(
path=os.path.join(root, fname),
mode=mode_to_perms(f["perms"]).value,
dataref=(b":" + self.mark(f["sha1"])),
data=None,
)
# A revision is added or modified if it was not in the tree or
# if its target changed
elif f["type"] == "rev" and (
fname not in prev_dir or f["target"] != prev_dir[fname]["target"]
):
yield FileModifyCommand(
path=os.path.join(root, fname),
mode=DentryPerms.revision,
dataref=hashutil.hash_to_hex(f["target"]).encode(),
data=None,
)
# A directory is added or modified if it was not in the tree or
# if its target changed.
elif f["type"] == "dir":
f_prev_target = None
if fname in prev_dir and prev_dir[fname]["type"] == "dir":
f_prev_target = prev_dir[fname]["target"]
if f_prev_target is None or f["target"] != f_prev_target:
stack.append(
(os.path.join(root, fname), f["target"], f_prev_target)
)