Source code for swh.shard.cli

# Copyright (C) 2025  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import logging

import click

# WARNING: do not import unnecessary things here to keep cli startup time under
# control


logger = logging.getLogger(__name__)

# marker of a deleted/non-populated index entry
NULLKEY = b"\x00" * 32

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])

try:
    # make this cli usable both from the swh.core's 'swh' cli group and from
    # direct swh-shard command (since swh-shard does not depend on swh.core)
    from swh.core.cli import swh

    cli_group = swh.group
except (ImportError, ModuleNotFoundError):
    cli_group = click.group


@cli_group(name="shard", context_settings=CONTEXT_SETTINGS)
@click.pass_context
def shard_cli_group(ctx):
    """Software Heritage Shard tools."""


@shard_cli_group.command("info")
@click.argument(
    "shard", required=True, nargs=-1, type=click.Path(exists=True, dir_okay=False)
)
@click.pass_context
def shard_info(ctx, shard):
    "Display shard file information"

    from swh.shard import Shard

    for shardfile in shard:
        with Shard(shardfile) as s:
            h = s.header
            click.echo(f"Shard {shardfile}")
            click.echo(f"├─version:    {h.version}")
            click.echo(f"├─objects:    {h.objects_count}")
            click.echo(f"│ ├─position: {h.objects_position}")
            click.echo(f"│ └─size:     {h.objects_size}")
            click.echo("├─index")
            click.echo(f"│ ├─position: {h.index_position}")
            click.echo(f"│ └─size:     {h.index_size}")
            click.echo("└─hash")
            click.echo(f"  └─position: {h.hash_position}")


@shard_cli_group.command("create")
@click.argument(
    "shard", required=True, type=click.Path(exists=False, dir_okay=False, writable=True)
)
@click.argument("files", metavar="files", required=True, nargs=-1)
@click.option(
    "--sorted/--no-sorted",
    "sort_files",
    default=False,
    help=(
        "Sort files by inversed filename before adding them to the shard; "
        "it may help having better compression ratio when compressing "
        "the shard file"
    ),
)
@click.pass_context
def shard_create(ctx, shard, files, sort_files):
    "Create a shard file from given files"

    import hashlib
    import os
    import sys

    from swh.shard import ShardCreator

    if os.path.exists(shard):
        raise click.ClickException(f"Shard file {shard} already exists. Aborted!")

    files = list(files)
    if files == ["-"]:
        # read file names from stdin
        files = [fname.strip() for fname in sys.stdin.read().splitlines()]
    click.echo(f"There are {len(files)} entries")
    hashes = set()
    files_to_add = {}
    with click.progressbar(files, label="Checking files to add") as bfiles:
        for fname in bfiles:
            try:
                with open(fname, "rb") as f:
                    sha256 = hashlib.sha256(f.read()).digest()
                    if sha256 not in hashes:
                        files_to_add[fname] = sha256
                        hashes.add(sha256)
            except OSError:
                continue
    click.echo(f"after deduplication: {len(files_to_add)} entries")

    with ShardCreator(shard, len(files_to_add)) as shard:
        it = files_to_add.items()
        if sort_files:
            it = sorted(it, key=lambda x: x[0][-1::-1])
        with click.progressbar(it, label="Adding files to the shard") as items:
            for fname, sha256 in items:
                with open(fname, "rb") as f:
                    shard.write(sha256, f.read())
    click.echo("Done")


@shard_cli_group.command("ls")
@click.option("--skip-removed", default=False, is_flag=True)
@click.argument("shard", required=True, type=click.Path(exists=True, dir_okay=False))
@click.pass_context
def shard_list(ctx, skip_removed, shard):
    "List objects in a shard file"

    from swh.shard import Shard

    with Shard(shard) as s:
        for key in s:
            if skip_removed and key == NULLKEY:
                continue
            try:
                size = s.getsize(key)
            except KeyError:
                size = "N/A"
            click.echo(f"{key.hex()}: {size} bytes")


@shard_cli_group.command("get")
@click.argument("shard", required=True, type=click.Path(exists=True, dir_okay=False))
@click.argument("keys", required=True, nargs=-1)
@click.pass_context
def shard_get(ctx, shard, keys):
    "List objects in a shard file"

    from swh.shard import Shard

    with Shard(shard) as s:
        for key in keys:
            click.echo(s[bytes.fromhex(key)], nl=False)


@shard_cli_group.command("delete")
@click.argument(
    "shard", required=True, type=click.Path(exists=True, dir_okay=False, writable=True)
)
@click.argument("keys", required=True, nargs=-1)
@click.option(
    "--confirm/--no-confirm",
    default=True,
    help="Ask for confirmation before performing the deletion",
)
@click.pass_context
def shard_delete(ctx, shard, keys, confirm):
    """Delete objects from a shard file

    Keys to delete from the shard file are expected to be given as hex
    representation. If there is only one argument '-', then read the list of
    keys from stdin. Implies --no-confirm.

    If at least one key is missing or invalid, the whole process is aborted.

    """
    import sys

    if keys == ("-",):
        keys = sys.stdin.read().split()
        confirm = False
    if len(set(keys)) < len(keys):
        click.fail("There are duplicate keys, aborting")

    from swh.shard import Shard

    obj_size = {}
    with Shard(shard) as s:
        for key in keys:
            try:
                obj_size[key] = s.getsize(bytes.fromhex(key))
            except ValueError:
                click.secho(f"{key}: key is invalid", fg="red")
            except KeyError:
                click.secho(f"{key}: key not found", fg="red")
    if len(obj_size) < len(keys):
        raise click.ClickException(
            "There have been errors for at least one key, aborting"
        )
    click.echo(f"About to remove these objects from the shard file {shard}")
    for key in keys:
        click.echo(f"{key} ({obj_size[key]} bytes)")
    if confirm:
        click.confirm(
            click.style(
                "Proceed?",
                fg="yellow",
                bold=True,
            ),
            abort=True,
        )
    with click.progressbar(keys, label="Deleting objects from the shard") as barkeys:
        for key in barkeys:
            Shard.delete(shard, bytes.fromhex(key))
    click.echo("Done")


[docs] def main(): # Even though swh() sets up logging, we need an earlier basic logging setup # for the next few logging statements logging.basicConfig() return shard_cli_group(auto_envvar_prefix="SWH")
if __name__ == "__main__": main()