Source code for swh.loader.mercurial.identify

# Copyright (C) 2020-2021  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from codecs import escape_decode
import json
from pathlib import Path
import re
import subprocess
from typing import Any, Dict, Iterator, List, NamedTuple, Optional, Union

# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import click

from swh.loader.mercurial.utils import get_minimum_env
from swh.model.cli import identify_object
from swh.model.git_objects import normalize_timestamp
from swh.model.hashutil import hash_to_bytehex
from swh.model.model import RevisionType
from swh.model.swhids import CoreSWHID, ObjectType

TAG_PATTERN = re.compile(b"([0-9A-Fa-f]{40}) +(.+)")



[docs]
class HgAuthor(NamedTuple):
    """Represent a Mercurial revision author."""

    fullname: bytes
    """full name of the author"""

    name: Optional[bytes]
    """name of the author"""

    email: Optional[bytes]
    """email of the author"""


[docs]
    @staticmethod
    def from_bytes(data: bytes) -> "HgAuthor":
        """Convert bytes to an HgAuthor named tuple.

        Expected format: "name <email>"
        """
        from swh.loader.mercurial.converters import parse_author

        result = parse_author(data)
        return HgAuthor(
            fullname=result["fullname"], name=result["name"], email=result["email"]
        )



[docs]
    def to_dict(self) -> Dict[str, Optional[bytes]]:
        return {"fullname": self.fullname, "name": self.name, "email": self.email}




HG_REVISION_TEMPLATE = "\n".join(
    [
        "node_id:{node}",
        "author:{author}",
        "timestamp_offset:{date|json}",
        "p1:{p1.node}",
        "p2:{p2.node}",
        "extras:{join(extras, '\nextras:')}",
    ]
)  # Log template for HgRevision.from_bytes

NULL_NODE_ID = b"0" * 40  # Value used when no parent



[docs]
class HgRevision(NamedTuple):
    """Represent a Mercurial revision."""

    node_id: bytes
    """raw bytes of the revision hash"""

    author: HgAuthor
    """author of the revision"""

    timestamp: bytes
    """timestamp of the revision"""

    offset: bytes
    """offset of the revision"""

    parents: List[bytes]
    """hex bytes of the revision's parents"""

    extras: Dict[bytes, bytes]
    """metadata of the revision"""

    description: bytes
    """description of the revision"""


[docs]
    @staticmethod
    def from_bytes(data: bytes, description: bytes) -> "HgRevision":
        """Convert bytes to an HgRevision named tuple.

        Expected data format:
        '''
        node_id:{node}
        author:{author}
        timestamp_offset:[{timestamp}, {offset}]
        p1:{p1}
        p2:{p2}
        extras:{key1}={value1}
        ...
        extras:{keyn}={value}
        '''

        """
        lines = data.split(b"\n")
        tuples = [line.split(b":", 1) for line in lines]
        fields: Dict[str, Any] = {
            "parents": [],
            "extras": {},
            "description": description,
        }
        for key, value in tuples:
            if key == b"timestamp_offset":
                timestamp, offset = json.loads(value)
                fields["timestamp"] = timestamp
                fields["offset"] = offset
            elif key in (b"p1", b"p2"):
                if value != NULL_NODE_ID:
                    fields["parents"].append(value)
            elif key == b"extras":
                extra_key, extra_value = value.split(b"=", 1)
                fields["extras"][extra_key] = extra_value
            elif key == b"author":
                fields["author"] = HgAuthor.from_bytes(value)
            else:
                fields[key.decode()] = value

        return HgRevision(**fields)



[docs]
    def branch(self) -> bytes:
        return self.extras.get(b"branch", b"default")



[docs]
    def to_dict(self) -> Dict:
        """Convert a HgRevision to a dict for SWHID computation"""
        date = normalize_timestamp(int(self.timestamp))

        extra_headers = [
            (b"time_offset_seconds", str(self.offset).encode("utf-8")),
        ]

        for key, value in self.extras.items():
            if key == b"branch" and value == b"default":
                # branch default is skipped to match historical implementation
                continue
            if key == b"transplant_source":
                # transplant_source is converted to hex
                # to match historical implementation
                value = hash_to_bytehex(escape_decode(value)[0])
            extra_headers.append((key, value))

        author = self.author.to_dict()

        return {
            "author": author,
            "date": date,
            "committer": author,
            "committer_date": date,
            "type": RevisionType.MERCURIAL.value,
            "message": self.description,
            "metadata": {"node": self.node_id},
            "extra_headers": tuple(extra_headers),
            "synthetic": False,
            "parents": self.parents,
        }





[docs]
class HgBranch(NamedTuple):
    """Represent a Mercurial branch."""

    name: bytes
    """name of the branch"""

    node_id: bytes
    """row bytes of the target revision hash"""




[docs]
class HgTag(NamedTuple):
    """Represent a Mercurial tag."""

    name: bytes
    """name of the tag"""

    node_id: bytes
    """hex bytes of the target revision"""




[docs]
class Hg:
    """Provide methods to extract data from a Mercurial repository."""

    def __init__(self, repository_root: Path) -> None:
        self._root = repository_root

    def _output(self, *args) -> bytes:
        """Return the output of a `hg` call."""
        return subprocess.check_output(
            ["hg", *args], cwd=self._root, env=get_minimum_env()
        )

    def _call(self, *args) -> None:
        """Perform a `hg` call."""
        subprocess.check_call(
            ["hg", *args],
            cwd=self._root,
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            env=get_minimum_env(),
        )


[docs]
    def root(self) -> Path:
        """Return the root of the Mercurial repository."""
        return self._root



[docs]
    def log(self, rev: Optional[Union[bytes, str]] = None) -> List[HgRevision]:
        """Return the specified revisions of the Mercurial repository.

        Mercurial revsets are supported. (See `hg help revsets`)

        If no revision range is specified, return all revisions".
        """
        if rev:
            node_ids = self._output("log", "-r", rev, "-T", "{node}\n").splitlines()
        else:
            node_ids = self._output("log", "-T", "{node}\n").splitlines()

        revisions = [self._revision(node_id) for node_id in reversed(node_ids)]

        return revisions


    def _revision(self, revision: bytes) -> HgRevision:
        data = self._output("log", "-r", revision, "-T", HG_REVISION_TEMPLATE)

        # hg log strips the description so the raw description has to be taken
        # from debugdata
        # The description follows some metadata and is separated from them
        # by an empty line
        _, desc = self._output("debugdata", "-c", revision).split(b"\n\n", 1)

        return HgRevision.from_bytes(data, desc)


[docs]
    def up(self, rev: bytes) -> None:
        """Update the repository working directory to the specified revision."""
        self._call("up", rev)



[docs]
    def branches(self) -> List[HgBranch]:
        """List the repository named branches."""
        output = self._output("branches", "-T", "{branch}\n{node}\n\n").strip()

        branches = []

        for block in output.split(b"\n\n"):
            name, node_id = block.splitlines()
            branches.append(HgBranch(name=name, node_id=node_id))

        return branches



[docs]
    def tip(self) -> HgRevision:
        """Return the `tip` node-id."""
        return self.log("tip")[0]



[docs]
    def tags(self) -> List[HgTag]:
        """Return the repository's tags as defined in the `.hgtags` file.

        `.hgtags` being like any other repository's tracked file, its content can vary
        from revision to revision. The returned value therefore depends on the current
        revision of the repository.
        """
        hgtags = self._root / ".hgtags"

        tags = {}

        if hgtags.is_file():
            for line in hgtags.read_bytes().splitlines():
                match = TAG_PATTERN.match(line)
                if match is None:
                    continue
                node_id, name = match.groups()
                tags[node_id] = name

        return [HgTag(name=name, node_id=node_id) for node_id, name in tags.items()]




@click.group()
@click.option(
    "--directory",
    "-d",
    help=("Path to the Mercurial repository. If unset, the current directory is used"),
)
@click.pass_context
def main(ctx, directory=None):
    """Compute the Software Heritage persistent identifier (SWHID) for the given
       source code object(s).

    For more details about SWHIDs see:

    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
    """
    # ensure that ctx.obj exists and is a dict (in case `cli()` is called
    # by means other than the `if` block below)
    ctx.ensure_object(dict)

    root = Path(directory) if directory else Path()
    if not root.exists():
        raise IOError(f"{root!r} does not exists")

    ctx.obj["HG_ROOT"] = root



[docs]
def identify_directory(path: Path) -> CoreSWHID:
    """Return the SWHID of the given path."""
    return CoreSWHID.from_string(
        identify_object(
            "directory", follow_symlinks=True, exclude_patterns=[b".hg"], obj=str(path)
        )
    )




[docs]
class RevisionIdentity(NamedTuple):
    """Represent a swh revision identity."""

    swhid: CoreSWHID
    """SWH Identifier of the revision."""

    node_id: bytes
    """node_id hex bytes"""

    directory_swhid: CoreSWHID
    """SWH Identifier of the directory"""


[docs]
    def dir_uri(self) -> str:
        """Return the SWHID uri of the revision's directory."""
        return f"{self.directory_swhid}\t{self.node_id.decode()}"


    def __str__(self) -> str:
        """Return the string representation of a RevisionIdentity."""
        return f"{self.swhid}\t{self.node_id.decode()}"




[docs]
def identify_revision(
    hg: Hg,
    rev: Optional[bytes] = None,
    node_id_2_swhid: Optional[Dict[bytes, CoreSWHID]] = None,
) -> Iterator[RevisionIdentity]:
    """Return the repository revision identities.

    Args:
        hg: A `Hg` repository instance
        rev: An optional revision or Mercurial revsets (See `hg help revsets`)
             If not provided all the repository revisions will be computed.
        node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs
            It will be updated in place with new mappings.
    """
    from swh.model.model import Revision

    if node_id_2_swhid is None:
        node_id_2_swhid = {}

    for revision in hg.log(rev):
        data = revision.to_dict()

        hg.up(revision.node_id)
        directory_swhid = identify_directory(hg.root())
        data["directory"] = directory_swhid.object_id

        parents = []
        for parent in data["parents"]:
            if parent not in node_id_2_swhid:
                parent_revision = next(identify_revision(hg, parent, node_id_2_swhid))
                node_id_2_swhid[parent] = parent_revision.swhid
            assert node_id_2_swhid[parent].object_type == ObjectType.REVISION
            parents.append(node_id_2_swhid[parent].object_id)
        data["parents"] = parents

        revision_swhid = Revision.from_dict(data).swhid()
        node_id_2_swhid[revision.node_id] = revision_swhid

        yield RevisionIdentity(
            swhid=revision_swhid,
            node_id=revision.node_id,
            directory_swhid=directory_swhid,
        )




[docs]
class ReleaseIdentity(NamedTuple):
    """Represent a swh release identity."""

    swhid: CoreSWHID
    """SWH Identifier of the release."""

    node_id: bytes
    """node_id hex bytes"""

    name: bytes
    """name of the release"""

    def __str__(self) -> str:
        """Return the string representation of a ReleaseIdentity."""
        return f"{self.swhid}\t{self.name.decode()}"




[docs]
def identify_release(
    hg: Hg,
    node_id_2_swhid: Optional[Dict[bytes, CoreSWHID]] = None,
) -> Iterator[ReleaseIdentity]:
    """Return the repository's release identities.

    Args:
        hg: A `Hg` repository instance
        node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs
            If not provided it will be computed using `identify_revision`.
    """
    from swh.model.model import Release, ReleaseTargetType

    if node_id_2_swhid is None:
        node_id_2_swhid = {
            revision.node_id: revision.swhid for revision in identify_revision(hg)
        }

    for tag in hg.tags():
        assert node_id_2_swhid[tag.node_id].object_type == ObjectType.REVISION
        data = {
            "name": tag.name,
            "target": node_id_2_swhid[tag.node_id].object_id,
            "target_type": ReleaseTargetType.REVISION.value,
            "message": None,
            "metadata": None,
            "synthetic": False,
            "author": {"name": None, "email": None, "fullname": b""},
            "date": None,
        }

        release_swhid = Release.from_dict(data).swhid()

        yield ReleaseIdentity(
            swhid=release_swhid,
            node_id=tag.node_id,
            name=tag.name,
        )




[docs]
def identify_snapshot(
    hg: Hg,
    node_id_2_swhid: Optional[Dict[bytes, CoreSWHID]] = None,
    releases: Optional[List[ReleaseIdentity]] = None,
) -> CoreSWHID:
    """Return the repository snapshot identity.

    Args:
        hg: A `Hg` repository instance
        node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs
             If not provided it will be computed using `identify_revision`.
        release: an optional list of `ReleaseIdentity`.
            If not provided it will be computed using `identify_release`.
    """
    from swh.model.model import Snapshot, SnapshotTargetType

    if node_id_2_swhid is None:
        node_id_2_swhid = {
            revision.node_id: revision.swhid for revision in identify_revision(hg)
        }

    if releases is None:
        releases = [release for release in identify_release(hg, node_id_2_swhid)]

    branches = {}

    tip = hg.tip()
    branches[b"HEAD"] = {
        "target": tip.branch(),
        "target_type": SnapshotTargetType.ALIAS.value,
    }

    for branch in hg.branches():
        assert node_id_2_swhid[branch.node_id].object_type == ObjectType.REVISION
        branches[branch.name] = {
            "target": node_id_2_swhid[branch.node_id].object_id,
            "target_type": SnapshotTargetType.REVISION.value,
        }

    for release in releases:
        assert release.swhid.object_type == ObjectType.RELEASE
        branches[release.name] = {
            "target": release.swhid.object_id,
            "target_type": SnapshotTargetType.RELEASE.value,
        }

    return Snapshot.from_dict({"branches": branches}).swhid()



@main.command()
@click.argument("rev", required=False)
@click.pass_context
def revision(ctx, rev):
    """Compute the SWHID of a given revision.

    If specified REV allow to select a single or multiple revisions
    (using the Mercurial revsets language: `hg help revsets`)
    """
    hg = Hg(ctx.obj["HG_ROOT"])

    for identity in identify_revision(hg, rev):
        click.echo(identity)


@main.command()
@click.pass_context
def snapshot(ctx):
    """Compute the SWHID of the snapshot."""
    root = ctx.obj["HG_ROOT"]
    hg = Hg(root)

    snapshot_swhid = identify_snapshot(hg)

    click.echo(f"{snapshot_swhid}\t{root}")


@main.command()
@click.pass_context
def all(ctx):
    """Compute the SWHID of all the repository objects."""
    root = ctx.obj["HG_ROOT"]
    hg = Hg(root)

    dir_uris = []
    rev_uris = []
    rel_uris = []

    node_id_2_swhid = {}
    for revision in identify_revision(hg):
        dir_uris.append(revision.dir_uri())
        rev_uris.append(str(revision))
        node_id_2_swhid[revision.node_id] = revision.swhid

    releases = []
    for release in identify_release(hg, node_id_2_swhid):
        rel_uris.append(str(release))
        releases.append(release)

    snapshot_swhid = identify_snapshot(hg, node_id_2_swhid, releases)

    for uri in dir_uris + rev_uris + rel_uris:
        click.echo(uri)

    click.echo(f"{snapshot_swhid}\t{root}")


if __name__ == "__main__":
    main()