Source code for swh.graph.luigi.blobs_datasets

# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

"""
Luigi tasks for blob-centric datasets
=====================================

This module contains `Luigi <https://luigi.readthedocs.io/>`_ tasks
driving the creation of derived datasets centered around a subset of
content objects in the graph. Currently, this means:

* the `license dataset <https://annex.softwareheritage.org/public/dataset/license-blobs/>`_, and
* the `citation dataset <https://annex.softwareheritage.org/public/dataset/citation-blobs/>`_

File layout
-----------

This assumes a local compressed graph (from :mod:`swh.graph.luigi.compressed_graph`)
is present, and generates/manipulates the following files::

    base_dir/
        <date>[_<flavor>]/
            citation-blobs/
                blobs-earliest.csv.zst
                blobs-fileinfo.csv.zst
                blobs-nb-origins.csv.zst
                blobs-origins.csv.zst
                blobs-sample20k.tar.zst
                blobs.tar.zst
                import-dataset.sql
                license-blobs.csv.zst
            license-blobs/
                <same as above, plus these two:>
                blobs-scancode.csv.zst
                blobs-scancode.ndjson.zst
"""

# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import contextlib
import functools
import hashlib
import logging
import os
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    ContextManager,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Sequence,
    Set,
    Tuple,
    TypeVar,
    cast,
)

import luigi

from swh.dataset.luigi import Format, LocalExport

if TYPE_CHECKING:
    import asyncio

    import magic
    from requests import Session

    from swh.graph.grpc.swhgraph_pb2_grpc import TraversalServiceStub


def _s3_url_to_bucket_path(s3_url: str) -> Tuple[str, str]:
    loc = _removeprefix(s3_url, "s3://")
    bucket, path = loc.split("/", 1)
    return bucket, path


# XXX in wait of Python 3.9 for PER 616...
def _removeprefix(s, prefix):
    if s.startswith(prefix):
        return s[len(prefix) :]


logger = logging.getLogger(__name__)

COMPRESS_LEVEL = 19
GRAPH_REQUEST_CONCURRENCY = 70
EMPTY_FILE_SHA1 = "da39a3ee5e6b4b0d3255bfef95601890afd80709"


SELECTION_QUERIES = {
    "citation": r"""
        SELECT
            concat('swh:1:cnt:', t1.target) AS swhid,
            t2.sha1 AS sha1,
            t1.filename AS name
        FROM (
            SELECT DISTINCT target, lower(trim(TRY_CAST(name AS VARCHAR))) AS filename
            FROM directory_entry
            WHERE (
                type='file'
                AND (
                    lower(trim(TRY_CAST(name AS VARCHAR))) = 'codemeta.json'
                    OR lower(trim(TRY_CAST(name AS VARCHAR))) = 'citation.cff'
                )
            )
        ) AS t1
        LEFT JOIN (SELECT sha1,sha1_git FROM content) AS t2
        ON (t1.target=t2.sha1_git)
        ORDER BY sha1
    """,
    "license": r"""
        SELECT
            concat('swh:1:cnt:', t1.target) AS swhid,
            t2.sha1 AS sha1,
            t1.filename AS name
        FROM (
            SELECT DISTINCT target, lower(trim(TRY_CAST(name AS VARCHAR))) AS filename
            FROM directory_entry
            WHERE (
                type = 'file' AND
                -- TODO: replace not(empty(regexp_match())) with regexp_find()
                not(empty(regexp_match(
                    lower(TRY_CAST(name AS VARCHAR)),
                    '^([a-z0-9._-]+\.)?(copying|licen(c|s)(e|ing)|notice|copyright|disclaimer|authors)(\.[a-z0-9\._-]+)?$'
                )))
            )
        ) AS t1
        LEFT JOIN (SELECT sha1,sha1_git FROM content) AS t2
        ON (t1.target=t2.sha1_git)
        ORDER BY sha1
    """,
    "readme": r"""
        SELECT
            concat('swh:1:cnt:', t1.target) AS swhid,
            t2.sha1 AS sha1,
            t1.filename AS name
        FROM (
            SELECT DISTINCT target, lower(trim(TRY_CAST(name AS VARCHAR))) AS filename
            FROM directory_entry
            WHERE (
                type = 'file' AND
                -- TODO: replace not(empty(regexp_match())) with regexp_find()
                not(empty(regexp_match(
                    lower(TRY_CAST(name AS VARCHAR)),
                    '^(readme)(\.[a-z0-9\._-]+)?$'
                )))
            )
        ) AS t1
        LEFT JOIN (SELECT sha1,sha1_git FROM content) AS t2
        ON (t1.target=t2.sha1_git)
        ORDER BY sha1
    """,
    "known_swhids": r"""
        SELECT
            concat('swh:1:cnt:', content.sha1_git) AS swhid,
            content.sha1 AS sha1,
            '<unknown>' AS name
        FROM swhids
        INNER JOIN content
        ON (swhids.swhid=concat('swh:1:cnt:', content.sha1_git))
        ORDER BY sha1
    """,
}


_mime_guesser = None


def _init_mime_guesser():
    global _mime_guesser
    if _mime_guesser is None:
        import magic

        _mime_guesser = magic.Magic(mime=True, mime_encoding=True)

    return _mime_guesser


def _guess_mime(path: str) -> Tuple[str, str]:
    _mime_guesser = _init_mime_guesser()
    info = _mime_guesser.from_file(path)
    mime_type, encoding = info.split()
    mime_type, encoding = mime_type.rstrip(";"), _removeprefix(encoding, "charset=")

    return (mime_type, encoding)



[docs]
@contextlib.contextmanager
def atomic_zstd_writer(result_path: Path):
    """Returns a file-like object, which writes to a temporary file,
    then atomically renames it to the ``result_path`` on success."""
    import pyzstd

    tmp_result_path = Path(f"{result_path}.tmp")
    try:
        with pyzstd.open(
            tmp_result_path, "wt", level_or_option=COMPRESS_LEVEL
        ) as output_fd:
            yield output_fd

        tmp_result_path.replace(result_path)

    except BaseException:
        tmp_result_path.unlink()
        raise




[docs]
@contextlib.contextmanager
def atomic_csv_zstd_writer(result_path: Path):
    """Returns a ``csv.writer`` object, which writes to a temporary file,
    then atomically renames it to the ``result_path`` on success."""
    import csv

    with atomic_zstd_writer(result_path) as output_fd:
        yield csv.writer(output_fd, lineterminator="\n")



# luigi.Task with some helpers to get paths
class _BaseTask(luigi.Task):
    blob_filter: str
    derived_datasets_path: Path
    previous_derived_datasets_path: Optional[Path]

    def blob_count(self) -> int:
        """Returns the total number of selected blobs"""
        with self.blob_count_path().open() as fd:
            return int(fd.read().strip())

    def blob_size(self) -> int:
        """Returns the total size of selected blobs"""
        with self.blob_size_path().open() as fd:
            return int(fd.read().strip())

    def blob_size_path(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "stats" / "size.txt"

    def blob_count_path(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "stats" / "count.txt"

    def blob_dir(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "blobs"

    def blob_list_path(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "blobs.csv.zst"

    def blob_tarball_path(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "blobs.tar.zst"

    def previous_blob_tarball_path(self) -> Optional[Path]:
        if self.previous_derived_datasets_path:
            return (
                self.previous_derived_datasets_path / self.blob_filter / "blobs.tar.zst"
            )
        else:
            return None

    def sample_blob_tarball_path(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "blobs-sample20k.tar.zst"

    def iter_blobs(
        self, *, unique_sha1: bool, with_tqdm: bool = True
    ) -> Iterator[Tuple[str, str, str]]:
        """Yields ``(swhid, sha1, name)`` by reading :file:`blobs.csv.zst`,
        and uses tqdm for progress report.

        If ``unique_sha1`` is True, skips all but the first occurrence of each sha1."""
        import csv

        import pyzstd
        import tqdm

        last_sha1 = "" * 20
        with pyzstd.open(self.blob_list_path(), "rt") as fd:
            reader = csv.reader(cast(Iterator[str], fd))
            header = next(reader)
            if header != ["swhid", "sha1", "name"]:
                raise ValueError(
                    "Unexpected header in %s: %r", self.blob_list_path(), header
                )

            rows_it: Iterable[List[str]] = reader
            if with_tqdm:
                rows_it = tqdm.tqdm(rows_it, total=self.blob_count())

            for row in rows_it:
                try:
                    (swhid, sha1, name) = row
                except ValueError:
                    raise ValueError(f"Unexpected row: {row!r}") from None
                if sha1 < last_sha1:
                    raise ValueError(f"Not sorted by sha1 ({last_sha1} before {sha1}")
                if not unique_sha1 or sha1 != last_sha1:
                    yield tuple(row)  # type: ignore[misc]
                last_sha1 = sha1

    def blob_paths(self, sha1: str) -> Tuple[Path, Path]:
        """Returns ``(sharded_path, unsharded_path)``, which are the two possible
        paths for this blob, depending on the blob dir layout."""
        sharded_path = self.blob_dir() / sha1[0:2] / sha1[2:4] / sha1
        unsharded_path = self.blob_dir() / sha1

        return (sharded_path, unsharded_path)

    def complete(self) -> bool:
        if not super().complete():
            return False

        for target in self.output():
            output_path = target.path
            if output_path.endswith(".csv.zst"):
                check_csv(Path(output_path))

        return True


_TCallable = TypeVar("_TCallable", bound=Callable)


def _log_exceptions(f: _TCallable) -> _TCallable:
    """Decorator for functions called by asyncio that would never be awaited if they
    crashed, causing asyncio to silently hide the exception."""

    @functools.wraps(f)
    def newf(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except BaseException:
            logger.exception(
                "Error while calling %s with %r and %r", f.__name__, args, kwargs
            )
            raise

    return newf  # type: ignore[return-value]


class _ConcurrentCsvWritingTask(_BaseTask):
    """Base classes for tasks writing a CSV using asyncio.

    asyncio is only used for gRPC requires to swh-graph; file writes are synchronous
    to keep the code simpler, as performance improvements from making them async
    would be negligeable."""

    CSV_HEADER: Tuple[str, str]

    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()
    grpc_api = luigi.Parameter()

    stub: "TraversalServiceStub"

    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`SelectBlobs`"""
        return SelectBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )

    def run(self) -> None:
        """Calls the :meth:`process_one` function, and writes its results as
        a two-column CSV to the target defined by :meth:`output`.
        """
        import asyncio

        asyncio.run(self._run_async())

    async def _run_async(self) -> None:
        import asyncio

        import grpc.aio

        import swh.graph.grpc.swhgraph_pb2_grpc as swhgraph_grpc

        input_queue: asyncio.Queue[Tuple[str, str, str]] = asyncio.Queue(maxsize=20)
        result_queue: asyncio.Queue[Tuple[str, str]] = asyncio.Queue(maxsize=20)

        async with grpc.aio.insecure_channel(self.grpc_api) as channel:
            self.stub = swhgraph_grpc.TraversalServiceStub(channel)

            fill_queue_task = asyncio.create_task(self._fill_input_queue(input_queue))
            write_task = asyncio.create_task(self._write_results(result_queue))

            worker_tasks = [
                asyncio.create_task(self._worker(input_queue, result_queue))
                for _ in range(GRAPH_REQUEST_CONCURRENCY)
            ]

            await write_task  # wait for workers to write everything

            await fill_queue_task  # should be instant

            for task in worker_tasks:
                task.cancel()

            await asyncio.gather(
                fill_queue_task,
                write_task,
                *worker_tasks,
                return_exceptions=True,
            )

    @_log_exceptions
    async def _fill_input_queue(
        self, input_queue: "asyncio.Queue[Tuple[str, str, str]]"
    ) -> None:
        for swhid, sha1, name in self.iter_blobs(with_tqdm=False, unique_sha1=True):
            if not swhid.startswith("swh:1:"):
                raise ValueError(f"Invalid SWHID: {swhid}")

            await input_queue.put((swhid, sha1, name))

    @_log_exceptions
    async def _worker(
        self,
        input_queue: "asyncio.Queue[Tuple[str, str, str]]",
        result_queue: "asyncio.Queue[Tuple[str, str]]",
    ) -> None:
        while True:  # exit via Task.cancel()
            row = await input_queue.get()
            (swhid, sha1, name) = row
            try:
                res = await self.process_one(row)
            except BaseException as e:
                res = (swhid, "")
                logger.exception("Error while processing %r", row)
                if not isinstance(e, Exception):
                    # KeyboardInterrupt, ...
                    raise
            await result_queue.put(res)

    async def _write_results(
        self, result_queue: "asyncio.Queue[Tuple[str, str]]"
    ) -> None:
        import tqdm.asyncio

        (target,) = self.output()
        result_path = Path(target.path)

        with atomic_csv_zstd_writer(result_path) as writer:
            writer.writerow(self.CSV_HEADER)

            async for i in tqdm.asyncio.trange(self.blob_count()):
                (swhid, result) = await result_queue.get()
                writer.writerow((swhid, result))

    async def process_one(self, row: Tuple[str, str, str]) -> Tuple[str, str]:
        raise NotImplementedError(f"{self.__class__.__name__}.process_one")



[docs]
def check_csv(csv_path: Path) -> None:
    import re

    import pyzstd

    with cast(ContextManager[Iterator[str]], pyzstd.open(csv_path, "rt")) as fd:
        try:
            header = next(fd)
        except StopIteration:
            raise ValueError(f"{csv_path} is empty") from None
        except pyzstd.ZstdError:
            raise ValueError(f"{csv_path} could not be decompressed as zstd") from None

        # check the header contains no whitespace
        if len(header.split()) != 1:
            raise ValueError(
                f"{csv_path.name} is not comma-separated "
                f"(or has whitespaces in column name)"
            )

        columns = header.split(",")
        if columns[0] != "swhid":
            raise ValueError(
                f"First column of {csv_path.name} is {columns[0]!r} "
                f"but should be 'swhid'"
            )

        try:
            first_line = next(fd)
        except StopIteration:
            raise ValueError(f"{csv_path} has no content") from None

        if not re.match("^swh:1:cnt:[0-9a-f]{40},", first_line):
            raise ValueError(f"{csv_path} has unexpected first row: {first_line}")




[docs]
class SelectBlobs(_BaseTask):
    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    local_export_path = luigi.PathParameter()
    derived_datasets_path = luigi.PathParameter()
    known_swhids_csv = luigi.Parameter(default="")


[docs]
    def requires(self) -> List[luigi.Task]:
        """Returns an instance of :class:`LocalExport`"""
        return [
            LocalExport(
                local_export_path=self.local_export_path,
                formats=[Format.orc],  # type: ignore[attr-defined]
            ),
        ]



[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`blobs.csv.zst` and :file:`stats/count.txt` in
        ``self.derived_datasets_path / self.blob_filter``"""
        return [
            luigi.LocalTarget(self.blob_list_path()),
            luigi.LocalTarget(self.blob_count_path()),
        ]



[docs]
    def run(self) -> None:
        """Runs an Athena query to get the list of blobs and writes it to
        :file:`blobs.csv.zst`."""
        import tempfile
        import textwrap

        import datafusion
        import pyarrow.dataset

        from ..shell import AtomicFileSink, Command

        ctx = datafusion.SessionContext()
        for table in ("directory_entry", "content"):
            ctx.register_dataset(
                table,
                pyarrow.dataset.dataset(
                    self.local_export_path / "orc" / table, format="orc"
                ),
            )

        if self.blob_filter == "known_swhids":
            assert self.known_swhids_csv, "Missing --SelectBlobs-known-swhids-csv"
            ctx.sql(
                f"""
                CREATE EXTERNAL TABLE swhids (
                    swhid VARCHAR NOT NULL
                )
                STORED AS CSV
                LOCATION '{self.known_swhids_csv}'
                OPTIONS (
                    'has_header' 'true',
                    'format.compression' 'zstd',
                );
                """
            )

        with tempfile.NamedTemporaryFile(suffix=".csv") as sql_res:
            logger.info("Running query...")
            query = f"""
                COPY ({SELECTION_QUERIES[self.blob_filter]})
                TO '{sql_res.name}'
                OPTIONS (
                    'has_header' 'true'
                )
            """
            logger.debug("%s", textwrap.indent(query, "    "))
            df = ctx.sql(query)
            logger.info("Reformatting...")

            columns = df.schema().names
            assert columns == ["swhid", "sha1", "name"], columns

            output_path = self.blob_list_path()
            output_path.parent.mkdir(parents=True, exist_ok=True)

            df.cache()

            # In the 2022-04-24 license dataset, the 'egrep' command filters
            # just 5 entries:
            #
            # $ egrep -v '^"[^"]*","[^"]*","[^"]*"$' license-blobs.csv
            # "swh:1:cnt:03e1933241b8c3878d81c0184d7f2fd3d8cd6185","037d40bc6bcb42dfd740be545dbdf2df3405442f","LICENSE
            # "
            # "swh:1:cnt:65a5c662900ee946583147129720563fd4ba286d","40e9258799f752fe25d7518155c615c1c497b7ac","LICENSE.md
            # "
            # "swh:1:cnt:8751b326784c7e145b242637866a4d46e8e274a5","a6bad643d9defc1e667c708d0d9aa9f1d6752fbc","LICENSE
            # "
            # "swh:1:cnt:e69de29bb2d1d6434b8b29ae775ad8c2e48c5391","da39a3ee5e6b4b0d3255bfef95601890afd80709","license.txt
            # "
            # "swh:1:cnt:82714d7648eb4f6cda2ed88fc4768e7d05472fe6","f096063880f4d0329856d3fca51c6d8afa13af9b","LICENSE.txt
            # "
            # fmt: off
            (
                Command.pv(sql_res.name)
                | Command.egrep('^[^,]*,[^,]*,[^,]*$')
                | Command.zstdmt("-")
                > AtomicFileSink(output_path)
            ).run()
            # fmt: on

            logger.info("Counting...")
            count = sum(1 for _ in self.iter_blobs(with_tqdm=False, unique_sha1=True))
            self.blob_count_path().parent.mkdir(exist_ok=True, parents=True)
            with self.blob_count_path().open("wt") as fd:
                fd.write(f"{count}\n")





[docs]
class DownloadBlobs(_BaseTask):
    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()
    previous_derived_datasets_path = luigi.OptionalPathParameter(default=None)
    parallel_downloads = luigi.IntParameter(default=10, significant=False)
    download_url = luigi.Parameter(
        default="https://softwareheritage.s3.amazonaws.com/content/{sha1}",
        description="""Where to download blobs from. {sha1} will be replaced by
        the file's SHA1 hexdigest. Alternative value:
        https://archive.softwareheritage.org/api/1/content/sha1:{sha1}/raw/""",
        significant=False,
    )
    decompression_algo = luigi.ChoiceParameter(
        choices=["none", "gzip"],
        default="gzip",
        description="""The decompression algorithm to use after downloading.
        Defaults to 'none' to match the SWH API. Should be 'none' when downloading from
        archive.softwareheritage.org""",
    )

    _session = None
    _session_pid = None


[docs]
    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`SelectBlobs`"""
        return SelectBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )



[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`stats/size.txt` in ``self.derived_datasets_path / self.blob_filter``"""
        return [
            luigi.LocalTarget(self.blob_dir()),
            luigi.LocalTarget(self.blob_size_path()),
        ]


    @classmethod
    def _compute_sha1(cls, path: Path) -> str:
        with path.open("rb") as fd:
            h = hashlib.sha1()
            while True:
                data = fd.read(40960)
                if not data:
                    break
                h.update(data)
        return h.hexdigest()

    def _download_blob(self, session: "Session", path: Path, sha1: str) -> int:
        """Returns the size in bytes."""
        import shutil
        import time

        import requests

        while True:
            url = self.download_url.format(sha1=sha1)
            try:
                resp = session.get(url, stream=True)
            except requests.exceptions.ConnectionError:
                logger.exception("Failed to query %s, retrying in 10 seconds:", url)
                time.sleep(10)
                continue
            if resp.status_code == 429:
                rate_limit_reset = int(resp.headers["X-RateLimit-Reset"])
                wait_seconds = max(10, rate_limit_reset - time.time())
                logger.warning("Waiting timeout for %d seconds", wait_seconds)
                time.sleep(wait_seconds)
                continue
            elif 500 <= resp.status_code < 600:
                logger.warning("Got %d error, retrying in 10 seconds", resp.status_code)
                time.sleep(10)
            elif resp.status_code == 200:
                break
            elif resp.status_code == 404:
                logger.error("%s returned 404", url)
                return 0
            else:
                msg = f"Unexpected status code: {resp.status_code}"
                logger.error(msg)
                logger.error(resp.text)
                raise Exception(msg)

        tmp_path = path.parent / f".tmp_{sha1}"

        if self.decompression_algo == "none":
            with tmp_path.open("wb") as fd:
                for chunk in resp.iter_content(chunk_size=40960):
                    fd.write(chunk)
        elif self.decompression_algo == "gzip":
            import gzip

            if not hasattr(gzip, "BadGzipFile"):
                # Python < 3.8
                BadGzipFile = OSError
            else:
                BadGzipFile = gzip.BadGzipFile

            try:
                with gzip.open(resp.raw, "rb") as compressed_fd:
                    with tmp_path.open("wb") as decompressed_fd:
                        shutil.copyfileobj(compressed_fd, decompressed_fd)
            except BadGzipFile as e:
                if e.args[0] == r"Not a gzipped file (b'\x00\x00')":
                    # WTF? https://gitlab.softwareheritage.org/swh/meta/-/issues/5034
                    print(f"{sha1} has null bytes instead of magic value")
                    return 0
                else:
                    raise
        else:
            assert False, f"Unexpected decompression algo: {self.decompression_algo}"

        if self._compute_sha1(tmp_path) != sha1:
            if tmp_path.stat().st_size == 0 and sha1 != EMPTY_FILE_SHA1:
                msg = f"Blob downloaded to {tmp_path} is empty"
            else:
                msg = f"Blob downloaded to {tmp_path} does not match its checksum"
            logger.error(msg)
            raise Exception(msg)

        # Atomically write to destination
        tmp_path.rename(path)

        return path.stat().st_size

    def _download_blob_if_missing(self, session: "Session", sha1: str) -> int:
        """Returns the size in bytes."""
        assert set(sha1) <= set("0123456789abcdef"), sha1
        assert len(sha1) == 40, sha1
        (sharded_path, unsharded_path) = self.blob_paths(sha1)

        for path in (sharded_path, unsharded_path):
            if path.exists():
                if self._compute_sha1(path) != sha1:
                    msg = f"Existing blob at {path} does not match its checksum"
                    logger.error(msg)
                    raise Exception(msg)
                logger.debug("Skipping %s, already exists", sha1)
                return path.stat().st_size
        else:
            logger.debug("Downloading %s", sha1)
            return self._download_blob(session, sharded_path, sha1)


[docs]
    def run(self) -> None:
        """Reads file SHA1 hashes from :file:`blobs.csv.zst` and downloads them
        to :file:`blobs/`."""

        import multiprocessing
        import multiprocessing.dummy

        import tqdm

        from ..shell import Command

        # Create sharded directories for the blobs
        for i in range(256):
            for j in range(256):
                (self.blob_dir() / f"{i:02x}" / f"{j:02x}").mkdir(
                    exist_ok=True, parents=True
                )

        previous_blob_tarball_path = self.previous_blob_tarball_path()
        if previous_blob_tarball_path:
            # reuse blobs from a previous version of the dataset, so we don't have
            # to download them one by one
            print(f"Unpacking previous blobs from {previous_blob_tarball_path}")
            # fmt: off
            (
                Command.pv(previous_blob_tarball_path)
                | Command.zstdcat()
                | Command.tar("-x", "-C", self.blob_dir().parent)  # tar root is blobs/
            ).run()
            # fmt: on
            print("Done unpacking")

        total_size = 0

        # Use a thread pool (more efficient because no IPC) if there is no compression
        # (because then it's IO bound), and a process pool when there is (it would
        # be single-thread CPU bound otherwise)
        Pool = multiprocessing.Pool
        if self.decompression_algo == "none":
            Pool = multiprocessing.dummy.Pool  # type: ignore[assignment]
        with Pool(self.parallel_downloads) as pool:
            for size in tqdm.tqdm(
                pool.imap_unordered(
                    self._worker,
                    self.iter_blobs(unique_sha1=True, with_tqdm=False),
                    chunksize=100,
                ),
                total=self.blob_count(),
            ):
                total_size += size

        with self.blob_size_path().open("wt") as fd:
            fd.write(f"{total_size}\n")



[docs]
    def session(self):
        if self._session_pid != os.getpid():
            # we forked; create a new session for this process
            import requests

            self._session_pid = os.getpid()

            self._session = requests.Session()
            self._session.headers["User-Agent"] = (
                f"SWH {self.blob_filter} Dataset blob downloader"
            )

            auth_token = os.environ.get("SWH_AUTH_TOKEN")
            if auth_token:
                self._session.headers["Authorization"] = f"Bearer {auth_token}"
        return self._session


    def _worker(self, args):
        (swhid, sha1, name) = args
        return self._download_blob_if_missing(self.session(), sha1)




[docs]
class MakeBlobTarball(_BaseTask):
    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()


[docs]
    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`DownloadBlobs`"""
        return DownloadBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )



[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`blobs.tar.zst` in ``self.derived_datasets_path / self.blob_filter``"""
        return [luigi.LocalTarget(self.blob_tarball_path())]



[docs]
    def run(self) -> None:
        """Run task."""
        from ..shell import AtomicFileSink, Command

        approx_tarball_size = (
            self.blob_size()  # the content itself
            + 512 * self.blob_count()  # assuming one header per file
        )

        cwd = self.derived_datasets_path / self.blob_filter

        # fmt: off
        (
            Command.tar("-c", "--sort=name", "blobs/", cwd=cwd)
            | Command.pv("--size", str(approx_tarball_size))
            | Command.zstdmt(f"-{COMPRESS_LEVEL}")
            > AtomicFileSink(self.blob_tarball_path())
        ).run()


        # fmt: on



[docs]
class MakeSampleBlobTarball(_BaseTask):
    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()


[docs]
    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`DownloadBlobs`"""
        return DownloadBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )



[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`blobs.tar.zst` in ``self.derived_datasets_path / self.blob_filter``"""
        return [luigi.LocalTarget(self.sample_blob_tarball_path())]



[docs]
    def run(self) -> None:
        """Selects a sample of 20k random blobs and puts them in a tarball."""
        from ..shell import AtomicFileSink, Command

        cwd = self.derived_datasets_path / self.blob_filter
        # fmt: off
        (
            Command.zstdcat(self.blob_list_path())
            | Command.tail("-n", "+2")
            | Command.cut("-d", ",", "-f", "2")
            | Command.uniq()
            | Command.shuf("--head-count=20000")
            | Command.sort()
            | Command.sed(r"s#\(\(..\)\(..\)\(..*\)\)#blobs/\2/\3/\1#")
            | Command.tar(
                "-c",
                "--files-from=/dev/stdin",
                "--transform=s/^blobs/blobs-sample20k/",
                cwd=cwd
            )
            | Command.zstdmt(f"-{COMPRESS_LEVEL}")
            > AtomicFileSink(self.sample_blob_tarball_path())
        ).run()


        # fmt: on



[docs]
class ComputeBlobFileinfo(_BaseTask):
    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()

    CSV_HEADER = (
        "swhid",
        "mime_type",
        "encoding",
        "line_count",
        "word_count",
        "size",
    )
    READABLE_ENCODINGS = ("us-ascii", "utf-8", "iso-8859-1")


[docs]
    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`DownloadBlobs`"""
        return DownloadBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )



[docs]
    def output(self) -> List[luigi.LocalTarget]:
        """:file:`blobs-fileinfo.csv.zst` in
        ``self.derived_datasets_path / self.blob_filter``"""
        return [
            luigi.LocalTarget(
                self.derived_datasets_path / self.blob_filter / "blobs-fileinfo.csv.zst"
            )
        ]



[docs]
    def run(self) -> None:
        """Run task."""
        import multiprocessing

        import tqdm

        with atomic_csv_zstd_writer(self.output()[0].path) as writer:
            writer.writerow(self.CSV_HEADER)

            with multiprocessing.Pool() as pool:
                # imap instead of imap_unordered, to preserve sha1 order of blobs
                for row in tqdm.tqdm(
                    pool.imap(
                        self._analyze_blob,
                        self.iter_blobs(unique_sha1=True, with_tqdm=False),
                    ),
                    total=self.blob_count(),
                ):
                    writer.writerow(row)


    def _analyze_blob(self, row) -> Tuple[str, str, str, str, str, str]:
        (swhid, sha1, name) = row

        (path, _) = self.blob_paths(sha1)
        assert path.exists(), f"{path} does not exist"

        size = path.stat().st_size
        mime_type, encoding = _guess_mime(str(path))
        line_count, word_count = None, None

        if mime_type == "text/plain" and encoding in self.READABLE_ENCODINGS:
            line_count = 0
            word_count = 0
            try:
                with open(path, encoding="utf8") as f:
                    for line in f:
                        line_count += 1
                        word_count += len(line.rstrip().split())
            except UnicodeDecodeError:
                line_count = None
                word_count = None

        return (
            swhid,
            mime_type,
            encoding,
            str(line_count) if line_count is not None else "",
            str(word_count) if word_count is not None else "",
            str(size),  # byte count
        )




[docs]
class BlobScancode(_BaseTask):
    """Runs scancode-toolkit on the blob dataset"""

    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()

    FIELDNAMES = [
        "swhid",
        "license",
        "score",
    ]

    DEFAULT_MIN_SCORE = 0
    DEFAULT_JOBS = 1
    DEFAULT_TIMEOUT = 120
    MAP_CHUNKSIZE = 1
    WORKER_MAX_TASKS = 1000  # to workaround Scancode get_licenses() memory leaks
    FIELD_SEP = ","
    READABLE_ENCODINGS = ("us-ascii", "utf-8", "iso-8859-1")

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.mime_guesser: Optional[magic.Magic] = None  # set in child processes


[docs]
    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`DownloadBlobs`"""
        return DownloadBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )


    def _csv_path(self) -> Path:
        return self.derived_datasets_path / self.blob_filter / "blobs-scancode.csv.zst"

    def _json_path(self) -> Path:
        return (
            self.derived_datasets_path / self.blob_filter / "blobs-scancode.ndjson.zst"
        )


[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`blobs-scancode.csv.zst` and :file:`blobs-scancode.ndjson.zst` in
        ``self.derived_datasets_path / self.blob_filter``"""
        return [
            luigi.LocalTarget(self._csv_path()),
            luigi.LocalTarget(self._json_path()),
        ]


    def _detect_licenses(self, row) -> Tuple[Set[Tuple[str, str, float]], str]:
        import json
        import time

        # needs to be initialized before importing scancode:
        # https://github.com/nexB/scancode-plugins/issues/30
        _init_mime_guesser()

        from scancode.api import get_copyrights, get_licenses

        (swhid, sha1, name) = row
        (path, _) = self.blob_paths(sha1)
        assert path.exists(), f"{path} does not exist"

        mime_type, encoding = _guess_mime(str(path))
        license_rows = set()

        res: Dict[str, Any] = {"swhid": swhid}

        if mime_type == "text/plain" and encoding in self.READABLE_ENCODINGS:
            deadline = time.time() + self.DEFAULT_TIMEOUT
            res["licenses"] = get_licenses(
                str(path), min_score=self.DEFAULT_MIN_SCORE, deadline=deadline
            )
            license_rows = {
                (
                    swhid,
                    lic["spdx_license_key"],
                    lic["score"],
                )
                for lic in res["licenses"]["licenses"]
            }
            res["copyrights"] = get_copyrights(str(path))

        return (license_rows, json.dumps(res))


[docs]
    def run(self) -> None:
        """Detect license(s) of license blobs located under blob_dir using scancode.

        Save frequencies to csv_outfile in a 3-column (sha1, license, score) CSV format.

        """
        import csv
        import multiprocessing
        import multiprocessing.pool

        import tqdm

        # ensure clean slate
        if self._csv_path().exists():
            self._csv_path().unlink()
        if self._json_path().exists():
            self._json_path().unlink()

        context = multiprocessing.get_context(method="spawn")
        with (
            atomic_zstd_writer(self._csv_path()) as csvfile,
            atomic_zstd_writer(self._json_path()) as jsonfile,
        ):
            csv_writer = csv.writer(csvfile, delimiter=self.FIELD_SEP)
            csv_writer.writerow(self.FIELDNAMES)
            with multiprocessing.pool.Pool(
                maxtasksperchild=self.WORKER_MAX_TASKS,
                context=context,
            ) as pool:
                for license_rows, results in tqdm.tqdm(
                    pool.imap_unordered(
                        self._detect_licenses,
                        self.iter_blobs(unique_sha1=True, with_tqdm=False),
                        chunksize=self.MAP_CHUNKSIZE,
                    ),
                    total=self.blob_count(),
                ):
                    # each detect() call can return multiple licenses, flatten them
                    for sha1, license, score in license_rows:
                        csv_writer.writerow([sha1, license, str(score)])
                    assert "\n" not in results
                    jsonfile.write(results + "\n")
                print("Done")





[docs]
class FindBlobOrigins(_ConcurrentCsvWritingTask):
    previous_derived_datasets_path = luigi.OptionalPathParameter(default=None)


[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`blobs.tar.zst` in ``self.derived_datasets_path / self.blob_filter``"""
        return [
            luigi.LocalTarget(
                self.derived_datasets_path / self.blob_filter / "blobs-origins.csv.zst"
            )
        ]


    CSV_HEADER = ("swhid", "origin_url")


[docs]
    async def process_one(self, row: Tuple[str, str, str]) -> Tuple[str, str]:
        from google.protobuf.field_mask_pb2 import FieldMask

        import swh.graph.grpc.swhgraph_pb2 as swhgraph

        (swhid, sha1, name) = row
        if not swhid.startswith("swh:1:"):
            raise ValueError(f"Invalid SWHID: {swhid}")

        # If we are running incrementally, skip the request
        origin_url = self.existing_swhids.get(swhid)

        if not origin_url:
            response = self.stub.Traverse(
                swhgraph.TraversalRequest(
                    src=[swhid],
                    direction=swhgraph.GraphDirection.BACKWARD,
                    mask=FieldMask(paths=["swhid", "ori.url"]),
                    return_nodes=swhgraph.NodeFilter(
                        types="ori",  # return only origins...
                    ),
                    max_matching_nodes=1,  # return at most one
                )
            )
            async for item in response:
                origin_url = item.ori.url
                if origin_url:
                    break
                else:
                    print(f"{item.swhid} does not have an associated URL")
            else:
                # no origin found
                origin_url = ""

        assert origin_url is not None

        return (swhid, origin_url)



[docs]
    def run(self) -> None:
        import pyzstd

        self.existing_swhids: Dict[str, str] = {}

        if self.previous_derived_datasets_path:
            separator = ","  # or "\t" for datasets before 2022-12-07
            # reuse blobs from a previous version of the dataset, so we don't have
            # to recompute them all
            path = (
                self.previous_derived_datasets_path
                / self.blob_filter
                / "blobs-origins.csv.zst"
            )
            with pyzstd.open(path, "rt") as fd:
                self.existing_swhids = dict(
                    line.strip().split(separator)
                    for line in cast(Iterable[str], fd)
                    if line[-2] != separator
                )

        super().run()





[docs]
class CountBlobOrigins(_ConcurrentCsvWritingTask):
    CSV_HEADER = ("swhid", "origin_count")


[docs]
    def output(self) -> List[luigi.Target]:
        """:file:`blobs.tar.zst` in ``self.derived_datasets_path / self.blob_filter``"""
        return [
            luigi.LocalTarget(
                self.derived_datasets_path
                / self.blob_filter
                / "blobs-nb-origins.csv.zst"
            )
        ]



[docs]
    async def process_one(self, row: Tuple[str, str, str]) -> Tuple[str, str]:
        from google.protobuf.field_mask_pb2 import FieldMask

        import swh.graph.grpc.swhgraph_pb2 as swhgraph

        (swhid, sha1, name) = row
        if not swhid.startswith("swh:1:"):
            raise ValueError(f"Invalid SWHID: {swhid}")

        response = await self.stub.CountNodes(
            swhgraph.TraversalRequest(
                src=[swhid],
                direction=swhgraph.GraphDirection.BACKWARD,
                mask=FieldMask(paths=["url"]),
                return_nodes=swhgraph.NodeFilter(
                    types="ori",  # count only origins
                ),
            )
        )

        return (swhid, response.count)





[docs]
class FindEarliestRevisions(_BaseTask):
    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()
    local_graph_path = luigi.PathParameter()
    graph_name = luigi.Parameter(default="graph")


[docs]
    def requires(self) -> luigi.Task:
        """Returns an instance of :class:`SelectBlobs`"""
        return SelectBlobs(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )



[docs]
    def output(self) -> luigi.LocalTarget:
        """:file:`blobs-earliest.csv.zst` in
        ``self.derived_datasets_path / self.blob_filter``"""
        return luigi.LocalTarget(
            self.derived_datasets_path / self.blob_filter / "blobs-earliest.csv.zst"
        )



[docs]
    def run(self) -> None:
        """Run task."""
        from ..shell import AtomicFileSink, Command, Rust

        # fmt: off
        (
            Command.zstdcat(self.blob_list_path())
            | Command.sed("s/,.*//")
            | Command.uniq()
            | Rust(
                "find-earliest-revision",
                self.local_graph_path / self.graph_name,
            )
            | Command.pv("--wait", "--line-mode", "--size", str(self.blob_count()))
            | Command.zstdmt(f"-{COMPRESS_LEVEL}")
            > AtomicFileSink(self.output())
        ).run()


        # fmt: on



[docs]
class RunBlobDataset(luigi.Task):
    """Runs all tasks to build a blob dataset with the given filter."""

    blob_filter = luigi.ChoiceParameter(choices=list(SELECTION_QUERIES))
    derived_datasets_path = luigi.PathParameter()


[docs]
    def requires(self) -> Sequence[luigi.Task]:
        """Returns a list of task such that every task in this module are transitively
        depended on."""
        kwargs = dict(
            blob_filter=self.blob_filter,
            derived_datasets_path=self.derived_datasets_path,
        )
        tasks = [
            DownloadBlobs(**kwargs),
            MakeBlobTarball(**kwargs),
            MakeSampleBlobTarball(**kwargs),
            FindBlobOrigins(**kwargs),
            CountBlobOrigins(**kwargs),
            FindEarliestRevisions(**kwargs),
            ComputeBlobFileinfo(**kwargs),
        ]

        if self.blob_filter in ("citation", "readme", "known_swhids"):
            pass
        elif self.blob_filter == "license":
            tasks.append(BlobScancode(**kwargs))
        else:
            raise ValueError(f"Unexpected blob filter: {self.blob_filter}")

        return tasks



[docs]
    def complete(self) -> bool:
        """Always returns False; status is checked by dependencies."""
        return False



[docs]
    def run(self):
        """Checks all files are in the correct format and contain a well-known SWHID"""
        dir_path = self.derived_datasets_path / self.blob_filter
        for csv_path in dir_path.glob("*.csv.zst"):
            check_csv(csv_path)

        if self.blob_filter == "license":
            # Text of the GPLv3
            swhid = "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2"
            expected_fileinfo = f"{swhid},text/plain,us-ascii,674,5644,35147\n"
            min_expected_origins = 2_000_000
        elif self.blob_filter == "citation":
            # Codemeta's own codemeta.json
            swhid = "swh:1:cnt:6daebd857f6f6a98dd9288ef7b942283f7fa4f0e"
            expected_fileinfo = f"{swhid},application/json,us-ascii,,,7173\n"
            min_expected_origins = 5
        else:
            assert False, f"Unexpected blob filter: {self.blob_filter}"

        self._check_fileinfo(
            swhid, expected_fileinfo, dir_path / "blobs-fileinfo.csv.zst"
        )
        if self.blob_filter == "license":
            self._check_scancode(swhid, dir_path)
        self._check_nb_origins(
            swhid, min_expected_origins, dir_path / "blobs-nb-origins.csv.zst"
        )
        self._check_exactly_one_line(swhid, dir_path / "blobs-origins.csv.zst")
        self._check_exactly_one_line(swhid, dir_path / "blobs-earliest.csv.zst")


    def _check_fileinfo(self, swhid: str, expected_fileinfo: str, path: Path) -> None:
        from ..shell import Command, Sink

        # fmt: off
        results = (
            Command.zstdcat(path)
            | Command.grep("^" + swhid)
            > Sink()
        ).run()
        # fmt: on

        assert results.decode() == expected_fileinfo, f"Unexpected fileinfo for {swhid}"

    def _check_scancode(self, swhid: str, dir_path: Path) -> None:
        import json

        from ..shell import Command, Sink

        assert swhid == "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2"

        # fmt: off
        csv_results = (
            Command.zstdcat(dir_path / "blobs-scancode.csv.zst")
            | Command.grep("^" + swhid)
            > Sink()
        ).run().decode()
        # fmt: on

        assert csv_results == (
            f"{swhid},GPL-3.0-only,100.0\r\n"
        ), f"Unexpected scancode CSV for {swhid}: {csv_results!r}"

        # fmt: off
        json_results = (
            Command.zstdcat(dir_path / "blobs-scancode.ndjson.zst")
            | Command.grep(swhid)
            > Sink()
        ).run().decode()
        # fmt: on
        assert (
            json_results.count("\n") == 1
        ), f"Unexpectedly number of results for {swhid} in scancode NDJSON:\n{json_results}"
        result = json.loads(json_results)
        assert result["swhid"] == swhid, result
        licenses = [license for license in result["licenses"]["licenses"]]
        assert (
            len(licenses) == 1
        ), f"Unexpected number of licenses for {swhid}: {licenses}"
        assert licenses[0]["key"] == "gpl-3.0"
        assert licenses[0]["score"] == 100.0

    def _check_exactly_one_line(self, swhid: str, path: Path) -> None:
        from ..shell import Command, Sink

        # fmt: off
        results = (
            Command.zstdcat(path)
            | Command.grep("^" + swhid)
            > Sink()
        ).run()
        # fmt: on

        assert (
            results.count(b"\n") == 1
        ), f"Unexpected number of line for {swhid} in {path}:\n{results}"

    def _check_nb_origins(
        self, swhid: str, min_expected_origins: int, path: Path
    ) -> None:
        from ..shell import Command, Sink

        # fmt: off
        results = (
            Command.zstdcat(path)
            | Command.grep("^" + swhid)
            > Sink()
        ).run().decode()
        # fmt: on

        assert (
            results.count("\n") == 1
        ), f"Unexpected number of origin counts for {swhid}:\n{results}"
        count = int(results.split("\n")[0].split(",")[1])
        assert (
            min_expected_origins <= count <= 1_000_000_000
        ), f"Unexpected number of origins for {swhid}: {count}"