Source code for swh.graph.luigi
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Luigi tasks
===========
This package contains `Luigi <https://luigi.readthedocs.io/>`_ tasks.
These come in two kinds:
* in :mod:`swh.graph.luigi.compressed_graph`: an alternative to the 'swh graph compress'
CLI that can be composed with other tasks, such as swh-dataset's
* in other submodules: tasks driving the creation of specific datasets that are
generated using the compressed graph
The overall directory structure is::
base_dir/
<date>[_<flavor>]/
edges/
...
orc/
...
compressed/
graph.graph
graph.mph
...
meta/
export.json
compression.json
datasets/
contribution_graph.csv.zst
topology/
topological_order_dfs.csv.zst
And optionally::
sensitive_base_dir/
<date>[_<flavor>]/
persons_sha256_to_name.csv.zst
datasets/
contribution_graph.deanonymized.csv.zst
"""
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
from typing import List
import luigi
from .aggregate_datasets import * # noqa
from .blobs_datasets import * # noqa
from .compressed_graph import * # noqa
from .file_names import * # noqa
from .origin_contributors import * # noqa
from .provenance import * # noqa
from .subdataset import * # noqa
from .topology import * # noqa
[docs]
class RunExportCompressUpload(luigi.Task):
"""Runs dataset export, graph compression, and generates datasets using the graph."""
[docs]
def requires(self) -> List[luigi.Task]:
"""Returns instances of :class:`swh.dataset.luigi.RunExportAll`
and :class:`swh.graph.luigi.compressed_graph.UploadGraphToS3`, which
recursively depend on the whole export and compression pipeline.
"""
from swh.dataset.luigi import RunExportAll
from .compressed_graph import UploadGraphToS3
return [
RunExportAll(),
UploadGraphToS3(),
]
[docs]
def complete(self) -> bool:
# Dependencies perform their own completeness check, and this task
# does no work itself
return False