Source code for swh.graph.luigi.aggregate_datasets

# Copyright (C) 2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

"""
Luigi tasks for producing the aggregated derived datasets
=========================================================

"""  # noqa

# WARNING: do not import unnecessary things here to keep cli startup time under
# control
from pathlib import Path
from typing import Dict, List, Tuple

import luigi

from swh.dataset.luigi import S3PathParameter

from .compressed_graph import LocalGraph
from .file_names import PopularContentNames
from .provenance import ComputeEarliestTimestamps
from .utils import _ParquetToS3ToAthenaTask


[docs] class ExportNodesTable(luigi.Task): """Creates a Parquet dataset that contains the id and SWHID of each node""" local_graph_path = luigi.PathParameter() graph_name = luigi.Parameter(default="graph") aggregate_datasets_path = luigi.PathParameter()
[docs] def requires(self) -> Dict[str, luigi.Task]: """Returns an instance of :class:`LocalGraph`.""" return { "graph": LocalGraph(local_graph_path=self.local_graph_path), }
[docs] def output(self) -> luigi.Target: """Directory of Parquet files.""" return luigi.LocalTarget(self.aggregate_datasets_path / "nodes")
[docs] def run(self) -> None: """Runs ``export-nodes`` from :file:`tools/aggregate`""" from ..shell import Rust # fmt: on ( Rust( "export-nodes", self.local_graph_path / self.graph_name, "--nodes-out", self.output(), ) ).run()
# fmt: off
[docs] class AggregateContentDatasets(luigi.Task): """Creates a Parquet dataset that contains a column for each of: * the content id * the content's length * the most popular name of each content * number of occurrences of that name for the content * its date of first occurrence in a revision or release, if any * said revision or release, if any * an origin containing said revision or release, if any """ local_graph_path = luigi.PathParameter() graph_name = luigi.Parameter(default="graph") popular_content_names_path = luigi.PathParameter() provenance_dir = luigi.PathParameter() aggregate_datasets_path = luigi.PathParameter()
[docs] def requires(self) -> Dict[str, luigi.Task]: """Returns an instance of :class:`LocalGraph`.""" return { "graph": LocalGraph(local_graph_path=self.local_graph_path), "popular_content_names": PopularContentNames( local_graph_path=self.local_graph_path, popular_contents_path=self.popular_content_names_path, max_results_per_content=1, ), "earliest_timestamps": ComputeEarliestTimestamps( local_graph_path=self.local_graph_path, provenance_dir=self.provenance_dir, provenance_node_filter="all", ), }
[docs] def output(self) -> luigi.Target: """Directory of Parquet files.""" return luigi.LocalTarget(self.aggregate_datasets_path / "contents")
[docs] def run(self) -> None: """Runs ``aggregate-content-datasets`` from :file:`tools/aggregate`""" from ..shell import Rust # fmt: on ( Rust( "aggregate-content-datasets", self.local_graph_path / self.graph_name, "--file-names", self.popular_content_names_path, "--earliest-timestamps", self.provenance_dir / "earliest_timestamps.bin", "--out", self.output(), ) ).run()
# fmt: off
[docs] class UploadNodesTable(_ParquetToS3ToAthenaTask): """Uploads the result of :class:`AggregateContentDatasets` to S3 and registers a table on Athena to query it""" aggregate_datasets_path = luigi.PathParameter() dataset_name = luigi.Parameter() s3_athena_output_location = S3PathParameter()
[docs] def requires(self) -> luigi.Task: """Returns an instance of :class:`ExportNodesTable`.""" return ExportNodesTable(aggregate_datasets_path=self.aggregate_datasets_path)
def _input_parquet_path(self) -> Path: return self.aggregate_datasets_path / "nodes" def _s3_bucket(self) -> str: # TODO: configurable return "softwareheritage" def _s3_prefix(self) -> str: # TODO: configurable return f"derived_datasets/{self.dataset_name}/nodes" def _parquet_columns(self) -> List[Tuple[str, str]]: return [ ("id", "bigint"), ("swhid", "string"), ("url", "string"), ] def _athena_db_name(self) -> str: return f"derived_{self.dataset_name.replace('-', '')}" def _athena_table_name(self) -> str: return "nodes"
[docs] def create_table_extras(self) -> str: """Extra clauses to add to the ``CREATE EXTERNAL TABLE`` statement.""" return "PARTITIONED BY (node_type string)"
[docs] class UploadAggregatedContentDataset(_ParquetToS3ToAthenaTask): """Uploads the result of :class:`AggregateContentDatasets` to S3 and registers a table on Athena to query it""" aggregate_datasets_path = luigi.PathParameter() dataset_name = luigi.Parameter() s3_athena_output_location = S3PathParameter()
[docs] def requires(self) -> luigi.Task: """Returns an instance of :class:`AggregateContentDatasets`.""" return AggregateContentDatasets( aggregate_datasets_path=self.aggregate_datasets_path )
def _input_parquet_path(self) -> Path: return self.aggregate_datasets_path / "contents" def _s3_bucket(self) -> str: # TODO: configurable return "softwareheritage" def _s3_prefix(self) -> str: # TODO: configurable return f"derived_datasets/{self.dataset_name}/contents" def _parquet_columns(self) -> List[Tuple[str, str]]: return [ ("id", "bigint"), ("length", "bigint"), ("filename", "binary"), ("filename_occurrences", "bigint"), ("first_occurrence_timestamp", "bigint"), ("first_occurrence_revrel", "bigint"), ("first_occurrence_origin", "bigint"), ] def _athena_db_name(self) -> str: return f"derived_{self.dataset_name.replace('-', '')}" def _athena_table_name(self) -> str: return "contents"
[docs] class RunAggregatedDatasets(luigi.WrapperTask): """Runs :class:`UploadNodesTable`, :class:`UploadAggregatedContentDataset`, and their recursive dependencies.""" aggregate_datasets_path = luigi.PathParameter() dataset_name = luigi.Parameter() s3_athena_output_location = S3PathParameter()
[docs] def requires(self) -> Dict[str, luigi.Task]: """Returns an instance of :class:`AggregateContentDatasets`.""" kwargs = dict( aggregate_datasets_path=self.aggregate_datasets_path, dataset_name=self.dataset_name, s3_athena_output_location=self.s3_athena_output_location, ) return { "upload_nodes": UploadNodesTable(**kwargs), "upload_contents": UploadAggregatedContentDataset(**kwargs), }