Source code for swh.graph.example_dataset.generate_dataset

#!/usr/bin/env python3

# Copyright (C) 2021-2022  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

# type: ignore

import argparse
import logging
from pathlib import Path
import shutil

from swh.dataset.exporters.edges import GraphEdgesExporter
from swh.dataset.exporters.orc import ORCExporter
from swh.graph.example_dataset import DATASET
from swh.graph.webgraph import compress


[docs] def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Generate a test dataset") parser.add_argument( "--compress", action="store_true", default=False, help="Also compress the dataset", ) parser.add_argument("output", help="output directory", nargs="?", default=".") args = parser.parse_args() exporters = {"edges": GraphEdgesExporter, "orc": ORCExporter} config = {"test_unique_file_id": "all"} output_path = Path(args.output) for name, exporter in exporters.items(): if (output_path / name).exists(): shutil.rmtree(output_path / name) with exporter(config, output_path / name) as e: for obj in DATASET: e.process_object(obj.object_type, obj.to_dict()) if args.compress: if (output_path / "compressed").exists(): shutil.rmtree(output_path / "compressed") compress("example", output_path / "orc", output_path / "compressed")
if __name__ == "__main__": main()