Source code for swh.graph.example_dataset.generate_dataset
#!/usr/bin/env python3
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# type: ignore
import argparse
import logging
from pathlib import Path
import shutil
from swh.export.exporters.edges import GraphEdgesExporter
from swh.export.exporters.orc import ORCExporter
from swh.graph.example_dataset import DATASET
from swh.graph.webgraph import compress
[docs]
def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="Generate a test dataset")
parser.add_argument(
"--compress",
action="store_true",
default=False,
help="Also compress the dataset",
)
parser.add_argument(
"--target", default="release", help="rust target to use for compression"
)
parser.add_argument("output", help="output directory", nargs="?", default=".")
args = parser.parse_args()
exporters = {"edges": GraphEdgesExporter, "orc": ORCExporter}
config = {"test_unique_file_id": "all"}
output_path = Path(args.output)
for name, exporter in exporters.items():
if (output_path / name).exists():
shutil.rmtree(output_path / name)
with exporter(config, output_path / name) as e:
for obj in DATASET:
e.process_object(obj.object_type, obj.to_dict())
if args.compress:
if (output_path / "compressed").exists():
shutil.rmtree(output_path / "compressed")
compress(
"example",
output_path / "orc",
output_path / "compressed",
"example",
conf={"target": args.target},
)
if __name__ == "__main__":
main()