Source code for swh.export.fullnames

# Copyright (C) 2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import base64
import csv
import os
from pathlib import Path
import subprocess
import tempfile

import pyorc
from tqdm import tqdm


[docs] def process_fullnames(fullnames_orc: Path, dedup_dir: Path) -> None: with tempfile.NamedTemporaryFile(suffix=".csv") as result_file: entries = list(dedup_dir.iterdir()) if entries: env = {**os.environ, "LC_ALL": "C", "LC_COLLATE": "C", "LANG": "C"} # fmt: off subprocess.run( [ "sort", "-t", ",", "-k", "2", "-u", "-S", "100M", "-m", *entries, "-o", result_file.name, ], env=env, ) # fmt: on with open(fullnames_orc, "wb") as output: with open(result_file.name, "r") as input: reader = csv.reader(input) with pyorc.Writer( output, pyorc.Struct( fullname=pyorc.Binary(), sha256_fullname=pyorc.Binary() ), bloom_filter_columns=[0, 1], ) as writer: for row in tqdm( reader, desc="Writing persons' fullnames to ORC file" ): if row == ("",): continue fullname, sha256_fullname = row writer.write( ( base64.b64decode(fullname), base64.b64decode(sha256_fullname), ) )