Source code for swh.datasets.download

# Copyright (C) 2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, List

from swh.core.s3.downloader import S3Downloader

if TYPE_CHECKING:
    from types_boto3_s3.service_resource import ObjectSummary

logger = logging.getLogger(__name__)



[docs]
class DatasetDownloader(S3Downloader):
    """Utility class to help downloading SWH datasets (ORC exports for instance)
    from S3.

    It also implements a download resumption feature in case some files fail to
    be downloaded (when connection errors happen for instance).

    Example of use::

        from swh.datasets.download import DatasetDownloader

        # download "2025-05-18-popular-1k" ORC dataset into a sub-directory of the
        # current working directory named "2025-05-18-popular-1k-orc"

        dataset_downloader = DatasetDownloader(
            local_path="2025-05-18-popular-1k-orc",
            s3_url="s3://softareheritage/graph/2025-05-18-popular-1k/orc/",
        )

        while not dataset_downloader.download():
            continue
    """


[docs]
    def filter_objects(self, objects: List[ObjectSummary]) -> List[ObjectSummary]:
        # filter out JSON metadata files, they will be downloaded after all other files
        return [obj for obj in objects if not obj.key.endswith(".json")]



[docs]
    def post_downloads(self) -> None:
        # download JSON metadata files as stamps after data files
        for obj in self.bucket.objects.filter(Prefix=self.prefix):
            if obj.key.endswith(".json"):
                self._download_file(obj.key)

        # also download {s3_url}/../meta/ contents if available
        prefix = self.prefix.rsplit("/", 2)[0] + "/meta/"
        for obj in self.bucket.objects.filter(Prefix=prefix):
            sub_path = obj.key.split("/meta/")[-1]
            self._download_file(
                obj.key,
                local_file_path=self.local_path / "meta" / sub_path,
                prefix=prefix,
            )