Source code for swh.datasets.download
# Copyright (C) 2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, List
from swh.core.s3.downloader import S3Downloader
if TYPE_CHECKING:
from types_boto3_s3.service_resource import ObjectSummary
logger = logging.getLogger(__name__)
[docs]
class DatasetDownloader(S3Downloader):
"""Utility class to help downloading SWH datasets (ORC exports for instance)
from S3.
It also implements a download resumption feature in case some files fail to
be downloaded (when connection errors happen for instance).
Example of use::
from swh.datasets.download import DatasetDownloader
# download "2025-05-18-popular-1k" ORC dataset into a sub-directory of the
# current working directory named "2025-05-18-popular-1k-orc"
dataset_downloader = DatasetDownloader(
local_path="2025-05-18-popular-1k-orc",
s3_url="s3://softareheritage/graph/2025-05-18-popular-1k/orc/",
)
while not dataset_downloader.download():
continue
"""
[docs]
def filter_objects(self, objects: List[ObjectSummary]) -> List[ObjectSummary]:
# filter out JSON metadata files, they will be downloaded after all other files
return [obj for obj in objects if not obj.key.endswith(".json")]
[docs]
def post_downloads(self) -> None:
# download JSON metadata files as stamps after data files
for obj in self.bucket.objects.filter(Prefix=self.prefix):
if obj.key.endswith(".json"):
self._download_file(obj.key)
# also download {s3_url}/../meta/ contents if available
prefix = self.prefix.rsplit("/", 2)[0] + "/meta/"
for obj in self.bucket.objects.filter(Prefix=prefix):
sub_path = obj.key.split("/meta/")[-1]
self._download_file(
obj.key,
local_file_path=self.local_path / "meta" / sub_path,
prefix=prefix,
)