#!/usr/bin/env python3
# Copyright (C) 2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Callable, Dict, List
import click
from swh.core.cli import CONTEXT_SETTINGS
from swh.core.cli import swh as swh_cli_group
[docs]
def common_options(func):
import functools
@click.option(
"--sentry-url",
"-u",
default="https://sentry.softwareheritage.org",
show_default=True,
help="Sentry URL",
)
@click.option(
"--sentry-token",
"-t",
default=None,
envvar="SENTRY_TOKEN",
help=(
"Bearer token required to communicate with Sentry API (can also be provided "
"in SENTRY_TOKEN environment variable)"
),
required=True,
)
@click.option(
"--sentry-issue-number",
"-i",
help="Sentry issue number to extract origin URLs from its events",
required=True,
)
@click.option(
"--environment",
"-e",
default="",
help="Filter on environment: production or staging, both are selected by default",
)
@functools.wraps(func)
def wrapper(*args, **kwargs):
return func(*args, **kwargs)
return wrapper
@swh_cli_group.group(name="sentry", context_settings=CONTEXT_SETTINGS)
def sentry():
"""Software Heritage tools for extracting data from the events associated to
a Sentry issue using Sentry REST API."""
pass
def _process_sentry_events_pages(
sentry_url,
sentry_token,
sentry_issue_number,
events_page_process_callback: Callable[[List[Dict[str, Any]]], None],
full_sentry_responses: bool = False,
):
import requests
sentry_api_base_url = f"{sentry_url.rstrip('/')}/api/0"
sentry_issue_events_url = (
f"{sentry_api_base_url}/issues/{sentry_issue_number}/events/"
)
if full_sentry_responses:
sentry_issue_events_url += "?full=true"
while True:
response = requests.get(
sentry_issue_events_url, headers={"Authorization": f"Bearer {sentry_token}"}
)
events = response.json()
if not events:
break
events_page_process_callback(events)
sentry_issue_events_url = response.links.get("next", {}).get("url")
@sentry.command(name="extract-origin-urls", context_settings=CONTEXT_SETTINGS)
@common_options
def extract_origin_urls(sentry_url, sentry_token, sentry_issue_number, environment):
"""Extract origin URLs from events.
This command allows to extract origin URLs from Sentry events related to
a Software Heritage loader and dumps them to stdout."""
origin_urls = set()
def _extract_origin_urls(events: List[Dict[str, Any]]):
for event in events:
tags = {tag["key"]: tag["value"] for tag in event.get("tags", [])}
env_match = environment in tags.get("environment", "")
if "swh.loader.origin_url" in tags and env_match:
origin_urls.add(tags["swh.loader.origin_url"])
_process_sentry_events_pages(
sentry_url,
sentry_token,
sentry_issue_number,
_extract_origin_urls,
)
for origin_url in sorted(origin_urls):
click.echo(origin_url)
@sentry.command(name="extract-scheduler-tasks", context_settings=CONTEXT_SETTINGS)
@common_options
def extract_scheduler_tasks(sentry_url, sentry_token, sentry_issue_number, environment):
"""Extract scheduler task parameters from events.
This command allows to extract scheduler task parameters from Sentry events related to
a Software Heritage scheduler task and dumps a CSV file to stdout that can be consumed
by the CLI command:
$ swh scheduler task schedule --columns type --columns kwargs <csv_file>.
"""
import csv
import json
import sys
task_params = {}
def _extract_scheduler_tasks(events):
for event in events:
celery_job = event.get("context", {}).get("celery-job", {})
task_name = celery_job.get("task_name")
task_param = celery_job.get("kwargs")
if task_param:
key = tuple([task_name] + list(task_param.values()))
task_params[key] = (task_name, task_param)
_process_sentry_events_pages(
sentry_url,
sentry_token,
sentry_issue_number,
_extract_scheduler_tasks,
full_sentry_responses=True,
)
csv_writer = csv.writer(sys.stdout)
for task_type, task_param in sorted(
task_params.values(), key=lambda p: p[1].get("url", "")
):
csv_writer.writerow([task_type, json.dumps(task_param)])