Source code for swh.indexer.fossology_license
# Copyright (C) 2016-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import subprocess
from typing import Any, Dict, List, Optional
import sentry_sdk
from swh.core.config import merge_configs
from swh.indexer.storage.interface import IndexerStorageInterface, Sha1
from swh.indexer.storage.model import ContentLicenseRow
from swh.model import hashutil
from .indexer import ContentIndexer, write_to_temp
logger = logging.getLogger(__name__)
[docs]
def compute_license(path) -> Dict:
"""Determine license from file at path.
Args:
path: filepath to determine the license
Returns:
dict: A dict with the following keys:
- licenses ([str]): associated detected licenses to path
- path (bytes): content filepath
"""
try:
properties = subprocess.check_output(["nomossa", path], universal_newlines=True)
if properties:
res = properties.rstrip().split(" contains license(s) ")
licenses = res[1].split(",")
else:
licenses = []
return {
"licenses": licenses,
"path": path,
}
except subprocess.CalledProcessError:
from os import path as __path
logger.exception(
"Problem during license detection for sha1 %s" % __path.basename(path)
)
sentry_sdk.capture_exception()
return {
"licenses": [],
"path": path,
}
DEFAULT_CONFIG: Dict[str, Any] = {
"workdir": "/tmp/swh/indexer.fossology.license",
"tools": {
"name": "nomos",
"version": "3.1.0rc2-31-ga2cbb8c",
"configuration": {
"command_line": "nomossa <filepath>",
},
},
"write_batch_size": 1000,
}
[docs]
class MixinFossologyLicenseIndexer:
"""Mixin fossology license indexer.
See :class:`FossologyLicenseIndexer`
"""
tool: Any
idx_storage: IndexerStorageInterface
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.config = merge_configs(DEFAULT_CONFIG, self.config)
self.working_directory = self.config["workdir"]
[docs]
def index(
self, id: Sha1, data: Optional[bytes] = None, **kwargs
) -> List[ContentLicenseRow]:
"""Index sha1s' content and store result.
Args:
id (bytes): content's identifier
raw_content (bytes): associated raw content to content id
Returns:
dict: A dict, representing a content_license, with keys:
- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes
- path (bytes): path
- indexer_configuration_id (int): tool used to compute the output
"""
assert data is not None
with write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data,
working_directory=self.working_directory,
) as content_path:
properties = compute_license(path=content_path)
return [
ContentLicenseRow(
id=id,
indexer_configuration_id=self.tool["id"],
license=license,
)
for license in properties["licenses"]
]
[docs]
def persist_index_computations(
self, results: List[ContentLicenseRow]
) -> Dict[str, int]:
"""Persist the results in storage.
Args:
results: list of content_license dict with the
following keys:
- id (bytes): content's identifier (sha1)
- license (bytes): license in bytes
- path (bytes): path
"""
return self.idx_storage.content_fossology_license_add(results)
[docs]
class FossologyLicenseIndexer(
MixinFossologyLicenseIndexer, ContentIndexer[ContentLicenseRow]
):
"""Indexer in charge of:
- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)
- computing {license, encoding} from that content
- store result in storage
"""
[docs]
def filter(self, ids):
"""Filter out known sha1s and return only missing ones."""
yield from self.idx_storage.content_fossology_license_missing(
(
{
"id": sha1,
"indexer_configuration_id": self.tool["id"],
}
for sha1 in ids
)
)