Source code for swh.graph.config

# Copyright (C) 2019-2023  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import logging
import os.path
from pathlib import Path

# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import psutil

logger = logging.getLogger(__name__)


[docs] def find_graph_jar() -> str: """find swh-graph.jar, containing the Java part of swh-graph look both in development directories and installed data (for in-production deployments who fecthed the JAR from pypi) """ logger.debug("Looking for swh-graph JAR") swh_graph_jar = Path(__file__).parent / "swh-graph.jar" logger.info("using swh-graph JAR: %s", swh_graph_jar) return str(swh_graph_jar)
[docs] def check_config(conf): """check configuration and propagate defaults""" conf = conf.copy() if "batch_size" not in conf: # Use 0.1% of the RAM as a batch size: # ~1 billion for big servers, ~10 million for small desktop machines conf["batch_size"] = min(int(psutil.virtual_memory().total / 1000), 2**30 - 1) logger.debug("batch_size not configured, defaulting to %s", conf["batch_size"]) if "llp_gammas" not in conf: conf["llp_gammas"] = "-0,-1,-2,-3,-4" logger.debug("llp_gammas not configured, defaulting to %s", conf["llp_gammas"]) if "max_ram" not in conf: conf["max_ram"] = str(int(psutil.virtual_memory().total * 0.9)) logger.debug("max_ram not configured, defaulting to %s", conf["max_ram"]) if "java_tool_options" not in conf: conf["java_tool_options"] = " ".join( [ "-Xmx{max_ram}", "-XX:PretenureSizeThreshold=512M", "-XX:MaxNewSize=4G", "-XX:+UseLargePages", "-XX:+UseTransparentHugePages", "-XX:+UseNUMA", "-XX:+UseTLAB", "-XX:+ResizeTLAB", ] ) logger.debug( "java_tool_options not providing, defaulting to %s", conf["java_tool_options"], ) conf["java_tool_options"] = conf["java_tool_options"].format( max_ram=conf["max_ram"] ) if "java" not in conf: if "JAVA_HOME" in os.environ: conf["java"] = os.path.join(os.environ["JAVA_HOME"], "bin", "java") else: conf["java"] = "java" if "classpath" not in conf: conf["classpath"] = find_graph_jar() if "object_types" not in conf: conf["object_types"] = "*" return conf
[docs] def check_config_compress(config, graph_name, in_dir, out_dir): """check compression-specific configuration and initialize its execution environment. """ conf = check_config(config) conf["graph_name"] = graph_name conf["in_dir"] = str(in_dir) conf["out_dir"] = str(out_dir) out_dir.mkdir(parents=True, exist_ok=True) if "tmp_dir" not in conf: tmp_dir = out_dir / "tmp" conf["tmp_dir"] = str(tmp_dir) else: tmp_dir = Path(conf["tmp_dir"]) tmp_dir.mkdir(parents=True, exist_ok=True) if "logback" not in conf: logback_confpath = tmp_dir / "logback.xml" with open(logback_confpath, "w") as conffile: conffile.write( """ <configuration> <appender name="STDERR" class="ch.qos.logback.core.ConsoleAppender"> <encoder> <pattern>%d %r %p [%t] %logger{1} - %m%n</pattern> </encoder> <target>System.err</target> </appender> <root level="INFO"> <appender-ref ref="STDERR"/> </root> </configuration> """ ) conf["logback"] = str(logback_confpath) conf["java_tool_options"] += " -Dlogback.configurationFile={logback}" conf["java_tool_options"] += " -Djava.io.tmpdir={tmp_dir}" conf["java_tool_options"] = conf["java_tool_options"].format( logback=conf["logback"], tmp_dir=conf["tmp_dir"], ) return conf