diff options
author | Bas van Dijk <bas@van.dijk.ch> | 2021-10-27 14:18:00 +0200 |
---|---|---|
committer | Bas van Dijk <bas@dfinity.org> | 2021-11-04 11:15:21 +0000 |
commit | f12e976aded8113cb466efcdc2abc1932d9beb05 (patch) | |
tree | a7fad1dd1d6e4561977f691d45d2e7faf7578890 | |
parent | 8d5213123af4ba144de3dadbe3f9353ba9f4e8f7 (diff) | |
download | nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.gz nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.bz2 nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.lz nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.xz nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.zst nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.zip |
module/prometheus: optionally support reloading on config changes
The new option `services.prometheus.enableReload` has been introduced which, when enabled, causes the prometheus systemd service to reload when its config file changes. More specifically the following property holds: switching to a configuration (`switch-to-configuration`) that changes the prometheus configuration only finishes successully when prometheus has finished loading the new configuration. `enableReload` is `false` by default in which case the old semantics of restarting the prometheus systemd service are in effect.
-rw-r--r-- | nixos/doc/manual/from_md/release-notes/rl-2111.section.xml | 8 | ||||
-rw-r--r-- | nixos/doc/manual/release-notes/rl-2111.section.md | 2 | ||||
-rw-r--r-- | nixos/modules/services/monitoring/prometheus/default.nix | 97 | ||||
-rw-r--r-- | nixos/tests/prometheus.nix | 100 |
4 files changed, 204 insertions, 3 deletions
diff --git a/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml b/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml index 6378520762b..9ea83ab023f 100644 --- a/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml +++ b/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml @@ -1717,6 +1717,14 @@ Superuser created successfully. </listitem> <listitem> <para> + A new option + <literal>services.prometheus.enableReload</literal> has been + added which can be enabled to reload the prometheus service + when its config file changes instead of restarting. + </para> + </listitem> + <listitem> + <para> Dokuwiki now supports caddy! However </para> <itemizedlist spacing="compact"> diff --git a/nixos/doc/manual/release-notes/rl-2111.section.md b/nixos/doc/manual/release-notes/rl-2111.section.md index c2ea7bb24e0..f3339b1671d 100644 --- a/nixos/doc/manual/release-notes/rl-2111.section.md +++ b/nixos/doc/manual/release-notes/rl-2111.section.md @@ -487,6 +487,8 @@ In addition to numerous new and upgraded packages, this release has the followin - The `cawbird` Twitter client now uses its own API keys to count as different application than upstream builds. This is done to evade application-level rate limiting. While existing accounts continue to work, users may want to remove and re-register their account in the client to enjoy a better user experience and benefit from this change. +- A new option `services.prometheus.enableReload` has been added which can be enabled to reload the prometheus service when its config file changes instead of restarting. + - Dokuwiki now supports caddy! However - the nginx option has been removed, in the new configuration, please use the `dokuwiki.webserver = "nginx"` instead. - The "${hostname}" option has been deprecated, please use `dokuwiki.sites = [ "${hostname}" ]` instead diff --git a/nixos/modules/services/monitoring/prometheus/default.nix b/nixos/modules/services/monitoring/prometheus/default.nix index d2b37cf688b..5f7bda1acbc 100644 --- a/nixos/modules/services/monitoring/prometheus/default.nix +++ b/nixos/modules/services/monitoring/prometheus/default.nix @@ -7,6 +7,30 @@ let workingDir = "/var/lib/" + cfg.stateDir; + prometheusYmlOut = "${workingDir}/prometheus-substituted.yaml"; + + writeConfig = pkgs.writeShellScriptBin "write-prometheus-config" '' + PATH="${makeBinPath (with pkgs; [ coreutils envsubst ])}" + touch '${prometheusYmlOut}' + chmod 600 '${prometheusYmlOut}' + envsubst -o '${prometheusYmlOut}' -i '${prometheusYml}' + ''; + + triggerReload = pkgs.writeShellScriptBin "trigger-reload-prometheus" '' + PATH="${makeBinPath (with pkgs; [ systemd ])}" + if systemctl -q is-active prometheus.service; then + systemctl reload prometheus.service + fi + ''; + + reload = pkgs.writeShellScriptBin "reload-prometheus" '' + PATH="${makeBinPath (with pkgs; [ systemd coreutils gnugrep ])}" + cursor=$(journalctl --show-cursor -n0 | grep -oP "cursor: \K.*") + kill -HUP $MAINPID + journalctl -u prometheus.service --after-cursor="$cursor" -f \ + | grep -m 1 "Completed loading of configuration file" > /dev/null + ''; + # a wrapper that verifies that the configuration is valid promtoolCheck = what: name: file: if cfg.checkConfig then @@ -47,7 +71,11 @@ let cmdlineArgs = cfg.extraFlags ++ [ "--storage.tsdb.path=${workingDir}/data/" - "--config.file=/run/prometheus/prometheus-substituted.yaml" + "--config.file=${ + if cfg.enableReload + then prometheusYmlOut + else "/run/prometheus/prometheus-substituted.yaml" + }" "--web.listen-address=${cfg.listenAddress}:${builtins.toString cfg.port}" "--alertmanager.notification-queue-capacity=${toString cfg.alertmanagerNotificationQueueCapacity}" "--alertmanager.timeout=${toString cfg.alertmanagerTimeout}s" @@ -731,6 +759,25 @@ in { ''; }; + enableReload = mkOption { + default = false; + type = types.bool; + description = '' + Reload prometheus when configuration file changes (instead of restart). + + The following property holds: switching to a configuration + (<literal>switch-to-configuration</literal>) that changes the prometheus + configuration only finishes successully when prometheus has finished + loading the new configuration. + + Note that prometheus will also get reloaded when the location of the + <option>environmentFile</option> changes but not when its contents + changes. So when you change it contents make sure to reload prometheus + manually or include the hash of <option>environmentFile</option> in its + name. + ''; + }; + environmentFile = mkOption { type = types.nullOr types.path; default = null; @@ -928,7 +975,7 @@ in { systemd.services.prometheus = { wantedBy = [ "multi-user.target" ]; after = [ "network.target" ]; - preStart = '' + preStart = mkIf (!cfg.enableReload) '' ${lib.getBin pkgs.envsubst}/bin/envsubst -o "/run/prometheus/prometheus-substituted.yaml" \ -i "${prometheusYml}" ''; @@ -936,9 +983,10 @@ in { ExecStart = "${cfg.package}/bin/prometheus" + optionalString (length cmdlineArgs != 0) (" \\\n " + concatStringsSep " \\\n " cmdlineArgs); + ExecReload = mkIf cfg.enableReload "+${reload}/bin/reload-prometheus"; User = "prometheus"; Restart = "always"; - EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ]; + EnvironmentFile = mkIf (cfg.environmentFile != null && !cfg.enableReload) [ cfg.environmentFile ]; RuntimeDirectory = "prometheus"; RuntimeDirectoryMode = "0700"; WorkingDirectory = workingDir; @@ -946,5 +994,48 @@ in { StateDirectoryMode = "0700"; }; }; + systemd.services.prometheus-config-write = mkIf cfg.enableReload { + wantedBy = [ "prometheus.service" ]; + before = [ "prometheus.service" ]; + serviceConfig = { + Type = "oneshot"; + User = "prometheus"; + StateDirectory = cfg.stateDir; + StateDirectoryMode = "0700"; + EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ]; + ExecStart = "${writeConfig}/bin/write-prometheus-config"; + }; + }; + # prometheus-config-reload will activate after prometheus. However, what we + # don't want is that on startup it immediately reloads prometheus because + # prometheus itself might have just started. + # + # Instead we only want to reload prometheus when the config file has + # changed. So on startup prometheus-config-reload will just output a + # harmless message and then stay active (RemainAfterExit). + # + # Then, when the config file has changed, switch-to-configuration notices + # that this service has changed and needs to be reloaded + # (reloadIfChanged). The reload command then actually writes the new config + # and reloads prometheus. + systemd.services.prometheus-config-reload = mkIf cfg.enableReload { + wantedBy = [ "prometheus.service" ]; + after = [ "prometheus.service" ]; + reloadIfChanged = true; + serviceConfig = { + Type = "oneshot"; + User = "prometheus"; + StateDirectory = cfg.stateDir; + StateDirectoryMode = "0700"; + EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ]; + RemainAfterExit = true; + TimeoutSec = 60; + ExecStart = "${pkgs.logger}/bin/logger 'prometheus-config-reload will only reload prometheus when reloaded itself.'"; + ExecReload = [ + "${writeConfig}/bin/write-prometheus-config" + "+${triggerReload}/bin/trigger-reload-prometheus" + ]; + }; + }; }; } diff --git a/nixos/tests/prometheus.nix b/nixos/tests/prometheus.nix index 70ac78a4a46..d102b4c0751 100644 --- a/nixos/tests/prometheus.nix +++ b/nixos/tests/prometheus.nix @@ -41,6 +41,7 @@ in import ./make-test-python.nix { networking.firewall.allowedTCPPorts = [ grpcPort ]; services.prometheus = { enable = true; + enableReload = true; scrapeConfigs = [ { job_name = "prometheus"; @@ -118,6 +119,36 @@ in import ./make-test-python.nix { # }; #}; }; + # Adds a "specialisation" of the above config which allows us to + # "switch" to it and see if the services.prometheus.enableReload + # functionality actually reloads the prometheus service instead of + # restarting it. + specialisation = { + "prometheus-config-change" = { + configuration = { + environment.systemPackages = [ pkgs.yq ]; + + # This configuration just adds a new prometheus job + # to scrape the node_exporter metrics of the s3 machine. + # We also use an environmentFile to test if that works correctly. + services.prometheus = { + environmentFile = pkgs.writeText "prometheus-config-env-file" '' + JOB_NAME=s3-node_exporter + ''; + scrapeConfigs = [ + { + job_name = "$JOB_NAME"; + static_configs = [ + { + targets = [ "s3:9100" ]; + } + ]; + } + ]; + }; + }; + }; + }; }; query = { pkgs, ... }: { @@ -171,10 +202,17 @@ in import ./make-test-python.nix { }; environment.systemPackages = [ pkgs.minio-client ]; + + services.prometheus.exporters.node = { + enable = true; + openFirewall = true; + }; }; }; testScript = { nodes, ... } : '' + import json + # Before starting the other machines we first make sure that our S3 service is online # and has a bucket added for thanos: s3.start() @@ -193,6 +231,12 @@ in import ./make-test-python.nix { # Check if prometheus responds to requests: prometheus.wait_for_unit("prometheus.service") + + # Check if prometheus' config file is correctly locked down because it could contain secrets. + prometheus.succeed( + "stat -c '%a %U' /var/lib/prometheus2/prometheus-substituted.yaml | grep '600 prometheus'" + ) + prometheus.wait_for_open_port(${toString queryPort}) prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics") @@ -245,5 +289,61 @@ in import ./make-test-python.nix { + "jq .thanos.labels.some_label | " + "grep 'required by thanos'" ) + + # Check if switching to a NixOS configuration that changes the prometheus + # configuration reloads (instead of restarts) prometheus before the switch + # finishes successfully: + with subtest("config change reloads prometheus"): + # We check if prometheus has finished reloading by looking for the message + # "Completed loading of configuration file" in the journal between the start + # and finish of switching to the new NixOS configuration. + # + # To mark the start we record the journal cursor before starting the switch: + cursor_before_switching = json.loads( + prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR") + )["__CURSOR"] + + # Now we switch: + prometheus_config_change = prometheus.succeed( + "readlink /run/current-system/specialisation/prometheus-config-change" + ).strip() + prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test") + + # Next we retrieve all logs since the start of switching: + logs_after_starting_switching = prometheus.succeed( + """ + journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE + """.format( + cursor_before_switching=cursor_before_switching + ) + ) + + # Finally we check if the message "Completed loading of configuration file" + # occurs before the "finished switching to system configuration" message: + finished_switching_msg = ( + "finished switching to system configuration " + prometheus_config_change + ) + reloaded_before_switching_finished = False + finished_switching = False + for log_line in logs_after_starting_switching.split("\n"): + msg = json.loads(log_line)["MESSAGE"] + if "Completed loading of configuration file" in msg: + reloaded_before_switching_finished = True + if msg == finished_switching_msg: + finished_switching = True + break + + assert reloaded_before_switching_finished + assert finished_switching + + # Check if the reloaded config includes the new s3-node_exporter job: + prometheus.succeed( + """ + curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \ + | jq -r .data.yaml \ + | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \ + | grep true + """ + ) ''; } |