summary refs log tree commit diff
diff options
context:
space:
mode:
authorBas van Dijk <bas@van.dijk.ch>2021-10-27 14:18:00 +0200
committerBas van Dijk <bas@dfinity.org>2021-11-04 11:15:21 +0000
commitf12e976aded8113cb466efcdc2abc1932d9beb05 (patch)
treea7fad1dd1d6e4561977f691d45d2e7faf7578890
parent8d5213123af4ba144de3dadbe3f9353ba9f4e8f7 (diff)
downloadnixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar
nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.gz
nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.bz2
nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.lz
nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.xz
nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.tar.zst
nixpkgs-f12e976aded8113cb466efcdc2abc1932d9beb05.zip
module/prometheus: optionally support reloading on config changes
The new option `services.prometheus.enableReload` has been introduced
which, when enabled, causes the prometheus systemd service to reload
when its config file changes.

More specifically the following property holds: switching to a
configuration (`switch-to-configuration`) that changes the prometheus
configuration only finishes successully when prometheus has finished
loading the new configuration.

`enableReload` is `false` by default in which case the old semantics
of restarting the prometheus systemd service are in effect.
-rw-r--r--nixos/doc/manual/from_md/release-notes/rl-2111.section.xml8
-rw-r--r--nixos/doc/manual/release-notes/rl-2111.section.md2
-rw-r--r--nixos/modules/services/monitoring/prometheus/default.nix97
-rw-r--r--nixos/tests/prometheus.nix100
4 files changed, 204 insertions, 3 deletions
diff --git a/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml b/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml
index 6378520762b..9ea83ab023f 100644
--- a/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml
+++ b/nixos/doc/manual/from_md/release-notes/rl-2111.section.xml
@@ -1717,6 +1717,14 @@ Superuser created successfully.
       </listitem>
       <listitem>
         <para>
+          A new option
+          <literal>services.prometheus.enableReload</literal> has been
+          added which can be enabled to reload the prometheus service
+          when its config file changes instead of restarting.
+        </para>
+      </listitem>
+      <listitem>
+        <para>
           Dokuwiki now supports caddy! However
         </para>
         <itemizedlist spacing="compact">
diff --git a/nixos/doc/manual/release-notes/rl-2111.section.md b/nixos/doc/manual/release-notes/rl-2111.section.md
index c2ea7bb24e0..f3339b1671d 100644
--- a/nixos/doc/manual/release-notes/rl-2111.section.md
+++ b/nixos/doc/manual/release-notes/rl-2111.section.md
@@ -487,6 +487,8 @@ In addition to numerous new and upgraded packages, this release has the followin
 
 - The `cawbird` Twitter client now uses its own API keys to count as different application than upstream builds. This is done to evade application-level rate limiting. While existing accounts continue to work, users may want to remove and re-register their account in the client to enjoy a better user experience and benefit from this change.
 
+- A new option `services.prometheus.enableReload` has been added which can be enabled to reload the prometheus service when its config file changes instead of restarting.
+
 - Dokuwiki now supports caddy! However
   - the nginx option has been removed, in the new configuration, please use the `dokuwiki.webserver = "nginx"` instead.
   - The "${hostname}" option has been deprecated, please use `dokuwiki.sites = [ "${hostname}" ]` instead
diff --git a/nixos/modules/services/monitoring/prometheus/default.nix b/nixos/modules/services/monitoring/prometheus/default.nix
index d2b37cf688b..5f7bda1acbc 100644
--- a/nixos/modules/services/monitoring/prometheus/default.nix
+++ b/nixos/modules/services/monitoring/prometheus/default.nix
@@ -7,6 +7,30 @@ let
 
   workingDir = "/var/lib/" + cfg.stateDir;
 
+  prometheusYmlOut = "${workingDir}/prometheus-substituted.yaml";
+
+  writeConfig = pkgs.writeShellScriptBin "write-prometheus-config" ''
+    PATH="${makeBinPath (with pkgs; [ coreutils envsubst ])}"
+    touch '${prometheusYmlOut}'
+    chmod 600 '${prometheusYmlOut}'
+    envsubst -o '${prometheusYmlOut}' -i '${prometheusYml}'
+  '';
+
+  triggerReload = pkgs.writeShellScriptBin "trigger-reload-prometheus" ''
+    PATH="${makeBinPath (with pkgs; [ systemd ])}"
+    if systemctl -q is-active prometheus.service; then
+      systemctl reload prometheus.service
+    fi
+  '';
+
+  reload = pkgs.writeShellScriptBin "reload-prometheus" ''
+    PATH="${makeBinPath (with pkgs; [ systemd coreutils gnugrep ])}"
+    cursor=$(journalctl --show-cursor -n0 | grep -oP "cursor: \K.*")
+    kill -HUP $MAINPID
+    journalctl -u prometheus.service --after-cursor="$cursor" -f \
+      | grep -m 1 "Completed loading of configuration file" > /dev/null
+  '';
+
   # a wrapper that verifies that the configuration is valid
   promtoolCheck = what: name: file:
     if cfg.checkConfig then
@@ -47,7 +71,11 @@ let
 
   cmdlineArgs = cfg.extraFlags ++ [
     "--storage.tsdb.path=${workingDir}/data/"
-    "--config.file=/run/prometheus/prometheus-substituted.yaml"
+    "--config.file=${
+      if cfg.enableReload
+      then prometheusYmlOut
+      else "/run/prometheus/prometheus-substituted.yaml"
+    }"
     "--web.listen-address=${cfg.listenAddress}:${builtins.toString cfg.port}"
     "--alertmanager.notification-queue-capacity=${toString cfg.alertmanagerNotificationQueueCapacity}"
     "--alertmanager.timeout=${toString cfg.alertmanagerTimeout}s"
@@ -731,6 +759,25 @@ in {
       '';
     };
 
+    enableReload = mkOption {
+      default = false;
+      type = types.bool;
+      description = ''
+        Reload prometheus when configuration file changes (instead of restart).
+
+        The following property holds: switching to a configuration
+        (<literal>switch-to-configuration</literal>) that changes the prometheus
+        configuration only finishes successully when prometheus has finished
+        loading the new configuration.
+
+        Note that prometheus will also get reloaded when the location of the
+        <option>environmentFile</option> changes but not when its contents
+        changes. So when you change it contents make sure to reload prometheus
+        manually or include the hash of <option>environmentFile</option> in its
+        name.
+      '';
+    };
+
     environmentFile = mkOption {
       type = types.nullOr types.path;
       default = null;
@@ -928,7 +975,7 @@ in {
     systemd.services.prometheus = {
       wantedBy = [ "multi-user.target" ];
       after    = [ "network.target" ];
-      preStart = ''
+      preStart = mkIf (!cfg.enableReload) ''
          ${lib.getBin pkgs.envsubst}/bin/envsubst -o "/run/prometheus/prometheus-substituted.yaml" \
                                                   -i "${prometheusYml}"
       '';
@@ -936,9 +983,10 @@ in {
         ExecStart = "${cfg.package}/bin/prometheus" +
           optionalString (length cmdlineArgs != 0) (" \\\n  " +
             concatStringsSep " \\\n  " cmdlineArgs);
+        ExecReload = mkIf cfg.enableReload "+${reload}/bin/reload-prometheus";
         User = "prometheus";
         Restart  = "always";
-        EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ];
+        EnvironmentFile = mkIf (cfg.environmentFile != null && !cfg.enableReload) [ cfg.environmentFile ];
         RuntimeDirectory = "prometheus";
         RuntimeDirectoryMode = "0700";
         WorkingDirectory = workingDir;
@@ -946,5 +994,48 @@ in {
         StateDirectoryMode = "0700";
       };
     };
+    systemd.services.prometheus-config-write = mkIf cfg.enableReload {
+      wantedBy = [ "prometheus.service" ];
+      before = [ "prometheus.service" ];
+      serviceConfig = {
+        Type = "oneshot";
+        User = "prometheus";
+        StateDirectory = cfg.stateDir;
+        StateDirectoryMode = "0700";
+        EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ];
+        ExecStart = "${writeConfig}/bin/write-prometheus-config";
+      };
+    };
+    # prometheus-config-reload will activate after prometheus. However, what we
+    # don't want is that on startup it immediately reloads prometheus because
+    # prometheus itself might have just started.
+    #
+    # Instead we only want to reload prometheus when the config file has
+    # changed. So on startup prometheus-config-reload will just output a
+    # harmless message and then stay active (RemainAfterExit).
+    #
+    # Then, when the config file has changed, switch-to-configuration notices
+    # that this service has changed and needs to be reloaded
+    # (reloadIfChanged). The reload command then actually writes the new config
+    # and reloads prometheus.
+    systemd.services.prometheus-config-reload = mkIf cfg.enableReload {
+      wantedBy = [ "prometheus.service" ];
+      after = [ "prometheus.service" ];
+      reloadIfChanged = true;
+      serviceConfig = {
+        Type = "oneshot";
+        User = "prometheus";
+        StateDirectory = cfg.stateDir;
+        StateDirectoryMode = "0700";
+        EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ];
+        RemainAfterExit = true;
+        TimeoutSec = 60;
+        ExecStart = "${pkgs.logger}/bin/logger 'prometheus-config-reload will only reload prometheus when reloaded itself.'";
+        ExecReload = [
+          "${writeConfig}/bin/write-prometheus-config"
+          "+${triggerReload}/bin/trigger-reload-prometheus"
+        ];
+      };
+    };
   };
 }
diff --git a/nixos/tests/prometheus.nix b/nixos/tests/prometheus.nix
index 70ac78a4a46..d102b4c0751 100644
--- a/nixos/tests/prometheus.nix
+++ b/nixos/tests/prometheus.nix
@@ -41,6 +41,7 @@ in import ./make-test-python.nix {
       networking.firewall.allowedTCPPorts = [ grpcPort ];
       services.prometheus = {
         enable = true;
+        enableReload = true;
         scrapeConfigs = [
           {
             job_name = "prometheus";
@@ -118,6 +119,36 @@ in import ./make-test-python.nix {
         #  };
         #};
       };
+      # Adds a "specialisation" of the above config which allows us to
+      # "switch" to it and see if the services.prometheus.enableReload
+      # functionality actually reloads the prometheus service instead of
+      # restarting it.
+      specialisation = {
+        "prometheus-config-change" = {
+          configuration = {
+            environment.systemPackages = [ pkgs.yq ];
+
+            # This configuration just adds a new prometheus job
+            # to scrape the node_exporter metrics of the s3 machine.
+            # We also use an environmentFile to test if that works correctly.
+            services.prometheus = {
+              environmentFile = pkgs.writeText "prometheus-config-env-file" ''
+                JOB_NAME=s3-node_exporter
+              '';
+              scrapeConfigs = [
+                {
+                  job_name = "$JOB_NAME";
+                  static_configs = [
+                    {
+                      targets = [ "s3:9100" ];
+                    }
+                  ];
+                }
+              ];
+            };
+          };
+        };
+      };
     };
 
     query = { pkgs, ... }: {
@@ -171,10 +202,17 @@ in import ./make-test-python.nix {
       };
 
       environment.systemPackages = [ pkgs.minio-client ];
+
+      services.prometheus.exporters.node = {
+        enable = true;
+        openFirewall = true;
+      };
     };
   };
 
   testScript = { nodes, ... } : ''
+    import json
+
     # Before starting the other machines we first make sure that our S3 service is online
     # and has a bucket added for thanos:
     s3.start()
@@ -193,6 +231,12 @@ in import ./make-test-python.nix {
 
     # Check if prometheus responds to requests:
     prometheus.wait_for_unit("prometheus.service")
+
+    # Check if prometheus' config file is correctly locked down because it could contain secrets.
+    prometheus.succeed(
+        "stat -c '%a %U' /var/lib/prometheus2/prometheus-substituted.yaml | grep '600 prometheus'"
+    )
+
     prometheus.wait_for_open_port(${toString queryPort})
     prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
 
@@ -245,5 +289,61 @@ in import ./make-test-python.nix {
         + "jq .thanos.labels.some_label | "
         + "grep 'required by thanos'"
     )
+
+    # Check if switching to a NixOS configuration that changes the prometheus
+    # configuration reloads (instead of restarts) prometheus before the switch
+    # finishes successfully:
+    with subtest("config change reloads prometheus"):
+        # We check if prometheus has finished reloading by looking for the message
+        # "Completed loading of configuration file" in the journal between the start
+        # and finish of switching to the new NixOS configuration.
+        #
+        # To mark the start we record the journal cursor before starting the switch:
+        cursor_before_switching = json.loads(
+            prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR")
+        )["__CURSOR"]
+
+        # Now we switch:
+        prometheus_config_change = prometheus.succeed(
+            "readlink /run/current-system/specialisation/prometheus-config-change"
+        ).strip()
+        prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test")
+
+        # Next we retrieve all logs since the start of switching:
+        logs_after_starting_switching = prometheus.succeed(
+            """
+              journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE
+            """.format(
+                cursor_before_switching=cursor_before_switching
+            )
+        )
+
+        # Finally we check if the message "Completed loading of configuration file"
+        # occurs before the "finished switching to system configuration" message:
+        finished_switching_msg = (
+            "finished switching to system configuration " + prometheus_config_change
+        )
+        reloaded_before_switching_finished = False
+        finished_switching = False
+        for log_line in logs_after_starting_switching.split("\n"):
+            msg = json.loads(log_line)["MESSAGE"]
+            if "Completed loading of configuration file" in msg:
+                reloaded_before_switching_finished = True
+            if msg == finished_switching_msg:
+                finished_switching = True
+                break
+
+        assert reloaded_before_switching_finished
+        assert finished_switching
+
+        # Check if the reloaded config includes the new s3-node_exporter job:
+        prometheus.succeed(
+          """
+            curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \
+              | jq -r .data.yaml \
+              | yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \
+              | grep true
+          """
+        )
   '';
 }