summary refs log tree commit diff
path: root/nixos/modules/services/computing/slurm/slurm.nix
diff options
context:
space:
mode:
Diffstat (limited to 'nixos/modules/services/computing/slurm/slurm.nix')
-rw-r--r--nixos/modules/services/computing/slurm/slurm.nix437
1 files changed, 437 insertions, 0 deletions
diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix
new file mode 100644
index 00000000000..8cbe54c6060
--- /dev/null
+++ b/nixos/modules/services/computing/slurm/slurm.nix
@@ -0,0 +1,437 @@
+{ config, lib, options, pkgs, ... }:
+
+with lib;
+
+let
+
+  cfg = config.services.slurm;
+  opt = options.services.slurm;
+  # configuration file can be generated by http://slurm.schedmd.com/configurator.html
+
+  defaultUser = "slurm";
+
+  configFile = pkgs.writeTextDir "slurm.conf"
+    ''
+      ClusterName=${cfg.clusterName}
+      StateSaveLocation=${cfg.stateSaveLocation}
+      SlurmUser=${cfg.user}
+      ${optionalString (cfg.controlMachine != null) "controlMachine=${cfg.controlMachine}"}
+      ${optionalString (cfg.controlAddr != null) "controlAddr=${cfg.controlAddr}"}
+      ${toString (map (x: "NodeName=${x}\n") cfg.nodeName)}
+      ${toString (map (x: "PartitionName=${x}\n") cfg.partitionName)}
+      PlugStackConfig=${plugStackConfig}/plugstack.conf
+      ProctrackType=${cfg.procTrackType}
+      ${cfg.extraConfig}
+    '';
+
+  plugStackConfig = pkgs.writeTextDir "plugstack.conf"
+    ''
+      ${optionalString cfg.enableSrunX11 "optional ${pkgs.slurm-spank-x11}/lib/x11.so"}
+      ${cfg.extraPlugstackConfig}
+    '';
+
+  cgroupConfig = pkgs.writeTextDir "cgroup.conf"
+   ''
+     ${cfg.extraCgroupConfig}
+   '';
+
+  slurmdbdConf = pkgs.writeText "slurmdbd.conf"
+   ''
+     DbdHost=${cfg.dbdserver.dbdHost}
+     SlurmUser=${cfg.user}
+     StorageType=accounting_storage/mysql
+     StorageUser=${cfg.dbdserver.storageUser}
+     ${cfg.dbdserver.extraConfig}
+   '';
+
+  # slurm expects some additional config files to be
+  # in the same directory as slurm.conf
+  etcSlurm = pkgs.symlinkJoin {
+    name = "etc-slurm";
+    paths = [ configFile cgroupConfig plugStackConfig ] ++ cfg.extraConfigPaths;
+  };
+in
+
+{
+
+  ###### interface
+
+  meta.maintainers = [ maintainers.markuskowa ];
+
+  options = {
+
+    services.slurm = {
+
+      server = {
+        enable = mkOption {
+          type = types.bool;
+          default = false;
+          description = ''
+            Whether to enable the slurm control daemon.
+            Note that the standard authentication method is "munge".
+            The "munge" service needs to be provided with a password file in order for
+            slurm to work properly (see <literal>services.munge.password</literal>).
+          '';
+        };
+      };
+
+      dbdserver = {
+        enable = mkEnableOption "SlurmDBD service";
+
+        dbdHost = mkOption {
+          type = types.str;
+          default = config.networking.hostName;
+          defaultText = literalExpression "config.networking.hostName";
+          description = ''
+            Hostname of the machine where <literal>slurmdbd</literal>
+            is running (i.e. name returned by <literal>hostname -s</literal>).
+          '';
+        };
+
+        storageUser = mkOption {
+          type = types.str;
+          default = cfg.user;
+          defaultText = literalExpression "config.${opt.user}";
+          description = ''
+            Database user name.
+          '';
+        };
+
+        storagePassFile = mkOption {
+          type = with types; nullOr str;
+          default = null;
+          description = ''
+            Path to file with database password. The content of this will be used to
+            create the password for the <literal>StoragePass</literal> option.
+          '';
+        };
+
+        extraConfig = mkOption {
+          type = types.lines;
+          default = "";
+          description = ''
+            Extra configuration for <literal>slurmdbd.conf</literal> See also:
+            <citerefentry><refentrytitle>slurmdbd.conf</refentrytitle>
+            <manvolnum>8</manvolnum></citerefentry>.
+          '';
+        };
+      };
+
+      client = {
+        enable = mkEnableOption "slurm client daemon";
+      };
+
+      enableStools = mkOption {
+        type = types.bool;
+        default = false;
+        description = ''
+          Whether to provide a slurm.conf file.
+          Enable this option if you do not run a slurm daemon on this host
+          (i.e. <literal>server.enable</literal> and <literal>client.enable</literal> are <literal>false</literal>)
+          but you still want to run slurm commands from this host.
+        '';
+      };
+
+      package = mkOption {
+        type = types.package;
+        default = pkgs.slurm.override { enableX11 = ! cfg.enableSrunX11; };
+        defaultText = literalExpression "pkgs.slurm";
+        example = literalExpression "pkgs.slurm-full";
+        description = ''
+          The package to use for slurm binaries.
+        '';
+      };
+
+      controlMachine = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        example = null;
+        description = ''
+          The short hostname of the machine where SLURM control functions are
+          executed (i.e. the name returned by the command "hostname -s", use "tux001"
+          rather than "tux001.my.com").
+        '';
+      };
+
+      controlAddr = mkOption {
+        type = types.nullOr types.str;
+        default = cfg.controlMachine;
+        defaultText = literalExpression "config.${opt.controlMachine}";
+        example = null;
+        description = ''
+          Name that ControlMachine should be referred to in establishing a
+          communications path.
+        '';
+      };
+
+      clusterName = mkOption {
+        type = types.str;
+        default = "default";
+        example = "myCluster";
+        description = ''
+          Necessary to distinguish accounting records in a multi-cluster environment.
+        '';
+      };
+
+      nodeName = mkOption {
+        type = types.listOf types.str;
+        default = [];
+        example = literalExpression ''[ "linux[1-32] CPUs=1 State=UNKNOWN" ];'';
+        description = ''
+          Name that SLURM uses to refer to a node (or base partition for BlueGene
+          systems). Typically this would be the string that "/bin/hostname -s"
+          returns. Note that now you have to write node's parameters after the name.
+        '';
+      };
+
+      partitionName = mkOption {
+        type = types.listOf types.str;
+        default = [];
+        example = literalExpression ''[ "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP" ];'';
+        description = ''
+          Name by which the partition may be referenced. Note that now you have
+          to write the partition's parameters after the name.
+        '';
+      };
+
+      enableSrunX11 = mkOption {
+        default = false;
+        type = types.bool;
+        description = ''
+          If enabled srun will accept the option "--x11" to allow for X11 forwarding
+          from within an interactive session or a batch job. This activates the
+          slurm-spank-x11 module. Note that this option also enables
+          <option>services.openssh.forwardX11</option> on the client.
+
+          This option requires slurm to be compiled without native X11 support.
+          The default behavior is to re-compile the slurm package with native X11
+          support disabled if this option is set to true.
+
+          To use the native X11 support add <literal>PrologFlags=X11</literal> in <option>extraConfig</option>.
+          Note that this method will only work RSA SSH host keys.
+        '';
+      };
+
+      procTrackType = mkOption {
+        type = types.str;
+        default = "proctrack/linuxproc";
+        description = ''
+          Plugin to be used for process tracking on a job step basis.
+          The slurmd daemon uses this mechanism to identify all processes
+          which are children of processes it spawns for a user job step.
+        '';
+      };
+
+      stateSaveLocation = mkOption {
+        type = types.str;
+        default = "/var/spool/slurmctld";
+        description = ''
+          Directory into which the Slurm controller, slurmctld, saves its state.
+        '';
+      };
+
+      user = mkOption {
+        type = types.str;
+        default = defaultUser;
+        description = ''
+          Set this option when you want to run the slurmctld daemon
+          as something else than the default slurm user "slurm".
+          Note that the UID of this user needs to be the same
+          on all nodes.
+        '';
+      };
+
+      extraConfig = mkOption {
+        default = "";
+        type = types.lines;
+        description = ''
+          Extra configuration options that will be added verbatim at
+          the end of the slurm configuration file.
+        '';
+      };
+
+      extraPlugstackConfig = mkOption {
+        default = "";
+        type = types.lines;
+        description = ''
+          Extra configuration that will be added to the end of <literal>plugstack.conf</literal>.
+        '';
+      };
+
+      extraCgroupConfig = mkOption {
+        default = "";
+        type = types.lines;
+        description = ''
+          Extra configuration for <literal>cgroup.conf</literal>. This file is
+          used when <literal>procTrackType=proctrack/cgroup</literal>.
+        '';
+      };
+
+      extraConfigPaths = mkOption {
+        type = with types; listOf path;
+        default = [];
+        description = ''
+          Slurm expects config files for plugins in the same path
+          as <literal>slurm.conf</literal>. Add extra nix store
+          paths that should be merged into same directory as
+          <literal>slurm.conf</literal>.
+        '';
+      };
+
+      etcSlurm = mkOption {
+        type = types.path;
+        internal = true;
+        default = etcSlurm;
+        defaultText = literalDocBook ''
+          Directory created from generated config files and
+          <literal>config.${opt.extraConfigPaths}</literal>.
+        '';
+        description = ''
+          Path to directory with slurm config files. This option is set by default from the
+          Slurm module and is meant to make the Slurm config file available to other modules.
+        '';
+      };
+
+    };
+
+  };
+
+  imports = [
+    (mkRemovedOptionModule [ "services" "slurm" "dbdserver" "storagePass" ] ''
+      This option has been removed so that the database password is not exposed via the nix store.
+      Use services.slurm.dbdserver.storagePassFile to provide the database password.
+    '')
+    (mkRemovedOptionModule [ "services" "slurm" "dbdserver" "configFile" ] ''
+      This option has been removed. Use services.slurm.dbdserver.storagePassFile
+      and services.slurm.dbdserver.extraConfig instead.
+    '')
+  ];
+
+  ###### implementation
+
+  config =
+    let
+      wrappedSlurm = pkgs.stdenv.mkDerivation {
+        name = "wrappedSlurm";
+
+        builder = pkgs.writeText "builder.sh" ''
+          source $stdenv/setup
+          mkdir -p $out/bin
+          find  ${getBin cfg.package}/bin -type f -executable | while read EXE
+          do
+            exename="$(basename $EXE)"
+            wrappername="$out/bin/$exename"
+            cat > "$wrappername" <<EOT
+          #!/bin/sh
+          if [ -z "$SLURM_CONF" ]
+          then
+            SLURM_CONF="${cfg.etcSlurm}/slurm.conf" "$EXE" "\$@"
+          else
+            "$EXE" "\$0"
+          fi
+          EOT
+            chmod +x "$wrappername"
+          done
+
+          mkdir -p $out/share
+          ln -s ${getBin cfg.package}/share/man $out/share/man
+        '';
+      };
+
+  in mkIf ( cfg.enableStools ||
+            cfg.client.enable ||
+            cfg.server.enable ||
+            cfg.dbdserver.enable ) {
+
+    environment.systemPackages = [ wrappedSlurm ];
+
+    services.munge.enable = mkDefault true;
+
+    # use a static uid as default to ensure it is the same on all nodes
+    users.users.slurm = mkIf (cfg.user == defaultUser) {
+      name = defaultUser;
+      group = "slurm";
+      uid = config.ids.uids.slurm;
+    };
+
+    users.groups.slurm.gid = config.ids.uids.slurm;
+
+    systemd.services.slurmd = mkIf (cfg.client.enable) {
+      path = with pkgs; [ wrappedSlurm coreutils ]
+        ++ lib.optional cfg.enableSrunX11 slurm-spank-x11;
+
+      wantedBy = [ "multi-user.target" ];
+      after = [ "systemd-tmpfiles-clean.service" ];
+      requires = [ "network.target" ];
+
+      serviceConfig = {
+        Type = "forking";
+        KillMode = "process";
+        ExecStart = "${wrappedSlurm}/bin/slurmd";
+        PIDFile = "/run/slurmd.pid";
+        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+        LimitMEMLOCK = "infinity";
+      };
+    };
+
+    systemd.tmpfiles.rules = mkIf cfg.client.enable [
+      "d /var/spool/slurmd 755 root root -"
+    ];
+
+    services.openssh.forwardX11 = mkIf cfg.client.enable (mkDefault true);
+
+    systemd.services.slurmctld = mkIf (cfg.server.enable) {
+      path = with pkgs; [ wrappedSlurm munge coreutils ]
+        ++ lib.optional cfg.enableSrunX11 slurm-spank-x11;
+
+      wantedBy = [ "multi-user.target" ];
+      after = [ "network.target" "munged.service" ];
+      requires = [ "munged.service" ];
+
+      serviceConfig = {
+        Type = "forking";
+        ExecStart = "${wrappedSlurm}/bin/slurmctld";
+        PIDFile = "/run/slurmctld.pid";
+        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+      };
+
+      preStart = ''
+        mkdir -p ${cfg.stateSaveLocation}
+        chown -R ${cfg.user}:slurm ${cfg.stateSaveLocation}
+      '';
+    };
+
+    systemd.services.slurmdbd = let
+      # slurm strips the last component off the path
+      configPath = "$RUNTIME_DIRECTORY/slurmdbd.conf";
+    in mkIf (cfg.dbdserver.enable) {
+      path = with pkgs; [ wrappedSlurm munge coreutils ];
+
+      wantedBy = [ "multi-user.target" ];
+      after = [ "network.target" "munged.service" "mysql.service" ];
+      requires = [ "munged.service" "mysql.service" ];
+
+      preStart = ''
+        install -m 600 -o ${cfg.user} -T ${slurmdbdConf} ${configPath}
+        ${optionalString (cfg.dbdserver.storagePassFile != null) ''
+          echo "StoragePass=$(cat ${cfg.dbdserver.storagePassFile})" \
+            >> ${configPath}
+        ''}
+      '';
+
+      script = ''
+        export SLURM_CONF=${configPath}
+        exec ${cfg.package}/bin/slurmdbd -D
+      '';
+
+      serviceConfig = {
+        RuntimeDirectory = "slurmdbd";
+        Type = "simple";
+        PIDFile = "/run/slurmdbd.pid";
+        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+      };
+    };
+
+  };
+
+}