summary refs log tree commit diff
path: root/nixos/modules/security/systemd-confinement.nix
blob: 0a400f1d535ba5bf6a447a660ba4f7974e22dc50 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
{ config, pkgs, lib, ... }:

let
  toplevelConfig = config;
  inherit (lib) types;
  inherit (import ../system/boot/systemd-lib.nix {
    inherit config pkgs lib;
  }) mkPathSafeName;
in {
  options.systemd.services = lib.mkOption {
    type = types.attrsOf (types.submodule ({ name, config, ... }: {
      options.confinement.enable = lib.mkOption {
        type = types.bool;
        default = false;
        description = ''
          If set, all the required runtime store paths for this service are
          bind-mounted into a <literal>tmpfs</literal>-based <citerefentry>
            <refentrytitle>chroot</refentrytitle>
            <manvolnum>2</manvolnum>
          </citerefentry>.
        '';
      };

      options.confinement.fullUnit = lib.mkOption {
        type = types.bool;
        default = false;
        description = ''
          Whether to include the full closure of the systemd unit file into the
          chroot, instead of just the dependencies for the executables.

          <warning><para>While it may be tempting to just enable this option to
          make things work quickly, please be aware that this might add paths
          to the closure of the chroot that you didn't anticipate. It's better
          to use <option>confinement.packages</option> to <emphasis
          role="strong">explicitly</emphasis> add additional store paths to the
          chroot.</para></warning>
        '';
      };

      options.confinement.packages = lib.mkOption {
        type = types.listOf (types.either types.str types.package);
        default = [];
        description = let
          mkScOption = optName: "<option>serviceConfig.${optName}</option>";
        in ''
          Additional packages or strings with context to add to the closure of
          the chroot. By default, this includes all the packages from the
          ${lib.concatMapStringsSep ", " mkScOption [
            "ExecReload" "ExecStartPost" "ExecStartPre" "ExecStop"
            "ExecStopPost"
          ]} and ${mkScOption "ExecStart"} options. If you want to have all the
          dependencies of this systemd unit, you can use
          <option>confinement.fullUnit</option>.

          <note><para>The store paths listed in <option>path</option> are
          <emphasis role="strong">not</emphasis> included in the closure as
          well as paths from other options except those listed
          above.</para></note>
        '';
      };

      options.confinement.binSh = lib.mkOption {
        type = types.nullOr types.path;
        default = toplevelConfig.environment.binsh;
        defaultText = "config.environment.binsh";
        example = lib.literalExample "\${pkgs.dash}/bin/dash";
        description = ''
          The program to make available as <filename>/bin/sh</filename> inside
          the chroot. If this is set to <literal>null</literal>, no
          <filename>/bin/sh</filename> is provided at all.

          This is useful for some applications, which for example use the
          <citerefentry>
            <refentrytitle>system</refentrytitle>
            <manvolnum>3</manvolnum>
          </citerefentry> library function to execute commands.
        '';
      };

      options.confinement.mode = lib.mkOption {
        type = types.enum [ "full-apivfs" "chroot-only" ];
        default = "full-apivfs";
        description = ''
          The value <literal>full-apivfs</literal> (the default) sets up
          private <filename class="directory">/dev</filename>, <filename
          class="directory">/proc</filename>, <filename
          class="directory">/sys</filename> and <filename
          class="directory">/tmp</filename> file systems in a separate user
          name space.

          If this is set to <literal>chroot-only</literal>, only the file
          system name space is set up along with the call to <citerefentry>
            <refentrytitle>chroot</refentrytitle>
            <manvolnum>2</manvolnum>
          </citerefentry>.

          <note><para>This doesn't cover network namespaces and is solely for
          file system level isolation.</para></note>
        '';
      };

      config = let
        rootName = "${mkPathSafeName name}-chroot";
        inherit (config.confinement) binSh fullUnit;
        wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
      in lib.mkIf config.confinement.enable {
        serviceConfig = {
          RootDirectory = pkgs.runCommand rootName {} "mkdir \"$out\"";
          TemporaryFileSystem = "/";
          PrivateMounts = lib.mkDefault true;

          # https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
          # to change some of these to default to true.
          #
          # If we run in chroot-only mode, having something like PrivateDevices
          # set to true by default will mount /dev within the chroot, whereas
          # with "chroot-only" it's expected that there are no /dev, /proc and
          # /sys file systems available.
          #
          # However, if this suddenly becomes true, the attack surface will
          # increase, so let's explicitly set these options to true/false
          # depending on the mode.
          MountAPIVFS = wantsAPIVFS;
          PrivateDevices = wantsAPIVFS;
          PrivateTmp = wantsAPIVFS;
          PrivateUsers = wantsAPIVFS;
          ProtectControlGroups = wantsAPIVFS;
          ProtectKernelModules = wantsAPIVFS;
          ProtectKernelTunables = wantsAPIVFS;
        };
        confinement.packages = let
          execOpts = [
            "ExecReload" "ExecStart" "ExecStartPost" "ExecStartPre" "ExecStop"
            "ExecStopPost"
          ];
          execPkgs = lib.concatMap (opt: let
            isSet = config.serviceConfig ? ${opt};
          in lib.optional isSet config.serviceConfig.${opt}) execOpts;
          unitAttrs = toplevelConfig.systemd.units."${name}.service";
          allPkgs = lib.singleton (builtins.toJSON unitAttrs);
          unitPkgs = if fullUnit then allPkgs else execPkgs;
        in unitPkgs ++ lib.optional (binSh != null) binSh;
      };
    }));
  };

  config.assertions = lib.concatLists (lib.mapAttrsToList (name: cfg: let
    whatOpt = optName: "The 'serviceConfig' option '${optName}' for"
                    + " service '${name}' is enabled in conjunction with"
                    + " 'confinement.enable'";
  in lib.optionals cfg.confinement.enable [
    { assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false;
      message = "${whatOpt "RootDirectoryStartOnly"}, but right now systemd"
              + " doesn't support restricting bind-mounts to 'ExecStart'."
              + " Please either define a separate service or find a way to run"
              + " commands other than ExecStart within the chroot.";
    }
    { assertion = !cfg.serviceConfig.DynamicUser or false;
      message = "${whatOpt "DynamicUser"}. Please create a dedicated user via"
              + " the 'users.users' option instead as this combination is"
              + " currently not supported.";
    }
    { assertion = !cfg.serviceConfig.ProtectSystem or false;
      message = "${whatOpt "ProtectSystem"}. ProtectSystem is not compatible"
              + " with service confinement as it fails to remount /usr within"
              + " our chroot. Please disable the option.";
    }
  ]) config.systemd.services);

  config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
    rootPaths = let
      contents = lib.concatStringsSep "\n" cfg.confinement.packages;
    in pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents;

    chrootPaths = pkgs.runCommand "${mkPathSafeName name}-chroot-paths" {
      closureInfo = pkgs.closureInfo { inherit rootPaths; };
      serviceName = "${name}.service";
      excludedPath = rootPaths;
    } ''
      mkdir -p "$out/lib/systemd/system"
      serviceFile="$out/lib/systemd/system/$serviceName"

      echo '[Service]' > "$serviceFile"

      # /bin/sh is special here, because the option value could contain a
      # symlink and we need to properly resolve it.
      ${lib.optionalString (cfg.confinement.binSh != null) ''
        binsh=${lib.escapeShellArg cfg.confinement.binSh}
        realprog="$(readlink -e "$binsh")"
        echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
      ''}

      while read storePath; do
        if [ -L "$storePath" ]; then
          # Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
          # so let's just bind-mount the target to that location.
          echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath"
        elif [ "$storePath" != "$excludedPath" ]; then
          echo "BindReadOnlyPaths=$storePath"
        fi
      done < "$closureInfo/store-paths" >> "$serviceFile"
    '';
  in lib.optional cfg.confinement.enable chrootPaths) config.systemd.services);
}