pkgs/os-specific/linux/dcgm/default.nix


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

{ lib
, gcc11Stdenv
, fetchFromGitHub
, catch2
, cmake
, cudaPackages_10_2
, cudaPackages_11_8
, cudaPackages_12
, fmt_9
, git
, jsoncpp
, libevent
, plog
, python3
, symlinkJoin
, tclap_1_4
, yaml-cpp
}:
let
  # Flags copied from DCGM's libevent build script
  libevent-nossl = libevent.override { sslSupport = false; };
  libevent-nossl-static = libevent-nossl.overrideAttrs (super: {
    CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
    CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
    configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ];
  });

  jsoncpp-static = jsoncpp.override { enableStatic = true; };

  # DCGM depends on 3 different versions of CUDA at the same time.
  # The runtime closure, thankfully, is quite small because most things
  # are statically linked.
  cudaPackageSetByVersion = [
    {
      version = "10";
      # Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
      pkgSet = [
        cudaPackages_10_2.cudatoolkit
        cudaPackages_10_2.cudatoolkit.lib
      ];
    }
    {
      version = "11";
      pkgSet = getCudaPackages cudaPackages_11_8;
    }
    {
      version = "12";
      pkgSet = getCudaPackages cudaPackages_12;
    }
  ];

  # Select needed redist packages from cudaPackages
  # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
  getCudaPackages = p: with p; [
    cuda_cccl
    cuda_cudart
    cuda_nvcc
    cuda_nvml_dev
    libcublas
    libcufft
    libcurand
  ];

  # Builds CMake code to add CUDA paths for include and lib.
  mkAppendCudaPaths = { version, pkgSet }:
    let
      # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
      # combine everything together for headers to work.
      # It would be more convenient to use symlinkJoin on *just* the include subdirectories
      # of each package, but not all of them have an include directory and making that work
      # is more effort than it's worth for this temporary, build-time package.
      combined = symlinkJoin {
        name = "cuda-combined-${version}";
        paths = pkgSet;
      };
      # The combined package above breaks the build for some reason so we just configure
      # each package's library path.
      libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
    in ''
      list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
      list(APPEND Cuda${version}_LIB_PATHS ${libs})
    '';

# gcc11 is required by DCGM's very particular build system
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
in gcc11Stdenv.mkDerivation rec {
  pname = "dcgm";
  version = "3.2.5"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.

  src = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "DCGM";
    rev = "refs/tags/v${version}";
    hash = "sha256-iMyYOr3dSpdRV2S/TlB/tEOAWYhK09373ZRbd5vzogQ=";
  };

  # Add our paths to the CUDA paths so FindCuda.cmake can find them.
  EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
  prePatch = ''
    echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
  '';

  hardeningDisable = [ "all" ];

  strictDeps = true;

  nativeBuildInputs = [
    # autoAddOpenGLRunpathHook does not actually depend on or incur any dependency
    # of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
    # executables that need to use cuda at runtime.
    cudaPackages_12.autoAddOpenGLRunpathHook

    cmake
    git
    python3
  ];

  buildInputs = [
    plog.dev # header-only
    tclap_1_4 # header-only

    catch2
    fmt_9
    jsoncpp-static
    libevent-nossl-static
    yaml-cpp
  ];

  disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;

  meta = with lib; {
    description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs.";
    homepage = "https://developer.nvidia.com/dcgm";
    license = licenses.asl20;
    maintainers = teams.deshaw.members;
    mainProgram = "dcgmi";
    platforms = platforms.linux;
  };
}