summary refs log tree commit diff
diff options
context:
space:
mode:
authorSamuel Ainsworth <skainsworth@gmail.com>2023-07-26 14:48:38 -0700
committerGitHub <noreply@github.com>2023-07-26 14:48:38 -0700
commit931999d13b29aed0c85b4d24d1afe665bd7a8f9b (patch)
treedc7ae6274876c40b4bfe6b884de7fea97a6ff636
parent0c7765a38ad7bd8182c6284a8b5877218bfa078a (diff)
parentb25101f159010487c512c14ddde689dca253111d (diff)
downloadnixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.tar
nixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.tar.gz
nixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.tar.bz2
nixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.tar.lz
nixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.tar.xz
nixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.tar.zst
nixpkgs-931999d13b29aed0c85b4d24d1afe665bd7a8f9b.zip
Merge pull request #235024 from deshaw/upstream-dcgm
Add NVIDIA DCGM and DCGM-exporter (prometheus)
-rw-r--r--pkgs/development/compilers/cudatoolkit/common.nix6
-rw-r--r--pkgs/development/libraries/jsoncpp/default.nix10
-rw-r--r--pkgs/development/libraries/libevent/default.nix2
-rw-r--r--pkgs/development/libraries/tclap/1.2.nix (renamed from pkgs/development/libraries/tclap/default.nix)0
-rw-r--r--pkgs/development/libraries/tclap/1.4.nix48
-rw-r--r--pkgs/os-specific/linux/dcgm/default.nix147
-rw-r--r--pkgs/servers/monitoring/prometheus/dcgm-exporter/default.nix66
-rw-r--r--pkgs/top-level/all-packages.nix9
8 files changed, 277 insertions, 11 deletions
diff --git a/pkgs/development/compilers/cudatoolkit/common.nix b/pkgs/development/compilers/cudatoolkit/common.nix
index a7a2e52b322..1f934ef5d46 100644
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@@ -138,7 +138,7 @@ backendStdenv.mkDerivation rec {
     (ucx.override { enableCuda = false; }) # Avoid infinite recursion
     xorg.libxshmfence
     xorg.libxkbfile
-  ] ++ (lib.optionals (lib.versionAtLeast version "12.1") (map lib.getLib ([
+  ] ++ (lib.optionals (lib.versionAtLeast version "12") (map lib.getLib ([
     # Used by `/target-linux-x64/CollectX/clx` and `/target-linux-x64/CollectX/libclx_api.so` for:
     # - `libcurl.so.4`
     curlMinimal
@@ -183,7 +183,9 @@ backendStdenv.mkDerivation rec {
     "libcom_err.so.2"
   ];
 
-  preFixup = ''
+  preFixup = if lib.versionOlder version "11" then ''
+    patchelf $out/targets/*/lib/libnvrtc.so --add-needed libnvrtc-builtins.so
+  '' else ''
     patchelf $out/lib64/libnvrtc.so --add-needed libnvrtc-builtins.so
   '';
 
diff --git a/pkgs/development/libraries/jsoncpp/default.nix b/pkgs/development/libraries/jsoncpp/default.nix
index 41e9a2d0a03..59572afc627 100644
--- a/pkgs/development/libraries/jsoncpp/default.nix
+++ b/pkgs/development/libraries/jsoncpp/default.nix
@@ -40,19 +40,13 @@ stdenv.mkDerivation rec {
     "-DBUILD_SHARED_LIBS=ON"
     "-DBUILD_OBJECT_LIBS=OFF"
     "-DJSONCPP_WITH_CMAKE_PACKAGE=ON"
+    "-DBUILD_STATIC_LIBS=${if enableStatic then "ON" else "OFF"}"
   ]
     # the test's won't compile if secureMemory is used because there is no
     # comparison operators and conversion functions between
     # std::basic_string<..., Json::SecureAllocator<char>> vs.
     # std::basic_string<..., [default allocator]>
-    ++ lib.optional ((stdenv.buildPlatform != stdenv.hostPlatform) || secureMemory) "-DJSONCPP_WITH_TESTS=OFF"
-    ++ lib.optional (!enableStatic) "-DBUILD_STATIC_LIBS=OFF";
-
-  # this is fixed and no longer necessary in 1.9.5 but there they use
-  # memset_s without switching to a different c++ standard in the cmake files
-  postInstall = lib.optionalString enableStatic ''
-    (cd $out/lib && ln -sf libjsoncpp_static.a libjsoncpp.a)
-  '';
+    ++ lib.optional ((stdenv.buildPlatform != stdenv.hostPlatform) || secureMemory) "-DJSONCPP_WITH_TESTS=OFF";
 
   meta = with lib; {
     homepage = "https://github.com/open-source-parsers/jsoncpp";
diff --git a/pkgs/development/libraries/libevent/default.nix b/pkgs/development/libraries/libevent/default.nix
index bd5edec68a0..782d86f1f58 100644
--- a/pkgs/development/libraries/libevent/default.nix
+++ b/pkgs/development/libraries/libevent/default.nix
@@ -20,6 +20,8 @@ stdenv.mkDerivation rec {
     })
   ];
 
+  configureFlags = lib.optional (!sslSupport) "--disable-openssl";
+
   preConfigure = lib.optionalString (lib.versionAtLeast stdenv.hostPlatform.darwinMinVersion "11") ''
     MACOSX_DEPLOYMENT_TARGET=10.16
   '';
diff --git a/pkgs/development/libraries/tclap/default.nix b/pkgs/development/libraries/tclap/1.2.nix
index cdb2b519329..cdb2b519329 100644
--- a/pkgs/development/libraries/tclap/default.nix
+++ b/pkgs/development/libraries/tclap/1.2.nix
diff --git a/pkgs/development/libraries/tclap/1.4.nix b/pkgs/development/libraries/tclap/1.4.nix
new file mode 100644
index 00000000000..7a0b5765963
--- /dev/null
+++ b/pkgs/development/libraries/tclap/1.4.nix
@@ -0,0 +1,48 @@
+{ lib
+, stdenv
+, fetchgit
+, cmake
+, doxygen
+, python3
+}:
+stdenv.mkDerivation {
+  pname = "tclap";
+
+  # This version is slightly newer than 1.4.0-rc1:
+  # See https://github.com/mirror/tclap/compare/1.4.0-rc1..3feeb7b2499b37d9cb80890cadaf7c905a9a50c6
+  version = "1.4-3feeb7b";
+
+  src = fetchgit {
+    url = "git://git.code.sf.net/p/tclap/code";
+    rev = "3feeb7b2499b37d9cb80890cadaf7c905a9a50c6"; # 1.4 branch
+    hash = "sha256-byLianB6Vf+I9ABMmsmuoGU2o5RO9c5sMckWW0F+GDM=";
+  };
+
+  postPatch = ''
+    substituteInPlace CMakeLists.txt \
+      --replace '$'{CMAKE_INSTALL_LIBDIR_ARCHIND} '$'{CMAKE_INSTALL_LIBDIR}
+    substituteInPlace packaging/pkgconfig.pc.in \
+      --replace '$'{prefix}/@CMAKE_INSTALL_INCLUDEDIR@ @CMAKE_INSTALL_FULL_INCLUDEDIR@
+  '';
+
+  nativeBuildInputs = [
+    cmake
+    doxygen
+    python3
+  ];
+
+  # Installing docs is broken in this package+version so we stub out some files
+  preInstall = ''
+    touch docs/manual.html
+  '';
+
+  doCheck = true;
+
+  meta = with lib; {
+    description = "Templatized C++ Command Line Parser Library (v1.4)";
+    homepage = "https://tclap.sourceforge.net/";
+    license = licenses.mit;
+    maintainers = teams.deshaw.members;
+    platforms = platforms.all;
+  };
+}
diff --git a/pkgs/os-specific/linux/dcgm/default.nix b/pkgs/os-specific/linux/dcgm/default.nix
new file mode 100644
index 00000000000..36c7e3ca688
--- /dev/null
+++ b/pkgs/os-specific/linux/dcgm/default.nix
@@ -0,0 +1,147 @@
+{ lib
+, callPackage
+, gcc11Stdenv
+, fetchFromGitHub
+, addOpenGLRunpath
+, catch2
+, cmake
+, cudaPackages_10_2
+, cudaPackages_11_8
+, cudaPackages_12
+, fmt_9
+, git
+, jsoncpp
+, libevent
+, plog
+, python3
+, symlinkJoin
+, tclap_1_4
+, yaml-cpp
+}:
+let
+  # Flags copied from DCGM's libevent build script
+  libevent-nossl = libevent.override { sslSupport = false; };
+  libevent-nossl-static = libevent-nossl.overrideAttrs (super: {
+    CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
+    CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
+    configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ];
+  });
+
+  jsoncpp-static = jsoncpp.override { enableStatic = true; };
+
+  # DCGM depends on 3 different versions of CUDA at the same time.
+  # The runtime closure, thankfully, is quite small because most things
+  # are statically linked.
+  cudaPackageSetByVersion = [
+    {
+      version = "10";
+      # Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
+      pkgSet = [
+        cudaPackages_10_2.cudatoolkit
+        cudaPackages_10_2.cudatoolkit.lib
+      ];
+    }
+    {
+      version = "11";
+      pkgSet = getCudaPackages cudaPackages_11_8;
+    }
+    {
+      version = "12";
+      pkgSet = getCudaPackages cudaPackages_12;
+    }
+  ];
+
+  # Select needed redist packages from cudaPackages
+  # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
+  getCudaPackages = p: with p; [
+    cuda_cccl
+    cuda_cudart
+    cuda_nvcc
+    cuda_nvml_dev
+    libcublas
+    libcufft
+    libcurand
+  ];
+
+  # Builds CMake code to add CUDA paths for include and lib.
+  mkAppendCudaPaths = { version, pkgSet }:
+    let
+      # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
+      # combine everything together for headers to work.
+      # It would be more convenient to use symlinkJoin on *just* the include subdirectories
+      # of each package, but not all of them have an include directory and making that work
+      # is more effort than it's worth for this temporary, build-time package.
+      combined = symlinkJoin {
+        name = "cuda-combined-${version}";
+        paths = pkgSet;
+      };
+      # The combined package above breaks the build for some reason so we just configure
+      # each package's library path.
+      libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
+    in ''
+      list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
+      list(APPEND Cuda${version}_LIB_PATHS ${libs})
+    '';
+
+# gcc11 is required by DCGM's very particular build system
+# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
+in gcc11Stdenv.mkDerivation rec {
+  pname = "dcgm";
+  version = "3.1.8";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "DCGM";
+    rev = "refs/tags/v${version}";
+    hash = "sha256-OXqXkP2ZUNPzafGIgJ0MKa39xB84keVFFYl+JsHgnks=";
+  };
+
+  # Add our paths to the CUDA paths so FindCuda.cmake can find them.
+  EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
+  prePatch = ''
+    echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
+  '';
+
+  hardeningDisable = [ "all" ];
+
+  nativeBuildInputs = [
+    addOpenGLRunpath
+    cmake
+    git
+    python3
+
+    jsoncpp-static
+    jsoncpp-static.dev
+    libevent-nossl-static
+    libevent-nossl-static.dev
+    plog.dev # header-only
+    tclap_1_4 # header-only
+  ];
+
+  buildInputs = [
+    catch2
+    fmt_9
+    yaml-cpp
+  ];
+
+  # libcuda.so must be found at runtime because it is supplied by the NVIDIA
+  # driver. autoAddOpenGLRunpathHook breaks on the statically linked exes.
+  postFixup = ''
+    find "$out/bin" "$out/lib" -type f -executable -print0 | while IFS= read -r -d "" f; do
+      if isELF "$f" && [[ $(patchelf --print-needed "$f" || true) == *libcuda.so* ]]; then
+        addOpenGLRunpath "$f"
+      fi
+    done
+  '';
+
+  disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
+
+  meta = with lib; {
+    description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs.";
+    homepage = "https://developer.nvidia.com/dcgm";
+    license = licenses.asl20;
+    maintainers = teams.deshaw.members;
+    mainProgram = "dcgmi";
+    platforms = platforms.linux;
+  };
+}
diff --git a/pkgs/servers/monitoring/prometheus/dcgm-exporter/default.nix b/pkgs/servers/monitoring/prometheus/dcgm-exporter/default.nix
new file mode 100644
index 00000000000..173a978cf2e
--- /dev/null
+++ b/pkgs/servers/monitoring/prometheus/dcgm-exporter/default.nix
@@ -0,0 +1,66 @@
+{ lib
+, buildGoModule
+, fetchFromGitHub
+, cudaPackages
+, dcgm
+, linuxPackages
+}:
+buildGoModule rec {
+  pname = "dcgm-exporter";
+  version = "3.1.8-3.1.5";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = pname;
+    rev = "refs/tags/${version}";
+    hash = "sha256-Jzv3cU3gmGIXV+DV3wV/1zSWwz18s3Jax6JC7WZW7Z4=";
+  };
+
+  # Upgrade to go 1.17 during the vendoring FOD build because it fails otherwise.
+  overrideModAttrs = _: {
+    preBuild = ''
+      substituteInPlace go.mod --replace 'go 1.16' 'go 1.17'
+      go mod tidy
+    '';
+    postInstall = ''
+      cp go.mod "$out/go.mod"
+    '';
+  };
+
+  CGO_LDFLAGS = "-ldcgm";
+
+  buildInputs = [
+    dcgm
+  ];
+
+  # gonvml and go-dcgm do not work with ELF BIND_NOW hardening because not all
+  # symbols are available on startup.
+  hardeningDisable = [ "bindnow" ];
+
+  # Copy the modified go.mod we got from the vendoring process.
+  preBuild = ''
+    cp vendor/go.mod go.mod
+  '';
+
+  vendorHash = "sha256-KMCV79kUY1sNYysH0MmB7pVU98r7v+DpLIoYHxyyG4U=";
+
+  nativeBuildInputs = [
+    cudaPackages.autoAddOpenGLRunpathHook
+  ];
+
+  # Tests try to interact with running DCGM service.
+  doCheck = false;
+
+  postFixup = ''
+    patchelf --add-needed libnvidia-ml.so "$out/bin/dcgm-exporter"
+  '';
+
+  meta = with lib; {
+    description = "NVIDIA GPU metrics exporter for Prometheus leveraging DCGM";
+    homepage = "https://github.com/NVIDIA/dcgm-exporter";
+    license = licenses.asl20;
+    maintainers = teams.deshaw.members;
+    mainProgram = "dcgm-exporter";
+    platforms = platforms.linux;
+  };
+}
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 76b08a4c430..406e76d15aa 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -555,6 +555,8 @@ with pkgs;
 
   dbip-country-lite = callPackage ../data/misc/dbip-country-lite { };
 
+  dcgm = callPackage ../os-specific/linux/dcgm { };
+
   dhallDirectoryToNix = callPackage ../build-support/dhall/directory-to-nix.nix { };
 
   dhallPackageToNix = callPackage ../build-support/dhall/package-to-nix.nix { };
@@ -25017,7 +25019,11 @@ with pkgs;
 
   taskflow = callPackage ../development/libraries/taskflow { };
 
-  tclap = callPackage ../development/libraries/tclap { };
+  tclap = tclap_1_2;
+
+  tclap_1_2 = callPackage ../development/libraries/tclap/1.2.nix { };
+
+  tclap_1_4 = callPackage ../development/libraries/tclap/1.4.nix { };
 
   tcllib = callPackage ../development/libraries/tcllib { };
 
@@ -26847,6 +26853,7 @@ with pkgs;
   prometheus-cloudflare-exporter = callPackage ../servers/monitoring/prometheus/cloudflare-exporter.nix { };
   prometheus-collectd-exporter = callPackage ../servers/monitoring/prometheus/collectd-exporter.nix { };
   prometheus-consul-exporter = callPackage ../servers/monitoring/prometheus/consul-exporter.nix { };
+  prometheus-dcgm-exporter = callPackage ../servers/monitoring/prometheus/dcgm-exporter { };
   prometheus-dnsmasq-exporter = callPackage ../servers/monitoring/prometheus/dnsmasq-exporter.nix { };
   prometheus-dovecot-exporter = callPackage ../servers/monitoring/prometheus/dovecot-exporter.nix { };
   prometheus-domain-exporter = callPackage ../servers/monitoring/prometheus/domain-exporter.nix { };