nixpkgs/pkgs/by-name/dc/dcgm/package.nix

{ lib
, gcc11Stdenv
, fetchFromGitHub
, autoAddDriverRunpath
, catch2
, cmake
, ninja
, cudaPackages_11_8
, cudaPackages_12
, boost
, fmt_9
, git
, jsoncpp
, libevent
, plog
, python3
, symlinkJoin
, tclap_1_4
, yaml-cpp

, static ? gcc11Stdenv.hostPlatform.isStatic
}:
let
  # DCGM depends on 2 different versions of CUDA at the same time.
  # The runtime closure, thankfully, is quite small as it does not
  # include the CUDA libraries.
  cudaPackageSets = [
    cudaPackages_11_8
    cudaPackages_12
  ];

  # Select needed redist packages from cudaPackages
  # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
  getCudaPackages = p: with p; [
    cuda_cccl
    cuda_cudart
    cuda_nvcc
    cuda_nvml_dev
    libcublas
    libcufft
    libcurand
  ];

  # Builds CMake flags to add CUDA paths for include and lib.
  mkCudaFlags = cudaPackages:
    let
      version = cudaPackages.cudaMajorVersion;
      # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
      # combine everything together for headers to work.
      headers = symlinkJoin {
        name = "cuda-headers-combined-${version}";
        paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);
      };
    in [
      (lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")
      (lib.cmakeFeature "CUDA${version}_LIBS" "${cudaPackages.cuda_cudart.stubs}/lib/stubs/libcuda.so")
      (lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")
      (lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (lib.concatStringsSep ";" [
        "${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"
        "${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"
      ]))
    ];

# gcc11 is required by DCGM's very particular build system
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
in gcc11Stdenv.mkDerivation rec {
  pname = "dcgm";
  version = "3.3.9"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.

  src = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "DCGM";
    rev = "refs/tags/v${version}";
    hash = "sha256-PysxuN5WT7GB0oOvT5ezYeOau6AMVDDWE5HOAcmqw/Y=";
  };

  hardeningDisable = [ "all" ];

  strictDeps = true;

  nativeBuildInputs = [
    # autoAddDriverRunpath does not actually depend on or incur any dependency
    # of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
    # executables that need to use cuda at runtime.
    autoAddDriverRunpath

    cmake
    ninja
    git
    python3
  ];

  buildInputs = [
    # Header-only
    boost
    catch2
    plog.dev
    tclap_1_4

    # Dependencies that can be either static or dynamic.
    (fmt_9.override { enableShared = !static; }) # DCGM's build uses the static outputs regardless of enableShared
    (yaml-cpp.override { inherit static; stdenv = gcc11Stdenv; })

    # TODO: Dependencies that DCGM's CMake hard-codes to be static-only.
    (jsoncpp.override { enableStatic = true; })
    (libevent.override { sslSupport = false; static = true; })
  ];

  # Add our paths to the CMake flags so FindCuda.cmake can find them.
  cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;

  disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;

  meta = with lib; {
    description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";
    homepage = "https://developer.nvidia.com/dcgm";
    license = licenses.asl20;
    maintainers = teams.deshaw.members;
    mainProgram = "dcgmi";
    platforms = platforms.linux;
  };
}
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`{ lib`
			`, gcc11Stdenv`
			`, fetchFromGitHub`
dcgm: use pkgs.autoAddDriverRunpath 2024-03-30 17:17:46 +00:00			`, autoAddDriverRunpath`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`, catch2`
			`, cmake`
dcgm: use Ninja 2024-11-19 18:31:39 +00:00			`, ninja`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`, cudaPackages_11_8`
			`, cudaPackages_12`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`, boost`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`, fmt_9`
			`, git`
			`, jsoncpp`
			`, libevent`
			`, plog`
			`, python3`
			`, symlinkJoin`
			`, tclap_1_4`
			`, yaml-cpp`
dcgm: add static build option Currently only one dependency (yaml-cpp) actually supports being either static or dynamic because DCGM's build system hard-codes the use of static artifacts for other dependencies. We note that in the comments as future work. 2024-03-15 19:02:37 +00:00
			`, static ? gcc11Stdenv.hostPlatform.isStatic`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`}:`
			`let`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`# DCGM depends on 2 different versions of CUDA at the same time.`
			`# The runtime closure, thankfully, is quite small as it does not`
			`# include the CUDA libraries.`
			`cudaPackageSets = [`
			`cudaPackages_11_8`
			`cudaPackages_12`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`];`

			`# Select needed redist packages from cudaPackages`
			`# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39`
			`getCudaPackages = p: with p; [`
			`cuda_cccl`
			`cuda_cudart`
			`cuda_nvcc`
			`cuda_nvml_dev`
			`libcublas`
			`libcufft`
			`libcurand`
			`];`

dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`# Builds CMake flags to add CUDA paths for include and lib.`
			`mkCudaFlags = cudaPackages:`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`let`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`version = cudaPackages.cudaMajorVersion;`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`# The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must`
			`# combine everything together for headers to work.`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`headers = symlinkJoin {`
			`name = "cuda-headers-combined-${version}";`
			`paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`};`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`in [`
			`(lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")`
			`(lib.cmakeFeature "CUDA${version}_LIBS" "${cudaPackages.cuda_cudart.stubs}/lib/stubs/libcuda.so")`
			`(lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")`
			`(lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (lib.concatStringsSep ";" [`
			`"${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"`
			`"${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"`
			`]))`
			`];`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00
			`# gcc11 is required by DCGM's very particular build system`
			`# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22`
			`in gcc11Stdenv.mkDerivation rec {`
			`pname = "dcgm";`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`version = "3.3.9"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00
			`src = fetchFromGitHub {`
			`owner = "NVIDIA";`
			`repo = "DCGM";`
			`rev = "refs/tags/v${version}";`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`hash = "sha256-PysxuN5WT7GB0oOvT5ezYeOau6AMVDDWE5HOAcmqw/Y=";`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`};`

			`hardeningDisable = [ "all" ];`

dcgm: use strictDeps 2023-08-05 00:58:03 +00:00			`strictDeps = true;`

dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`nativeBuildInputs = [`
cudaPackages: generalize and refactor setup hook This PR refactor CUDA setup hooks, and in particular autoAddOpenGLRunpath and autoAddCudaCompatRunpathHook, that were using a lot of code in common (in fact, I introduced the latter by copy pasting most of the bash script of the former). This is not satisfying for maintenance, as a recent patch showed, because we need to duplicate changes to both hooks. This commit abstract the common part in a single shell script that applies a generic patch action to every elf file in the output. For autoAddOpenGLRunpath the action is just addOpenGLRunpath (now addDriverRunpath), and is few line function for autoAddCudaCompatRunpathHook. Doing so, we also takes the occasion to use the newer addDriverRunpath instead of the previous addOpenGLRunpath, and rename the CUDA hook to reflect that as well. Co-Authored-By: Connor Baker <connor.baker@tweag.io> 2024-01-17 15:32:24 +00:00			`# autoAddDriverRunpath does not actually depend on or incur any dependency`
dcgm: switch to fixed autoAddOpenGLRunpathHook 2023-07-27 20:58:23 +00:00			`# of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of`
			`# executables that need to use cuda at runtime.`
dcgm: use pkgs.autoAddDriverRunpath 2024-03-30 17:17:46 +00:00			`autoAddDriverRunpath`
dcgm: switch to fixed autoAddOpenGLRunpathHook 2023-07-27 20:58:23 +00:00
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`cmake`
dcgm: use Ninja 2024-11-19 18:31:39 +00:00			`ninja`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`git`
			`python3`
dcgm: use strictDeps 2023-08-05 00:58:03 +00:00			`];`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00
dcgm: use strictDeps 2023-08-05 00:58:03 +00:00			`buildInputs = [`
dcgm: add static build option Currently only one dependency (yaml-cpp) actually supports being either static or dynamic because DCGM's build system hard-codes the use of static artifacts for other dependencies. We note that in the comments as future work. 2024-03-15 19:02:37 +00:00			`# Header-only`
dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`boost`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`catch2`
dcgm: add static build option Currently only one dependency (yaml-cpp) actually supports being either static or dynamic because DCGM's build system hard-codes the use of static artifacts for other dependencies. We note that in the comments as future work. 2024-03-15 19:02:37 +00:00			`plog.dev`
			`tclap_1_4`

			`# Dependencies that can be either static or dynamic.`
			`(fmt_9.override { enableShared = !static; }) # DCGM's build uses the static outputs regardless of enableShared`
			`(yaml-cpp.override { inherit static; stdenv = gcc11Stdenv; })`

			`# TODO: Dependencies that DCGM's CMake hard-codes to be static-only.`
			`(jsoncpp.override { enableStatic = true; })`
			`(libevent.override { sslSupport = false; static = true; })`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00			`];`

dcgm: 3.3.5 -> 3.3.9 Fixes the build and matches upstream in dropping CUDA 10. Diff: https://github.com/NVIDIA/DCGM/compare/refs/tags/v3.3.5...v3.3.9 2024-11-19 18:31:39 +00:00			`# Add our paths to the CMake flags so FindCuda.cmake can find them.`
			`cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;`

			`disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;`
dcgm: init at 3.1.8 2023-04-25 03:42:44 +00:00
			`meta = with lib; {`
			`description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";`
			`homepage = "https://developer.nvidia.com/dcgm";`
			`license = licenses.asl20;`
			`maintainers = teams.deshaw.members;`
			`mainProgram = "dcgmi";`
			`platforms = platforms.linux;`
			`};`
			`}`