2023-04-25 03:42:44 +00:00
|
|
|
{
|
|
|
|
lib,
|
2024-11-19 18:31:39 +00:00
|
|
|
stdenv,
|
2023-04-25 03:42:44 +00:00
|
|
|
fetchFromGitHub,
|
2024-03-30 17:17:46 +00:00
|
|
|
autoAddDriverRunpath,
|
2023-04-25 03:42:44 +00:00
|
|
|
catch2,
|
|
|
|
cmake,
|
2024-11-19 18:31:39 +00:00
|
|
|
ninja,
|
2023-04-25 03:42:44 +00:00
|
|
|
cudaPackages_11_8,
|
|
|
|
cudaPackages_12,
|
2024-11-19 18:31:39 +00:00
|
|
|
boost,
|
2023-04-25 03:42:44 +00:00
|
|
|
fmt_9,
|
|
|
|
git,
|
|
|
|
jsoncpp,
|
|
|
|
libevent,
|
|
|
|
plog,
|
|
|
|
python3,
|
|
|
|
symlinkJoin,
|
|
|
|
tclap_1_4,
|
|
|
|
yaml-cpp,
|
|
|
|
}:
|
|
|
|
let
|
2024-11-19 18:31:39 +00:00
|
|
|
# DCGM depends on 2 different versions of CUDA at the same time.
|
|
|
|
# The runtime closure, thankfully, is quite small as it does not
|
|
|
|
# include the CUDA libraries.
|
|
|
|
cudaPackageSets = [
|
|
|
|
cudaPackages_11_8
|
|
|
|
cudaPackages_12
|
2023-04-25 03:42:44 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
# Select needed redist packages from cudaPackages
|
|
|
|
# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
|
|
|
|
getCudaPackages =
|
|
|
|
p: with p; [
|
|
|
|
cuda_cccl
|
|
|
|
cuda_cudart
|
|
|
|
cuda_nvcc
|
|
|
|
cuda_nvml_dev
|
|
|
|
libcublas
|
|
|
|
libcufft
|
|
|
|
libcurand
|
|
|
|
];
|
|
|
|
|
2024-11-19 18:31:39 +00:00
|
|
|
# Builds CMake flags to add CUDA paths for include and lib.
|
|
|
|
mkCudaFlags =
|
|
|
|
cudaPackages:
|
2023-04-25 03:42:44 +00:00
|
|
|
let
|
2024-11-19 18:31:39 +00:00
|
|
|
version = cudaPackages.cudaMajorVersion;
|
2023-04-25 03:42:44 +00:00
|
|
|
# The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
|
|
|
|
# combine everything together for headers to work.
|
2024-11-19 18:31:39 +00:00
|
|
|
headers = symlinkJoin {
|
|
|
|
name = "cuda-headers-combined-${version}";
|
|
|
|
paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);
|
2023-04-25 03:42:44 +00:00
|
|
|
};
|
2024-11-19 18:31:39 +00:00
|
|
|
in
|
|
|
|
[
|
|
|
|
(lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")
|
|
|
|
(lib.cmakeFeature "CUDA${version}_LIBS" "${cudaPackages.cuda_cudart.stubs}/lib/stubs/libcuda.so")
|
|
|
|
(lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")
|
|
|
|
(lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (
|
|
|
|
lib.concatStringsSep ";" [
|
|
|
|
"${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"
|
|
|
|
"${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"
|
|
|
|
]
|
|
|
|
))
|
|
|
|
];
|
2024-11-19 18:31:39 +00:00
|
|
|
in
|
|
|
|
stdenv.mkDerivation rec {
|
2023-04-25 03:42:44 +00:00
|
|
|
pname = "dcgm";
|
2024-11-19 18:31:39 +00:00
|
|
|
version = "3.3.9"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
|
2023-04-25 03:42:44 +00:00
|
|
|
|
|
|
|
src = fetchFromGitHub {
|
|
|
|
owner = "NVIDIA";
|
|
|
|
repo = "DCGM";
|
|
|
|
rev = "refs/tags/v${version}";
|
2024-11-19 18:31:39 +00:00
|
|
|
hash = "sha256-PysxuN5WT7GB0oOvT5ezYeOau6AMVDDWE5HOAcmqw/Y=";
|
2023-04-25 03:42:44 +00:00
|
|
|
};
|
|
|
|
|
2024-11-19 18:31:39 +00:00
|
|
|
patches = [
|
2024-11-19 18:31:39 +00:00
|
|
|
./fix-includes.patch
|
2024-11-19 18:31:39 +00:00
|
|
|
./dynamic-libs.patch
|
|
|
|
];
|
|
|
|
|
2023-04-25 03:42:44 +00:00
|
|
|
hardeningDisable = [ "all" ];
|
|
|
|
|
2023-08-05 00:58:03 +00:00
|
|
|
strictDeps = true;
|
|
|
|
|
2023-04-25 03:42:44 +00:00
|
|
|
nativeBuildInputs = [
|
2024-01-17 15:32:24 +00:00
|
|
|
# autoAddDriverRunpath does not actually depend on or incur any dependency
|
2023-07-27 20:58:23 +00:00
|
|
|
# of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
|
|
|
|
# executables that need to use cuda at runtime.
|
2024-03-30 17:17:46 +00:00
|
|
|
autoAddDriverRunpath
|
2023-07-27 20:58:23 +00:00
|
|
|
|
2023-04-25 03:42:44 +00:00
|
|
|
cmake
|
2024-11-19 18:31:39 +00:00
|
|
|
ninja
|
2023-04-25 03:42:44 +00:00
|
|
|
git
|
|
|
|
python3
|
2023-08-05 00:58:03 +00:00
|
|
|
];
|
2023-04-25 03:42:44 +00:00
|
|
|
|
2023-08-05 00:58:03 +00:00
|
|
|
buildInputs = [
|
2024-03-15 19:02:37 +00:00
|
|
|
# Header-only
|
2024-11-19 18:31:39 +00:00
|
|
|
boost
|
2023-04-25 03:42:44 +00:00
|
|
|
catch2
|
2024-03-15 19:02:37 +00:00
|
|
|
plog.dev
|
|
|
|
tclap_1_4
|
|
|
|
|
2024-11-19 18:31:39 +00:00
|
|
|
fmt_9
|
2024-11-19 18:31:39 +00:00
|
|
|
yaml-cpp
|
2024-11-19 18:31:39 +00:00
|
|
|
jsoncpp
|
|
|
|
libevent
|
2023-04-25 03:42:44 +00:00
|
|
|
];
|
|
|
|
|
2024-11-19 18:31:39 +00:00
|
|
|
# Add our paths to the CMake flags so FindCuda.cmake can find them.
|
|
|
|
cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;
|
|
|
|
|
2024-11-19 18:31:39 +00:00
|
|
|
# Lots of dodgy C++.
|
|
|
|
env.NIX_CFLAGS_COMPILE = "-Wno-error";
|
|
|
|
|
2024-11-20 19:17:36 +00:00
|
|
|
doCheck = true;
|
|
|
|
|
|
|
|
checkPhase = ''
|
|
|
|
runHook preCheck
|
|
|
|
|
|
|
|
ctest -j $NIX_BUILD_CORES --output-on-failure --exclude-regex ${
|
|
|
|
lib.escapeShellArg (
|
|
|
|
lib.concatMapStringsSep "|" (test: "^${lib.escapeRegex test}$") [
|
|
|
|
"DcgmModuleSysmon Watches"
|
|
|
|
"DcgmModuleSysmon maxSampleAge"
|
|
|
|
"DcgmModuleSysmon::CalculateCoreUtilization"
|
|
|
|
"DcgmModuleSysmon::ParseProcStatCpuLine"
|
|
|
|
"DcgmModuleSysmon::ParseThermalFileContentsAndStore"
|
|
|
|
"DcgmModuleSysmon::PopulateTemperatureFileMap"
|
|
|
|
"DcgmModuleSysmon::ReadCoreSpeed"
|
|
|
|
"DcgmModuleSysmon::ReadTemperature"
|
|
|
|
"Sysmon: initialize module"
|
|
|
|
]
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
runHook postCheck
|
|
|
|
'';
|
|
|
|
|
2024-11-19 18:31:39 +00:00
|
|
|
disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;
|
2023-04-25 03:42:44 +00:00
|
|
|
|
|
|
|
meta = with lib; {
|
|
|
|
description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";
|
|
|
|
homepage = "https://developer.nvidia.com/dcgm";
|
|
|
|
license = licenses.asl20;
|
|
|
|
maintainers = teams.deshaw.members;
|
|
|
|
mainProgram = "dcgmi";
|
|
|
|
platforms = platforms.linux;
|
|
|
|
};
|
|
|
|
}
|