Merge pull request #233581 from SomeoneSerge/CUDAToolkit_ROOT

cudaPackages.setupCudaHook: init
This commit is contained in:
Connor Baker 2023-07-25 14:24:12 -04:00 committed by GitHub
commit 5825210d3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 278 additions and 67 deletions

View File

@ -24,6 +24,7 @@ args@
, libkrb5
, krb5
, makeWrapper
, markForCudatoolkitRootHook
, ncurses5
, numactl
, nss
@ -31,6 +32,7 @@ args@
, python3 # FIXME: CUDAToolkit 10 may still need python27
, pulseaudio
, requireFile
, setupCudaHook
, stdenv
, backendStdenv # E.g. gcc11Stdenv, set in extension.nix
, unixODBC
@ -80,11 +82,15 @@ backendStdenv.mkDerivation rec {
addOpenGLRunpath
autoPatchelfHook
autoAddOpenGLRunpathHook
markForCudatoolkitRootHook
] ++ lib.optionals (lib.versionOlder version "11") [
libsForQt5.wrapQtAppsHook
] ++ lib.optionals (lib.versionAtLeast version "11.8") [
qt6Packages.wrapQtAppsHook
];
depsTargetTargetPropagated = [
setupCudaHook
];
buildInputs = lib.optionals (lib.versionOlder version "11") [
libsForQt5.qt5.qtwebengine
freeglut
@ -280,24 +286,12 @@ backendStdenv.mkDerivation rec {
sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h"
'' +
# Point NVCC at a compatible compiler
# FIXME: redist cuda_nvcc copy-pastes this code
# Refer to comments in the overrides for cuda_nvcc for explanation
# CUDA_TOOLKIT_ROOT_DIR is legacy,
# Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
# NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the compiled
# binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
# the default set of CUDA capabilities we build can regularly cause this to occur (for
# example, with Magma).
''
mkdir -p $out/nix-support
cat <<EOF >> $out/nix-support/setup-hook
cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
if [ -z "\''${CUDAHOSTCXX-}" ]; then
export CUDAHOSTCXX=${backendStdenv.cc}/bin;
fi
export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin -Xfatbin=-compress-all'
EOF
# Move some libraries to the lib output so that programs that

View File

@ -38,10 +38,39 @@ final: prev: let
cudaFlags = final.callPackage ./flags.nix {};
# Internal hook, used by cudatoolkit and cuda redist packages
# to accommodate automatic CUDAToolkit_ROOT construction
markForCudatoolkitRootHook = (final.callPackage
({ makeSetupHook }:
makeSetupHook
{ name = "mark-for-cudatoolkit-root-hook"; }
./hooks/mark-for-cudatoolkit-root-hook.sh)
{ });
# Normally propagated by cuda_nvcc or cudatoolkit through their depsHostHostPropagated
setupCudaHook = (final.callPackage
({ makeSetupHook, backendStdenv }:
makeSetupHook
{
name = "setup-cuda-hook";
substitutions.ccRoot = "${backendStdenv.cc}";
# Required in addition to ccRoot as otherwise bin/gcc is looked up
# when building CMakeCUDACompilerId.cu
substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
}
./hooks/setup-cuda-hook.sh)
{ });
in
{
inherit
backendStdenv
cudatoolkit
cudaFlags;
cudaFlags
markForCudatoolkitRootHook
setupCudaHook;
saxpy = final.callPackage ./saxpy { };
}

View File

@ -0,0 +1,8 @@
# shellcheck shell=bash
markForCUDAToolkit_ROOT() {
mkdir -p "${prefix}/nix-support"
touch "${prefix}/nix-support/include-in-cudatoolkit-root"
}
fixupOutputHooks+=(markForCUDAToolkit_ROOT)

View File

@ -0,0 +1,5 @@
# shellcheck shell=bash
# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
# CUDAToolkit_ROOT. We have to help it locate libcudart
export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"

View File

@ -0,0 +1,68 @@
# shellcheck shell=bash
echo Sourcing setup-cuda-hook >&2
extendCUDAToolkit_ROOT() {
if [[ -f "$1/nix-support/include-in-cudatoolkit-root" ]] ; then
addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$1"
if [[ -d "$1/include" ]] ; then
addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$1/include"
fi
fi
}
addEnvHooks "$targetOffset" extendCUDAToolkit_ROOT
setupCUDAToolkitCompilers() {
echo Executing setupCUDAToolkitCompilers >&2
if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then
return
fi
# Point NVCC at a compatible compiler
# For CMake-based projects:
# https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
# https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
# https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
export cmakeFlags+=" -DCUDA_HOST_COMPILER=@ccFullPath@"
export cmakeFlags+=" -DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@"
# For non-CMake projects:
# We prepend --compiler-bindir to nvcc flags.
# Downstream packages can override these, because NVCC
# uses the last --compiler-bindir it gets on the command line.
# FIXME: this results in "incompatible redefinition" warnings.
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
if [ -z "${CUDAHOSTCXX-}" ]; then
export CUDAHOSTCXX="@ccFullPath@";
fi
export NVCC_PREPEND_FLAGS+=" --compiler-bindir=@ccRoot@/bin"
# NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
# binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
# the default set of CUDA capabilities we build can regularly cause this to occur (for
# example, with Magma).
#
# @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
if [[ -z "${dontCompressFatbin-}" ]]; then
export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
fi
# CMake's enable_language(CUDA) runs a compiler test and it doesn't account for
# CUDAToolkit_ROOT. We have to help it locate libcudart
if [[ -z "${nvccDontPrependCudartFlags-}" ]] ; then
export NVCC_APPEND_FLAGS+=" -L@cudartRoot@/lib -I@cudartRoot@/include"
fi
}
setupCMakeCUDAToolkit_ROOT() {
export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
}
postHooks+=(setupCUDAToolkitCompilers)
preConfigureHooks+=(setupCMakeCUDAToolkit_ROOT)

View File

@ -4,6 +4,7 @@
, fetchurl
, autoPatchelfHook
, autoAddOpenGLRunpathHook
, markForCudatoolkitRootHook
}:
pname:
@ -28,6 +29,7 @@ backendStdenv.mkDerivation {
# directory to the rpath of all ELF binaries.
# Check e.g. with `patchelf --print-rpath path/to/my/binary
autoAddOpenGLRunpathHook
markForCudatoolkitRootHook
];
buildInputs = [

View File

@ -27,35 +27,27 @@ in
inherit (prev.backendStdenv) cc;
in
{
# Required by cmake's enable_language(CUDA) to build a test program
# When implementing cross-compilation support: this is
# final.pkgs.targetPackages.cudaPackages.cuda_cudart
env.cudartRoot = "${prev.lib.getDev final.cuda_cudart}";
# Point NVCC at a compatible compiler
# FIXME: non-redist cudatoolkit copy-pastes this code
# For CMake-based projects:
# https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
# https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
# https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
# For non-CMake projects:
# We prepend --compiler-bindir to nvcc flags.
# Downstream packages can override these, because NVCC
# uses the last --compiler-bindir it gets on the command line.
# FIXME: this results in "incompatible redefinition" warnings.
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
# NOTE: We unconditionally set -Xfatbin=-compress-all, which reduces the size of the
# compiled binaries. If binaries grow over 2GB, they will fail to link. This is a problem
# for us, as the default set of CUDA capabilities we build can regularly cause this to
# occur (for example, with Magma).
postInstall = (oldAttrs.postInstall or "") + ''
mkdir -p $out/nix-support
cat <<EOF >> $out/nix-support/setup-hook
cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
if [ -z "\''${CUDAHOSTCXX-}" ]; then
export CUDAHOSTCXX=${cc}/bin;
fi
export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${cc}/bin -Xfatbin=-compress-all'
EOF
'';
# Desiredata: whenever a package (e.g. magma) adds cuda_nvcc to
# nativeBuildInputs (offsets `(-1, 0)`), magma should also source the
# setupCudaHook, i.e. we want it the hook to be propagated into the
# same nativeBuildInputs.
#
# Logically, cuda_nvcc should include the hook in depsHostHostPropagated,
# so that the final offsets for the propagated hook would be `(-1, 0) +
# (0, 0) = (-1, 0)`.
#
# In practice, TargetTarget appears to work:
# https://gist.github.com/fd80ff142cd25e64603618a3700e7f82
depsTargetTargetPropagated = [
final.setupCudaHook
];
});
cuda_nvprof = prev.cuda_nvprof.overrideAttrs (oldAttrs: {

View File

@ -0,0 +1,12 @@
cmake_minimum_required(VERSION 3.25)
project(saxpy LANGUAGES CXX CUDA)
find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)
add_executable(saxpy saxpy.cu)
target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
target_compile_features(saxpy PRIVATE cxx_std_14)
target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--expt-relaxed-constexpr>)
install(TARGETS saxpy)

View File

@ -0,0 +1,50 @@
{ autoAddOpenGLRunpathHook
, backendStdenv
, cmake
, cuda_cccl
, cuda_cudart
, cudaFlags
, cuda_nvcc
, lib
, libcublas
, setupCudaHook
, stdenv
}:
backendStdenv.mkDerivation {
pname = "saxpy";
version = "unstable-2023-07-11";
src = ./.;
buildInputs = [
libcublas
cuda_cudart
cuda_cccl
];
nativeBuildInputs = [
cmake
# NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
# cross-compilation to work. This should work automatically once we move to
# spliced scopes. Delete this comment once that happens
cuda_nvcc
# Alternatively, we could remove the propagated hook from cuda_nvcc and add
# directly:
# setupCudaHook
autoAddOpenGLRunpathHook
];
cmakeFlags = [
"-DCMAKE_VERBOSE_MAKEFILE=ON"
"-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}"
];
meta = {
description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
license = lib.licenses.mit;
maintainers = lib.teams.cuda.members;
platforms = lib.platforms.unix;
};
}

View File

@ -0,0 +1,68 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <vector>
#include <stdio.h>
static inline void check(cudaError_t err, const char *context) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
std::exit(EXIT_FAILURE);
}
}
#define CHECK(x) check(x, #x)
__global__ void saxpy(int n, float a, float *x, float *y) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main(void) {
setbuf(stderr, NULL);
fprintf(stderr, "Start\n");
int rtVersion, driverVersion;
CHECK(cudaRuntimeGetVersion(&rtVersion));
CHECK(cudaDriverGetVersion(&driverVersion));
fprintf(stderr, "Runtime version: %d\n", rtVersion);
fprintf(stderr, "Driver version: %d\n", driverVersion);
constexpr int N = 1 << 10;
std::vector<float> xHost(N), yHost(N);
for (int i = 0; i < N; i++) {
xHost[i] = 1.0f;
yHost[i] = 2.0f;
}
fprintf(stderr, "Host memory initialized, copying to the device\n");
fflush(stderr);
float *xDevice, *yDevice;
CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
cudaMemcpyHostToDevice));
fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
fprintf(stderr, "Scheduled a kernel call\n");
CHECK(cudaGetLastError());
CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
cudaMemcpyDeviceToHost));
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(yHost[i] - 4.0f));
fprintf(stderr, "Max error: %f\n", maxError);
CHECK(cudaFree(xDevice));
CHECK(cudaFree(yDevice));
}

View File

@ -86,29 +86,6 @@ let
# "75" -> "750" Cf. https://bitbucket.org/icl/magma/src/f4ec79e2c13a2347eff8a77a3be6f83bc2daec20/CMakeLists.txt#lines-273
"${minArch'}0";
cuda-common-redist = with cudaPackages; [
libcublas # cublas_v2.h
libcusparse # cusparse.h
];
# Build-time dependencies
cuda-native-redist = symlinkJoin {
name = "cuda-native-redist-${cudaVersion}";
paths = with cudaPackages; [
cuda_cudart # cuda_runtime.h
cuda_nvcc
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
cuda_nvprof # <cuda_profiler_api.h>
] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
cuda_profiler_api # <cuda_profiler_api.h>
] ++ cuda-common-redist;
};
# Run-time dependencies
cuda-redist = symlinkJoin {
name = "cuda-redist-${cudaVersion}";
paths = cuda-common-redist;
};
in
assert (builtins.match "[^[:space:]]*" gpuTargetString) != null;
@ -128,16 +105,22 @@ stdenv.mkDerivation {
ninja
gfortran
] ++ lists.optionals cudaSupport [
cuda-native-redist
cudaPackages.cuda_nvcc
];
buildInputs = [
libpthreadstubs
lapack
blas
] ++ lists.optionals cudaSupport [
cuda-redist
] ++ lists.optionals rocmSupport [
] ++ lists.optionals cudaSupport (with cudaPackages; [
cuda_cudart
libcublas # cublas_v2.h
libcusparse # cusparse.h
] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
cuda_nvprof # <cuda_profiler_api.h>
] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
cuda_profiler_api # <cuda_profiler_api.h>
]) ++ lists.optionals rocmSupport [
hip
hipblas
hipsparse