Merge pull request #230881 from kira-bruneau/rocfft

rocfft: split kernel compilation into separate derivations
This commit is contained in:
Kira Bruneau 2023-06-02 07:50:38 -04:00 committed by GitHub
commit 564e538d49
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 356 additions and 173 deletions

View File

@ -1,199 +1,243 @@
{ lib
{ rocfft
, lib
, stdenv
, fetchFromGitHub
, rocmUpdateScript
, runCommand
, cmake
, rocm-cmake
, rocrand
, hip
, openmp
, sqlite
, python3
, gtest
, rocm-cmake
, sqlite
, boost
, fftw
, fftwFloat
, buildTests ? false
, buildBenchmarks ? false
, gtest
, openmp
, rocrand
# NOTE: Update the default GPU targets on every update
, gpuTargets ? [
"gfx803"
"gfx900"
"gfx906"
"gfx908"
"gfx90a"
"gfx1030"
"gfx1100"
"gfx1102"
]
}:
let
name-zero = "librocfft-device-0.so.0.1";
name-one = "librocfft-device-1.so.0.1";
name-two = "librocfft-device-2.so.0.1";
name-three = "librocfft-device-3.so.0.1";
# To avoid output limit exceeded errors in hydra, we build kernel
# device libs and the kernel RTC cache database in separate derivations
kernelDeviceLibs = map
(target:
(rocfft.overrideAttrs (prevAttrs: {
pname = "rocfft-device-${target}";
# This is over 3GB, to allow hydra caching we separate it
rf = stdenv.mkDerivation (finalAttrs: {
pname = "rocfft";
version = "5.4.3";
patches = prevAttrs.patches ++ [
# Add back install rule for device library
# This workaround is needed because rocm_install_targets
# doesn't support an EXCLUDE_FROM_ALL option
./device-install.patch
];
outputs = [
"out"
"libzero"
"libone"
"libtwo"
"libthree"
] ++ lib.optionals buildTests [
"test"
] ++ lib.optionals buildBenchmarks [
"benchmark"
];
buildFlags = [ "rocfft-device-${target}" ];
src = fetchFromGitHub {
owner = "ROCmSoftwarePlatform";
repo = "rocFFT";
rev = "rocm-${finalAttrs.version}";
hash = "sha256-FsefE0B2hF5ZcHDB6TscwFeZ1NKFkWX7VDpEvvbDbOk=";
};
installPhase = ''
runHook preInstall
cmake --install . --component device
runHook postInstall
'';
nativeBuildInputs = [
cmake
rocm-cmake
hip
];
requiredSystemFeatures = [ "big-parallel" ];
})).override {
gpuTargets = [ target ];
}
)
gpuTargets;
buildInputs = [
sqlite
python3
] ++ lib.optionals buildTests [
gtest
] ++ lib.optionals (buildTests || buildBenchmarks) [
rocrand
boost
fftw
fftwFloat
openmp
];
# TODO: Figure out how to also split this by GPU target
#
# It'll be bit more complicated than what we're doing for the kernel
# device libs, because the kernel cache needs to be compiled into
# one sqlite database (whereas the device libs can be linked into
# rocfft as separate libraries for each GPU target).
#
# It's not clear why this needs to even be a db in the first place.
# It would simplify things A LOT if we could just store these
# pre-compiled kernels as files (but that'd need a lot of patching).
kernelRtcCache = rocfft.overrideAttrs (_: {
pname = "rocfft-kernel-cache";
propagatedBuildInputs = lib.optionals buildTests [
fftw
fftwFloat
];
buildFlags = [ "rocfft_kernel_cache_target" ];
cmakeFlags = [
"-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc"
"-DUSE_HIP_CLANG=ON"
"-DSQLITE_USE_SYSTEM_PACKAGE=ON"
# Manually define CMAKE_INSTALL_<DIR>
# See: https://github.com/NixOS/nixpkgs/pull/197838
"-DCMAKE_INSTALL_BINDIR=bin"
"-DCMAKE_INSTALL_LIBDIR=lib"
"-DCMAKE_INSTALL_INCLUDEDIR=include"
] ++ lib.optionals buildTests [
"-DBUILD_CLIENTS_TESTS=ON"
] ++ lib.optionals buildBenchmarks [
"-DBUILD_CLIENTS_RIDER=ON"
"-DBUILD_CLIENTS_SAMPLES=ON"
];
postInstall = ''
mv $out/lib/${name-zero} $libzero
mv $out/lib/${name-one} $libone
mv $out/lib/${name-two} $libtwo
mv $out/lib/${name-three} $libthree
ln -s $libzero $out/lib/${name-zero}
ln -s $libone $out/lib/${name-one}
ln -s $libtwo $out/lib/${name-two}
ln -s $libthree $out/lib/${name-three}
'' + lib.optionalString buildTests ''
mkdir -p $test/{bin,lib/fftw}
cp -a $out/bin/* $test/bin
ln -s ${fftw}/lib/libfftw*.so $test/lib/fftw
ln -s ${fftwFloat}/lib/libfftw*.so $test/lib/fftw
rm -r $out/lib/fftw
rm $test/bin/{rocfft_rtc_helper,*-rider} || true
'' + lib.optionalString buildBenchmarks ''
mkdir -p $benchmark/bin
cp -a $out/bin/* $benchmark/bin
rm $benchmark/bin/{rocfft_rtc_helper,*-test} || true
'' + lib.optionalString (buildTests || buildBenchmarks ) ''
mv $out/bin/rocfft_rtc_helper $out
rm -r $out/bin/*
mv $out/rocfft_rtc_helper $out/bin
installPhase = ''
runHook preInstall
cmake --install . --component kernel_cache
runHook postInstall
'';
passthru.updateScript = rocmUpdateScript {
requiredSystemFeatures = [ "big-parallel" ];
});
in
stdenv.mkDerivation (finalAttrs: {
pname = "rocfft";
version = "5.4.3";
src = fetchFromGitHub {
owner = "ROCmSoftwarePlatform";
repo = "rocFFT";
rev = "rocm-${finalAttrs.version}";
hash = "sha256-FsefE0B2hF5ZcHDB6TscwFeZ1NKFkWX7VDpEvvbDbOk=";
};
patches = [
# Exclude kernel compilation & installation from "all" target,
# and split device libraries by GPU target
./split-kernel-compilation.patch
];
nativeBuildInputs = [
cmake
hip
python3
rocm-cmake
];
buildInputs = [
sqlite
] ++ lib.optionals (finalAttrs.pname == "rocfft") kernelDeviceLibs;
cmakeFlags = [
"-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc"
"-DUSE_HIP_CLANG=ON"
"-DSQLITE_USE_SYSTEM_PACKAGE=ON"
# Manually define CMAKE_INSTALL_<DIR>
# See: https://github.com/NixOS/nixpkgs/pull/197838
"-DCMAKE_INSTALL_BINDIR=bin"
"-DCMAKE_INSTALL_LIBDIR=lib"
"-DCMAKE_INSTALL_INCLUDEDIR=include"
"-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
];
postInstall = lib.optionalString (finalAttrs.pname == "rocfft") ''
ln -s ${kernelRtcCache}/lib/rocfft_kernel_cache.db "$out/lib"
'';
passthru = {
test = stdenv.mkDerivation {
pname = "${finalAttrs.pname}-test";
inherit (finalAttrs) version src;
sourceRoot = "source/clients/tests";
nativeBuildInputs = [
cmake
hip
rocm-cmake
];
buildInputs = [
boost
fftw
fftwFloat
finalAttrs.finalPackage
gtest
openmp
rocrand
];
cmakeFlags = [
"-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc"
];
postInstall = ''
rm -r "$out/lib/fftw"
rmdir "$out/lib"
'';
};
benchmark = stdenv.mkDerivation {
pname = "${finalAttrs.pname}-benchmark";
inherit (finalAttrs) version src;
sourceRoot = "source/clients/rider";
nativeBuildInputs = [
cmake
hip
rocm-cmake
];
buildInputs = [
boost
finalAttrs.finalPackage
openmp
(python3.withPackages (ps: with ps; [
pandas
scipy
]))
rocrand
];
cmakeFlags = [
"-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc"
];
postInstall = ''
cp -a ../../../scripts/perf "$out/bin"
'';
};
samples = stdenv.mkDerivation {
pname = "${finalAttrs.pname}-samples";
inherit (finalAttrs) version src;
sourceRoot = "source/clients/samples";
nativeBuildInputs = [
cmake
hip
rocm-cmake
];
buildInputs = [
boost
finalAttrs.finalPackage
openmp
rocrand
];
cmakeFlags = [
"-DCMAKE_C_COMPILER=hipcc"
"-DCMAKE_CXX_COMPILER=hipcc"
];
installPhase = ''
runHook preInstall
mkdir "$out"
cp -a bin "$out"
runHook postInstall
'';
};
updateScript = rocmUpdateScript {
name = finalAttrs.pname;
owner = finalAttrs.src.owner;
repo = finalAttrs.src.repo;
};
};
meta = with lib; {
description = "FFT implementation for ROCm ";
homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT";
license = with licenses; [ mit ];
maintainers = teams.rocm.members;
platforms = platforms.linux;
broken = versions.minor finalAttrs.version != versions.minor hip.version;
};
});
rf-zero = runCommand name-zero { preferLocalBuild = true; } ''
cp -a ${rf.libzero} $out
'';
rf-one = runCommand name-one { preferLocalBuild = true; } ''
cp -a ${rf.libone} $out
'';
rf-two = runCommand name-two { preferLocalBuild = true; } ''
cp -a ${rf.libtwo} $out
'';
rf-three = runCommand name-three { preferLocalBuild = true; } ''
cp -a ${rf.libthree} $out
'';
in stdenv.mkDerivation {
inherit (rf) pname version src passthru meta;
outputs = [
"out"
] ++ lib.optionals buildTests [
"test"
] ++ lib.optionals buildBenchmarks [
"benchmark"
];
dontUnpack = true;
dontPatch = true;
dontConfigure = true;
dontBuild = true;
installPhase = ''
runHook preInstall
mkdir -p $out/lib
ln -sf ${rf-zero} $out/lib/${name-zero}
ln -sf ${rf-one} $out/lib/${name-one}
ln -sf ${rf-two} $out/lib/${name-two}
ln -sf ${rf-three} $out/lib/${name-three}
cp -an ${rf}/* $out
'' + lib.optionalString buildTests ''
cp -a ${rf.test} $test
'' + lib.optionalString buildBenchmarks ''
cp -a ${rf.benchmark} $benchmark
'' + ''
runHook postInstall
'';
# Fix paths
preFixup = ''
substituteInPlace $out/include/*.h $out/rocfft/include/*.h \
--replace "${rf}" "$out"
patchelf --set-rpath \
$(patchelf --print-rpath $out/lib/librocfft.so | sed 's,${rf}/lib,'"$out/lib"',') \
$out/lib/librocfft.so
'' + lib.optionalString buildTests ''
patchelf --set-rpath \
$(patchelf --print-rpath $test/bin/rocfft-test | sed 's,${rf}/lib,'"$out/lib"',') \
$test/bin/rocfft-test
'' + lib.optionalString buildBenchmarks ''
patchelf --set-rpath \
$(patchelf --print-rpath $benchmark/bin/rocfft-rider | sed 's,${rf}/lib,'"$out/lib"',') \
$benchmark/bin/rocfft-rider
'';
}
meta = with lib; {
description = "FFT implementation for ROCm";
homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT";
license = with licenses; [ mit ];
maintainers = with maintainers; [ kira-bruneau ] ++ teams.rocm.members;
platforms = platforms.linux;
broken = versions.minor finalAttrs.version != versions.minor hip.version;
};
})

View File

@ -0,0 +1,15 @@
diff --git a/library/src/device/CMakeLists.txt b/library/src/device/CMakeLists.txt
index 73a8ec9..9bfd4b8 100644
--- a/library/src/device/CMakeLists.txt
+++ b/library/src/device/CMakeLists.txt
@@ -255,4 +255,10 @@ foreach( sub ${AMDGPU_TARGETS} )
if( NOT BUILD_SHARED_LIBS )
set_target_properties( rocfft-device-${sub} PROPERTIES PREFIX "lib" )
endif( )
+
+ rocm_install_targets(
+ TARGETS
+ rocfft-device-${sub}
+ COMPONENT device
+ )
endforeach()

View File

@ -0,0 +1,124 @@
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index 3a16304..606b711 100644
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -250,12 +250,12 @@ foreach( target
endforeach()
-add_executable( rocfft_aot_helper
+add_executable( rocfft_aot_helper EXCLUDE_FROM_ALL
rocfft_aot_helper.cpp
rocfft_stub.cpp
)
-add_executable( rocfft_config_search
+add_executable( rocfft_config_search EXCLUDE_FROM_ALL
rocfft_config_search.cpp
rocfft_stub.cpp
)
@@ -279,10 +279,10 @@ endif()
target_link_libraries( rocfft PRIVATE ${ROCFFT_DEVICE_LINK_LIBS} )
-target_link_libraries( rocfft PRIVATE rocfft-device-0 )
-target_link_libraries( rocfft PRIVATE rocfft-device-1 )
-target_link_libraries( rocfft PRIVATE rocfft-device-2 )
-target_link_libraries( rocfft PRIVATE rocfft-device-3 )
+foreach( sub ${AMDGPU_TARGETS} )
+ target_link_libraries( rocfft PRIVATE -lrocfft-device-${sub} )
+endforeach()
+
foreach( target rocfft rocfft_aot_helper rocfft_config_search )
# RTC uses dladdr to find the RTC helper program
if( NOT WIN32 )
@@ -347,7 +347,7 @@ add_custom_command(
DEPENDS rocfft_aot_helper rocfft_rtc_helper
COMMENT "Compile kernels into shipped cache file"
)
-add_custom_target( rocfft_kernel_cache_target ALL
+add_custom_target( rocfft_kernel_cache_target
DEPENDS rocfft_kernel_cache.db
VERBATIM
)
@@ -392,7 +392,8 @@ else()
endif()
rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
- COMPONENT runtime
+ COMPONENT kernel_cache
+ EXCLUDE_FROM_ALL
)
# PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
diff --git a/library/src/device/CMakeLists.txt b/library/src/device/CMakeLists.txt
index 9f7b85f..73a8ec9 100644
--- a/library/src/device/CMakeLists.txt
+++ b/library/src/device/CMakeLists.txt
@@ -170,11 +170,11 @@ list( SORT rocfft_device_source )
# functions callable by rocFFT and depends on amdhip64, and another
# one usable by AOT RTC that contains no device code
list( FILTER rocfft_device_source EXCLUDE REGEX function_pool.cpp )
-add_library( rocfft-function-pool OBJECT
+add_library( rocfft-function-pool OBJECT EXCLUDE_FROM_ALL
function_pool.cpp
)
target_compile_definitions( rocfft-function-pool PRIVATE FUNCTION_POOL_STANDALONE_BODY= )
-add_library( rocfft-function-pool-standalone OBJECT
+add_library( rocfft-function-pool-standalone OBJECT EXCLUDE_FROM_ALL
function_pool.cpp
)
target_compile_definitions( rocfft-function-pool-standalone PRIVATE FUNCTION_POOL_STANDALONE_BODY={} )
@@ -193,26 +193,15 @@ foreach( pool rocfft-function-pool rocfft-function-pool-standalone )
add_dependencies(${pool} gen_headers_target)
endforeach()
-list( LENGTH rocfft_device_source rocfft_device_source_len )
-math(EXPR split_len "${rocfft_device_source_len} / 4")
-math(EXPR split_idx_2 "${rocfft_device_source_len} / 4 * 2")
-math(EXPR split_idx_3 "${rocfft_device_source_len} / 4 * 3")
-
-list( SUBLIST rocfft_device_source 0 ${split_len} rocfft_device_source_0 )
-list( SUBLIST rocfft_device_source ${split_len} ${split_len} rocfft_device_source_1 )
-list( SUBLIST rocfft_device_source ${split_idx_2} ${split_len} rocfft_device_source_2 )
-list( SUBLIST rocfft_device_source ${split_idx_3} -1 rocfft_device_source_3 )
-
-foreach( sub RANGE 3 )
- set( rocfft_device_source_var rocfft_device_source_${sub} )
+foreach( sub ${AMDGPU_TARGETS} )
if(NOT SINGLELIB)
- add_library( rocfft-device-${sub}
- ${${rocfft_device_source_var}} )
+ add_library( rocfft-device-${sub} EXCLUDE_FROM_ALL
+ ${rocfft_device_source} )
else()
# Compile the device lib as a static library, which is then linked
# into librocfft.so Useful for testing purposes.
- add_library( rocfft-device-${sub} STATIC
- ${${rocfft_device_source_var}} )
+ add_library( rocfft-device-${sub} STATIC EXCLUDE_FROM_ALL
+ ${rocfft_device_source} )
# if we're building singlelib, we don't want to export any of the
# device library symbols to the main library
@@ -241,9 +230,7 @@ foreach( sub RANGE 3 )
# Set AMD GPU architecture options
# Enable compilation of desired architectures
- foreach( target ${AMDGPU_TARGETS} )
- target_compile_options( rocfft-device-${sub} PRIVATE --offload-arch=${target} )
- endforeach( )
+ target_compile_options( rocfft-device-${sub} PRIVATE --offload-arch=${sub} )
target_include_directories( rocfft-device-${sub}
PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
@@ -268,9 +255,4 @@ foreach( sub RANGE 3 )
if( NOT BUILD_SHARED_LIBS )
set_target_properties( rocfft-device-${sub} PROPERTIES PREFIX "lib" )
endif( )
-
- rocm_install_targets(
- TARGETS
- rocfft-device-${sub}
- )
endforeach()