mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-11-22 15:03:28 +00:00
mpiCheckPhaseHook: add parameters to bypass errors in sandbox (#350112)
This commit is contained in:
commit
c4875a446f
@ -195,7 +195,7 @@ stdenv.mkDerivation rec {
|
||||
doCheck = false;
|
||||
|
||||
doInstallCheck = true;
|
||||
nativeCheckInputs = [ mpiCheckPhaseHook ];
|
||||
nativeInstallCheckInputs = [ mpiCheckPhaseHook ];
|
||||
installCheckPhase = ''
|
||||
runHook preInstallCheck
|
||||
|
||||
@ -211,7 +211,7 @@ stdenv.mkDerivation rec {
|
||||
meta = with lib; {
|
||||
description = "Open Source High-Performance Computational Chemistry";
|
||||
mainProgram = "nwchem";
|
||||
platforms = [ "x86_64-linux" ];
|
||||
platforms = [ "x86_64-linux" "aarch64-linux" ];
|
||||
maintainers = with maintainers; [ sheepforce markuskowa ];
|
||||
homepage = "https://nwchemgit.github.io";
|
||||
license = licenses.ecl20;
|
||||
|
@ -2,4 +2,8 @@
|
||||
|
||||
makeSetupHook {
|
||||
name = "mpi-checkPhase-hook";
|
||||
|
||||
substitutions = {
|
||||
topology = ./topology.xml;
|
||||
};
|
||||
} ./mpi-check-hook.sh
|
||||
|
@ -44,6 +44,17 @@ setupMpiCheck() {
|
||||
# Disable CPU pinning
|
||||
export OMPI_MCA_hwloc_base_binding_policy=none
|
||||
export PRTE_MCA_hwloc_default_binding_policy=none
|
||||
|
||||
# OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
|
||||
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
|
||||
# [1729458724.473282] [localhost:78 :0] tcp_iface.c:893 UCX ERROR scandir(/sys/class/net) failed: No such file or directory
|
||||
# These messages contaminate test output, which makes the difftest to fail.
|
||||
# The solution is to use a preset cpu topology file and disable ucx model.
|
||||
|
||||
# Disable sysfs cpu topology directory discovery.
|
||||
export PRTE_MCA_hwloc_use_topo_file="@topology@"
|
||||
# Use the network model ob1 instead of ucx.
|
||||
export OMPI_MCA_pml=ob1
|
||||
;;
|
||||
MPICH)
|
||||
# Fix to make mpich run in a sandbox
|
||||
|
10
pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml
Normal file
10
pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml
Normal file
@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE topology SYSTEM "hwloc2.dtd">
|
||||
<topology version="2.0">
|
||||
<object type="Machine" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" allowed_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" allowed_nodeset="0x00000001" gp_index="1">
|
||||
<object type="Core" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="2">
|
||||
<object type="NUMANode" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="4"/>
|
||||
<object type="PU" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="3"/>
|
||||
</object>
|
||||
</object>
|
||||
</topology>
|
@ -1,112 +0,0 @@
|
||||
diff --git a/src/snes/tutorials/makefile b/src/snes/tutorials/makefile
|
||||
index fa15faad39e..7670e80931e 100644
|
||||
--- a/src/snes/tutorials/makefile
|
||||
+++ b/src/snes/tutorials/makefile
|
||||
@@ -13,6 +13,7 @@ ex55: ex55.o ex55k.o
|
||||
# these tests are used by the makefile in PETSC_DIR for basic tests of the install and should not be removed
|
||||
testex5f: ex5f.PETSc
|
||||
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex5f -snes_rtol 1e-4 > ex5f_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex5f_1.tmp; \
|
||||
if (${DIFF} output/ex5f_1.testout ex5f_1.tmp > /dev/null 2>&1) then \
|
||||
echo "Fortran example src/snes/tutorials/ex5f run successfully with 1 MPI process"; \
|
||||
else \
|
||||
@@ -25,6 +26,7 @@ testex5f: ex5f.PETSc
|
||||
${MAKE} PETSC_ARCH=${PETSC_ARCH} PETSC_DIR=${PETSC_DIR} ex5f.rm;
|
||||
testex19: ex19.PETSc
|
||||
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with 1 MPI process"; \
|
||||
else \
|
||||
@@ -36,6 +38,7 @@ testex19: ex19.PETSc
|
||||
${RM} -f ex19_1.tmp;
|
||||
testex19_mpi:
|
||||
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with 2 MPI processes"; \
|
||||
else \
|
||||
@@ -48,6 +51,7 @@ testex19_mpi:
|
||||
#use unpreconditioned norm because HYPRE device installations use different AMG parameters
|
||||
runex19_hypre:
|
||||
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE"; \
|
||||
else \
|
||||
@@ -57,6 +61,7 @@ runex19_hypre:
|
||||
${RM} -f ex19_1.tmp
|
||||
runex19_hypre_cuda:
|
||||
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type cuda -dm_mat_type aijcusparse -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/CUDA"; \
|
||||
else \
|
||||
@@ -66,6 +71,7 @@ runex19_hypre_cuda:
|
||||
${RM} -f ex19_1.tmp
|
||||
runex19_hypre_hip:
|
||||
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type hip -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/HIP"; \
|
||||
else \
|
||||
@@ -75,6 +81,7 @@ runex19_hypre_hip:
|
||||
${RM} -f ex19_1.tmp
|
||||
runex19_cuda:
|
||||
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -snes_monitor -dm_mat_type seqaijcusparse -dm_vec_type seqcuda -pc_type gamg -ksp_monitor -mg_levels_ksp_max_it 1 > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_cuda_1.out ex19_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with CUDA"; \
|
||||
else \
|
||||
@@ -84,6 +91,7 @@ runex19_cuda:
|
||||
${RM} -f ex19_1.tmp
|
||||
runex19_ml:
|
||||
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type ml > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_ml.out ex19_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with ML"; \
|
||||
else \
|
||||
@@ -93,6 +101,7 @@ runex19_ml:
|
||||
${RM} -f ex19_1.tmp
|
||||
runex19_fieldsplit_mumps:
|
||||
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -pc_type fieldsplit -pc_fieldsplit_block_size 4 -pc_fieldsplit_type SCHUR -pc_fieldsplit_0_fields 0,1,2 -pc_fieldsplit_1_fields 3 -fieldsplit_0_pc_type lu -fieldsplit_1_pc_type lu -snes_monitor_short -ksp_monitor_short -fieldsplit_0_pc_factor_mat_solver_type mumps -fieldsplit_1_pc_factor_mat_solver_type mumps > ex19_6.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_6.tmp; \
|
||||
if (${DIFF} output/ex19_fieldsplit_5.out ex19_6.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with MUMPS"; \
|
||||
else \
|
||||
@@ -102,6 +111,7 @@ runex19_fieldsplit_mumps:
|
||||
${RM} -f ex19_6.tmp
|
||||
runex19_superlu_dist:
|
||||
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_grid_x 20 -da_grid_y 20 -pc_type lu -pc_factor_mat_solver_type superlu_dist > ex19.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19.tmp; \
|
||||
if (${DIFF} output/ex19_superlu.out ex19.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuperLU_DIST"; \
|
||||
else \
|
||||
@@ -111,6 +121,7 @@ runex19_superlu_dist:
|
||||
${RM} -f ex19.tmp
|
||||
runex19_suitesparse:
|
||||
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type lu -pc_factor_mat_solver_type umfpack > ex19_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
|
||||
if (${DIFF} output/ex19_suitesparse.out ex19_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuiteSparse"; \
|
||||
else \
|
||||
@@ -120,6 +131,7 @@ runex19_suitesparse:
|
||||
${RM} -f ex19_1.tmp
|
||||
runex3k_kokkos: ex3k.PETSc
|
||||
-@OMP_PROC_BIND=false ${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex3k -view_initial -dm_vec_type kokkos -dm_mat_type aijkokkos -use_gpu_aware_mpi 0 -snes_monitor > ex3k_1.tmp 2>&1 ;\
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex3k_1.tmp; \
|
||||
if (${DIFF} output/ex3k_1.out ex3k_1.tmp) then \
|
||||
echo "C/C++ example src/snes/tutorials/ex3k run successfully with Kokkos Kernels"; \
|
||||
else \
|
||||
diff --git a/src/vec/vec/tests/makefile b/src/vec/vec/tests/makefile
|
||||
index d1f047820ec..aab400535dd 100644
|
||||
--- a/src/vec/vec/tests/makefile
|
||||
+++ b/src/vec/vec/tests/makefile
|
||||
@@ -5,6 +5,7 @@ include ${PETSC_DIR}/lib/petsc/conf/rules
|
||||
|
||||
runex47: ex47.PETSc
|
||||
-@H5OUT=`mktemp -t petsc.h5.XXXXXX`; ${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex47 -filename $${H5OUT} > ex47_1.tmp 2>&1; \
|
||||
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex47_1.tmp; \
|
||||
if (${DIFF} output/ex47_1.out ex47_1.tmp) then \
|
||||
echo "C/C++ example src/vec/vec/tests/ex47 run successfully with HDF5"; \
|
||||
else \
|
@ -9,6 +9,7 @@
|
||||
lapack,
|
||||
mpiSupport ? true,
|
||||
mpi, # generic mpi dependency
|
||||
mpiCheckPhaseHook,
|
||||
openssh, # required for openmpi tests
|
||||
petsc-withp4est ? false,
|
||||
hdf5-support ? false,
|
||||
@ -52,12 +53,6 @@ stdenv.mkDerivation rec {
|
||||
--replace /usr/bin/install_name_tool ${cctools}/bin/install_name_tool
|
||||
'';
|
||||
|
||||
# Both OpenMPI and MPICH get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
|
||||
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
|
||||
# [1684747490.391106] [localhost:14258:0] tcp_iface.c:837 UCX ERROR opendir(/sys/class/net) failed: No such file or directory
|
||||
# These messages contaminate test output, which makes the quicktest suite to fail. The patch adds filtering for these messages.
|
||||
patches = [ ./filter_mpi_warnings.patch ];
|
||||
|
||||
configureFlags = [
|
||||
"--with-blas=1"
|
||||
"--with-lapack=1"
|
||||
@ -112,6 +107,7 @@ stdenv.mkDerivation rec {
|
||||
# the library is installed and available.
|
||||
doInstallCheck = true;
|
||||
installCheckTarget = "check_install";
|
||||
nativeInstallCheckInputs = [ mpiCheckPhaseHook ];
|
||||
|
||||
passthru = {
|
||||
inherit mpiSupport;
|
||||
|
Loading…
Reference in New Issue
Block a user