mpiCheckPhaseHook: add parameters to bypass errors in sandbox (#350112)

This commit is contained in:
Markus Kowalewski 2024-10-21 14:11:57 +02:00 committed by GitHub
commit c4875a446f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 29 additions and 120 deletions

View File

@ -195,7 +195,7 @@ stdenv.mkDerivation rec {
doCheck = false;
doInstallCheck = true;
nativeCheckInputs = [ mpiCheckPhaseHook ];
nativeInstallCheckInputs = [ mpiCheckPhaseHook ];
installCheckPhase = ''
runHook preInstallCheck
@ -211,7 +211,7 @@ stdenv.mkDerivation rec {
meta = with lib; {
description = "Open Source High-Performance Computational Chemistry";
mainProgram = "nwchem";
platforms = [ "x86_64-linux" ];
platforms = [ "x86_64-linux" "aarch64-linux" ];
maintainers = with maintainers; [ sheepforce markuskowa ];
homepage = "https://nwchemgit.github.io";
license = licenses.ecl20;

View File

@ -2,4 +2,8 @@
makeSetupHook {
name = "mpi-checkPhase-hook";
substitutions = {
topology = ./topology.xml;
};
} ./mpi-check-hook.sh

View File

@ -44,6 +44,17 @@ setupMpiCheck() {
# Disable CPU pinning
export OMPI_MCA_hwloc_base_binding_policy=none
export PRTE_MCA_hwloc_default_binding_policy=none
# OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
# [1729458724.473282] [localhost:78 :0] tcp_iface.c:893 UCX ERROR scandir(/sys/class/net) failed: No such file or directory
# These messages contaminate test output, which makes the difftest to fail.
# The solution is to use a preset cpu topology file and disable ucx model.
# Disable sysfs cpu topology directory discovery.
export PRTE_MCA_hwloc_use_topo_file="@topology@"
# Use the network model ob1 instead of ucx.
export OMPI_MCA_pml=ob1
;;
MPICH)
# Fix to make mpich run in a sandbox

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE topology SYSTEM "hwloc2.dtd">
<topology version="2.0">
<object type="Machine" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" allowed_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" allowed_nodeset="0x00000001" gp_index="1">
<object type="Core" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="2">
<object type="NUMANode" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="4"/>
<object type="PU" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="3"/>
</object>
</object>
</topology>

View File

@ -1,112 +0,0 @@
diff --git a/src/snes/tutorials/makefile b/src/snes/tutorials/makefile
index fa15faad39e..7670e80931e 100644
--- a/src/snes/tutorials/makefile
+++ b/src/snes/tutorials/makefile
@@ -13,6 +13,7 @@ ex55: ex55.o ex55k.o
# these tests are used by the makefile in PETSC_DIR for basic tests of the install and should not be removed
testex5f: ex5f.PETSc
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex5f -snes_rtol 1e-4 > ex5f_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex5f_1.tmp; \
if (${DIFF} output/ex5f_1.testout ex5f_1.tmp > /dev/null 2>&1) then \
echo "Fortran example src/snes/tutorials/ex5f run successfully with 1 MPI process"; \
else \
@@ -25,6 +26,7 @@ testex5f: ex5f.PETSc
${MAKE} PETSC_ARCH=${PETSC_ARCH} PETSC_DIR=${PETSC_DIR} ex5f.rm;
testex19: ex19.PETSc
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with 1 MPI process"; \
else \
@@ -36,6 +38,7 @@ testex19: ex19.PETSc
${RM} -f ex19_1.tmp;
testex19_mpi:
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with 2 MPI processes"; \
else \
@@ -48,6 +51,7 @@ testex19_mpi:
#use unpreconditioned norm because HYPRE device installations use different AMG parameters
runex19_hypre:
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE"; \
else \
@@ -57,6 +61,7 @@ runex19_hypre:
${RM} -f ex19_1.tmp
runex19_hypre_cuda:
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type cuda -dm_mat_type aijcusparse -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/CUDA"; \
else \
@@ -66,6 +71,7 @@ runex19_hypre_cuda:
${RM} -f ex19_1.tmp
runex19_hypre_hip:
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type hip -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/HIP"; \
else \
@@ -75,6 +81,7 @@ runex19_hypre_hip:
${RM} -f ex19_1.tmp
runex19_cuda:
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -snes_monitor -dm_mat_type seqaijcusparse -dm_vec_type seqcuda -pc_type gamg -ksp_monitor -mg_levels_ksp_max_it 1 > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_cuda_1.out ex19_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with CUDA"; \
else \
@@ -84,6 +91,7 @@ runex19_cuda:
${RM} -f ex19_1.tmp
runex19_ml:
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type ml > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_ml.out ex19_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with ML"; \
else \
@@ -93,6 +101,7 @@ runex19_ml:
${RM} -f ex19_1.tmp
runex19_fieldsplit_mumps:
-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -pc_type fieldsplit -pc_fieldsplit_block_size 4 -pc_fieldsplit_type SCHUR -pc_fieldsplit_0_fields 0,1,2 -pc_fieldsplit_1_fields 3 -fieldsplit_0_pc_type lu -fieldsplit_1_pc_type lu -snes_monitor_short -ksp_monitor_short -fieldsplit_0_pc_factor_mat_solver_type mumps -fieldsplit_1_pc_factor_mat_solver_type mumps > ex19_6.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_6.tmp; \
if (${DIFF} output/ex19_fieldsplit_5.out ex19_6.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with MUMPS"; \
else \
@@ -102,6 +111,7 @@ runex19_fieldsplit_mumps:
${RM} -f ex19_6.tmp
runex19_superlu_dist:
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_grid_x 20 -da_grid_y 20 -pc_type lu -pc_factor_mat_solver_type superlu_dist > ex19.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19.tmp; \
if (${DIFF} output/ex19_superlu.out ex19.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuperLU_DIST"; \
else \
@@ -111,6 +121,7 @@ runex19_superlu_dist:
${RM} -f ex19.tmp
runex19_suitesparse:
-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type lu -pc_factor_mat_solver_type umfpack > ex19_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
if (${DIFF} output/ex19_suitesparse.out ex19_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuiteSparse"; \
else \
@@ -120,6 +131,7 @@ runex19_suitesparse:
${RM} -f ex19_1.tmp
runex3k_kokkos: ex3k.PETSc
-@OMP_PROC_BIND=false ${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex3k -view_initial -dm_vec_type kokkos -dm_mat_type aijkokkos -use_gpu_aware_mpi 0 -snes_monitor > ex3k_1.tmp 2>&1 ;\
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex3k_1.tmp; \
if (${DIFF} output/ex3k_1.out ex3k_1.tmp) then \
echo "C/C++ example src/snes/tutorials/ex3k run successfully with Kokkos Kernels"; \
else \
diff --git a/src/vec/vec/tests/makefile b/src/vec/vec/tests/makefile
index d1f047820ec..aab400535dd 100644
--- a/src/vec/vec/tests/makefile
+++ b/src/vec/vec/tests/makefile
@@ -5,6 +5,7 @@ include ${PETSC_DIR}/lib/petsc/conf/rules
runex47: ex47.PETSc
-@H5OUT=`mktemp -t petsc.h5.XXXXXX`; ${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex47 -filename $${H5OUT} > ex47_1.tmp 2>&1; \
+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex47_1.tmp; \
if (${DIFF} output/ex47_1.out ex47_1.tmp) then \
echo "C/C++ example src/vec/vec/tests/ex47 run successfully with HDF5"; \
else \

View File

@ -9,6 +9,7 @@
lapack,
mpiSupport ? true,
mpi, # generic mpi dependency
mpiCheckPhaseHook,
openssh, # required for openmpi tests
petsc-withp4est ? false,
hdf5-support ? false,
@ -52,12 +53,6 @@ stdenv.mkDerivation rec {
--replace /usr/bin/install_name_tool ${cctools}/bin/install_name_tool
'';
# Both OpenMPI and MPICH get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
# [1684747490.391106] [localhost:14258:0] tcp_iface.c:837 UCX ERROR opendir(/sys/class/net) failed: No such file or directory
# These messages contaminate test output, which makes the quicktest suite to fail. The patch adds filtering for these messages.
patches = [ ./filter_mpi_warnings.patch ];
configureFlags = [
"--with-blas=1"
"--with-lapack=1"
@ -112,6 +107,7 @@ stdenv.mkDerivation rec {
# the library is installed and available.
doInstallCheck = true;
installCheckTarget = "check_install";
nativeInstallCheckInputs = [ mpiCheckPhaseHook ];
passthru = {
inherit mpiSupport;