From 0c5883bbce990809800106eaad2512f9a6d1b7eb Mon Sep 17 00:00:00 2001 From: qbisi Date: Mon, 21 Oct 2024 07:20:30 +0800 Subject: [PATCH 1/4] mpiCheckPhaseHook: add parameters to bypass errors in sandbox --- .../setup-hooks/mpi-check-hook/default.nix | 4 ++++ .../setup-hooks/mpi-check-hook/mpi-check-hook.sh | 11 +++++++++++ .../setup-hooks/mpi-check-hook/topology.xml | 10 ++++++++++ 3 files changed, 25 insertions(+) create mode 100644 pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml diff --git a/pkgs/build-support/setup-hooks/mpi-check-hook/default.nix b/pkgs/build-support/setup-hooks/mpi-check-hook/default.nix index 2834cfcc44ff..a49e8e338bea 100644 --- a/pkgs/build-support/setup-hooks/mpi-check-hook/default.nix +++ b/pkgs/build-support/setup-hooks/mpi-check-hook/default.nix @@ -2,4 +2,8 @@ makeSetupHook { name = "mpi-checkPhase-hook"; + + substitutions = { + topology = ./topology.xml; + }; } ./mpi-check-hook.sh diff --git a/pkgs/build-support/setup-hooks/mpi-check-hook/mpi-check-hook.sh b/pkgs/build-support/setup-hooks/mpi-check-hook/mpi-check-hook.sh index 41b6d864c8a5..d7275b62244d 100644 --- a/pkgs/build-support/setup-hooks/mpi-check-hook/mpi-check-hook.sh +++ b/pkgs/build-support/setup-hooks/mpi-check-hook/mpi-check-hook.sh @@ -44,6 +44,17 @@ setupMpiCheck() { # Disable CPU pinning export OMPI_MCA_hwloc_base_binding_policy=none export PRTE_MCA_hwloc_default_binding_policy=none + + # OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr): + # [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery. + # [1729458724.473282] [localhost:78 :0] tcp_iface.c:893 UCX ERROR scandir(/sys/class/net) failed: No such file or directory + # These messages contaminate test output, which makes the difftest to fail. + # The solution is to use a preset cpu topology file and disable ucx model. + + # Disable sysfs cpu topology directory discovery. + export PRTE_MCA_hwloc_use_topo_file="@topology@" + # Use the network model ob1 instead of ucx. + export OMPI_MCA_pml=ob1 ;; MPICH) # Fix to make mpich run in a sandbox diff --git a/pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml b/pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml new file mode 100644 index 000000000000..616ed839f004 --- /dev/null +++ b/pkgs/build-support/setup-hooks/mpi-check-hook/topology.xml @@ -0,0 +1,10 @@ + + + + + + + + + + From 161892e932643c769117d621a70e014f3b2d1563 Mon Sep 17 00:00:00 2001 From: qbisi Date: Mon, 21 Oct 2024 07:20:52 +0800 Subject: [PATCH 2/4] petsc: use mpiCheckPhaseHook to bypass check in sandbox --- .../pe/petsc/filter_mpi_warnings.patch | 112 ------------------ pkgs/by-name/pe/petsc/package.nix | 8 +- 2 files changed, 2 insertions(+), 118 deletions(-) delete mode 100644 pkgs/by-name/pe/petsc/filter_mpi_warnings.patch diff --git a/pkgs/by-name/pe/petsc/filter_mpi_warnings.patch b/pkgs/by-name/pe/petsc/filter_mpi_warnings.patch deleted file mode 100644 index 6659f2d084d9..000000000000 --- a/pkgs/by-name/pe/petsc/filter_mpi_warnings.patch +++ /dev/null @@ -1,112 +0,0 @@ -diff --git a/src/snes/tutorials/makefile b/src/snes/tutorials/makefile -index fa15faad39e..7670e80931e 100644 ---- a/src/snes/tutorials/makefile -+++ b/src/snes/tutorials/makefile -@@ -13,6 +13,7 @@ ex55: ex55.o ex55k.o - # these tests are used by the makefile in PETSC_DIR for basic tests of the install and should not be removed - testex5f: ex5f.PETSc - -@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex5f -snes_rtol 1e-4 > ex5f_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex5f_1.tmp; \ - if (${DIFF} output/ex5f_1.testout ex5f_1.tmp > /dev/null 2>&1) then \ - echo "Fortran example src/snes/tutorials/ex5f run successfully with 1 MPI process"; \ - else \ -@@ -25,6 +26,7 @@ testex5f: ex5f.PETSc - ${MAKE} PETSC_ARCH=${PETSC_ARCH} PETSC_DIR=${PETSC_DIR} ex5f.rm; - testex19: ex19.PETSc - -@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with 1 MPI process"; \ - else \ -@@ -36,6 +38,7 @@ testex19: ex19.PETSc - ${RM} -f ex19_1.tmp; - testex19_mpi: - -@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with 2 MPI processes"; \ - else \ -@@ -48,6 +51,7 @@ testex19_mpi: - #use unpreconditioned norm because HYPRE device installations use different AMG parameters - runex19_hypre: - -@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE"; \ - else \ -@@ -57,6 +61,7 @@ runex19_hypre: - ${RM} -f ex19_1.tmp - runex19_hypre_cuda: - -@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type cuda -dm_mat_type aijcusparse -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/CUDA"; \ - else \ -@@ -66,6 +71,7 @@ runex19_hypre_cuda: - ${RM} -f ex19_1.tmp - runex19_hypre_hip: - -@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type hip -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/HIP"; \ - else \ -@@ -75,6 +81,7 @@ runex19_hypre_hip: - ${RM} -f ex19_1.tmp - runex19_cuda: - -@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -snes_monitor -dm_mat_type seqaijcusparse -dm_vec_type seqcuda -pc_type gamg -ksp_monitor -mg_levels_ksp_max_it 1 > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_cuda_1.out ex19_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with CUDA"; \ - else \ -@@ -84,6 +91,7 @@ runex19_cuda: - ${RM} -f ex19_1.tmp - runex19_ml: - -@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type ml > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_ml.out ex19_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with ML"; \ - else \ -@@ -93,6 +101,7 @@ runex19_ml: - ${RM} -f ex19_1.tmp - runex19_fieldsplit_mumps: - -@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -pc_type fieldsplit -pc_fieldsplit_block_size 4 -pc_fieldsplit_type SCHUR -pc_fieldsplit_0_fields 0,1,2 -pc_fieldsplit_1_fields 3 -fieldsplit_0_pc_type lu -fieldsplit_1_pc_type lu -snes_monitor_short -ksp_monitor_short -fieldsplit_0_pc_factor_mat_solver_type mumps -fieldsplit_1_pc_factor_mat_solver_type mumps > ex19_6.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_6.tmp; \ - if (${DIFF} output/ex19_fieldsplit_5.out ex19_6.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with MUMPS"; \ - else \ -@@ -102,6 +111,7 @@ runex19_fieldsplit_mumps: - ${RM} -f ex19_6.tmp - runex19_superlu_dist: - -@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_grid_x 20 -da_grid_y 20 -pc_type lu -pc_factor_mat_solver_type superlu_dist > ex19.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19.tmp; \ - if (${DIFF} output/ex19_superlu.out ex19.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuperLU_DIST"; \ - else \ -@@ -111,6 +121,7 @@ runex19_superlu_dist: - ${RM} -f ex19.tmp - runex19_suitesparse: - -@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type lu -pc_factor_mat_solver_type umfpack > ex19_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \ - if (${DIFF} output/ex19_suitesparse.out ex19_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuiteSparse"; \ - else \ -@@ -120,6 +131,7 @@ runex19_suitesparse: - ${RM} -f ex19_1.tmp - runex3k_kokkos: ex3k.PETSc - -@OMP_PROC_BIND=false ${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex3k -view_initial -dm_vec_type kokkos -dm_mat_type aijkokkos -use_gpu_aware_mpi 0 -snes_monitor > ex3k_1.tmp 2>&1 ;\ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex3k_1.tmp; \ - if (${DIFF} output/ex3k_1.out ex3k_1.tmp) then \ - echo "C/C++ example src/snes/tutorials/ex3k run successfully with Kokkos Kernels"; \ - else \ -diff --git a/src/vec/vec/tests/makefile b/src/vec/vec/tests/makefile -index d1f047820ec..aab400535dd 100644 ---- a/src/vec/vec/tests/makefile -+++ b/src/vec/vec/tests/makefile -@@ -5,6 +5,7 @@ include ${PETSC_DIR}/lib/petsc/conf/rules - - runex47: ex47.PETSc - -@H5OUT=`mktemp -t petsc.h5.XXXXXX`; ${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex47 -filename $${H5OUT} > ex47_1.tmp 2>&1; \ -+ sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex47_1.tmp; \ - if (${DIFF} output/ex47_1.out ex47_1.tmp) then \ - echo "C/C++ example src/vec/vec/tests/ex47 run successfully with HDF5"; \ - else \ diff --git a/pkgs/by-name/pe/petsc/package.nix b/pkgs/by-name/pe/petsc/package.nix index 498a14c96dc3..8d0db4a768db 100644 --- a/pkgs/by-name/pe/petsc/package.nix +++ b/pkgs/by-name/pe/petsc/package.nix @@ -9,6 +9,7 @@ lapack, mpiSupport ? true, mpi, # generic mpi dependency + mpiCheckPhaseHook, openssh, # required for openmpi tests petsc-withp4est ? false, hdf5-support ? false, @@ -52,12 +53,6 @@ stdenv.mkDerivation rec { --replace /usr/bin/install_name_tool ${cctools}/bin/install_name_tool ''; - # Both OpenMPI and MPICH get confused by the sandbox environment and spew errors like this (both to stdout and stderr): - # [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery. - # [1684747490.391106] [localhost:14258:0] tcp_iface.c:837 UCX ERROR opendir(/sys/class/net) failed: No such file or directory - # These messages contaminate test output, which makes the quicktest suite to fail. The patch adds filtering for these messages. - patches = [ ./filter_mpi_warnings.patch ]; - configureFlags = [ "--with-blas=1" "--with-lapack=1" @@ -112,6 +107,7 @@ stdenv.mkDerivation rec { # the library is installed and available. doInstallCheck = true; installCheckTarget = "check_install"; + nativeInstallCheckInputs = [ mpiCheckPhaseHook ]; passthru = { inherit mpiSupport; From 3aeaac6b8e65aa1561b32067c6946b052d633eed Mon Sep 17 00:00:00 2001 From: qbisi Date: Mon, 21 Oct 2024 07:21:08 +0800 Subject: [PATCH 3/4] nwchem: replace nativeCheckInputs with nativeInstallCheckInputs --- pkgs/applications/science/chemistry/nwchem/default.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkgs/applications/science/chemistry/nwchem/default.nix b/pkgs/applications/science/chemistry/nwchem/default.nix index fb24f903f522..8ea4d96a2a25 100644 --- a/pkgs/applications/science/chemistry/nwchem/default.nix +++ b/pkgs/applications/science/chemistry/nwchem/default.nix @@ -195,7 +195,7 @@ stdenv.mkDerivation rec { doCheck = false; doInstallCheck = true; - nativeCheckInputs = [ mpiCheckPhaseHook ]; + nativeInstallCheckInputs = [ mpiCheckPhaseHook ]; installCheckPhase = '' runHook preInstallCheck From 3e08945b40e08fc0a36cf24dc73e5022c58df1d1 Mon Sep 17 00:00:00 2001 From: qbisi Date: Mon, 21 Oct 2024 07:21:19 +0800 Subject: [PATCH 4/4] nwchem: mark supported on aarch64 --- pkgs/applications/science/chemistry/nwchem/default.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkgs/applications/science/chemistry/nwchem/default.nix b/pkgs/applications/science/chemistry/nwchem/default.nix index 8ea4d96a2a25..d3d349969bb2 100644 --- a/pkgs/applications/science/chemistry/nwchem/default.nix +++ b/pkgs/applications/science/chemistry/nwchem/default.nix @@ -211,7 +211,7 @@ stdenv.mkDerivation rec { meta = with lib; { description = "Open Source High-Performance Computational Chemistry"; mainProgram = "nwchem"; - platforms = [ "x86_64-linux" ]; + platforms = [ "x86_64-linux" "aarch64-linux" ]; maintainers = with maintainers; [ sheepforce markuskowa ]; homepage = "https://nwchemgit.github.io"; license = licenses.ecl20;