mpiCheckPhaseHook: add parameters to bypass errors in sandbox

This commit is contained in:
qbisi 2024-10-21 07:20:30 +08:00
parent f8f18110b6
commit 0c5883bbce
3 changed files with 25 additions and 0 deletions

View File

@ -2,4 +2,8 @@
makeSetupHook {
name = "mpi-checkPhase-hook";
substitutions = {
topology = ./topology.xml;
};
} ./mpi-check-hook.sh

View File

@ -44,6 +44,17 @@ setupMpiCheck() {
# Disable CPU pinning
export OMPI_MCA_hwloc_base_binding_policy=none
export PRTE_MCA_hwloc_default_binding_policy=none
# OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
# [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
# [1729458724.473282] [localhost:78 :0] tcp_iface.c:893 UCX ERROR scandir(/sys/class/net) failed: No such file or directory
# These messages contaminate test output, which makes the difftest to fail.
# The solution is to use a preset cpu topology file and disable ucx model.
# Disable sysfs cpu topology directory discovery.
export PRTE_MCA_hwloc_use_topo_file="@topology@"
# Use the network model ob1 instead of ucx.
export OMPI_MCA_pml=ob1
;;
MPICH)
# Fix to make mpich run in a sandbox

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE topology SYSTEM "hwloc2.dtd">
<topology version="2.0">
<object type="Machine" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" allowed_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" allowed_nodeset="0x00000001" gp_index="1">
<object type="Core" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="2">
<object type="NUMANode" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="4"/>
<object type="PU" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="3"/>
</object>
</object>
</topology>