mirror of
https://github.com/NixOS/nixpkgs.git
synced 2025-01-23 21:33:49 +00:00
b1cce3b54e
mpi.conf is required for PMIx configuration. Setting the PMIxCliTmpDirBase in mpi.conf per default avoids PMIx errors complaining about a missing temporary directory.
461 lines
14 KiB
Nix
461 lines
14 KiB
Nix
{ config, lib, options, pkgs, ... }:
|
|
let
|
|
|
|
cfg = config.services.slurm;
|
|
opt = options.services.slurm;
|
|
# configuration file can be generated by https://slurm.schedmd.com/configurator.html
|
|
|
|
defaultUser = "slurm";
|
|
|
|
configFile = pkgs.writeTextDir "slurm.conf"
|
|
''
|
|
ClusterName=${cfg.clusterName}
|
|
StateSaveLocation=${cfg.stateSaveLocation}
|
|
SlurmUser=${cfg.user}
|
|
${lib.optionalString (cfg.controlMachine != null) "controlMachine=${cfg.controlMachine}"}
|
|
${lib.optionalString (cfg.controlAddr != null) "controlAddr=${cfg.controlAddr}"}
|
|
${toString (map (x: "NodeName=${x}\n") cfg.nodeName)}
|
|
${toString (map (x: "PartitionName=${x}\n") cfg.partitionName)}
|
|
PlugStackConfig=${plugStackConfig}/plugstack.conf
|
|
ProctrackType=${cfg.procTrackType}
|
|
${cfg.extraConfig}
|
|
'';
|
|
|
|
plugStackConfig = pkgs.writeTextDir "plugstack.conf"
|
|
''
|
|
${lib.optionalString cfg.enableSrunX11 "optional ${pkgs.slurm-spank-x11}/lib/x11.so"}
|
|
${cfg.extraPlugstackConfig}
|
|
'';
|
|
|
|
cgroupConfig = pkgs.writeTextDir "cgroup.conf"
|
|
''
|
|
${cfg.extraCgroupConfig}
|
|
'';
|
|
|
|
mpiConf = pkgs.writeTextDir "mpi.conf"
|
|
''
|
|
PMIxCliTmpDirBase=${cfg.mpi.PmixCliTmpDirBase}
|
|
${cfg.mpi.extraMpiConfig}
|
|
'';
|
|
|
|
slurmdbdConf = pkgs.writeText "slurmdbd.conf"
|
|
''
|
|
DbdHost=${cfg.dbdserver.dbdHost}
|
|
SlurmUser=${cfg.user}
|
|
StorageType=accounting_storage/mysql
|
|
StorageUser=${cfg.dbdserver.storageUser}
|
|
${cfg.dbdserver.extraConfig}
|
|
'';
|
|
|
|
# slurm expects some additional config files to be
|
|
# in the same directory as slurm.conf
|
|
etcSlurm = pkgs.symlinkJoin {
|
|
name = "etc-slurm";
|
|
paths = [ configFile cgroupConfig plugStackConfig mpiConf ] ++ cfg.extraConfigPaths;
|
|
};
|
|
in
|
|
|
|
{
|
|
|
|
###### interface
|
|
|
|
meta.maintainers = [ lib.maintainers.markuskowa ];
|
|
|
|
options = {
|
|
|
|
services.slurm = {
|
|
|
|
server = {
|
|
enable = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = false;
|
|
description = ''
|
|
Whether to enable the slurm control daemon.
|
|
Note that the standard authentication method is "munge".
|
|
The "munge" service needs to be provided with a password file in order for
|
|
slurm to work properly (see `services.munge.password`).
|
|
'';
|
|
};
|
|
};
|
|
|
|
dbdserver = {
|
|
enable = lib.mkEnableOption "SlurmDBD service";
|
|
|
|
dbdHost = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = config.networking.hostName;
|
|
defaultText = lib.literalExpression "config.networking.hostName";
|
|
description = ''
|
|
Hostname of the machine where `slurmdbd`
|
|
is running (i.e. name returned by `hostname -s`).
|
|
'';
|
|
};
|
|
|
|
storageUser = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = cfg.user;
|
|
defaultText = lib.literalExpression "config.${opt.user}";
|
|
description = ''
|
|
Database user name.
|
|
'';
|
|
};
|
|
|
|
storagePassFile = lib.mkOption {
|
|
type = with lib.types; nullOr str;
|
|
default = null;
|
|
description = ''
|
|
Path to file with database password. The content of this will be used to
|
|
create the password for the `StoragePass` option.
|
|
'';
|
|
};
|
|
|
|
extraConfig = lib.mkOption {
|
|
type = lib.types.lines;
|
|
default = "";
|
|
description = ''
|
|
Extra configuration for `slurmdbd.conf` See also:
|
|
{manpage}`slurmdbd.conf(8)`.
|
|
'';
|
|
};
|
|
};
|
|
|
|
client = {
|
|
enable = lib.mkEnableOption "slurm client daemon";
|
|
};
|
|
|
|
enableStools = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = false;
|
|
description = ''
|
|
Whether to provide a slurm.conf file.
|
|
Enable this option if you do not run a slurm daemon on this host
|
|
(i.e. `server.enable` and `client.enable` are `false`)
|
|
but you still want to run slurm commands from this host.
|
|
'';
|
|
};
|
|
|
|
package = lib.mkPackageOption pkgs "slurm" {
|
|
example = "slurm-full";
|
|
} // {
|
|
default = pkgs.slurm.override { enableX11 = ! cfg.enableSrunX11; };
|
|
};
|
|
|
|
controlMachine = lib.mkOption {
|
|
type = lib.types.nullOr lib.types.str;
|
|
default = null;
|
|
example = null;
|
|
description = ''
|
|
The short hostname of the machine where SLURM control functions are
|
|
executed (i.e. the name returned by the command "hostname -s", use "tux001"
|
|
rather than "tux001.my.com").
|
|
'';
|
|
};
|
|
|
|
controlAddr = lib.mkOption {
|
|
type = lib.types.nullOr lib.types.str;
|
|
default = cfg.controlMachine;
|
|
defaultText = lib.literalExpression "config.${opt.controlMachine}";
|
|
example = null;
|
|
description = ''
|
|
Name that ControlMachine should be referred to in establishing a
|
|
communications path.
|
|
'';
|
|
};
|
|
|
|
clusterName = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = "default";
|
|
example = "myCluster";
|
|
description = ''
|
|
Necessary to distinguish accounting records in a multi-cluster environment.
|
|
'';
|
|
};
|
|
|
|
nodeName = lib.mkOption {
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [];
|
|
example = lib.literalExpression ''[ "linux[1-32] CPUs=1 State=UNKNOWN" ];'';
|
|
description = ''
|
|
Name that SLURM uses to refer to a node (or base partition for BlueGene
|
|
systems). Typically this would be the string that "/bin/hostname -s"
|
|
returns. Note that now you have to write node's parameters after the name.
|
|
'';
|
|
};
|
|
|
|
partitionName = lib.mkOption {
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [];
|
|
example = lib.literalExpression ''[ "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP" ];'';
|
|
description = ''
|
|
Name by which the partition may be referenced. Note that now you have
|
|
to write the partition's parameters after the name.
|
|
'';
|
|
};
|
|
|
|
enableSrunX11 = lib.mkOption {
|
|
default = false;
|
|
type = lib.types.bool;
|
|
description = ''
|
|
If enabled srun will accept the option "--x11" to allow for X11 forwarding
|
|
from within an interactive session or a batch job. This activates the
|
|
slurm-spank-x11 module. Note that this option also enables
|
|
{option}`services.openssh.forwardX11` on the client.
|
|
|
|
This option requires slurm to be compiled without native X11 support.
|
|
The default behavior is to re-compile the slurm package with native X11
|
|
support disabled if this option is set to true.
|
|
|
|
To use the native X11 support add `PrologFlags=X11` in {option}`extraConfig`.
|
|
Note that this method will only work RSA SSH host keys.
|
|
'';
|
|
};
|
|
|
|
procTrackType = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = "proctrack/linuxproc";
|
|
description = ''
|
|
Plugin to be used for process tracking on a job step basis.
|
|
The slurmd daemon uses this mechanism to identify all processes
|
|
which are children of processes it spawns for a user job step.
|
|
'';
|
|
};
|
|
|
|
stateSaveLocation = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = "/var/spool/slurmctld";
|
|
description = ''
|
|
Directory into which the Slurm controller, slurmctld, saves its state.
|
|
'';
|
|
};
|
|
|
|
user = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = defaultUser;
|
|
description = ''
|
|
Set this option when you want to run the slurmctld daemon
|
|
as something else than the default slurm user "slurm".
|
|
Note that the UID of this user needs to be the same
|
|
on all nodes.
|
|
'';
|
|
};
|
|
|
|
extraConfig = lib.mkOption {
|
|
default = "";
|
|
type = lib.types.lines;
|
|
description = ''
|
|
Extra configuration options that will be added verbatim at
|
|
the end of the slurm configuration file.
|
|
'';
|
|
};
|
|
|
|
mpi = {
|
|
PmixCliTmpDirBase = lib.mkOption {
|
|
default = "/tmp/pmix";
|
|
type = lib.types.str;
|
|
description = ''
|
|
Base path for PMIx temporary files.
|
|
'';
|
|
};
|
|
|
|
extraMpiConfig = lib.mkOption {
|
|
default = "";
|
|
type = lib.types.lines;
|
|
description = ''
|
|
Extra configuration for that will be added to `mpi.conf`.
|
|
'';
|
|
};
|
|
};
|
|
|
|
extraPlugstackConfig = lib.mkOption {
|
|
default = "";
|
|
type = lib.types.lines;
|
|
description = ''
|
|
Extra configuration that will be added to the end of `plugstack.conf`.
|
|
'';
|
|
};
|
|
|
|
extraCgroupConfig = lib.mkOption {
|
|
default = "";
|
|
type = lib.types.lines;
|
|
description = ''
|
|
Extra configuration for `cgroup.conf`. This file is
|
|
used when `procTrackType=proctrack/cgroup`.
|
|
'';
|
|
};
|
|
|
|
extraConfigPaths = lib.mkOption {
|
|
type = with lib.types; listOf path;
|
|
default = [];
|
|
description = ''
|
|
Slurm expects config files for plugins in the same path
|
|
as `slurm.conf`. Add extra nix store
|
|
paths that should be merged into same directory as
|
|
`slurm.conf`.
|
|
'';
|
|
};
|
|
|
|
etcSlurm = lib.mkOption {
|
|
type = lib.types.path;
|
|
internal = true;
|
|
default = etcSlurm;
|
|
defaultText = lib.literalMD ''
|
|
Directory created from generated config files and
|
|
`config.${opt.extraConfigPaths}`.
|
|
'';
|
|
description = ''
|
|
Path to directory with slurm config files. This option is set by default from the
|
|
Slurm module and is meant to make the Slurm config file available to other modules.
|
|
'';
|
|
};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
imports = [
|
|
(lib.mkRemovedOptionModule [ "services" "slurm" "dbdserver" "storagePass" ] ''
|
|
This option has been removed so that the database password is not exposed via the nix store.
|
|
Use services.slurm.dbdserver.storagePassFile to provide the database password.
|
|
'')
|
|
(lib.mkRemovedOptionModule [ "services" "slurm" "dbdserver" "configFile" ] ''
|
|
This option has been removed. Use services.slurm.dbdserver.storagePassFile
|
|
and services.slurm.dbdserver.extraConfig instead.
|
|
'')
|
|
];
|
|
|
|
###### implementation
|
|
|
|
config =
|
|
let
|
|
wrappedSlurm = pkgs.stdenv.mkDerivation {
|
|
name = "wrappedSlurm";
|
|
|
|
builder = pkgs.writeText "builder.sh" ''
|
|
source $stdenv/setup
|
|
mkdir -p $out/bin
|
|
find ${lib.getBin cfg.package}/bin -type f -executable | while read EXE
|
|
do
|
|
exename="$(basename $EXE)"
|
|
wrappername="$out/bin/$exename"
|
|
cat > "$wrappername" <<EOT
|
|
#!/bin/sh
|
|
if [ -z "$SLURM_CONF" ]
|
|
then
|
|
SLURM_CONF="${cfg.etcSlurm}/slurm.conf" "$EXE" "\$@"
|
|
else
|
|
"$EXE" "\$0"
|
|
fi
|
|
EOT
|
|
chmod +x "$wrappername"
|
|
done
|
|
|
|
mkdir -p $out/share
|
|
ln -s ${lib.getBin cfg.package}/share/man $out/share/man
|
|
'';
|
|
};
|
|
|
|
in lib.mkIf ( cfg.enableStools ||
|
|
cfg.client.enable ||
|
|
cfg.server.enable ||
|
|
cfg.dbdserver.enable ) {
|
|
|
|
environment.systemPackages = [ wrappedSlurm ];
|
|
|
|
services.munge.enable = lib.mkDefault true;
|
|
|
|
# use a static uid as default to ensure it is the same on all nodes
|
|
users.users.slurm = lib.mkIf (cfg.user == defaultUser) {
|
|
name = defaultUser;
|
|
group = "slurm";
|
|
uid = config.ids.uids.slurm;
|
|
};
|
|
|
|
users.groups.slurm.gid = config.ids.uids.slurm;
|
|
|
|
systemd.services.slurmd = lib.mkIf (cfg.client.enable) {
|
|
path = with pkgs; [ wrappedSlurm coreutils ]
|
|
++ lib.optional cfg.enableSrunX11 slurm-spank-x11;
|
|
|
|
wantedBy = [ "multi-user.target" ];
|
|
after = [
|
|
"systemd-tmpfiles-clean.service"
|
|
"munge.service"
|
|
"network-online.target"
|
|
"remote-fs.target"
|
|
];
|
|
wants = [ "network-online.target" ];
|
|
|
|
serviceConfig = {
|
|
Type = "forking";
|
|
KillMode = "process";
|
|
ExecStart = "${wrappedSlurm}/bin/slurmd";
|
|
PIDFile = "/run/slurmd.pid";
|
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
|
LimitMEMLOCK = "infinity";
|
|
Delegate="Yes";
|
|
};
|
|
};
|
|
|
|
systemd.tmpfiles.rules = lib.optionals cfg.client.enable [
|
|
"d /var/spool/slurmd 755 root root -"
|
|
"d ${cfg.mpi.PmixCliTmpDirBase} 755 root root -"
|
|
];
|
|
|
|
services.openssh.settings.X11Forwarding = lib.mkIf cfg.client.enable (lib.mkDefault true);
|
|
|
|
systemd.services.slurmctld = lib.mkIf (cfg.server.enable) {
|
|
path = with pkgs; [ wrappedSlurm munge coreutils ]
|
|
++ lib.optional cfg.enableSrunX11 slurm-spank-x11;
|
|
|
|
wantedBy = [ "multi-user.target" ];
|
|
after = [ "network.target" "munged.service" ];
|
|
requires = [ "munged.service" ];
|
|
|
|
serviceConfig = {
|
|
Type = "forking";
|
|
ExecStart = "${wrappedSlurm}/bin/slurmctld";
|
|
PIDFile = "/run/slurmctld.pid";
|
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
|
};
|
|
|
|
preStart = ''
|
|
mkdir -p ${cfg.stateSaveLocation}
|
|
chown -R ${cfg.user}:slurm ${cfg.stateSaveLocation}
|
|
'';
|
|
};
|
|
|
|
systemd.services.slurmdbd = let
|
|
# slurm strips the last component off the path
|
|
configPath = "$RUNTIME_DIRECTORY/slurmdbd.conf";
|
|
in lib.mkIf (cfg.dbdserver.enable) {
|
|
path = with pkgs; [ wrappedSlurm munge coreutils ];
|
|
|
|
wantedBy = [ "multi-user.target" ];
|
|
after = [ "network.target" "munged.service" "mysql.service" ];
|
|
requires = [ "munged.service" "mysql.service" ];
|
|
|
|
preStart = ''
|
|
install -m 600 -o ${cfg.user} -T ${slurmdbdConf} ${configPath}
|
|
${lib.optionalString (cfg.dbdserver.storagePassFile != null) ''
|
|
echo "StoragePass=$(cat ${cfg.dbdserver.storagePassFile})" \
|
|
>> ${configPath}
|
|
''}
|
|
'';
|
|
|
|
script = ''
|
|
export SLURM_CONF=${configPath}
|
|
exec ${cfg.package}/bin/slurmdbd -D
|
|
'';
|
|
|
|
serviceConfig = {
|
|
RuntimeDirectory = "slurmdbd";
|
|
Type = "simple";
|
|
PIDFile = "/run/slurmdbd.pid";
|
|
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
|
};
|
|
};
|
|
|
|
};
|
|
|
|
}
|