Merge pull request #11870 from lancelotsix/improve_slurm_service

Improve slurm service configuration
This commit is contained in:
Arseniy Seroka 2015-12-25 18:36:18 +03:00
commit 7e14e28a80
2 changed files with 128 additions and 8 deletions

View File

@ -34,6 +34,15 @@ in
};
package = mkOption {
type = types.package;
default = pkgs.slurm-llnl;
example = literalExample "pkgs.slurm-llnl-full";
description = ''
The packge to use for slurm binaries.
'';
};
controlMachine = mkOption {
type = types.nullOr types.str;
default = null;
@ -91,38 +100,69 @@ in
###### implementation
config = mkIf (cfg.client.enable || cfg.server.enable) {
config =
let
wrappedSlurm = pkgs.stdenv.mkDerivation {
name = "wrappedSlurm";
environment.systemPackages = [ pkgs.slurm-llnl ];
propagatedBuildInputs = [ cfg.package configFile ];
builder = pkgs.writeText "builder.sh" ''
source $stdenv/setup
mkdir -p $out/bin
find ${cfg.package}/bin -type f -executable | while read EXE
do
exename="$(basename $EXE)"
wrappername="$out/bin/$exename"
cat > "$wrappername" <<EOT
#!/bin/sh
if [ -z "$SLURM_CONF" ]
then
SLURM_CONF="${configFile}" "$EXE" "\$@"
else
"$EXE" "\$0"
fi
EOT
chmod +x "$wrappername"
done
'';
};
in mkIf (cfg.client.enable || cfg.server.enable) {
environment.systemPackages = [ wrappedSlurm ];
systemd.services.slurmd = mkIf (cfg.client.enable) {
path = with pkgs; [ slurm-llnl coreutils ];
path = with pkgs; [ wrappedSlurm coreutils ];
wantedBy = [ "multi-user.target" ];
after = [ "systemd-tmpfiles-clean.service" ];
serviceConfig = {
Type = "forking";
ExecStart = "${pkgs.slurm-llnl}/bin/slurmd -f ${configFile}";
ExecStart = "${wrappedSlurm}/bin/slurmd";
PIDFile = "/run/slurmd.pid";
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
};
preStart = ''
mkdir -p /var/spool
'';
};
systemd.services.slurmctld = mkIf (cfg.server.enable) {
path = with pkgs; [ slurm-llnl munge coreutils ];
path = with pkgs; [ wrappedSlurm munge coreutils ];
wantedBy = [ "multi-user.target" ];
after = [ "network.target" "auditd.service" "munged.service" "slurmdbd.service" ];
after = [ "network.target" "munged.service" ];
requires = [ "munged.service" ];
serviceConfig = {
Type = "forking";
ExecStart = "${pkgs.slurm-llnl}/bin/slurmctld";
ExecStart = "${wrappedSlurm}/bin/slurmctld";
PIDFile = "/run/slurmctld.pid";
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
};
environment = { SLURM_CONF = "${configFile}"; };
};
};

80
nixos/tests/slurm.nix Normal file
View File

@ -0,0 +1,80 @@
import ./make-test.nix ({ pkgs, ... }:
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
slurmconfig = {
client.enable = true;
controlMachine = "control";
nodeName = ''
control
NodeName=node[1-3] CPUs=1 State=UNKNOWN
'';
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
};
in {
name = "slurm";
nodes =
let
computeNode =
{ config, pkgs, ...}:
{
# TODO slrumd port and slurmctld port should be configurations and
# automatically allowed by the firewall.
networking.firewall.enable = false;
services.munge.enable = true;
services.slurm = slurmconfig;
};
in {
control =
{ config, pkgs, ...}:
{
networking.firewall.enable = false;
services.munge.enable = true;
services.slurm = {
server.enable = true;
} // slurmconfig;
};
node1 = computeNode;
node2 = computeNode;
node3 = computeNode;
};
testScript =
''
startAll;
# Set up authentification across the cluster
foreach my $node (($control,$node1,$node2,$node3))
{
$node->waitForUnit("default.target");
$node->succeed("mkdir /etc/munge");
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
$node->succeed("chmod 0400 /etc/munge/munge.key");
$node->succeed("systemctl restart munged");
}
# Restart the services since they have probably failed due to the munge init
# failure
subtest "can_start_slurmctld", sub {
$control->succeed("systemctl restart slurmctld");
$control->waitForUnit("slurmctld.service");
};
subtest "can_start_slurmd", sub {
foreach my $node (($control,$node1,$node2,$node3))
{
$node->succeed("systemctl restart slurmd.service");
$node->waitForUnit("slurmd");
}
};
# Test that the cluster work and can distribute jobs;
subtest "run_distributed_command", sub {
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
$control->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
};
'';
})