nixpkgs/nixos/tests/slurm.nix
2018-10-30 19:50:52 +01:00

133 lines
3.4 KiB
Nix
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import ./make-test.nix ({ lib, ... }:
let
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
slurmconfig = {
controlMachine = "control";
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
extraConfig = ''
AccountingStorageHost=dbd
AccountingStorageType=accounting_storage/slurmdbd
'';
};
in {
name = "slurm";
meta.maintainers = [ lib.maintainers.markuskowa ];
nodes =
let
computeNode =
{ ...}:
{
# TODO slurmd port and slurmctld port should be configurations and
# automatically allowed by the firewall.
networking.firewall.enable = false;
services.slurm = {
client.enable = true;
} // slurmconfig;
};
in {
control =
{ ...}:
{
networking.firewall.enable = false;
services.slurm = {
server.enable = true;
} // slurmconfig;
};
submit =
{ ...}:
{
networking.firewall.enable = false;
services.slurm = {
enableStools = true;
} // slurmconfig;
};
dbd =
{ pkgs, ... } :
{
networking.firewall.enable = false;
services.slurm.dbdserver = {
enable = true;
};
services.mysql = {
enable = true;
package = pkgs.mysql;
ensureDatabases = [ "slurm_acct_db" ];
ensureUsers = [{
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
name = "slurm";
}];
};
};
node1 = computeNode;
node2 = computeNode;
node3 = computeNode;
};
testScript =
''
startAll;
# Set up authentification across the cluster
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
{
$node->waitForUnit("default.target");
$node->succeed("mkdir /etc/munge");
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
$node->succeed("chmod 0400 /etc/munge/munge.key");
$node->succeed("chown munge:munge /etc/munge/munge.key");
$node->succeed("systemctl restart munged");
$node->waitForUnit("munged");
};
# Restart the services since they have probably failed due to the munge init
# failure
subtest "can_start_slurmdbd", sub {
$dbd->succeed("systemctl restart slurmdbd");
$dbd->waitForUnit("slurmdbd.service");
};
# there needs to be an entry for the current
# cluster in the database before slurmctld is restarted
subtest "add_account", sub {
$control->succeed("sacctmgr -i add cluster default");
};
subtest "can_start_slurmctld", sub {
$control->succeed("systemctl restart slurmctld");
$control->waitForUnit("slurmctld.service");
};
subtest "can_start_slurmd", sub {
foreach my $node (($node1,$node2,$node3))
{
$node->succeed("systemctl restart slurmd.service");
$node->waitForUnit("slurmd");
}
};
# Test that the cluster works and can distribute jobs;
subtest "run_distributed_command", sub {
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
};
subtest "check_slurm_dbd", sub {
# find the srun job from above in the database
$submit->succeed("sacct | grep hostname");
};
'';
})