import ./make-test-python.nix ( { pkgs, lib, ... }: rec { name = "pacemaker"; meta = with pkgs.lib.maintainers; { maintainers = [ astro ]; }; nodes = let node = i: { networking.interfaces.eth1.ipv4.addresses = [ { address = "192.168.0.${toString i}"; prefixLength = 24; } ]; services.corosync = { enable = true; clusterName = "zentralwerk-network"; nodelist = lib.imap (i: name: { nodeid = i; inherit name; ring_addrs = [ (builtins.head nodes.${name}.networking.interfaces.eth1.ipv4.addresses).address ]; }) (builtins.attrNames nodes); }; environment.etc."corosync/authkey" = { source = builtins.toFile "authkey" # minimum length: 128 bytes "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest"; mode = "0400"; }; services.pacemaker.enable = true; # used for pacemaker resource systemd.services.ha-cat = { description = "Highly available netcat"; serviceConfig.ExecStart = "${pkgs.netcat}/bin/nc -l discard"; }; }; in { node1 = node 1; node2 = node 2; node3 = node 3; }; # sets up pacemaker with resources configuration, then crashes a # node and waits for service restart on another node testScript = let resources = builtins.toFile "cib-resources.xml" '' ''; in '' import re import time start_all() ${lib.concatMapStrings (node: '' ${node}.wait_until_succeeds("corosync-quorumtool") ${node}.wait_for_unit("pacemaker.service") '') (builtins.attrNames nodes)} # No STONITH device node1.succeed("crm_attribute -t crm_config -n stonith-enabled -v false") # Configure the cat resource node1.succeed("cibadmin --replace --scope resources --xml-file ${resources}") # wait until the service is started while True: output = node1.succeed("crm_resource -r cat --locate") match = re.search("is running on: (.+)", output) if match: for machine in machines: if machine.name == match.group(1): current_node = machine break time.sleep(1) current_node.log("Service running here!") current_node.crash() # pick another node that's still up for machine in machines: if machine.booted: check_node = machine # find where the service has been started next while True: output = check_node.succeed("crm_resource -r cat --locate") match = re.search("is running on: (.+)", output) # output will remain the old current_node until the crash is detected by pacemaker if match and match.group(1) != current_node.name: for machine in machines: if machine.name == match.group(1): next_node = machine break time.sleep(1) next_node.log("Service migrated here!") ''; } )