diff options
Diffstat (limited to 'nixos/tests/consul.nix')
-rw-r--r-- | nixos/tests/consul.nix | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/nixos/tests/consul.nix b/nixos/tests/consul.nix new file mode 100644 index 00000000000..ee85f1d0b91 --- /dev/null +++ b/nixos/tests/consul.nix @@ -0,0 +1,229 @@ +import ./make-test-python.nix ({pkgs, lib, ...}: + +let + # Settings for both servers and agents + webUi = true; + retry_interval = "1s"; + raft_multiplier = 1; + + defaultExtraConfig = { + inherit retry_interval; + performance = { + inherit raft_multiplier; + }; + }; + + allConsensusServerHosts = [ + "192.168.1.1" + "192.168.1.2" + "192.168.1.3" + ]; + + allConsensusClientHosts = [ + "192.168.2.1" + "192.168.2.2" + ]; + + firewallSettings = { + # See https://www.consul.io/docs/install/ports.html + allowedTCPPorts = [ 8301 8302 8600 8500 8300 ]; + allowedUDPPorts = [ 8301 8302 8600 ]; + }; + + client = index: { pkgs, ... }: + let + ip = builtins.elemAt allConsensusClientHosts index; + in + { + environment.systemPackages = [ pkgs.consul ]; + + networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ + { address = ip; prefixLength = 16; } + ]; + networking.firewall = firewallSettings; + + services.consul = { + enable = true; + inherit webUi; + extraConfig = defaultExtraConfig // { + server = false; + retry_join = allConsensusServerHosts; + bind_addr = ip; + }; + }; + }; + + server = index: { pkgs, ... }: + let + numConsensusServers = builtins.length allConsensusServerHosts; + thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index; + ip = thisConsensusServerHost; # since we already use IPs to identify servers + in + { + networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ + { address = ip; prefixLength = 16; } + ]; + networking.firewall = firewallSettings; + + services.consul = + assert builtins.elem thisConsensusServerHost allConsensusServerHosts; + { + enable = true; + inherit webUi; + extraConfig = defaultExtraConfig // { + server = true; + bootstrap_expect = numConsensusServers; + # Tell Consul that we never intend to drop below this many servers. + # Ensures to not permanently lose consensus after temporary loss. + # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 + autopilot.min_quorum = numConsensusServers; + retry_join = + # If there's only 1 node in the network, we allow self-join; + # otherwise, the node must not try to join itself, and join only the other servers. + # See https://github.com/hashicorp/consul/issues/2868 + if numConsensusServers == 1 + then allConsensusServerHosts + else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts; + bind_addr = ip; + }; + }; + }; +in { + name = "consul"; + + nodes = { + server1 = server 0; + server2 = server 1; + server3 = server 2; + + client1 = client 0; + client2 = client 1; + }; + + testScript = '' + servers = [server1, server2, server3] + machines = [server1, server2, server3, client1, client2] + + for m in machines: + m.wait_for_unit("consul.service") + + + def wait_for_healthy_servers(): + # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 + # for why the `Voter` column of `list-peers` has that info. + # TODO: The `grep true` relies on the fact that currently in + # the output like + # # consul operator raft list-peers + # Node ID Address State Voter RaftProtocol + # server3 ... 192.168.1.3:8300 leader true 3 + # server2 ... 192.168.1.2:8300 follower true 3 + # server1 ... 192.168.1.1:8300 follower false 3 + # `Voter`is the only boolean column. + # Change this to the more reliable way to be defined by + # https://github.com/hashicorp/consul/issues/8118 + # once that ticket is closed. + for m in machines: + m.wait_until_succeeds( + "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" + ) + + + def wait_for_all_machines_alive(): + """ + Note that Serf-"alive" does not mean "Raft"-healthy; + see `wait_for_healthy_servers()` for that instead. + """ + for m in machines: + m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") + + + wait_for_healthy_servers() + # Also wait for clients to be alive. + wait_for_all_machines_alive() + + client1.succeed("consul kv put testkey 42") + client2.succeed("[ $(consul kv get testkey) == 42 ]") + + + def rolling_reboot_test(proper_rolling_procedure=True): + """ + Tests that the cluster can tolearate failures of any single server, + following the recommended rolling upgrade procedure from + https://www.consul.io/docs/upgrading#standard-upgrades. + + Optionally, `proper_rolling_procedure=False` can be given + to wait only for each server to be back `Healthy`, not `Stable` + in the Raft consensus, see Consul setting `ServerStabilizationTime` and + https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040. + """ + + for server in servers: + server.crash() + + # For each client, wait until they have connection again + # using `kv get -recurse` before issuing commands. + client1.wait_until_succeeds("consul kv get -recurse") + client2.wait_until_succeeds("consul kv get -recurse") + + # Do some consul actions while one server is down. + client1.succeed("consul kv put testkey 43") + client2.succeed("[ $(consul kv get testkey) == 43 ]") + client2.succeed("consul kv delete testkey") + + # Restart crashed machine. + server.start() + + if proper_rolling_procedure: + # Wait for recovery. + wait_for_healthy_servers() + else: + # NOT proper rolling upgrade procedure, see above. + wait_for_all_machines_alive() + + # Wait for client connections. + client1.wait_until_succeeds("consul kv get -recurse") + client2.wait_until_succeeds("consul kv get -recurse") + + # Do some consul actions with server back up. + client1.succeed("consul kv put testkey 44") + client2.succeed("[ $(consul kv get testkey) == 44 ]") + client2.succeed("consul kv delete testkey") + + + def all_servers_crash_simultaneously_test(): + """ + Tests that the cluster will eventually come back after all + servers crash simultaneously. + """ + + for server in servers: + server.crash() + + for server in servers: + server.start() + + # Wait for recovery. + wait_for_healthy_servers() + + # Wait for client connections. + client1.wait_until_succeeds("consul kv get -recurse") + client2.wait_until_succeeds("consul kv get -recurse") + + # Do some consul actions with servers back up. + client1.succeed("consul kv put testkey 44") + client2.succeed("[ $(consul kv get testkey) == 44 ]") + client2.succeed("consul kv delete testkey") + + + # Run the tests. + + print("rolling_reboot_test()") + rolling_reboot_test() + + print("all_servers_crash_simultaneously_test()") + all_servers_crash_simultaneously_test() + + print("rolling_reboot_test(proper_rolling_procedure=False)") + rolling_reboot_test(proper_rolling_procedure=False) + ''; +}) |