pacemaker: fix race condition on resource creation (#11750)

* remove pacemaker wait arg and fix race condition

* fix up pacemaker resource and stonith polling

* add changelog for pacemaker timeout bug

* remove env from test case and fix changelog file name

* Update changelogs/fragments/11750-pacemaker-wait-race-condition.yml

Co-authored-by: Felix Fontein <felix@fontein.de>

---------

Co-authored-by: Felix Fontein <felix@fontein.de>
This commit is contained in:
munchtoast
2026-04-18 16:45:58 -04:00
committed by GitHub
parent afe9de7562
commit 6c809dd9db
8 changed files with 382 additions and 17 deletions

View File

@@ -0,0 +1,2 @@
bugfixes:
- pacemaker_resource, pacemaker_stonith - fix resource and stonith creation race condition by polling PCS status (https://github.com/ansible-collections/community.general/issues/11574, https://github.com/ansible-collections/community.general/pull/11750).

View File

@@ -5,6 +5,7 @@
from __future__ import annotations
import re
import time
import typing as t
from ansible_collections.community.general.plugins.module_utils.cmd_runner import CmdRunner, cmd_runner_fmt
@@ -59,6 +60,22 @@ def get_pacemaker_maintenance_mode(runner: CmdRunner) -> bool:
return bool(maintenance_mode_output)
def wait_for_resource(runner: CmdRunner, cli_noun: str, name: str, wait: int, sleep_interval: int = 5) -> None:
"""Poll ``pcs <cli_noun> status <name>`` until the resource reports Started or the wait budget expires.
Raises an exception if the resource does not reach the Started state within *wait* seconds.
"""
deadline = time.monotonic() + wait
while True:
with runner("cli_action state name") as ctx:
rc, out, err = ctx.run(cli_action=cli_noun, state="status")
if out and "Started" in out:
return
if time.monotonic() >= deadline:
raise Exception(f"Timed out waiting {wait}s for {cli_noun} resource '{name}' to start")
time.sleep(sleep_interval)
def pacemaker_runner(module: AnsibleModule, **kwargs) -> CmdRunner:
runner_command = ["pcs"]
runner = CmdRunner(

View File

@@ -149,6 +149,7 @@ from ansible_collections.community.general.plugins.module_utils.module_helper im
from ansible_collections.community.general.plugins.module_utils.pacemaker import (
get_pacemaker_maintenance_mode,
pacemaker_runner,
wait_for_resource,
)
@@ -237,7 +238,7 @@ class PacemakerResource(StateModuleHelper):
def state_present(self):
with self.runner(
"cli_action state name resource_type resource_option resource_operation resource_meta resource_argument "
"resource_clone_ids resource_clone_meta wait",
"resource_clone_ids resource_clone_meta",
output_process=self._process_command_output(
not get_pacemaker_maintenance_mode(self.runner), "already exists"
),
@@ -247,10 +248,12 @@ class PacemakerResource(StateModuleHelper):
cli_action="resource",
resource_clone_ids=self.fmt_as_stack_argument(self.module.params["resource_clone_ids"], "clone"),
)
if not self.module.check_mode and self.vars.wait and not get_pacemaker_maintenance_mode(self.runner):
wait_for_resource(self.runner, "resource", self.vars.name, self.vars.wait)
def state_cloned(self):
with self.runner(
"cli_action state name resource_clone_ids resource_clone_meta wait",
"cli_action state name resource_clone_ids resource_clone_meta",
output_process=self._process_command_output(
not get_pacemaker_maintenance_mode(self.runner), "already a clone resource"
),
@@ -260,6 +263,8 @@ class PacemakerResource(StateModuleHelper):
cli_action="resource",
resource_clone_meta=self.fmt_as_stack_argument(self.module.params["resource_clone_meta"], "meta"),
)
if not self.module.check_mode and self.vars.wait and not get_pacemaker_maintenance_mode(self.runner):
wait_for_resource(self.runner, "resource", self.vars.name, self.vars.wait)
def state_enabled(self):
with self.runner(

View File

@@ -125,7 +125,7 @@ value:
"""
from ansible_collections.community.general.plugins.module_utils.module_helper import StateModuleHelper
from ansible_collections.community.general.plugins.module_utils.pacemaker import pacemaker_runner
from ansible_collections.community.general.plugins.module_utils.pacemaker import pacemaker_runner, wait_for_resource
class PacemakerStonith(StateModuleHelper):
@@ -206,7 +206,7 @@ class PacemakerStonith(StateModuleHelper):
def state_present(self):
with self.runner(
"cli_action state name resource_type resource_option resource_operation resource_meta resource_argument agent_validation wait",
"cli_action state name resource_type resource_option resource_operation resource_meta resource_argument agent_validation",
output_process=self._process_command_output(True, "already exists"),
check_mode_skip=True,
) as ctx:
@@ -218,6 +218,8 @@ class PacemakerStonith(StateModuleHelper):
resource_meta=self.vars.stonith_metas,
resource_argument=self.vars.stonith_argument,
)
if not self.module.check_mode and self.vars.wait:
wait_for_resource(self.runner, "stonith", self.vars.name, self.vars.wait)
def state_enabled(self):
with self.runner(

View File

@@ -10,8 +10,138 @@
from __future__ import annotations
import json
import pytest
from ansible_collections.community.general.plugins.modules import pacemaker_resource
from .uthelper import RunCommandMock, UTHelper
UTHelper.from_module(pacemaker_resource, __name__, mocks=[RunCommandMock])
# ---------------------------------------------------------------------------
# Race condition tests: resource starts after one or more Stopped polls
# ---------------------------------------------------------------------------
NO_MAINTENANCE_OUT = (
"Cluster Properties: cib-bootstrap-options\n"
"cluster-infrastructure=corosync\n"
"cluster-name=hacluster\n"
"dc-version=2.1.9-1.fc41-7188dbf\n"
"have-watchdog=false\n"
)
@pytest.fixture
def patch_bin(mocker):
def mockie(self_, path, *args, **kwargs):
return f"/testbin/{path}"
mocker.patch("ansible.module_utils.basic.AnsibleModule.get_bin_path", mockie)
@pytest.mark.usefixtures("patch_bin")
def test_present_race_condition_stopped_then_started(mocker, capfd):
"""Resource reports Stopped on the first poll then Started on the second — must succeed."""
mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep")
# Sequence of run_command calls:
# 1. initial _get(): resource status → not found (rc=1)
# 2. state_present: property config → no maintenance
# 3. state_present: resource create → rc=0
# 4. post-create maintenance check → no maintenance
# 5. wait_for_resource poll 1: status → Stopped (not yet running)
# 6. wait_for_resource poll 2: status → Started
# 7. __quit_module__ _get(): status → Started
run_command_calls = [
(1, "", "Error: resource or tag id 'virtual-ip' not found"),
(1, NO_MAINTENANCE_OUT, ""),
(0, "Assumed agent name 'ocf:heartbeat:IPaddr2'", ""),
(1, NO_MAINTENANCE_OUT, ""),
(0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Stopped", ""),
(0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started", ""),
(0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started", ""),
]
def side_effect(self_, **kwargs):
return run_command_calls.pop(0)
mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect)
with pytest.raises(SystemExit):
with pytest.MonkeyPatch().context() as mp:
mp.setattr(
"ansible.module_utils.basic._ANSIBLE_ARGS",
json.dumps(
{
"ANSIBLE_MODULE_ARGS": {
"state": "present",
"name": "virtual-ip",
"resource_type": {"resource_name": "IPaddr2"},
"resource_option": ["ip=192.168.2.1"],
"wait": 30,
}
}
).encode(),
)
mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False)
pacemaker_resource.main()
out, _err = capfd.readouterr()
result = json.loads(out)
assert result["changed"] is True
assert result.get("failed") is not True
assert "Started" in result["value"]
@pytest.mark.usefixtures("patch_bin")
def test_present_wait_timeout_raises(mocker, capfd):
"""Resource never starts within the wait window — must fail with a timeout message."""
mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep")
# Simulate time advancing past the deadline immediately on the first poll
monotonic_values = iter([0.0, 999.0])
mocker.patch(
"ansible_collections.community.general.plugins.module_utils.pacemaker.time.monotonic",
side_effect=lambda: next(monotonic_values),
)
run_command_calls = [
(1, "", "Error: resource or tag id 'virtual-ip' not found"),
(1, NO_MAINTENANCE_OUT, ""),
(0, "Assumed agent name 'ocf:heartbeat:IPaddr2'", ""),
(1, NO_MAINTENANCE_OUT, ""),
(0, " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Stopped", ""),
]
def side_effect(self_, **kwargs):
return run_command_calls.pop(0)
mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect)
with pytest.raises(SystemExit):
with pytest.MonkeyPatch().context() as mp:
mp.setattr(
"ansible.module_utils.basic._ANSIBLE_ARGS",
json.dumps(
{
"ANSIBLE_MODULE_ARGS": {
"state": "present",
"name": "virtual-ip",
"resource_type": {"resource_name": "IPaddr2"},
"resource_option": ["ip=192.168.2.1"],
"wait": 10,
}
}
).encode(),
)
mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False)
pacemaker_resource.main()
out, _err = capfd.readouterr()
result = json.loads(out)
assert result.get("failed") is True
assert "Timed out" in result["msg"]
assert "virtual-ip" in result["msg"]

View File

@@ -40,11 +40,26 @@ test_cases:
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300]
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"]
environ: *env-def
rc: 0
out: "Assumed agent name 'ocf:heartbeat:IPaddr2' (deduced from 'IPAddr2')"
err: ""
- command: [/testbin/pcs, property, config]
environ: *env-def
rc: 1
out: |
Cluster Properties: cib-bootstrap-options
cluster-infrastructure=corosync
cluster-name=hacluster
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
out: " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started"
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
@@ -103,11 +118,26 @@ test_cases:
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, create, virtual-ip, ocf:heartbeat:IPaddr2, "ip=[192.168.2.1]", op, start, timeout=1200, op, stop, timeout=1200, op, monitor, timeout=1200, meta, test_meta1=123, meta, test_meta2=456, --group, test_group, clone, test_clone, test_clone_meta1=789, --wait=200]
- command: [/testbin/pcs, resource, create, virtual-ip, ocf:heartbeat:IPaddr2, "ip=[192.168.2.1]", op, start, timeout=1200, op, stop, timeout=1200, op, monitor, timeout=1200, meta, test_meta1=123, meta, test_meta2=456, --group, test_group, clone, test_clone, test_clone_meta1=789]
environ: *env-def
rc: 0
out: "Assumed agent name 'ocf:heartbeat:IPaddr2'"
err: ""
- command: [/testbin/pcs, property, config]
environ: *env-def
rc: 1
out: |
Cluster Properties: cib-bootstrap-options
cluster-infrastructure=corosync
cluster-name=hacluster
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
out: " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started"
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
@@ -142,11 +172,26 @@ test_cases:
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300]
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"]
environ: *env-def
rc: 1
out: ""
err: "Error: 'virtual-ip' already exists\n"
- command: [/testbin/pcs, property, config]
environ: *env-def
rc: 1
out: |
Cluster Properties: cib-bootstrap-options
cluster-infrastructure=corosync
cluster-name=hacluster
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
out: " * virtual-ip\t(ocf:heartbeat:IPAddr2):\t Started"
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
@@ -182,11 +227,22 @@ test_cases:
have-watchdog=false
maintenance-mode=true
err: ""
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300]
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"]
environ: *env-def
rc: 1
rc: 0
out: ""
err: "Error: resource 'virtual-ip' is not running on any node"
err: ""
- command: [/testbin/pcs, property, config]
environ: *env-def
rc: 0
out: |
Cluster Properties: cib-bootstrap-options
cluster-infrastructure=corosync
cluster-name=hacluster
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
maintenance-mode=true
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
@@ -222,11 +278,22 @@ test_cases:
have-watchdog: false
maintenance-mode: true
err: ""
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]", --wait=300]
- command: [/testbin/pcs, resource, create, virtual-ip, IPaddr2, "ip=[192.168.2.1]"]
environ: *env-def
rc: 1
rc: 0
out: ""
err: "Error: resource 'virtual-ip' is not running on any node"
err: ""
- command: [/testbin/pcs, property, config]
environ: *env-def
rc: 0
out: |
Cluster Properties: cib-bootstrap-options
cluster-infrastructure: corosync
cluster-name: hacluster
dc-version: 2.1.9-1.fc41-7188dbf
have-watchdog: false
maintenance-mode: true
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
@@ -523,7 +590,7 @@ test_cases:
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, clone, virtual-ip, --wait=300]
- command: [/testbin/pcs, resource, clone, virtual-ip]
environ: *env-def
rc: 1
out: ""
@@ -558,11 +625,26 @@ test_cases:
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, clone, virtual-ip, test_clone, meta, test_clone_meta1=789, --wait=200]
- command: [/testbin/pcs, resource, clone, virtual-ip, test_clone, meta, test_clone_meta1=789]
environ: *env-def
rc: 0
out: ""
err: ""
- command: [/testbin/pcs, property, config]
environ: *env-def
rc: 1
out: |
Cluster Properties: cib-bootstrap-options
cluster-infrastructure=corosync
cluster-name=hacluster
dc-version=2.1.9-1.fc41-7188dbf
have-watchdog=false
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0
out: " * Clone Set: virtual-ip-clone [virtual-ip]\t(ocf:heartbeat:IPAddr2):\t Started"
err: ""
- command: [/testbin/pcs, resource, status, virtual-ip]
environ: *env-def
rc: 0

View File

@@ -10,8 +10,125 @@
from __future__ import annotations
import json
import pytest
from ansible_collections.community.general.plugins.modules import pacemaker_stonith
from .uthelper import RunCommandMock, UTHelper
UTHelper.from_module(pacemaker_stonith, __name__, mocks=[RunCommandMock])
# ---------------------------------------------------------------------------
# Race condition tests: resource starts after one or more Stopped polls
# ---------------------------------------------------------------------------
@pytest.fixture
def patch_bin(mocker):
def mockie(self_, path, *args, **kwargs):
return f"/testbin/{path}"
mocker.patch("ansible.module_utils.basic.AnsibleModule.get_bin_path", mockie)
@pytest.mark.usefixtures("patch_bin")
def test_present_race_condition_stopped_then_started(mocker, capfd):
"""Resource reports Stopped on the first poll then Started on the second — must succeed."""
mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep")
# Sequence of run_command calls:
# 1. initial _get(): stonith status → not found (rc=1)
# 2. state_present create → rc=0
# 3. wait_for_resource poll 1: status → Stopped (not yet running)
# 4. wait_for_resource poll 2: status → Started
# 5. __quit_module__ _get(): status → Started
run_command_calls = [
(1, "", ""),
(0, "", ""),
(0, " * virtual-stonith\t(stonith:fence_virt):\t Stopped", ""),
(0, " * virtual-stonith\t(stonith:fence_virt):\t Started", ""),
(0, " * virtual-stonith\t(stonith:fence_virt):\t Started", ""),
]
def side_effect(self_, **kwargs):
return run_command_calls.pop(0)
mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect)
with pytest.raises(SystemExit):
with pytest.MonkeyPatch().context() as mp:
mp.setattr(
"ansible.module_utils.basic._ANSIBLE_ARGS",
json.dumps(
{
"ANSIBLE_MODULE_ARGS": {
"state": "present",
"name": "virtual-stonith",
"stonith_type": "fence_virt",
"stonith_options": ["pcmk_host_list=f1"],
"wait": 30,
}
}
).encode(),
)
mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False)
pacemaker_stonith.main()
out, _err = capfd.readouterr()
result = json.loads(out)
assert result["changed"] is True
assert result.get("failed") is not True
assert "Started" in result["value"]
@pytest.mark.usefixtures("patch_bin")
def test_present_wait_timeout_raises(mocker, capfd):
"""Resource never starts within the wait window — must fail with a timeout message."""
mocker.patch("ansible_collections.community.general.plugins.module_utils.pacemaker.time.sleep")
# Simulate time advancing past the deadline immediately on the first poll
monotonic_values = iter([0.0, 999.0])
mocker.patch(
"ansible_collections.community.general.plugins.module_utils.pacemaker.time.monotonic",
side_effect=lambda: next(monotonic_values),
)
# Sequence: initial _get() → not found, create → rc=0, poll → Stopped (deadline exceeded)
run_command_calls = [
(1, "", ""),
(0, "", ""),
(0, " * virtual-stonith\t(stonith:fence_virt):\t Stopped", ""),
]
def side_effect(self_, **kwargs):
return run_command_calls.pop(0)
mocker.patch("ansible.module_utils.basic.AnsibleModule.run_command", side_effect=side_effect)
with pytest.raises(SystemExit):
with pytest.MonkeyPatch().context() as mp:
mp.setattr(
"ansible.module_utils.basic._ANSIBLE_ARGS",
json.dumps(
{
"ANSIBLE_MODULE_ARGS": {
"state": "present",
"name": "virtual-stonith",
"stonith_type": "fence_virt",
"stonith_options": ["pcmk_host_list=f1"],
"wait": 10,
}
}
).encode(),
)
mp.setattr("ansible.module_utils.basic._ANSIBLE_PROFILE", "legacy", raising=False)
pacemaker_stonith.main()
out, _err = capfd.readouterr()
result = json.loads(out)
assert result.get("failed") is True
assert "Timed out" in result["msg"]
assert "virtual-stonith" in result["msg"]

View File

@@ -33,7 +33,7 @@ test_cases:
rc: 1
out: ""
err: ""
- command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s", "--wait=300"]
- command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s"]
environ: *env-def
rc: 0
out: ""
@@ -43,6 +43,11 @@ test_cases:
rc: 0
out: " * virtual-stonith\t(stonith:fence_virt):\t Started"
err: ""
- command: ["/testbin/pcs", stonith, status, virtual-stonith]
environ: *env-def
rc: 0
out: " * virtual-stonith\t(stonith:fence_virt):\t Started"
err: ""
- id: test_present_minimal_input_stonith_exists
input:
state: present
@@ -65,7 +70,7 @@ test_cases:
rc: 0
out: " * virtual-stonith\t(stonith:fence_virt):\t Started"
err: ""
- command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s", "--wait=300"]
- command: ["/testbin/pcs", stonith, create, virtual-stonith, fence_virt, "pcmk_host_list=f1", "op", "monitor", "interval=30s"]
environ: *env-def
rc: 0
out: ""
@@ -75,6 +80,11 @@ test_cases:
rc: 0
out: " * virtual-stonith\t(stonith:fence_virt):\t Started"
err: ""
- command: ["/testbin/pcs", stonith, status, virtual-stonith]
environ: *env-def
rc: 0
out: " * virtual-stonith\t(stonith:fence_virt):\t Started"
err: ""
- id: test_absent_minimal_input_stonith_not_exists
input:
state: absent