From 1a722f1d62b3ad583cce4f6041704ba7fe532384 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Wed, 22 Apr 2026 15:11:35 +0100 Subject: [PATCH 1/4] Perform a second agent inspection boot to commit BIOS settings change We're seeing obscure errors from Ironic like: failed step {'interface': 'raid', 'step': 'delete_configuration', 'abortable': False, 'priority': 0}: Unable to connect to /redfish/v1/TaskService/Tasks/JID_768614980495. Error: Timeout waiting for task monitor /redfish/v1/TaskService/Tasks/JID_768614980495 (timeout = 500) To clear this up, we are completing each operation with a separate reboot. --- .../tests/test_enroll_server.py | 45 +++++++++++++++++-- .../main/enroll_server.py | 5 ++- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/python/understack-workflows/tests/test_enroll_server.py b/python/understack-workflows/tests/test_enroll_server.py index 8e3f35306..dd8318a38 100644 --- a/python/understack-workflows/tests/test_enroll_server.py +++ b/python/understack-workflows/tests/test_enroll_server.py @@ -202,8 +202,12 @@ def test_enrol_happy_path_uses_virtual_media_inspect_and_flips_back(mocker): ] fake_ironic, created_node = make_ironic_client( node_name="Dell-ABC123", - # OOB inspect, agent inspect, OOB inspect (post-RAID). - inspect_interfaces=["idrac-redfish", "idrac-redfish", "idrac-redfish"], + inspect_interfaces=[ + "idrac-redfish", + "idrac-redfish", + "idrac-redfish", + "idrac-redfish", + ], inventory=inventory, ports=ports, ) @@ -216,7 +220,7 @@ def test_enrol_happy_path_uses_virtual_media_inspect_and_flips_back(mocker): ) bmc_set_hostname = mocker.patch.object(enroll_server, "bmc_set_hostname") update_dell_bios_settings = mocker.patch.object( - enroll_server, "update_dell_bios_settings" + enroll_server, "update_dell_bios_settings", return_value={"changed": True} ) mocker.patch( "understack_workflows.ironic.client.get_ironic_client", @@ -297,6 +301,20 @@ def test_enrol_happy_path_uses_virtual_media_inspect_and_flips_back(mocker): runbook=None, disable_ramdisk=None, ), + call( + created_node.uuid, + "clean", + cleansteps=[{"interface": "management", "step": "clear_job_queue"}], + runbook=None, + disable_ramdisk=True, + ), + call( + created_node.uuid, + "inspect", # second agent inspect to apply BIOS changes + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), call( created_node.uuid, "clean", @@ -334,6 +352,9 @@ def test_enrol_happy_path_uses_virtual_media_inspect_and_flips_back(mocker): call(created_node.uuid, expected_ipxe_boot), call(created_node.uuid, expected_agent), call(created_node.uuid, expected_ipxe_boot), + call(created_node.uuid, expected_ipxe_boot), + call(created_node.uuid, expected_agent), + call(created_node.uuid, expected_ipxe_boot), call(created_node.uuid, expected_reset), # Post-RAID OOB inspect prep ] @@ -371,7 +392,9 @@ def test_enrol_existing_failed_node_recovers_and_updates(mocker): mocker.patch.object(enroll_server, "set_bmc_password") mocker.patch.object(enroll_server, "update_dell_drac_settings") mocker.patch.object(enroll_server, "bmc_set_hostname") - mocker.patch.object(enroll_server, "update_dell_bios_settings") + mocker.patch.object( + enroll_server, "update_dell_bios_settings", return_value={"changed": True} + ) mocker.patch( "understack_workflows.ironic.client.get_ironic_client", return_value=fake_ironic, @@ -410,6 +433,20 @@ def test_enrol_existing_failed_node_recovers_and_updates(mocker): runbook=None, disable_ramdisk=None, ), + call( + existing_node.uuid, + "clean", + cleansteps=[{"interface": "management", "step": "clear_job_queue"}], + runbook=None, + disable_ramdisk=True, + ), + call( + existing_node.uuid, + "inspect", # second agent inspect to apply BIOS changes + cleansteps=None, + runbook=None, + disable_ramdisk=None, + ), call( existing_node.uuid, "provide", diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index c9f34731d..8fffa6e1c 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -121,7 +121,10 @@ def enroll( ) logger.info("[node:%s] Selected PXE interface %s", node.uuid, pxe_interface) - update_dell_bios_settings(bmc, pxe_interface=pxe_interface) + # This sets the boot device to use for all future HTTP boots: + if update_dell_bios_settings(bmc, pxe_interface=pxe_interface): + logger.info("%s performing second inspection write BIOS settings", node.uuid) + agent_inspection(node) if raid_configure: configure_raid(node, bmc) From 2552777f1f33e6946e638c72e069de6c43498aeb Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Wed, 22 Apr 2026 18:02:47 +0100 Subject: [PATCH 2/4] Clear any pending BIOS jobs before continuing with enrolment --- .../understack_workflows/ironic_node.py | 11 +++++++++++ .../understack_workflows/main/enroll_server.py | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/python/understack-workflows/understack_workflows/ironic_node.py b/python/understack-workflows/understack_workflows/ironic_node.py index 95821e3f4..9de5138a2 100644 --- a/python/understack-workflows/understack_workflows/ironic_node.py +++ b/python/understack-workflows/understack_workflows/ironic_node.py @@ -162,6 +162,17 @@ def create_ironic_node( return client.create_node(node_data) +def clear_pending_idrac_jobs(node: Node): + logger.info("%s performing clear_job_queue clean step", node.uuid) + transition( + node, + target_state="clean", + expected_state="manageable", + clean_steps=[{"interface": "management", "step": "clear_job_queue"}], + disable_ramdisk=True, + ) + + def _driver_for(manufacturer: str) -> tuple[str, str]: """Answer the (driver, inspect_interface) for this server.""" if manufacturer.startswith("Dell"): diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index 8fffa6e1c..b9a4b9bdb 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -109,7 +109,6 @@ def enroll( # Therefore, we only use virtual media during our "enroll" phase, when the # port data is set up in a manner that suits the Neutron algorithm. If a # normal PXE/HTTP port is available then we use it instead: - virtual_media = not bool(ironic_node.pxe_enabled_bios_name(node)) agent_inspection(node, virtual_media=virtual_media) @@ -121,6 +120,9 @@ def enroll( ) logger.info("[node:%s] Selected PXE interface %s", node.uuid, pxe_interface) + # Clear the job queue - stale jobs can conflict with the ones we create: + ironic_node.clear_pending_idrac_jobs(node) + # This sets the boot device to use for all future HTTP boots: if update_dell_bios_settings(bmc, pxe_interface=pxe_interface): logger.info("%s performing second inspection write BIOS settings", node.uuid) From 3e1c22f945640f56d47b3b0a57787dbfe7c1d926 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Thu, 23 Apr 2026 16:31:04 +0100 Subject: [PATCH 3/4] Disable any old PXE devices that might be lying around --- python/understack-workflows/understack_workflows/bmc_bios.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/understack-workflows/understack_workflows/bmc_bios.py b/python/understack-workflows/understack_workflows/bmc_bios.py index c6c20b48a..66dca8ecf 100644 --- a/python/understack-workflows/understack_workflows/bmc_bios.py +++ b/python/understack-workflows/understack_workflows/bmc_bios.py @@ -24,6 +24,9 @@ def required_bios_settings(pxe_interface: str) -> dict[str, str]: "IPMILan.1.Enable": "Disabled", # PXE is enabled by default on DELL, but we don't use it: "PxeDev1EnDis": "Disabled", + "PxeDev2EnDis": "Disabled", + "PxeDev3EnDis": "Disabled", + "PxeDev4EnDis": "Disabled", # Configure exactly one HTTP port for booting: "HttpDev1Interface": pxe_interface, "HttpDev1EnDis": "Enabled", From e533231321de380a7f9e55edc6729bb48ca9ca19 Mon Sep 17 00:00:00 2001 From: Steve Keay Date: Fri, 24 Apr 2026 07:54:47 +0100 Subject: [PATCH 4/4] Clear idrac jobs earlier in enroll and optionally reset drac completely --- .../tests/test_enroll_server.py | 28 +++++++++---------- .../understack_workflows/ironic_node.py | 11 ++++++++ .../main/enroll_server.py | 18 ++++++++++-- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/python/understack-workflows/tests/test_enroll_server.py b/python/understack-workflows/tests/test_enroll_server.py index dd8318a38..24b3e0084 100644 --- a/python/understack-workflows/tests/test_enroll_server.py +++ b/python/understack-workflows/tests/test_enroll_server.py @@ -289,24 +289,24 @@ def test_enrol_happy_path_uses_virtual_media_inspect_and_flips_back(mocker): ), call( created_node.uuid, - "inspect", # OOB redfish inspect for bios_name / basic info - cleansteps=None, + "clean", + cleansteps=[{"interface": "management", "step": "clear_job_queue"}], runbook=None, - disable_ramdisk=None, + disable_ramdisk=True, ), call( created_node.uuid, - "inspect", # agent inspect via virtual media + "inspect", # OOB redfish inspect for bios_name / basic info cleansteps=None, runbook=None, disable_ramdisk=None, ), call( created_node.uuid, - "clean", - cleansteps=[{"interface": "management", "step": "clear_job_queue"}], + "inspect", # agent inspect + cleansteps=None, runbook=None, - disable_ramdisk=True, + disable_ramdisk=None, ), call( created_node.uuid, @@ -421,24 +421,24 @@ def test_enrol_existing_failed_node_recovers_and_updates(mocker): ), call( existing_node.uuid, - "inspect", # OOB inspect - cleansteps=None, + "clean", + cleansteps=[{"interface": "management", "step": "clear_job_queue"}], runbook=None, - disable_ramdisk=None, + disable_ramdisk=True, ), call( existing_node.uuid, - "inspect", # Agent inspect via virtual media + "inspect", # OOB inspect cleansteps=None, runbook=None, disable_ramdisk=None, ), call( existing_node.uuid, - "clean", - cleansteps=[{"interface": "management", "step": "clear_job_queue"}], + "inspect", # Agent inspect + cleansteps=None, runbook=None, - disable_ramdisk=True, + disable_ramdisk=None, ), call( existing_node.uuid, diff --git a/python/understack-workflows/understack_workflows/ironic_node.py b/python/understack-workflows/understack_workflows/ironic_node.py index 9de5138a2..b8ce8b230 100644 --- a/python/understack-workflows/understack_workflows/ironic_node.py +++ b/python/understack-workflows/understack_workflows/ironic_node.py @@ -173,6 +173,17 @@ def clear_pending_idrac_jobs(node: Node): ) +def reset_idrac_to_known_good_state(node: Node): + logger.info("%s performing known_good_state clean step", node.uuid) + transition( + node, + target_state="clean", + expected_state="manageable", + clean_steps=[{"interface": "management", "step": "known_good_state"}], + disable_ramdisk=True, + ) + + def _driver_for(manufacturer: str) -> tuple[str, str]: """Answer the (driver, inspect_interface) for this server.""" if manufacturer.startswith("Dell"): diff --git a/python/understack-workflows/understack_workflows/main/enroll_server.py b/python/understack-workflows/understack_workflows/main/enroll_server.py index b9a4b9bdb..5efc9616f 100644 --- a/python/understack-workflows/understack_workflows/main/enroll_server.py +++ b/python/understack-workflows/understack_workflows/main/enroll_server.py @@ -65,6 +65,7 @@ def main() -> None: firmware_update=args.firmware_update, raid_configure=args.raid_configure, external_cmdb_id=args.external_cmdb_id, + reset_idrac=args.reset_idrac, ) @@ -74,6 +75,7 @@ def enroll( raid_configure: bool, old_password: str | None, external_cmdb_id: str | None = None, + reset_idrac: bool = False, ) -> None: logger.info("Starting enroll workflow for bmc_ip_address=%s", ip_address) @@ -90,6 +92,13 @@ def enroll( external_cmdb_id=external_cmdb_id, ) + # Clear stale iDRAC jobs before virtual-media inspection, or optionally + # reset the controller to a broader known-good state. + if reset_idrac: + ironic_node.reset_idrac_to_known_good_state(node) + else: + ironic_node.clear_pending_idrac_jobs(node) + # Out-of-band redfish inspection populates data including baremetal ports. # # Our hooks augment the ironic baremetal port with the BMC-reported @@ -120,9 +129,6 @@ def enroll( ) logger.info("[node:%s] Selected PXE interface %s", node.uuid, pxe_interface) - # Clear the job queue - stale jobs can conflict with the ones we create: - ironic_node.clear_pending_idrac_jobs(node) - # This sets the boot device to use for all future HTTP boots: if update_dell_bios_settings(bmc, pxe_interface=pxe_interface): logger.info("%s performing second inspection write BIOS settings", node.uuid) @@ -278,6 +284,12 @@ def argument_parser(): default=True, help="Configure RAID before inspection", ) + parser.add_argument( + "--reset-idrac", + type=parse_bool, + default=False, + help="Reset iDRAC to known_good_state instead of clear_job_queue", + ) parser.add_argument( "--external-cmdb-id", required=False,