From 2bffc8bbd9035ce9209e698baebaf82da362e690 Mon Sep 17 00:00:00 2001 From: falfab Date: Wed, 29 Apr 2026 15:41:14 +0200 Subject: [PATCH] fix(RosActionNode): swallow UnknownGoalHandleError in cancelGoal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the action server completes the cancel handshake before the BT thread reaches async_get_result / async_cancel_goal, the rclcpp_action client's result callback has already erased the goal handle from its internal registry. Both calls then throw UnknownGoalHandleError, which propagates out of halt() and causes TreeExecutionServer to abort() the outer goal instead of canceling it. Catch only UnknownGoalHandleError so other rclcpp_action errors (network, invalid handle, etc.) still propagate as real failures. The pre-accept path (goal_handle_ not yet populated) is untouched — it continues to hit the existing "cancelGoal called on an empty goal_handle" branch. Adds a gtest regression under behaviortree_ros2/test/ that reproduces the race by synchronizing the server's canceled() call with receipt of the cancel request. Without the fix the test terminates with "Goal handle is not known to this client." Closes #18 --- behaviortree_ros2/CMakeLists.txt | 9 + .../behaviortree_ros2/bt_action_node.hpp | 37 ++- behaviortree_ros2/package.xml | 2 + .../test/test_cancel_goal_race.cpp | 221 ++++++++++++++++++ 4 files changed, 258 insertions(+), 11 deletions(-) create mode 100644 behaviortree_ros2/test/test_cancel_goal_race.cpp diff --git a/behaviortree_ros2/CMakeLists.txt b/behaviortree_ros2/CMakeLists.txt index 314edd0..c2e8586 100644 --- a/behaviortree_ros2/CMakeLists.txt +++ b/behaviortree_ros2/CMakeLists.txt @@ -62,4 +62,13 @@ install( ament_export_include_directories(include) ament_export_libraries(${PROJECT_NAME}) +if(BUILD_TESTING) + find_package(ament_cmake_gtest REQUIRED) + + ament_add_gtest(test_cancel_goal_race + test/test_cancel_goal_race.cpp) + target_link_libraries(test_cancel_goal_race + ${PROJECT_NAME}) +endif() + ament_package() diff --git a/behaviortree_ros2/include/behaviortree_ros2/bt_action_node.hpp b/behaviortree_ros2/include/behaviortree_ros2/bt_action_node.hpp index 11bfc96..dad092e 100644 --- a/behaviortree_ros2/include/behaviortree_ros2/bt_action_node.hpp +++ b/behaviortree_ros2/include/behaviortree_ros2/bt_action_node.hpp @@ -23,6 +23,7 @@ #include "behaviortree_cpp/action_node.h" #include "behaviortree_cpp/bt_factory.h" #include "rclcpp_action/rclcpp_action.hpp" +#include "rclcpp_action/exceptions.hpp" #include "behaviortree_ros2/ros_node_params.hpp" @@ -558,21 +559,35 @@ inline void RosActionNode::cancelGoal() auto& action_client = client_instance_->action_client; - auto future_result = action_client->async_get_result(goal_handle_); - auto future_cancel = action_client->async_cancel_goal(goal_handle_); + try + { + auto future_result = action_client->async_get_result(goal_handle_); + auto future_cancel = action_client->async_cancel_goal(goal_handle_); - constexpr auto SUCCESS = rclcpp::FutureReturnCode::SUCCESS; + constexpr auto SUCCESS = rclcpp::FutureReturnCode::SUCCESS; - if(executor.spin_until_future_complete(future_cancel, server_timeout_) != SUCCESS) - { - RCLCPP_ERROR(logger(), "Failed to cancel action server for [%s]", - action_name_.c_str()); - } + if(executor.spin_until_future_complete(future_cancel, server_timeout_) != SUCCESS) + { + RCLCPP_ERROR(logger(), "Failed to cancel action server for [%s]", + action_name_.c_str()); + } - if(executor.spin_until_future_complete(future_result, server_timeout_) != SUCCESS) + if(executor.spin_until_future_complete(future_result, server_timeout_) != SUCCESS) + { + RCLCPP_ERROR(logger(), "Failed to get result call failed :( for [%s]", + action_name_.c_str()); + } + } + catch(const rclcpp_action::exceptions::UnknownGoalHandleError& e) { - RCLCPP_ERROR(logger(), "Failed to get result call failed :( for [%s]", - action_name_.c_str()); + // The action server's terminal result arrived and the rclcpp_action + // client's result callback has already erased the goal from its + // internal registry before we reached async_get_result / + // async_cancel_goal. This is a benign race: the goal has already + // reached a terminal state, which is exactly what cancelGoal is + // trying to achieve. + RCLCPP_DEBUG(logger(), "Goal already terminal at cancel time for [%s]: %s", + action_name_.c_str(), e.what()); } } diff --git a/behaviortree_ros2/package.xml b/behaviortree_ros2/package.xml index a7e6bd4..a6d7018 100644 --- a/behaviortree_ros2/package.xml +++ b/behaviortree_ros2/package.xml @@ -20,6 +20,8 @@ generate_parameter_library btcpp_ros2_interfaces + ament_cmake_gtest + ament_cmake diff --git a/behaviortree_ros2/test/test_cancel_goal_race.cpp b/behaviortree_ros2/test/test_cancel_goal_race.cpp new file mode 100644 index 0000000..bcee685 --- /dev/null +++ b/behaviortree_ros2/test/test_cancel_goal_race.cpp @@ -0,0 +1,221 @@ +// Copyright (c) 2026 Davide Faconti +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Regression test for the fast-cancel race between +// `RosActionNode::cancelGoal()` and the `rclcpp_action` client's result +// callback. +// +// Background: `cancelGoal()` calls `async_get_result()` and +// `async_cancel_goal()` on the action client. If the server produces a +// terminal result (including one accepted via cancel) before those calls +// run on the BT thread, the client's result-callback has already erased +// the goal from its internal registry and both calls throw +// `rclcpp_action::exceptions::UnknownGoalHandleError`. The exception used +// to propagate out of `halt()` and cause `TreeExecutionServer` to abort +// the outer goal instead of canceling it. + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "behaviortree_ros2/bt_action_node.hpp" +#include "btcpp_ros2_interfaces/action/sleep.hpp" + +using namespace std::chrono_literals; +using Sleep = btcpp_ros2_interfaces::action::Sleep; + +namespace +{ + +class SleepLeaf : public BT::RosActionNode +{ +public: + SleepLeaf(const std::string& name, const BT::NodeConfig& conf, + const BT::RosNodeParams& params) + : BT::RosActionNode(name, conf, params) + {} + + bool setGoal(Goal& goal) override + { + goal.msec_timeout = 10000; // deliberately long: server must not finish normally + return true; + } + + BT::NodeStatus onResultReceived(const WrappedResult& /*wr*/) override + { + return BT::NodeStatus::SUCCESS; + } + + BT::NodeStatus onFeedback(const std::shared_ptr /*fb*/) override + { + return BT::NodeStatus::RUNNING; + } + + BT::NodeStatus onFailure(BT::ActionNodeErrorCode /*err*/) override + { + return BT::NodeStatus::FAILURE; + } +}; + +class FastCancelRaceTest : public ::testing::Test +{ +protected: + static void SetUpTestSuite() + { + if(!rclcpp::ok()) + { + rclcpp::init(0, nullptr); + } + } + static void TearDownTestSuite() + { + if(rclcpp::ok()) + { + rclcpp::shutdown(); + } + } + + void SetUp() override + { + bt_node_ = std::make_shared("bt_cancel_race_test"); + server_node_ = std::make_shared("server_cancel_race_test"); + + using GoalHandle = rclcpp_action::ServerGoalHandle; + + action_server_ = rclcpp_action::create_server( + server_node_, "/sleep", + [this](const rclcpp_action::GoalUUID&, std::shared_ptr) { + goal_received_ = true; + return rclcpp_action::GoalResponse::ACCEPT_AND_EXECUTE; + }, + [this](const std::shared_ptr) { + cancel_received_ = true; + return rclcpp_action::CancelResponse::ACCEPT; + }, + [this](const std::shared_ptr goal_handle) { + // Detach a thread that waits up to ~50 ms for a cancel request, + // then completes the goal synchronously once it sees one. This + // is the shape of the race: the server's `canceled()` call on + // the client side fires the result callback, which erases the + // goal from `goal_handles_` on the rclcpp_action client + // registry. If `halt()` -> `cancelGoal()` reaches + // `async_get_result` / `async_cancel_goal` after that erase, + // both calls used to throw `UnknownGoalHandleError`. + std::thread([goal_handle, this]() { + for(int i = 0; i < 50 && !cancel_received_.load(); ++i) + { + std::this_thread::sleep_for(5ms); + } + auto result = std::make_shared(); + result->done = false; + if(cancel_received_.load()) + { + goal_handle->canceled(result); + } + else + { + result->done = true; + goal_handle->succeed(result); + } + }).detach(); + }); + + exec_ = std::make_unique( + rclcpp::ExecutorOptions(), 2, false, std::chrono::milliseconds(50)); + exec_->add_node(bt_node_); + exec_->add_node(server_node_); + spinner_ = std::thread([this]() { exec_->spin(); }); + } + + void TearDown() override + { + if(exec_) + { + exec_->cancel(); + } + if(spinner_.joinable()) + { + spinner_.join(); + } + exec_.reset(); + } + + BT::BehaviorTreeFactory makeFactory() + { + BT::RosNodeParams params; + params.nh = bt_node_; + params.server_timeout = 1000ms; + params.wait_for_server_timeout = 2000ms; + params.default_port_value = "/sleep"; + + BT::BehaviorTreeFactory factory; + factory.registerNodeType("SleepLeaf", params); + return factory; + } + + std::shared_ptr bt_node_; + std::shared_ptr server_node_; + rclcpp_action::Server::SharedPtr action_server_; + std::unique_ptr exec_; + std::thread spinner_; + + std::atomic goal_received_{ false }; + std::atomic cancel_received_{ false }; +}; + +} // namespace + +// Regression test for the UnknownGoalHandleError fast-cancel race. +// `tree.haltTree()` must not throw when the server completes the cancel +// handshake synchronously from its own thread while the BT thread is +// still reaching the `cancelGoal()` body. +TEST_F(FastCancelRaceTest, HaltDoesNotThrowOnFastCancel) +{ + auto factory = makeFactory(); + auto tree = factory.createTreeFromText(R"( + + + +)"); + + // Tick until the server has accepted the goal. This ensures + // `goal_handle_` is populated inside the BT leaf before we halt. + tree.tickOnce(); + for(int i = 0; i < 100 && !goal_received_.load(); ++i) + { + std::this_thread::sleep_for(5ms); + tree.tickOnce(); + } + ASSERT_TRUE(goal_received_.load()) << "Server never received the goal; precondition " + "for the race " + "reproducer was not met."; + + // The halt must not leak `UnknownGoalHandleError` (or any other + // exception). Before the fix, this throws with + // "Goal handle is not known to this client.". + EXPECT_NO_THROW(tree.haltTree()); + + // The server must have actually received the cancel request — otherwise + // we'd be testing the non-racing path. + EXPECT_TRUE(cancel_received_.load()); +}