[scheduler] Fix retry race condition on cancellation (#9788)

This commit is contained in:
J. Nick Koston 2025-07-25 08:14:15 -10:00 committed by GitHub
parent 9ac10d7276
commit 88ccde4ba1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 104 additions and 33 deletions

View File

@ -65,7 +65,7 @@ static void validate_static_string(const char *name) {
// Common implementation for both timeout and interval
void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string,
const void *name_ptr, uint32_t delay, std::function<void()> func) {
const void *name_ptr, uint32_t delay, std::function<void()> func, bool is_retry) {
// Get the name as const char*
const char *name_cstr = this->get_name_cstr_(is_static_string, name_ptr);
@ -130,6 +130,18 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type
#endif /* ESPHOME_DEBUG_SCHEDULER */
LockGuard guard{this->lock_};
// For retries, check if there's a cancelled timeout first
if (is_retry && name_cstr != nullptr && type == SchedulerItem::TIMEOUT &&
(has_cancelled_timeout_in_container_(this->items_, component, name_cstr) ||
has_cancelled_timeout_in_container_(this->to_add_, component, name_cstr))) {
// Skip scheduling - the retry was cancelled
#ifdef ESPHOME_DEBUG_SCHEDULER
ESP_LOGD(TAG, "Skipping retry '%s' - found cancelled item", name_cstr);
#endif
return;
}
// If name is provided, do atomic cancel-and-add
// Cancel existing items
this->cancel_item_locked_(component, name_cstr, type);
@ -178,12 +190,14 @@ struct RetryArgs {
Scheduler *scheduler;
};
static void retry_handler(const std::shared_ptr<RetryArgs> &args) {
void retry_handler(const std::shared_ptr<RetryArgs> &args) {
RetryResult const retry_result = args->func(--args->retry_countdown);
if (retry_result == RetryResult::DONE || args->retry_countdown <= 0)
return;
// second execution of `func` happens after `initial_wait_time`
args->scheduler->set_timeout(args->component, args->name, args->current_interval, [args]() { retry_handler(args); });
args->scheduler->set_timer_common_(
args->component, Scheduler::SchedulerItem::TIMEOUT, false, &args->name, args->current_interval,
[args]() { retry_handler(args); }, true);
// backoff_increase_factor applied to third & later executions
args->current_interval *= args->backoff_increase_factor;
}

View File

@ -15,8 +15,15 @@
namespace esphome {
class Component;
struct RetryArgs;
// Forward declaration of retry_handler - needs to be non-static for friend declaration
void retry_handler(const std::shared_ptr<RetryArgs> &args);
class Scheduler {
// Allow retry_handler to access protected members
friend void ::esphome::retry_handler(const std::shared_ptr<RetryArgs> &args);
public:
// Public API - accepts std::string for backward compatibility
void set_timeout(Component *component, const std::string &name, uint32_t timeout, std::function<void()> func);
@ -147,7 +154,7 @@ class Scheduler {
// Common implementation for both timeout and interval
void set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string, const void *name_ptr,
uint32_t delay, std::function<void()> func);
uint32_t delay, std::function<void()> func, bool is_retry = false);
uint64_t millis_64_(uint32_t now);
// Cleanup logically deleted items from the scheduler
@ -170,8 +177,8 @@ class Scheduler {
// Helper function to check if item matches criteria for cancellation
inline bool HOT matches_item_(const std::unique_ptr<SchedulerItem> &item, Component *component, const char *name_cstr,
SchedulerItem::Type type) {
if (item->component != component || item->type != type || item->remove) {
SchedulerItem::Type type, bool skip_removed = true) const {
if (item->component != component || item->type != type || (skip_removed && item->remove)) {
return false;
}
const char *item_name = item->get_name();
@ -197,6 +204,18 @@ class Scheduler {
return item->remove || (item->component != nullptr && item->component->is_failed());
}
// Template helper to check if any item in a container matches our criteria
template<typename Container>
bool has_cancelled_timeout_in_container_(const Container &container, Component *component,
const char *name_cstr) const {
for (const auto &item : container) {
if (item->remove && this->matches_item_(item, component, name_cstr, SchedulerItem::TIMEOUT, false)) {
return true;
}
}
return false;
}
Mutex lock_;
std::vector<std::unique_ptr<SchedulerItem>> items_;
std::vector<std::unique_ptr<SchedulerItem>> to_add_;

View File

@ -10,7 +10,7 @@ esphome:
host:
api:
logger:
level: VERBOSE
level: VERY_VERBOSE
globals:
- id: simple_retry_counter
@ -19,6 +19,9 @@ globals:
- id: backoff_retry_counter
type: int
initial_value: '0'
- id: backoff_last_attempt_time
type: uint32_t
initial_value: '0'
- id: immediate_done_counter
type: int
initial_value: '0'
@ -35,20 +38,55 @@ globals:
type: int
initial_value: '0'
# Using different component types for each test to ensure isolation
sensor:
- platform: template
name: Test Sensor
id: test_sensor
name: Simple Retry Test Sensor
id: simple_retry_sensor
lambda: return 1.0;
update_interval: never
- platform: template
name: Backoff Retry Test Sensor
id: backoff_retry_sensor
lambda: return 2.0;
update_interval: never
- platform: template
name: Immediate Done Test Sensor
id: immediate_done_sensor
lambda: return 3.0;
update_interval: never
binary_sensor:
- platform: template
name: Cancel Retry Test Binary Sensor
id: cancel_retry_binary_sensor
lambda: return false;
- platform: template
name: Empty Name Test Binary Sensor
id: empty_name_binary_sensor
lambda: return true;
switch:
- platform: template
name: Script Retry Test Switch
id: script_retry_switch
optimistic: true
- platform: template
name: Multiple Same Name Test Switch
id: multiple_same_name_switch
optimistic: true
script:
- id: run_all_tests
then:
# Test 1: Simple retry
- logger.log: "=== Test 1: Simple retry ==="
- lambda: |-
auto *component = id(test_sensor);
auto *component = id(simple_retry_sensor);
App.scheduler.set_retry(component, "simple_retry", 50, 3,
[](uint8_t retry_countdown) {
id(simple_retry_counter)++;
@ -65,19 +103,19 @@ script:
# Test 2: Backoff retry
- logger.log: "=== Test 2: Retry with backoff ==="
- lambda: |-
auto *component = id(test_sensor);
static uint32_t backoff_start_time = 0;
static uint32_t last_attempt_time = 0;
backoff_start_time = millis();
last_attempt_time = backoff_start_time;
auto *component = id(backoff_retry_sensor);
App.scheduler.set_retry(component, "backoff_retry", 50, 4,
[](uint8_t retry_countdown) {
id(backoff_retry_counter)++;
uint32_t now = millis();
uint32_t interval = now - last_attempt_time;
last_attempt_time = now;
uint32_t interval = 0;
// Only calculate interval after first attempt
if (id(backoff_retry_counter) > 1) {
interval = now - id(backoff_last_attempt_time);
}
id(backoff_last_attempt_time) = now;
ESP_LOGI("test", "Backoff retry attempt %d (countdown=%d, interval=%dms)",
id(backoff_retry_counter), retry_countdown, interval);
@ -100,7 +138,7 @@ script:
# Test 3: Immediate done
- logger.log: "=== Test 3: Immediate done ==="
- lambda: |-
auto *component = id(test_sensor);
auto *component = id(immediate_done_sensor);
App.scheduler.set_retry(component, "immediate_done", 50, 5,
[](uint8_t retry_countdown) {
id(immediate_done_counter)++;
@ -111,8 +149,8 @@ script:
# Test 4: Cancel retry
- logger.log: "=== Test 4: Cancel retry ==="
- lambda: |-
auto *component = id(test_sensor);
App.scheduler.set_retry(component, "cancel_test", 25, 10,
auto *component = id(cancel_retry_binary_sensor);
App.scheduler.set_retry(component, "cancel_test", 30, 10,
[](uint8_t retry_countdown) {
id(cancel_retry_counter)++;
ESP_LOGI("test", "Cancel test retry attempt %d", id(cancel_retry_counter));
@ -121,7 +159,7 @@ script:
// Cancel it after 100ms
App.scheduler.set_timeout(component, "cancel_timer", 100, []() {
bool cancelled = App.scheduler.cancel_retry(id(test_sensor), "cancel_test");
bool cancelled = App.scheduler.cancel_retry(id(cancel_retry_binary_sensor), "cancel_test");
ESP_LOGI("test", "Retry cancellation result: %s", cancelled ? "true" : "false");
ESP_LOGI("test", "Cancel retry ran %d times before cancellation", id(cancel_retry_counter));
});
@ -129,7 +167,7 @@ script:
# Test 5: Empty name retry
- logger.log: "=== Test 5: Empty name retry ==="
- lambda: |-
auto *component = id(test_sensor);
auto *component = id(empty_name_binary_sensor);
App.scheduler.set_retry(component, "", 100, 5,
[](uint8_t retry_countdown) {
id(empty_name_retry_counter)++;
@ -139,7 +177,7 @@ script:
// Try to cancel after 150ms
App.scheduler.set_timeout(component, "empty_cancel_timer", 150, []() {
bool cancelled = App.scheduler.cancel_retry(id(test_sensor), "");
bool cancelled = App.scheduler.cancel_retry(id(empty_name_binary_sensor), "");
ESP_LOGI("test", "Empty name retry cancel result: %s",
cancelled ? "true" : "false");
ESP_LOGI("test", "Empty name retry ran %d times", id(empty_name_retry_counter));
@ -169,7 +207,7 @@ script:
# Test 7: Multiple same name
- logger.log: "=== Test 7: Multiple retries with same name ==="
- lambda: |-
auto *component = id(test_sensor);
auto *component = id(multiple_same_name_switch);
// Set first retry
App.scheduler.set_retry(component, "duplicate_retry", 100, 5,
@ -200,7 +238,7 @@ script:
ESP_LOGI("test", "Simple retry counter: %d (expected 2)", id(simple_retry_counter));
ESP_LOGI("test", "Backoff retry counter: %d (expected 4)", id(backoff_retry_counter));
ESP_LOGI("test", "Immediate done counter: %d (expected 1)", id(immediate_done_counter));
ESP_LOGI("test", "Cancel retry counter: %d (expected ~3-4)", id(cancel_retry_counter));
ESP_LOGI("test", "Cancel retry counter: %d (expected 2-4)", id(cancel_retry_counter));
ESP_LOGI("test", "Empty name retry counter: %d (expected 1-2)", id(empty_name_retry_counter));
ESP_LOGI("test", "Component retry counter: %d (expected 2)", id(script_retry_counter));
ESP_LOGI("test", "Multiple same name counter: %d (expected 20+)", id(multiple_same_name_counter));

View File

@ -148,16 +148,16 @@ async def test_scheduler_retry_test(
f"Expected at least 2 intervals, got {len(backoff_intervals)}"
)
if len(backoff_intervals) >= 3:
# First interval should be ~50ms
assert 30 <= backoff_intervals[0] <= 70, (
# First interval should be ~50ms (very wide tolerance for heavy system load)
assert 20 <= backoff_intervals[0] <= 150, (
f"First interval {backoff_intervals[0]}ms not ~50ms"
)
# Second interval should be ~100ms (50ms * 2.0)
assert 80 <= backoff_intervals[1] <= 120, (
assert 50 <= backoff_intervals[1] <= 250, (
f"Second interval {backoff_intervals[1]}ms not ~100ms"
)
# Third interval should be ~200ms (100ms * 2.0)
assert 180 <= backoff_intervals[2] <= 220, (
assert 100 <= backoff_intervals[2] <= 500, (
f"Third interval {backoff_intervals[2]}ms not ~200ms"
)
@ -175,7 +175,7 @@ async def test_scheduler_retry_test(
# Wait for cancel retry test
try:
await asyncio.wait_for(cancel_retry_done.wait(), timeout=2.0)
await asyncio.wait_for(cancel_retry_done.wait(), timeout=3.0)
except TimeoutError:
pytest.fail(
f"Cancel retry test did not complete. Count: {cancel_retry_count}"
@ -195,8 +195,8 @@ async def test_scheduler_retry_test(
)
# Empty name retry should run at least once before being cancelled
assert 1 <= empty_name_retry_count <= 2, (
f"Expected 1-2 empty name retry attempts, got {empty_name_retry_count}"
assert 1 <= empty_name_retry_count <= 3, (
f"Expected 1-3 empty name retry attempts, got {empty_name_retry_count}"
)
assert empty_cancel_result is True, (
"Empty name retry cancel should have succeeded"