mirror of
https://github.com/esphome/esphome.git
synced 2025-07-29 06:36:45 +00:00
[scheduler] Fix retry race condition on cancellation (#9788)
This commit is contained in:
parent
9ac10d7276
commit
88ccde4ba1
@ -65,7 +65,7 @@ static void validate_static_string(const char *name) {
|
|||||||
|
|
||||||
// Common implementation for both timeout and interval
|
// Common implementation for both timeout and interval
|
||||||
void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string,
|
void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string,
|
||||||
const void *name_ptr, uint32_t delay, std::function<void()> func) {
|
const void *name_ptr, uint32_t delay, std::function<void()> func, bool is_retry) {
|
||||||
// Get the name as const char*
|
// Get the name as const char*
|
||||||
const char *name_cstr = this->get_name_cstr_(is_static_string, name_ptr);
|
const char *name_cstr = this->get_name_cstr_(is_static_string, name_ptr);
|
||||||
|
|
||||||
@ -130,6 +130,18 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type
|
|||||||
#endif /* ESPHOME_DEBUG_SCHEDULER */
|
#endif /* ESPHOME_DEBUG_SCHEDULER */
|
||||||
|
|
||||||
LockGuard guard{this->lock_};
|
LockGuard guard{this->lock_};
|
||||||
|
|
||||||
|
// For retries, check if there's a cancelled timeout first
|
||||||
|
if (is_retry && name_cstr != nullptr && type == SchedulerItem::TIMEOUT &&
|
||||||
|
(has_cancelled_timeout_in_container_(this->items_, component, name_cstr) ||
|
||||||
|
has_cancelled_timeout_in_container_(this->to_add_, component, name_cstr))) {
|
||||||
|
// Skip scheduling - the retry was cancelled
|
||||||
|
#ifdef ESPHOME_DEBUG_SCHEDULER
|
||||||
|
ESP_LOGD(TAG, "Skipping retry '%s' - found cancelled item", name_cstr);
|
||||||
|
#endif
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// If name is provided, do atomic cancel-and-add
|
// If name is provided, do atomic cancel-and-add
|
||||||
// Cancel existing items
|
// Cancel existing items
|
||||||
this->cancel_item_locked_(component, name_cstr, type);
|
this->cancel_item_locked_(component, name_cstr, type);
|
||||||
@ -178,12 +190,14 @@ struct RetryArgs {
|
|||||||
Scheduler *scheduler;
|
Scheduler *scheduler;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void retry_handler(const std::shared_ptr<RetryArgs> &args) {
|
void retry_handler(const std::shared_ptr<RetryArgs> &args) {
|
||||||
RetryResult const retry_result = args->func(--args->retry_countdown);
|
RetryResult const retry_result = args->func(--args->retry_countdown);
|
||||||
if (retry_result == RetryResult::DONE || args->retry_countdown <= 0)
|
if (retry_result == RetryResult::DONE || args->retry_countdown <= 0)
|
||||||
return;
|
return;
|
||||||
// second execution of `func` happens after `initial_wait_time`
|
// second execution of `func` happens after `initial_wait_time`
|
||||||
args->scheduler->set_timeout(args->component, args->name, args->current_interval, [args]() { retry_handler(args); });
|
args->scheduler->set_timer_common_(
|
||||||
|
args->component, Scheduler::SchedulerItem::TIMEOUT, false, &args->name, args->current_interval,
|
||||||
|
[args]() { retry_handler(args); }, true);
|
||||||
// backoff_increase_factor applied to third & later executions
|
// backoff_increase_factor applied to third & later executions
|
||||||
args->current_interval *= args->backoff_increase_factor;
|
args->current_interval *= args->backoff_increase_factor;
|
||||||
}
|
}
|
||||||
|
@ -15,8 +15,15 @@
|
|||||||
namespace esphome {
|
namespace esphome {
|
||||||
|
|
||||||
class Component;
|
class Component;
|
||||||
|
struct RetryArgs;
|
||||||
|
|
||||||
|
// Forward declaration of retry_handler - needs to be non-static for friend declaration
|
||||||
|
void retry_handler(const std::shared_ptr<RetryArgs> &args);
|
||||||
|
|
||||||
class Scheduler {
|
class Scheduler {
|
||||||
|
// Allow retry_handler to access protected members
|
||||||
|
friend void ::esphome::retry_handler(const std::shared_ptr<RetryArgs> &args);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Public API - accepts std::string for backward compatibility
|
// Public API - accepts std::string for backward compatibility
|
||||||
void set_timeout(Component *component, const std::string &name, uint32_t timeout, std::function<void()> func);
|
void set_timeout(Component *component, const std::string &name, uint32_t timeout, std::function<void()> func);
|
||||||
@ -147,7 +154,7 @@ class Scheduler {
|
|||||||
|
|
||||||
// Common implementation for both timeout and interval
|
// Common implementation for both timeout and interval
|
||||||
void set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string, const void *name_ptr,
|
void set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string, const void *name_ptr,
|
||||||
uint32_t delay, std::function<void()> func);
|
uint32_t delay, std::function<void()> func, bool is_retry = false);
|
||||||
|
|
||||||
uint64_t millis_64_(uint32_t now);
|
uint64_t millis_64_(uint32_t now);
|
||||||
// Cleanup logically deleted items from the scheduler
|
// Cleanup logically deleted items from the scheduler
|
||||||
@ -170,8 +177,8 @@ class Scheduler {
|
|||||||
|
|
||||||
// Helper function to check if item matches criteria for cancellation
|
// Helper function to check if item matches criteria for cancellation
|
||||||
inline bool HOT matches_item_(const std::unique_ptr<SchedulerItem> &item, Component *component, const char *name_cstr,
|
inline bool HOT matches_item_(const std::unique_ptr<SchedulerItem> &item, Component *component, const char *name_cstr,
|
||||||
SchedulerItem::Type type) {
|
SchedulerItem::Type type, bool skip_removed = true) const {
|
||||||
if (item->component != component || item->type != type || item->remove) {
|
if (item->component != component || item->type != type || (skip_removed && item->remove)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const char *item_name = item->get_name();
|
const char *item_name = item->get_name();
|
||||||
@ -197,6 +204,18 @@ class Scheduler {
|
|||||||
return item->remove || (item->component != nullptr && item->component->is_failed());
|
return item->remove || (item->component != nullptr && item->component->is_failed());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Template helper to check if any item in a container matches our criteria
|
||||||
|
template<typename Container>
|
||||||
|
bool has_cancelled_timeout_in_container_(const Container &container, Component *component,
|
||||||
|
const char *name_cstr) const {
|
||||||
|
for (const auto &item : container) {
|
||||||
|
if (item->remove && this->matches_item_(item, component, name_cstr, SchedulerItem::TIMEOUT, false)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
Mutex lock_;
|
Mutex lock_;
|
||||||
std::vector<std::unique_ptr<SchedulerItem>> items_;
|
std::vector<std::unique_ptr<SchedulerItem>> items_;
|
||||||
std::vector<std::unique_ptr<SchedulerItem>> to_add_;
|
std::vector<std::unique_ptr<SchedulerItem>> to_add_;
|
||||||
|
@ -10,7 +10,7 @@ esphome:
|
|||||||
host:
|
host:
|
||||||
api:
|
api:
|
||||||
logger:
|
logger:
|
||||||
level: VERBOSE
|
level: VERY_VERBOSE
|
||||||
|
|
||||||
globals:
|
globals:
|
||||||
- id: simple_retry_counter
|
- id: simple_retry_counter
|
||||||
@ -19,6 +19,9 @@ globals:
|
|||||||
- id: backoff_retry_counter
|
- id: backoff_retry_counter
|
||||||
type: int
|
type: int
|
||||||
initial_value: '0'
|
initial_value: '0'
|
||||||
|
- id: backoff_last_attempt_time
|
||||||
|
type: uint32_t
|
||||||
|
initial_value: '0'
|
||||||
- id: immediate_done_counter
|
- id: immediate_done_counter
|
||||||
type: int
|
type: int
|
||||||
initial_value: '0'
|
initial_value: '0'
|
||||||
@ -35,20 +38,55 @@ globals:
|
|||||||
type: int
|
type: int
|
||||||
initial_value: '0'
|
initial_value: '0'
|
||||||
|
|
||||||
|
# Using different component types for each test to ensure isolation
|
||||||
sensor:
|
sensor:
|
||||||
- platform: template
|
- platform: template
|
||||||
name: Test Sensor
|
name: Simple Retry Test Sensor
|
||||||
id: test_sensor
|
id: simple_retry_sensor
|
||||||
lambda: return 1.0;
|
lambda: return 1.0;
|
||||||
update_interval: never
|
update_interval: never
|
||||||
|
|
||||||
|
- platform: template
|
||||||
|
name: Backoff Retry Test Sensor
|
||||||
|
id: backoff_retry_sensor
|
||||||
|
lambda: return 2.0;
|
||||||
|
update_interval: never
|
||||||
|
|
||||||
|
- platform: template
|
||||||
|
name: Immediate Done Test Sensor
|
||||||
|
id: immediate_done_sensor
|
||||||
|
lambda: return 3.0;
|
||||||
|
update_interval: never
|
||||||
|
|
||||||
|
binary_sensor:
|
||||||
|
- platform: template
|
||||||
|
name: Cancel Retry Test Binary Sensor
|
||||||
|
id: cancel_retry_binary_sensor
|
||||||
|
lambda: return false;
|
||||||
|
|
||||||
|
- platform: template
|
||||||
|
name: Empty Name Test Binary Sensor
|
||||||
|
id: empty_name_binary_sensor
|
||||||
|
lambda: return true;
|
||||||
|
|
||||||
|
switch:
|
||||||
|
- platform: template
|
||||||
|
name: Script Retry Test Switch
|
||||||
|
id: script_retry_switch
|
||||||
|
optimistic: true
|
||||||
|
|
||||||
|
- platform: template
|
||||||
|
name: Multiple Same Name Test Switch
|
||||||
|
id: multiple_same_name_switch
|
||||||
|
optimistic: true
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- id: run_all_tests
|
- id: run_all_tests
|
||||||
then:
|
then:
|
||||||
# Test 1: Simple retry
|
# Test 1: Simple retry
|
||||||
- logger.log: "=== Test 1: Simple retry ==="
|
- logger.log: "=== Test 1: Simple retry ==="
|
||||||
- lambda: |-
|
- lambda: |-
|
||||||
auto *component = id(test_sensor);
|
auto *component = id(simple_retry_sensor);
|
||||||
App.scheduler.set_retry(component, "simple_retry", 50, 3,
|
App.scheduler.set_retry(component, "simple_retry", 50, 3,
|
||||||
[](uint8_t retry_countdown) {
|
[](uint8_t retry_countdown) {
|
||||||
id(simple_retry_counter)++;
|
id(simple_retry_counter)++;
|
||||||
@ -65,19 +103,19 @@ script:
|
|||||||
# Test 2: Backoff retry
|
# Test 2: Backoff retry
|
||||||
- logger.log: "=== Test 2: Retry with backoff ==="
|
- logger.log: "=== Test 2: Retry with backoff ==="
|
||||||
- lambda: |-
|
- lambda: |-
|
||||||
auto *component = id(test_sensor);
|
auto *component = id(backoff_retry_sensor);
|
||||||
static uint32_t backoff_start_time = 0;
|
|
||||||
static uint32_t last_attempt_time = 0;
|
|
||||||
|
|
||||||
backoff_start_time = millis();
|
|
||||||
last_attempt_time = backoff_start_time;
|
|
||||||
|
|
||||||
App.scheduler.set_retry(component, "backoff_retry", 50, 4,
|
App.scheduler.set_retry(component, "backoff_retry", 50, 4,
|
||||||
[](uint8_t retry_countdown) {
|
[](uint8_t retry_countdown) {
|
||||||
id(backoff_retry_counter)++;
|
id(backoff_retry_counter)++;
|
||||||
uint32_t now = millis();
|
uint32_t now = millis();
|
||||||
uint32_t interval = now - last_attempt_time;
|
uint32_t interval = 0;
|
||||||
last_attempt_time = now;
|
|
||||||
|
// Only calculate interval after first attempt
|
||||||
|
if (id(backoff_retry_counter) > 1) {
|
||||||
|
interval = now - id(backoff_last_attempt_time);
|
||||||
|
}
|
||||||
|
id(backoff_last_attempt_time) = now;
|
||||||
|
|
||||||
ESP_LOGI("test", "Backoff retry attempt %d (countdown=%d, interval=%dms)",
|
ESP_LOGI("test", "Backoff retry attempt %d (countdown=%d, interval=%dms)",
|
||||||
id(backoff_retry_counter), retry_countdown, interval);
|
id(backoff_retry_counter), retry_countdown, interval);
|
||||||
@ -100,7 +138,7 @@ script:
|
|||||||
# Test 3: Immediate done
|
# Test 3: Immediate done
|
||||||
- logger.log: "=== Test 3: Immediate done ==="
|
- logger.log: "=== Test 3: Immediate done ==="
|
||||||
- lambda: |-
|
- lambda: |-
|
||||||
auto *component = id(test_sensor);
|
auto *component = id(immediate_done_sensor);
|
||||||
App.scheduler.set_retry(component, "immediate_done", 50, 5,
|
App.scheduler.set_retry(component, "immediate_done", 50, 5,
|
||||||
[](uint8_t retry_countdown) {
|
[](uint8_t retry_countdown) {
|
||||||
id(immediate_done_counter)++;
|
id(immediate_done_counter)++;
|
||||||
@ -111,8 +149,8 @@ script:
|
|||||||
# Test 4: Cancel retry
|
# Test 4: Cancel retry
|
||||||
- logger.log: "=== Test 4: Cancel retry ==="
|
- logger.log: "=== Test 4: Cancel retry ==="
|
||||||
- lambda: |-
|
- lambda: |-
|
||||||
auto *component = id(test_sensor);
|
auto *component = id(cancel_retry_binary_sensor);
|
||||||
App.scheduler.set_retry(component, "cancel_test", 25, 10,
|
App.scheduler.set_retry(component, "cancel_test", 30, 10,
|
||||||
[](uint8_t retry_countdown) {
|
[](uint8_t retry_countdown) {
|
||||||
id(cancel_retry_counter)++;
|
id(cancel_retry_counter)++;
|
||||||
ESP_LOGI("test", "Cancel test retry attempt %d", id(cancel_retry_counter));
|
ESP_LOGI("test", "Cancel test retry attempt %d", id(cancel_retry_counter));
|
||||||
@ -121,7 +159,7 @@ script:
|
|||||||
|
|
||||||
// Cancel it after 100ms
|
// Cancel it after 100ms
|
||||||
App.scheduler.set_timeout(component, "cancel_timer", 100, []() {
|
App.scheduler.set_timeout(component, "cancel_timer", 100, []() {
|
||||||
bool cancelled = App.scheduler.cancel_retry(id(test_sensor), "cancel_test");
|
bool cancelled = App.scheduler.cancel_retry(id(cancel_retry_binary_sensor), "cancel_test");
|
||||||
ESP_LOGI("test", "Retry cancellation result: %s", cancelled ? "true" : "false");
|
ESP_LOGI("test", "Retry cancellation result: %s", cancelled ? "true" : "false");
|
||||||
ESP_LOGI("test", "Cancel retry ran %d times before cancellation", id(cancel_retry_counter));
|
ESP_LOGI("test", "Cancel retry ran %d times before cancellation", id(cancel_retry_counter));
|
||||||
});
|
});
|
||||||
@ -129,7 +167,7 @@ script:
|
|||||||
# Test 5: Empty name retry
|
# Test 5: Empty name retry
|
||||||
- logger.log: "=== Test 5: Empty name retry ==="
|
- logger.log: "=== Test 5: Empty name retry ==="
|
||||||
- lambda: |-
|
- lambda: |-
|
||||||
auto *component = id(test_sensor);
|
auto *component = id(empty_name_binary_sensor);
|
||||||
App.scheduler.set_retry(component, "", 100, 5,
|
App.scheduler.set_retry(component, "", 100, 5,
|
||||||
[](uint8_t retry_countdown) {
|
[](uint8_t retry_countdown) {
|
||||||
id(empty_name_retry_counter)++;
|
id(empty_name_retry_counter)++;
|
||||||
@ -139,7 +177,7 @@ script:
|
|||||||
|
|
||||||
// Try to cancel after 150ms
|
// Try to cancel after 150ms
|
||||||
App.scheduler.set_timeout(component, "empty_cancel_timer", 150, []() {
|
App.scheduler.set_timeout(component, "empty_cancel_timer", 150, []() {
|
||||||
bool cancelled = App.scheduler.cancel_retry(id(test_sensor), "");
|
bool cancelled = App.scheduler.cancel_retry(id(empty_name_binary_sensor), "");
|
||||||
ESP_LOGI("test", "Empty name retry cancel result: %s",
|
ESP_LOGI("test", "Empty name retry cancel result: %s",
|
||||||
cancelled ? "true" : "false");
|
cancelled ? "true" : "false");
|
||||||
ESP_LOGI("test", "Empty name retry ran %d times", id(empty_name_retry_counter));
|
ESP_LOGI("test", "Empty name retry ran %d times", id(empty_name_retry_counter));
|
||||||
@ -169,7 +207,7 @@ script:
|
|||||||
# Test 7: Multiple same name
|
# Test 7: Multiple same name
|
||||||
- logger.log: "=== Test 7: Multiple retries with same name ==="
|
- logger.log: "=== Test 7: Multiple retries with same name ==="
|
||||||
- lambda: |-
|
- lambda: |-
|
||||||
auto *component = id(test_sensor);
|
auto *component = id(multiple_same_name_switch);
|
||||||
|
|
||||||
// Set first retry
|
// Set first retry
|
||||||
App.scheduler.set_retry(component, "duplicate_retry", 100, 5,
|
App.scheduler.set_retry(component, "duplicate_retry", 100, 5,
|
||||||
@ -200,7 +238,7 @@ script:
|
|||||||
ESP_LOGI("test", "Simple retry counter: %d (expected 2)", id(simple_retry_counter));
|
ESP_LOGI("test", "Simple retry counter: %d (expected 2)", id(simple_retry_counter));
|
||||||
ESP_LOGI("test", "Backoff retry counter: %d (expected 4)", id(backoff_retry_counter));
|
ESP_LOGI("test", "Backoff retry counter: %d (expected 4)", id(backoff_retry_counter));
|
||||||
ESP_LOGI("test", "Immediate done counter: %d (expected 1)", id(immediate_done_counter));
|
ESP_LOGI("test", "Immediate done counter: %d (expected 1)", id(immediate_done_counter));
|
||||||
ESP_LOGI("test", "Cancel retry counter: %d (expected ~3-4)", id(cancel_retry_counter));
|
ESP_LOGI("test", "Cancel retry counter: %d (expected 2-4)", id(cancel_retry_counter));
|
||||||
ESP_LOGI("test", "Empty name retry counter: %d (expected 1-2)", id(empty_name_retry_counter));
|
ESP_LOGI("test", "Empty name retry counter: %d (expected 1-2)", id(empty_name_retry_counter));
|
||||||
ESP_LOGI("test", "Component retry counter: %d (expected 2)", id(script_retry_counter));
|
ESP_LOGI("test", "Component retry counter: %d (expected 2)", id(script_retry_counter));
|
||||||
ESP_LOGI("test", "Multiple same name counter: %d (expected 20+)", id(multiple_same_name_counter));
|
ESP_LOGI("test", "Multiple same name counter: %d (expected 20+)", id(multiple_same_name_counter));
|
||||||
|
@ -148,16 +148,16 @@ async def test_scheduler_retry_test(
|
|||||||
f"Expected at least 2 intervals, got {len(backoff_intervals)}"
|
f"Expected at least 2 intervals, got {len(backoff_intervals)}"
|
||||||
)
|
)
|
||||||
if len(backoff_intervals) >= 3:
|
if len(backoff_intervals) >= 3:
|
||||||
# First interval should be ~50ms
|
# First interval should be ~50ms (very wide tolerance for heavy system load)
|
||||||
assert 30 <= backoff_intervals[0] <= 70, (
|
assert 20 <= backoff_intervals[0] <= 150, (
|
||||||
f"First interval {backoff_intervals[0]}ms not ~50ms"
|
f"First interval {backoff_intervals[0]}ms not ~50ms"
|
||||||
)
|
)
|
||||||
# Second interval should be ~100ms (50ms * 2.0)
|
# Second interval should be ~100ms (50ms * 2.0)
|
||||||
assert 80 <= backoff_intervals[1] <= 120, (
|
assert 50 <= backoff_intervals[1] <= 250, (
|
||||||
f"Second interval {backoff_intervals[1]}ms not ~100ms"
|
f"Second interval {backoff_intervals[1]}ms not ~100ms"
|
||||||
)
|
)
|
||||||
# Third interval should be ~200ms (100ms * 2.0)
|
# Third interval should be ~200ms (100ms * 2.0)
|
||||||
assert 180 <= backoff_intervals[2] <= 220, (
|
assert 100 <= backoff_intervals[2] <= 500, (
|
||||||
f"Third interval {backoff_intervals[2]}ms not ~200ms"
|
f"Third interval {backoff_intervals[2]}ms not ~200ms"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -175,7 +175,7 @@ async def test_scheduler_retry_test(
|
|||||||
|
|
||||||
# Wait for cancel retry test
|
# Wait for cancel retry test
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(cancel_retry_done.wait(), timeout=2.0)
|
await asyncio.wait_for(cancel_retry_done.wait(), timeout=3.0)
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
pytest.fail(
|
pytest.fail(
|
||||||
f"Cancel retry test did not complete. Count: {cancel_retry_count}"
|
f"Cancel retry test did not complete. Count: {cancel_retry_count}"
|
||||||
@ -195,8 +195,8 @@ async def test_scheduler_retry_test(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Empty name retry should run at least once before being cancelled
|
# Empty name retry should run at least once before being cancelled
|
||||||
assert 1 <= empty_name_retry_count <= 2, (
|
assert 1 <= empty_name_retry_count <= 3, (
|
||||||
f"Expected 1-2 empty name retry attempts, got {empty_name_retry_count}"
|
f"Expected 1-3 empty name retry attempts, got {empty_name_retry_count}"
|
||||||
)
|
)
|
||||||
assert empty_cancel_result is True, (
|
assert empty_cancel_result is True, (
|
||||||
"Empty name retry cancel should have succeeded"
|
"Empty name retry cancel should have succeeded"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user