mirror of
https://github.com/esphome/esphome.git
synced 2025-07-28 14:16:40 +00:00
[scheduler] Fix retry race condition on cancellation (#9788)
This commit is contained in:
parent
9ac10d7276
commit
88ccde4ba1
@ -65,7 +65,7 @@ static void validate_static_string(const char *name) {
|
||||
|
||||
// Common implementation for both timeout and interval
|
||||
void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string,
|
||||
const void *name_ptr, uint32_t delay, std::function<void()> func) {
|
||||
const void *name_ptr, uint32_t delay, std::function<void()> func, bool is_retry) {
|
||||
// Get the name as const char*
|
||||
const char *name_cstr = this->get_name_cstr_(is_static_string, name_ptr);
|
||||
|
||||
@ -130,6 +130,18 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type
|
||||
#endif /* ESPHOME_DEBUG_SCHEDULER */
|
||||
|
||||
LockGuard guard{this->lock_};
|
||||
|
||||
// For retries, check if there's a cancelled timeout first
|
||||
if (is_retry && name_cstr != nullptr && type == SchedulerItem::TIMEOUT &&
|
||||
(has_cancelled_timeout_in_container_(this->items_, component, name_cstr) ||
|
||||
has_cancelled_timeout_in_container_(this->to_add_, component, name_cstr))) {
|
||||
// Skip scheduling - the retry was cancelled
|
||||
#ifdef ESPHOME_DEBUG_SCHEDULER
|
||||
ESP_LOGD(TAG, "Skipping retry '%s' - found cancelled item", name_cstr);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
// If name is provided, do atomic cancel-and-add
|
||||
// Cancel existing items
|
||||
this->cancel_item_locked_(component, name_cstr, type);
|
||||
@ -178,12 +190,14 @@ struct RetryArgs {
|
||||
Scheduler *scheduler;
|
||||
};
|
||||
|
||||
static void retry_handler(const std::shared_ptr<RetryArgs> &args) {
|
||||
void retry_handler(const std::shared_ptr<RetryArgs> &args) {
|
||||
RetryResult const retry_result = args->func(--args->retry_countdown);
|
||||
if (retry_result == RetryResult::DONE || args->retry_countdown <= 0)
|
||||
return;
|
||||
// second execution of `func` happens after `initial_wait_time`
|
||||
args->scheduler->set_timeout(args->component, args->name, args->current_interval, [args]() { retry_handler(args); });
|
||||
args->scheduler->set_timer_common_(
|
||||
args->component, Scheduler::SchedulerItem::TIMEOUT, false, &args->name, args->current_interval,
|
||||
[args]() { retry_handler(args); }, true);
|
||||
// backoff_increase_factor applied to third & later executions
|
||||
args->current_interval *= args->backoff_increase_factor;
|
||||
}
|
||||
|
@ -15,8 +15,15 @@
|
||||
namespace esphome {
|
||||
|
||||
class Component;
|
||||
struct RetryArgs;
|
||||
|
||||
// Forward declaration of retry_handler - needs to be non-static for friend declaration
|
||||
void retry_handler(const std::shared_ptr<RetryArgs> &args);
|
||||
|
||||
class Scheduler {
|
||||
// Allow retry_handler to access protected members
|
||||
friend void ::esphome::retry_handler(const std::shared_ptr<RetryArgs> &args);
|
||||
|
||||
public:
|
||||
// Public API - accepts std::string for backward compatibility
|
||||
void set_timeout(Component *component, const std::string &name, uint32_t timeout, std::function<void()> func);
|
||||
@ -147,7 +154,7 @@ class Scheduler {
|
||||
|
||||
// Common implementation for both timeout and interval
|
||||
void set_timer_common_(Component *component, SchedulerItem::Type type, bool is_static_string, const void *name_ptr,
|
||||
uint32_t delay, std::function<void()> func);
|
||||
uint32_t delay, std::function<void()> func, bool is_retry = false);
|
||||
|
||||
uint64_t millis_64_(uint32_t now);
|
||||
// Cleanup logically deleted items from the scheduler
|
||||
@ -170,8 +177,8 @@ class Scheduler {
|
||||
|
||||
// Helper function to check if item matches criteria for cancellation
|
||||
inline bool HOT matches_item_(const std::unique_ptr<SchedulerItem> &item, Component *component, const char *name_cstr,
|
||||
SchedulerItem::Type type) {
|
||||
if (item->component != component || item->type != type || item->remove) {
|
||||
SchedulerItem::Type type, bool skip_removed = true) const {
|
||||
if (item->component != component || item->type != type || (skip_removed && item->remove)) {
|
||||
return false;
|
||||
}
|
||||
const char *item_name = item->get_name();
|
||||
@ -197,6 +204,18 @@ class Scheduler {
|
||||
return item->remove || (item->component != nullptr && item->component->is_failed());
|
||||
}
|
||||
|
||||
// Template helper to check if any item in a container matches our criteria
|
||||
template<typename Container>
|
||||
bool has_cancelled_timeout_in_container_(const Container &container, Component *component,
|
||||
const char *name_cstr) const {
|
||||
for (const auto &item : container) {
|
||||
if (item->remove && this->matches_item_(item, component, name_cstr, SchedulerItem::TIMEOUT, false)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Mutex lock_;
|
||||
std::vector<std::unique_ptr<SchedulerItem>> items_;
|
||||
std::vector<std::unique_ptr<SchedulerItem>> to_add_;
|
||||
|
@ -10,7 +10,7 @@ esphome:
|
||||
host:
|
||||
api:
|
||||
logger:
|
||||
level: VERBOSE
|
||||
level: VERY_VERBOSE
|
||||
|
||||
globals:
|
||||
- id: simple_retry_counter
|
||||
@ -19,6 +19,9 @@ globals:
|
||||
- id: backoff_retry_counter
|
||||
type: int
|
||||
initial_value: '0'
|
||||
- id: backoff_last_attempt_time
|
||||
type: uint32_t
|
||||
initial_value: '0'
|
||||
- id: immediate_done_counter
|
||||
type: int
|
||||
initial_value: '0'
|
||||
@ -35,20 +38,55 @@ globals:
|
||||
type: int
|
||||
initial_value: '0'
|
||||
|
||||
# Using different component types for each test to ensure isolation
|
||||
sensor:
|
||||
- platform: template
|
||||
name: Test Sensor
|
||||
id: test_sensor
|
||||
name: Simple Retry Test Sensor
|
||||
id: simple_retry_sensor
|
||||
lambda: return 1.0;
|
||||
update_interval: never
|
||||
|
||||
- platform: template
|
||||
name: Backoff Retry Test Sensor
|
||||
id: backoff_retry_sensor
|
||||
lambda: return 2.0;
|
||||
update_interval: never
|
||||
|
||||
- platform: template
|
||||
name: Immediate Done Test Sensor
|
||||
id: immediate_done_sensor
|
||||
lambda: return 3.0;
|
||||
update_interval: never
|
||||
|
||||
binary_sensor:
|
||||
- platform: template
|
||||
name: Cancel Retry Test Binary Sensor
|
||||
id: cancel_retry_binary_sensor
|
||||
lambda: return false;
|
||||
|
||||
- platform: template
|
||||
name: Empty Name Test Binary Sensor
|
||||
id: empty_name_binary_sensor
|
||||
lambda: return true;
|
||||
|
||||
switch:
|
||||
- platform: template
|
||||
name: Script Retry Test Switch
|
||||
id: script_retry_switch
|
||||
optimistic: true
|
||||
|
||||
- platform: template
|
||||
name: Multiple Same Name Test Switch
|
||||
id: multiple_same_name_switch
|
||||
optimistic: true
|
||||
|
||||
script:
|
||||
- id: run_all_tests
|
||||
then:
|
||||
# Test 1: Simple retry
|
||||
- logger.log: "=== Test 1: Simple retry ==="
|
||||
- lambda: |-
|
||||
auto *component = id(test_sensor);
|
||||
auto *component = id(simple_retry_sensor);
|
||||
App.scheduler.set_retry(component, "simple_retry", 50, 3,
|
||||
[](uint8_t retry_countdown) {
|
||||
id(simple_retry_counter)++;
|
||||
@ -65,19 +103,19 @@ script:
|
||||
# Test 2: Backoff retry
|
||||
- logger.log: "=== Test 2: Retry with backoff ==="
|
||||
- lambda: |-
|
||||
auto *component = id(test_sensor);
|
||||
static uint32_t backoff_start_time = 0;
|
||||
static uint32_t last_attempt_time = 0;
|
||||
|
||||
backoff_start_time = millis();
|
||||
last_attempt_time = backoff_start_time;
|
||||
auto *component = id(backoff_retry_sensor);
|
||||
|
||||
App.scheduler.set_retry(component, "backoff_retry", 50, 4,
|
||||
[](uint8_t retry_countdown) {
|
||||
id(backoff_retry_counter)++;
|
||||
uint32_t now = millis();
|
||||
uint32_t interval = now - last_attempt_time;
|
||||
last_attempt_time = now;
|
||||
uint32_t interval = 0;
|
||||
|
||||
// Only calculate interval after first attempt
|
||||
if (id(backoff_retry_counter) > 1) {
|
||||
interval = now - id(backoff_last_attempt_time);
|
||||
}
|
||||
id(backoff_last_attempt_time) = now;
|
||||
|
||||
ESP_LOGI("test", "Backoff retry attempt %d (countdown=%d, interval=%dms)",
|
||||
id(backoff_retry_counter), retry_countdown, interval);
|
||||
@ -100,7 +138,7 @@ script:
|
||||
# Test 3: Immediate done
|
||||
- logger.log: "=== Test 3: Immediate done ==="
|
||||
- lambda: |-
|
||||
auto *component = id(test_sensor);
|
||||
auto *component = id(immediate_done_sensor);
|
||||
App.scheduler.set_retry(component, "immediate_done", 50, 5,
|
||||
[](uint8_t retry_countdown) {
|
||||
id(immediate_done_counter)++;
|
||||
@ -111,8 +149,8 @@ script:
|
||||
# Test 4: Cancel retry
|
||||
- logger.log: "=== Test 4: Cancel retry ==="
|
||||
- lambda: |-
|
||||
auto *component = id(test_sensor);
|
||||
App.scheduler.set_retry(component, "cancel_test", 25, 10,
|
||||
auto *component = id(cancel_retry_binary_sensor);
|
||||
App.scheduler.set_retry(component, "cancel_test", 30, 10,
|
||||
[](uint8_t retry_countdown) {
|
||||
id(cancel_retry_counter)++;
|
||||
ESP_LOGI("test", "Cancel test retry attempt %d", id(cancel_retry_counter));
|
||||
@ -121,7 +159,7 @@ script:
|
||||
|
||||
// Cancel it after 100ms
|
||||
App.scheduler.set_timeout(component, "cancel_timer", 100, []() {
|
||||
bool cancelled = App.scheduler.cancel_retry(id(test_sensor), "cancel_test");
|
||||
bool cancelled = App.scheduler.cancel_retry(id(cancel_retry_binary_sensor), "cancel_test");
|
||||
ESP_LOGI("test", "Retry cancellation result: %s", cancelled ? "true" : "false");
|
||||
ESP_LOGI("test", "Cancel retry ran %d times before cancellation", id(cancel_retry_counter));
|
||||
});
|
||||
@ -129,7 +167,7 @@ script:
|
||||
# Test 5: Empty name retry
|
||||
- logger.log: "=== Test 5: Empty name retry ==="
|
||||
- lambda: |-
|
||||
auto *component = id(test_sensor);
|
||||
auto *component = id(empty_name_binary_sensor);
|
||||
App.scheduler.set_retry(component, "", 100, 5,
|
||||
[](uint8_t retry_countdown) {
|
||||
id(empty_name_retry_counter)++;
|
||||
@ -139,7 +177,7 @@ script:
|
||||
|
||||
// Try to cancel after 150ms
|
||||
App.scheduler.set_timeout(component, "empty_cancel_timer", 150, []() {
|
||||
bool cancelled = App.scheduler.cancel_retry(id(test_sensor), "");
|
||||
bool cancelled = App.scheduler.cancel_retry(id(empty_name_binary_sensor), "");
|
||||
ESP_LOGI("test", "Empty name retry cancel result: %s",
|
||||
cancelled ? "true" : "false");
|
||||
ESP_LOGI("test", "Empty name retry ran %d times", id(empty_name_retry_counter));
|
||||
@ -169,7 +207,7 @@ script:
|
||||
# Test 7: Multiple same name
|
||||
- logger.log: "=== Test 7: Multiple retries with same name ==="
|
||||
- lambda: |-
|
||||
auto *component = id(test_sensor);
|
||||
auto *component = id(multiple_same_name_switch);
|
||||
|
||||
// Set first retry
|
||||
App.scheduler.set_retry(component, "duplicate_retry", 100, 5,
|
||||
@ -200,7 +238,7 @@ script:
|
||||
ESP_LOGI("test", "Simple retry counter: %d (expected 2)", id(simple_retry_counter));
|
||||
ESP_LOGI("test", "Backoff retry counter: %d (expected 4)", id(backoff_retry_counter));
|
||||
ESP_LOGI("test", "Immediate done counter: %d (expected 1)", id(immediate_done_counter));
|
||||
ESP_LOGI("test", "Cancel retry counter: %d (expected ~3-4)", id(cancel_retry_counter));
|
||||
ESP_LOGI("test", "Cancel retry counter: %d (expected 2-4)", id(cancel_retry_counter));
|
||||
ESP_LOGI("test", "Empty name retry counter: %d (expected 1-2)", id(empty_name_retry_counter));
|
||||
ESP_LOGI("test", "Component retry counter: %d (expected 2)", id(script_retry_counter));
|
||||
ESP_LOGI("test", "Multiple same name counter: %d (expected 20+)", id(multiple_same_name_counter));
|
||||
|
@ -148,16 +148,16 @@ async def test_scheduler_retry_test(
|
||||
f"Expected at least 2 intervals, got {len(backoff_intervals)}"
|
||||
)
|
||||
if len(backoff_intervals) >= 3:
|
||||
# First interval should be ~50ms
|
||||
assert 30 <= backoff_intervals[0] <= 70, (
|
||||
# First interval should be ~50ms (very wide tolerance for heavy system load)
|
||||
assert 20 <= backoff_intervals[0] <= 150, (
|
||||
f"First interval {backoff_intervals[0]}ms not ~50ms"
|
||||
)
|
||||
# Second interval should be ~100ms (50ms * 2.0)
|
||||
assert 80 <= backoff_intervals[1] <= 120, (
|
||||
assert 50 <= backoff_intervals[1] <= 250, (
|
||||
f"Second interval {backoff_intervals[1]}ms not ~100ms"
|
||||
)
|
||||
# Third interval should be ~200ms (100ms * 2.0)
|
||||
assert 180 <= backoff_intervals[2] <= 220, (
|
||||
assert 100 <= backoff_intervals[2] <= 500, (
|
||||
f"Third interval {backoff_intervals[2]}ms not ~200ms"
|
||||
)
|
||||
|
||||
@ -175,7 +175,7 @@ async def test_scheduler_retry_test(
|
||||
|
||||
# Wait for cancel retry test
|
||||
try:
|
||||
await asyncio.wait_for(cancel_retry_done.wait(), timeout=2.0)
|
||||
await asyncio.wait_for(cancel_retry_done.wait(), timeout=3.0)
|
||||
except TimeoutError:
|
||||
pytest.fail(
|
||||
f"Cancel retry test did not complete. Count: {cancel_retry_count}"
|
||||
@ -195,8 +195,8 @@ async def test_scheduler_retry_test(
|
||||
)
|
||||
|
||||
# Empty name retry should run at least once before being cancelled
|
||||
assert 1 <= empty_name_retry_count <= 2, (
|
||||
f"Expected 1-2 empty name retry attempts, got {empty_name_retry_count}"
|
||||
assert 1 <= empty_name_retry_count <= 3, (
|
||||
f"Expected 1-3 empty name retry attempts, got {empty_name_retry_count}"
|
||||
)
|
||||
assert empty_cancel_result is True, (
|
||||
"Empty name retry cancel should have succeeded"
|
||||
|
Loading…
x
Reference in New Issue
Block a user