From e19f1788647d47a6de34a6a51d29db738465534f Mon Sep 17 00:00:00 2001
From: Franck Nijhof <git@frenck.dev>
Date: Thu, 12 Jun 2025 13:55:26 +0200
Subject: [PATCH] Make duplicate issue detection more strict (#146633)

---
 .github/workflows/detect-duplicate-issues.yml | 39 ++++++++++++-------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/detect-duplicate-issues.yml b/.github/workflows/detect-duplicate-issues.yml
index 509868541fd..b01a0d68352 100644
--- a/.github/workflows/detect-duplicate-issues.yml
+++ b/.github/workflows/detect-duplicate-issues.yml
@@ -133,12 +133,18 @@ jobs:
 
             // Build search query for issues with any of the current integration labels
             const labelQueries = integrationLabels.map(label => `label:"${label}"`);
+
+            // Calculate date 6 months ago
+            const sixMonthsAgo = new Date();
+            sixMonthsAgo.setMonth(sixMonthsAgo.getMonth() - 6);
+            const dateFilter = `created:>=${sixMonthsAgo.toISOString().split('T')[0]}`;
+
             let searchQuery;
 
             if (labelQueries.length === 1) {
-              searchQuery = `repo:${context.repo.owner}/${context.repo.repo} is:issue ${labelQueries[0]}`;
+              searchQuery = `repo:${context.repo.owner}/${context.repo.repo} is:issue ${labelQueries[0]} ${dateFilter}`;
             } else {
-              searchQuery = `repo:${context.repo.owner}/${context.repo.repo} is:issue (${labelQueries.join(' OR ')})`;
+              searchQuery = `repo:${context.repo.owner}/${context.repo.repo} is:issue (${labelQueries.join(' OR ')}) ${dateFilter}`;
             }
 
             console.log(`Search query: ${searchQuery}`);
@@ -227,29 +233,34 @@ jobs:
         if: steps.extract.outputs.should_continue == 'true' && steps.fetch_similar.outputs.has_similar == 'true'
         uses: actions/ai-inference@v1.1.0
         with:
-          model: openai/gpt-4o-mini
+          model: openai/gpt-4o
           system-prompt: |
-            You are a Home Assistant issue duplicate detector. Your task is to identify potential duplicate issues based on their content.
+            You are a Home Assistant issue duplicate detector. Your task is to identify TRUE DUPLICATES - issues that report the EXACT SAME problem, not just similar or related issues.
+
+            CRITICAL: An issue is ONLY a duplicate if:
+            - It describes the SAME problem with the SAME root cause
+            - Issues about the same integration but different problems are NOT duplicates
+            - Issues with similar symptoms but different causes are NOT duplicates
 
             Important considerations:
             - Open issues are more relevant than closed ones for duplicate detection
             - Recently updated issues may indicate ongoing work or discussion
             - Issues with more comments are generally more relevant and active
-            - Higher comment count often indicates community engagement and importance
             - Older closed issues might be resolved differently than newer approaches
             - Consider the time between issues - very old issues may have different contexts
 
             Rules:
-            1. Compare the current issue with the provided similar issues
+            1. ONLY mark as duplicate if the issues describe IDENTICAL problems
             2. Look for issues that report the same problem or request the same functionality
-            3. Consider different wording but same underlying issue as duplicates
+            3. Different error messages = NOT a duplicate (even if same integration)
             4. For CLOSED issues, only mark as duplicate if they describe the EXACT same problem
-            5. For OPEN issues, use a lower threshold (70%+ similarity)
+            5. For OPEN issues, use a lower threshold (90%+ similarity)
             6. Prioritize issues with higher comment counts as they indicate more activity/relevance
-            7. Return ONLY a JSON array of issue numbers that are potential duplicates
-            8. If no duplicates are found, return an empty array: []
-            9. Maximum 5 potential duplicates, prioritize open issues with comments
-            10. Consider the age of issues - prefer recent duplicates over very old ones
+            7. When in doubt, do NOT mark as duplicate
+            8. Return ONLY a JSON array of issue numbers that are duplicates
+            9. If no duplicates are found, return an empty array: []
+            10. Maximum 5 potential duplicates, prioritize open issues with comments
+            11. Consider the age of issues - prefer recent duplicates over very old ones
 
             Example response format:
             [1234, 5678, 9012]
@@ -259,10 +270,10 @@ jobs:
             Title: ${{ steps.extract.outputs.current_title }}
             Body: ${{ steps.extract.outputs.current_body }}
 
-            Similar issues to compare against (each includes state, creation date, last update, and comment count):
+            Other issues to compare against (each includes state, creation date, last update, and comment count):
             ${{ steps.fetch_similar.outputs.similar_issues }}
 
-            Analyze these issues and identify which ones are potential duplicates of the current issue. Consider their state (open/closed), how recently they were updated, and their comment count (higher = more relevant).
+            Analyze these issues and identify which ones describe IDENTICAL problems and thus are duplicates of the current issue. When sorting them, consider their state (open/closed), how recently they were updated, and their comment count (higher = more relevant).
 
           max-tokens: 100