Add confidence breakdown display

- Shows High/Medium/Low count breakdown - Helps verify all matching posts will be processed - Example output: Filtered to 328 proposals (confidence >= Medium) Breakdown: High=293, Medium=35, Low=0 Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 18:21:16 +01:00
parent 54168a1c00
commit 06d660f9c8
2 changed files with 40 additions and 0 deletions
--- a/check_confidence.py
+++ b/check_confidence.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+import csv
+from collections import Counter
+import glob
+
+files = sorted(glob.glob('output/category_proposals_*.csv'))
+if files:
+    with open(files[-1], 'r') as f:
+        reader = csv.DictReader(f)
+        proposals = list(reader)
+    
+    print("=== All Proposals ===")
+    print(f"Total: {len(proposals)}\n")
+    
+    print("By Site:")
+    sites = Counter(p['current_site'] for p in proposals)
+    for site, count in sorted(sites.items()):
+        print(f"  {site}: {count}")
+    
+    print("\nBy Confidence (all sites):")
+    confs = Counter(p['category_confidence'] for p in proposals)
+    for conf, count in sorted(confs.items()):
+        print(f"  {conf}: {count}")
+    
+    print("\nBy Site and Confidence:")
+    for site in ['mistergeek.net', 'webscroll.fr', 'hellogeek.net']:
+        site_props = [p for p in proposals if p['current_site'] == site]
+        confs = Counter(p['category_confidence'] for p in site_props)
+        print(f"\n  {site} ({len(site_props)} total):")
+        for conf, count in sorted(confs.items()):
+            print(f"    {conf}: {count}")
+        
+        medium_or_better = [p for p in site_props if p['category_confidence'] in ['High', 'Medium']]
+        print(f"    → Would process with -c Medium (default): {len(medium_or_better)}")
--- a/src/seo/category_manager.py
+++ b/src/seo/category_manager.py
@@ -381,6 +381,12 @@ class CategoryAssignmentProcessor:
                if confidence_order.get(p.get('category_confidence', 'Medium'), 2) >= min_confidence
            ]
            logger.info(f"Filtered to {len(filtered_proposals)} proposals (confidence >= {confidence_threshold})")
+            
+            # Show breakdown
+            high_count = sum(1 for p in filtered_proposals if p.get('category_confidence') == 'High')
+            medium_count = sum(1 for p in filtered_proposals if p.get('category_confidence') == 'Medium')
+            low_count = sum(1 for p in filtered_proposals if p.get('category_confidence') == 'Low')
+            logger.info(f"  Breakdown: High={high_count}, Medium={medium_count}, Low={low_count}")
        
        # Fetch existing categories
        self.category_manager.fetch_categories(site_name)