From 06d660f9c859e303afcf018de5df974c02ad767b Mon Sep 17 00:00:00 2001 From: Kevin Bataille Date: Mon, 16 Feb 2026 18:21:16 +0100 Subject: [PATCH] Add confidence breakdown display - Shows High/Medium/Low count breakdown - Helps verify all matching posts will be processed - Example output: Filtered to 328 proposals (confidence >= Medium) Breakdown: High=293, Medium=35, Low=0 Co-authored-by: Qwen-Coder --- check_confidence.py | 34 ++++++++++++++++++++++++++++++++++ src/seo/category_manager.py | 6 ++++++ 2 files changed, 40 insertions(+) create mode 100644 check_confidence.py diff --git a/check_confidence.py b/check_confidence.py new file mode 100644 index 0000000..b017549 --- /dev/null +++ b/check_confidence.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +import csv +from collections import Counter +import glob + +files = sorted(glob.glob('output/category_proposals_*.csv')) +if files: + with open(files[-1], 'r') as f: + reader = csv.DictReader(f) + proposals = list(reader) + + print("=== All Proposals ===") + print(f"Total: {len(proposals)}\n") + + print("By Site:") + sites = Counter(p['current_site'] for p in proposals) + for site, count in sorted(sites.items()): + print(f" {site}: {count}") + + print("\nBy Confidence (all sites):") + confs = Counter(p['category_confidence'] for p in proposals) + for conf, count in sorted(confs.items()): + print(f" {conf}: {count}") + + print("\nBy Site and Confidence:") + for site in ['mistergeek.net', 'webscroll.fr', 'hellogeek.net']: + site_props = [p for p in proposals if p['current_site'] == site] + confs = Counter(p['category_confidence'] for p in site_props) + print(f"\n {site} ({len(site_props)} total):") + for conf, count in sorted(confs.items()): + print(f" {conf}: {count}") + + medium_or_better = [p for p in site_props if p['category_confidence'] in ['High', 'Medium']] + print(f" → Would process with -c Medium (default): {len(medium_or_better)}") diff --git a/src/seo/category_manager.py b/src/seo/category_manager.py index d67baa5..ddbc7e6 100644 --- a/src/seo/category_manager.py +++ b/src/seo/category_manager.py @@ -381,6 +381,12 @@ class CategoryAssignmentProcessor: if confidence_order.get(p.get('category_confidence', 'Medium'), 2) >= min_confidence ] logger.info(f"Filtered to {len(filtered_proposals)} proposals (confidence >= {confidence_threshold})") + + # Show breakdown + high_count = sum(1 for p in filtered_proposals if p.get('category_confidence') == 'High') + medium_count = sum(1 for p in filtered_proposals if p.get('category_confidence') == 'Medium') + low_count = sum(1 for p in filtered_proposals if p.get('category_confidence') == 'Low') + logger.info(f" Breakdown: High={high_count}, Medium={medium_count}, Low={low_count}") # Fetch existing categories self.category_manager.fetch_categories(site_name)