Add existing category fetching to category_propose

New Feature:
- Fetch existing categories from WordPress sites before AI proposals
- AI now prefers existing categories to avoid duplicates
- Shows existing categories in AI prompt for better suggestions
- Tracks whether proposed categories are existing or new

### Changes:
- fetch_existing_categories() method - Gets categories from all sites
- Updated AI prompt includes existing categories list
- New CSV column: is_existing_category (Yes/No)
- Statistics showing existing vs new categories

### Benefits:
- Reduces category duplication
- Maintains consistency across posts
- AI makes smarter category suggestions
- Users can see which are existing vs new categories

### AI Prompt Enhancement:
EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES):

mistergeek.net:
  - VPN
  - Software
  - Gaming
  ...

webscroll.fr:
  - Torrenting
  - File-Sharing
  ...

IMPORTANT: Use existing categories when possible...

### Output:
Category Statistics:
  Using existing categories: 145 posts
  Proposing new categories: 12 posts

Usage:
./seo category_propose  # Now fetches existing categories automatically

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
Kevin Bataille
2026-02-16 15:58:51 +01:00
parent b5c586e7ad
commit 6a574bf07c
2 changed files with 117 additions and 15 deletions

View File

@@ -64,13 +64,15 @@ class SEOApp:
analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields)
return analyzer.run(output_file=output, update_input=update)
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]:
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None,
fetch_existing: bool = True) -> Tuple[str, str]:
"""
Propose categories for posts with editorial strategy alignment.
Args:
csv_file: Path to CSV file (uses latest export if not provided)
output: Custom output file path
fetch_existing: If True, fetch existing categories from WordPress
Returns:
Tuple of (proposals_file, migrations_file)
@@ -85,7 +87,8 @@ class SEOApp:
logger.info(f"Using file: {csv_file}")
proposer = CategoryProposer(csv_file, use_editorial_strategy=True)
proposer = CategoryProposer(csv_file, use_editorial_strategy=True,
fetch_existing_categories=fetch_existing)
return proposer.run(output_file=output)
def category_apply(self, proposals_csv: str, site_name: str,

View File

@@ -1,6 +1,6 @@
"""
Category Proposer - AI-powered category suggestions with editorial strategy alignment
Proposes categories based on content AND site editorial lines
Fetches existing categories from WordPress sites and uses them in proposals
"""
import csv
@@ -10,6 +10,7 @@ from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple
import requests
from requests.auth import HTTPBasicAuth
from .config import Config
from .editorial_strategy import EditorialStrategyAnalyzer
@@ -58,13 +59,15 @@ EDITORIAL_LINES = {
class CategoryProposer:
"""Propose categories for posts using AI with editorial strategy alignment."""
def __init__(self, csv_file: str, use_editorial_strategy: bool = True):
def __init__(self, csv_file: str, use_editorial_strategy: bool = True,
fetch_existing_categories: bool = True):
"""
Initialize proposer.
Args:
csv_file: Path to CSV file
use_editorial_strategy: If True, align proposals with editorial lines
fetch_existing_categories: If True, fetch existing categories from WordPress
"""
self.csv_file = Path(csv_file)
self.openrouter_api_key = Config.OPENROUTER_API_KEY
@@ -75,7 +78,63 @@ class CategoryProposer:
self.api_calls = 0
self.ai_cost = 0.0
self.use_editorial_strategy = use_editorial_strategy
self.fetch_existing_categories = fetch_existing_categories
self.site_analysis = {}
self.existing_categories_by_site = {} # Fetched from WordPress
self.sites = Config.WORDPRESS_SITES # Add sites config
def fetch_existing_categories(self) -> Dict[str, List[str]]:
"""
Fetch existing categories from all WordPress sites.
Returns:
Dict mapping site name to list of category names
"""
logger.info("\n📁 Fetching existing categories from WordPress sites...")
categories_by_site = {}
for site_name, site_config in self.sites.items():
try:
base_url = site_config['url'].rstrip('/')
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
logger.info(f" Fetching from {site_name}...")
all_categories = []
page = 1
while True:
response = requests.get(
f"{base_url}/wp-json/wp/v2/categories",
params={'per_page': 100, 'page': page},
auth=auth,
timeout=10
)
if response.status_code != 200:
logger.warning(f" Could not fetch categories: {response.status_code}")
break
page_categories = response.json()
if not page_categories:
break
all_categories.extend([cat['name'] for cat in page_categories])
if len(page_categories) < 100:
break
page += 1
categories_by_site[site_name] = all_categories
logger.info(f" ✓ Found {len(all_categories)} categories")
except Exception as e:
logger.warning(f" Error fetching from {site_name}: {e}")
categories_by_site[site_name] = []
self.existing_categories_by_site = categories_by_site
return categories_by_site
def load_csv(self) -> bool:
"""Load posts from CSV."""
@@ -159,29 +218,53 @@ class CategoryProposer:
return ('hellogeek.net', "Low-traffic/off-brand content", 0.4)
def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
"""Get AI category proposals with editorial strategy context."""
"""Get AI category proposals with editorial strategy context and existing categories."""
if not self.openrouter_api_key:
logger.error("OPENROUTER_API_KEY not set")
return None
# Build editorial context
# Build editorial context with existing categories
editorial_context = ""
if self.use_editorial_strategy:
editorial_context = """
EDITORIAL STRATEGY GUIDELINES:
mistergeek.net (High-value tech):
- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
- Ideal Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
- Focus: Professional, high-traffic tech content
webscroll.fr (Torrenting niche):
- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
- Ideal Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
- Focus: Torrenting and file-sharing only
hellogeek.net (Catch-all):
- Categories: Experimental, Low-Traffic, Off-Brand, Testing
- Ideal Categories: Experimental, Low-Traffic, Off-Brand, Testing
- Focus: Everything else, low-traffic content
"""
# Add existing categories from WordPress
existing_cats_context = ""
if self.existing_categories_by_site:
existing_cats_context = """
EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES):
"""
for site_name, categories in self.existing_categories_by_site.items():
if categories:
existing_cats_context += f"{site_name}:\n"
for cat in categories[:20]: # Show first 20
existing_cats_context += f" - {cat}\n"
if len(categories) > 20:
existing_cats_context += f" ... and {len(categories) - 20} more\n"
existing_cats_context += "\n"
existing_cats_context += """
IMPORTANT: Use existing categories when possible. Only propose new categories if:
1. The post doesn't fit any existing category
2. The new category would have multiple posts (not one-off)
3. It aligns with the site's editorial strategy
"""
# Format posts for AI
@@ -200,6 +283,7 @@ hellogeek.net (Catch-all):
prompt = f"""Analyze these blog posts and propose optimal categories.
{editorial_context}
{existing_cats_context}
POSTS TO ANALYZE:
{posts_text}
@@ -208,19 +292,20 @@ For EACH post, provide:
{{
"post_id": <id>,
"current_categories": "<current>",
"proposed_category": "<best category from editorial lines above>",
"proposed_category": "<best category - PREFER EXISTING ONES FROM LIST ABOVE>",
"alternative_categories": ["<alt1>", "<alt2>"],
"recommended_site": "<best site for this post>",
"reason": "<brief explanation referencing editorial strategy>",
"confidence": "<High|Medium|Low>",
"should_migrate": <true/false>,
"migration_reason": "<reason if should_migrate is true>"
"migration_reason": "<reason if should_migrate is true>",
"is_existing_category": <true/false - true if proposed_category exists on the site>
}}
Return ONLY a JSON array with one object per post."""
try:
logger.info(f" Getting category proposals with editorial alignment...")
logger.info(f" Getting category proposals with existing categories...")
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
@@ -267,11 +352,15 @@ Return ONLY a JSON array with one object per post."""
return []
def propose_categories(self, batch_size: int = 10) -> bool:
"""Propose categories with editorial strategy alignment."""
"""Propose categories with editorial strategy alignment and existing categories."""
logger.info("\n" + "="*70)
logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY")
logger.info("="*70 + "\n")
# Fetch existing categories from WordPress
if self.fetch_existing_categories:
self.fetch_existing_categories()
# Analyze editorial strategy first
if self.use_editorial_strategy:
self.analyze_editorial_strategy()
@@ -311,6 +400,7 @@ Return ONLY a JSON array with one object per post."""
proposed_category = proposal.get('proposed_category', post.get('categories', ''))
recommended_site = proposal.get('recommended_site', current_site)
should_migrate = proposal.get('should_migrate', False)
is_existing = proposal.get('is_existing_category', True)
# If AI didn't specify, use editorial strategy
if not recommended_site or recommended_site == current_site:
@@ -331,7 +421,8 @@ Return ONLY a JSON array with one object per post."""
'recommended_site': recommended_site,
'should_migrate': 'Yes' if should_migrate else 'No',
'migration_reason': migration_reason,
'current_site': current_site
'current_site': current_site,
'is_existing_category': 'Yes' if is_existing else 'No'
}
self.proposed_categories.append(proposal_record)
@@ -357,6 +448,13 @@ Return ONLY a JSON array with one object per post."""
by_site[site] = by_site.get(site, 0) + 1
for site, count in by_site.items():
logger.info(f" To {site}: {count} posts")
# Summary of existing vs new categories
existing_count = sum(1 for p in self.proposed_categories if p.get('is_existing_category') == 'Yes')
new_count = len(self.proposed_categories) - existing_count
logger.info(f"\n📊 Category Statistics:")
logger.info(f" Using existing categories: {existing_count} posts")
logger.info(f" Proposing new categories: {new_count} posts")
return True
@@ -375,7 +473,8 @@ Return ONLY a JSON array with one object per post."""
'post_id', 'title', 'current_site', 'current_categories',
'proposed_category', 'alternative_categories',
'category_reason', 'category_confidence',
'recommended_site', 'should_migrate', 'migration_reason'
'recommended_site', 'should_migrate', 'migration_reason',
'is_existing_category'
]
logger.info(f"\nExporting to: {output_file}")