Add existing category fetching to category_propose

New Feature:
- Fetch existing categories from WordPress sites before AI proposals
- AI now prefers existing categories to avoid duplicates
- Shows existing categories in AI prompt for better suggestions
- Tracks whether proposed categories are existing or new

### Changes:
- fetch_existing_categories() method - Gets categories from all sites
- Updated AI prompt includes existing categories list
- New CSV column: is_existing_category (Yes/No)
- Statistics showing existing vs new categories

### Benefits:
- Reduces category duplication
- Maintains consistency across posts
- AI makes smarter category suggestions
- Users can see which are existing vs new categories

### AI Prompt Enhancement:
EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES):

mistergeek.net:
  - VPN
  - Software
  - Gaming
  ...

webscroll.fr:
  - Torrenting
  - File-Sharing
  ...

IMPORTANT: Use existing categories when possible...

### Output:
Category Statistics:
  Using existing categories: 145 posts
  Proposing new categories: 12 posts

Usage:
./seo category_propose  # Now fetches existing categories automatically

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
Kevin Bataille
2026-02-16 15:58:51 +01:00
parent b5c586e7ad
commit 6a574bf07c
2 changed files with 117 additions and 15 deletions

View File

@@ -64,13 +64,15 @@ class SEOApp:
analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields) analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields)
return analyzer.run(output_file=output, update_input=update) return analyzer.run(output_file=output, update_input=update)
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]: def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None,
fetch_existing: bool = True) -> Tuple[str, str]:
""" """
Propose categories for posts with editorial strategy alignment. Propose categories for posts with editorial strategy alignment.
Args: Args:
csv_file: Path to CSV file (uses latest export if not provided) csv_file: Path to CSV file (uses latest export if not provided)
output: Custom output file path output: Custom output file path
fetch_existing: If True, fetch existing categories from WordPress
Returns: Returns:
Tuple of (proposals_file, migrations_file) Tuple of (proposals_file, migrations_file)
@@ -85,7 +87,8 @@ class SEOApp:
logger.info(f"Using file: {csv_file}") logger.info(f"Using file: {csv_file}")
proposer = CategoryProposer(csv_file, use_editorial_strategy=True) proposer = CategoryProposer(csv_file, use_editorial_strategy=True,
fetch_existing_categories=fetch_existing)
return proposer.run(output_file=output) return proposer.run(output_file=output)
def category_apply(self, proposals_csv: str, site_name: str, def category_apply(self, proposals_csv: str, site_name: str,

View File

@@ -1,6 +1,6 @@
""" """
Category Proposer - AI-powered category suggestions with editorial strategy alignment Category Proposer - AI-powered category suggestions with editorial strategy alignment
Proposes categories based on content AND site editorial lines Fetches existing categories from WordPress sites and uses them in proposals
""" """
import csv import csv
@@ -10,6 +10,7 @@ from pathlib import Path
from datetime import datetime from datetime import datetime
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import requests import requests
from requests.auth import HTTPBasicAuth
from .config import Config from .config import Config
from .editorial_strategy import EditorialStrategyAnalyzer from .editorial_strategy import EditorialStrategyAnalyzer
@@ -58,13 +59,15 @@ EDITORIAL_LINES = {
class CategoryProposer: class CategoryProposer:
"""Propose categories for posts using AI with editorial strategy alignment.""" """Propose categories for posts using AI with editorial strategy alignment."""
def __init__(self, csv_file: str, use_editorial_strategy: bool = True): def __init__(self, csv_file: str, use_editorial_strategy: bool = True,
fetch_existing_categories: bool = True):
""" """
Initialize proposer. Initialize proposer.
Args: Args:
csv_file: Path to CSV file csv_file: Path to CSV file
use_editorial_strategy: If True, align proposals with editorial lines use_editorial_strategy: If True, align proposals with editorial lines
fetch_existing_categories: If True, fetch existing categories from WordPress
""" """
self.csv_file = Path(csv_file) self.csv_file = Path(csv_file)
self.openrouter_api_key = Config.OPENROUTER_API_KEY self.openrouter_api_key = Config.OPENROUTER_API_KEY
@@ -75,7 +78,63 @@ class CategoryProposer:
self.api_calls = 0 self.api_calls = 0
self.ai_cost = 0.0 self.ai_cost = 0.0
self.use_editorial_strategy = use_editorial_strategy self.use_editorial_strategy = use_editorial_strategy
self.fetch_existing_categories = fetch_existing_categories
self.site_analysis = {} self.site_analysis = {}
self.existing_categories_by_site = {} # Fetched from WordPress
self.sites = Config.WORDPRESS_SITES # Add sites config
def fetch_existing_categories(self) -> Dict[str, List[str]]:
"""
Fetch existing categories from all WordPress sites.
Returns:
Dict mapping site name to list of category names
"""
logger.info("\n📁 Fetching existing categories from WordPress sites...")
categories_by_site = {}
for site_name, site_config in self.sites.items():
try:
base_url = site_config['url'].rstrip('/')
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
logger.info(f" Fetching from {site_name}...")
all_categories = []
page = 1
while True:
response = requests.get(
f"{base_url}/wp-json/wp/v2/categories",
params={'per_page': 100, 'page': page},
auth=auth,
timeout=10
)
if response.status_code != 200:
logger.warning(f" Could not fetch categories: {response.status_code}")
break
page_categories = response.json()
if not page_categories:
break
all_categories.extend([cat['name'] for cat in page_categories])
if len(page_categories) < 100:
break
page += 1
categories_by_site[site_name] = all_categories
logger.info(f" ✓ Found {len(all_categories)} categories")
except Exception as e:
logger.warning(f" Error fetching from {site_name}: {e}")
categories_by_site[site_name] = []
self.existing_categories_by_site = categories_by_site
return categories_by_site
def load_csv(self) -> bool: def load_csv(self) -> bool:
"""Load posts from CSV.""" """Load posts from CSV."""
@@ -159,29 +218,53 @@ class CategoryProposer:
return ('hellogeek.net', "Low-traffic/off-brand content", 0.4) return ('hellogeek.net', "Low-traffic/off-brand content", 0.4)
def get_category_proposals(self, batch: List[Dict]) -> Optional[str]: def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
"""Get AI category proposals with editorial strategy context.""" """Get AI category proposals with editorial strategy context and existing categories."""
if not self.openrouter_api_key: if not self.openrouter_api_key:
logger.error("OPENROUTER_API_KEY not set") logger.error("OPENROUTER_API_KEY not set")
return None return None
# Build editorial context # Build editorial context with existing categories
editorial_context = "" editorial_context = ""
if self.use_editorial_strategy: if self.use_editorial_strategy:
editorial_context = """ editorial_context = """
EDITORIAL STRATEGY GUIDELINES: EDITORIAL STRATEGY GUIDELINES:
mistergeek.net (High-value tech): mistergeek.net (High-value tech):
- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews - Ideal Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
- Focus: Professional, high-traffic tech content - Focus: Professional, high-traffic tech content
webscroll.fr (Torrenting niche): webscroll.fr (Torrenting niche):
- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox - Ideal Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
- Focus: Torrenting and file-sharing only - Focus: Torrenting and file-sharing only
hellogeek.net (Catch-all): hellogeek.net (Catch-all):
- Categories: Experimental, Low-Traffic, Off-Brand, Testing - Ideal Categories: Experimental, Low-Traffic, Off-Brand, Testing
- Focus: Everything else, low-traffic content - Focus: Everything else, low-traffic content
"""
# Add existing categories from WordPress
existing_cats_context = ""
if self.existing_categories_by_site:
existing_cats_context = """
EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES):
"""
for site_name, categories in self.existing_categories_by_site.items():
if categories:
existing_cats_context += f"{site_name}:\n"
for cat in categories[:20]: # Show first 20
existing_cats_context += f" - {cat}\n"
if len(categories) > 20:
existing_cats_context += f" ... and {len(categories) - 20} more\n"
existing_cats_context += "\n"
existing_cats_context += """
IMPORTANT: Use existing categories when possible. Only propose new categories if:
1. The post doesn't fit any existing category
2. The new category would have multiple posts (not one-off)
3. It aligns with the site's editorial strategy
""" """
# Format posts for AI # Format posts for AI
@@ -200,6 +283,7 @@ hellogeek.net (Catch-all):
prompt = f"""Analyze these blog posts and propose optimal categories. prompt = f"""Analyze these blog posts and propose optimal categories.
{editorial_context} {editorial_context}
{existing_cats_context}
POSTS TO ANALYZE: POSTS TO ANALYZE:
{posts_text} {posts_text}
@@ -208,19 +292,20 @@ For EACH post, provide:
{{ {{
"post_id": <id>, "post_id": <id>,
"current_categories": "<current>", "current_categories": "<current>",
"proposed_category": "<best category from editorial lines above>", "proposed_category": "<best category - PREFER EXISTING ONES FROM LIST ABOVE>",
"alternative_categories": ["<alt1>", "<alt2>"], "alternative_categories": ["<alt1>", "<alt2>"],
"recommended_site": "<best site for this post>", "recommended_site": "<best site for this post>",
"reason": "<brief explanation referencing editorial strategy>", "reason": "<brief explanation referencing editorial strategy>",
"confidence": "<High|Medium|Low>", "confidence": "<High|Medium|Low>",
"should_migrate": <true/false>, "should_migrate": <true/false>,
"migration_reason": "<reason if should_migrate is true>" "migration_reason": "<reason if should_migrate is true>",
"is_existing_category": <true/false - true if proposed_category exists on the site>
}} }}
Return ONLY a JSON array with one object per post.""" Return ONLY a JSON array with one object per post."""
try: try:
logger.info(f" Getting category proposals with editorial alignment...") logger.info(f" Getting category proposals with existing categories...")
response = requests.post( response = requests.post(
"https://openrouter.ai/api/v1/chat/completions", "https://openrouter.ai/api/v1/chat/completions",
@@ -267,11 +352,15 @@ Return ONLY a JSON array with one object per post."""
return [] return []
def propose_categories(self, batch_size: int = 10) -> bool: def propose_categories(self, batch_size: int = 10) -> bool:
"""Propose categories with editorial strategy alignment.""" """Propose categories with editorial strategy alignment and existing categories."""
logger.info("\n" + "="*70) logger.info("\n" + "="*70)
logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY") logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY")
logger.info("="*70 + "\n") logger.info("="*70 + "\n")
# Fetch existing categories from WordPress
if self.fetch_existing_categories:
self.fetch_existing_categories()
# Analyze editorial strategy first # Analyze editorial strategy first
if self.use_editorial_strategy: if self.use_editorial_strategy:
self.analyze_editorial_strategy() self.analyze_editorial_strategy()
@@ -311,6 +400,7 @@ Return ONLY a JSON array with one object per post."""
proposed_category = proposal.get('proposed_category', post.get('categories', '')) proposed_category = proposal.get('proposed_category', post.get('categories', ''))
recommended_site = proposal.get('recommended_site', current_site) recommended_site = proposal.get('recommended_site', current_site)
should_migrate = proposal.get('should_migrate', False) should_migrate = proposal.get('should_migrate', False)
is_existing = proposal.get('is_existing_category', True)
# If AI didn't specify, use editorial strategy # If AI didn't specify, use editorial strategy
if not recommended_site or recommended_site == current_site: if not recommended_site or recommended_site == current_site:
@@ -331,7 +421,8 @@ Return ONLY a JSON array with one object per post."""
'recommended_site': recommended_site, 'recommended_site': recommended_site,
'should_migrate': 'Yes' if should_migrate else 'No', 'should_migrate': 'Yes' if should_migrate else 'No',
'migration_reason': migration_reason, 'migration_reason': migration_reason,
'current_site': current_site 'current_site': current_site,
'is_existing_category': 'Yes' if is_existing else 'No'
} }
self.proposed_categories.append(proposal_record) self.proposed_categories.append(proposal_record)
@@ -357,6 +448,13 @@ Return ONLY a JSON array with one object per post."""
by_site[site] = by_site.get(site, 0) + 1 by_site[site] = by_site.get(site, 0) + 1
for site, count in by_site.items(): for site, count in by_site.items():
logger.info(f" To {site}: {count} posts") logger.info(f" To {site}: {count} posts")
# Summary of existing vs new categories
existing_count = sum(1 for p in self.proposed_categories if p.get('is_existing_category') == 'Yes')
new_count = len(self.proposed_categories) - existing_count
logger.info(f"\n📊 Category Statistics:")
logger.info(f" Using existing categories: {existing_count} posts")
logger.info(f" Proposing new categories: {new_count} posts")
return True return True
@@ -375,7 +473,8 @@ Return ONLY a JSON array with one object per post."""
'post_id', 'title', 'current_site', 'current_categories', 'post_id', 'title', 'current_site', 'current_categories',
'proposed_category', 'alternative_categories', 'proposed_category', 'alternative_categories',
'category_reason', 'category_confidence', 'category_reason', 'category_confidence',
'recommended_site', 'should_migrate', 'migration_reason' 'recommended_site', 'should_migrate', 'migration_reason',
'is_existing_category'
] ]
logger.info(f"\nExporting to: {output_file}") logger.info(f"\nExporting to: {output_file}")