diff --git a/src/seo/app.py b/src/seo/app.py index e1ea387..387aa76 100644 --- a/src/seo/app.py +++ b/src/seo/app.py @@ -64,13 +64,15 @@ class SEOApp: analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields) return analyzer.run(output_file=output, update_input=update) - def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]: + def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None, + fetch_existing: bool = True) -> Tuple[str, str]: """ Propose categories for posts with editorial strategy alignment. Args: csv_file: Path to CSV file (uses latest export if not provided) output: Custom output file path + fetch_existing: If True, fetch existing categories from WordPress Returns: Tuple of (proposals_file, migrations_file) @@ -85,7 +87,8 @@ class SEOApp: logger.info(f"Using file: {csv_file}") - proposer = CategoryProposer(csv_file, use_editorial_strategy=True) + proposer = CategoryProposer(csv_file, use_editorial_strategy=True, + fetch_existing_categories=fetch_existing) return proposer.run(output_file=output) def category_apply(self, proposals_csv: str, site_name: str, diff --git a/src/seo/category_proposer.py b/src/seo/category_proposer.py index 853ab84..66965e4 100644 --- a/src/seo/category_proposer.py +++ b/src/seo/category_proposer.py @@ -1,6 +1,6 @@ """ Category Proposer - AI-powered category suggestions with editorial strategy alignment -Proposes categories based on content AND site editorial lines +Fetches existing categories from WordPress sites and uses them in proposals """ import csv @@ -10,6 +10,7 @@ from pathlib import Path from datetime import datetime from typing import Dict, List, Optional, Tuple import requests +from requests.auth import HTTPBasicAuth from .config import Config from .editorial_strategy import EditorialStrategyAnalyzer @@ -58,13 +59,15 @@ EDITORIAL_LINES = { class CategoryProposer: """Propose categories for posts using AI with editorial strategy alignment.""" - def __init__(self, csv_file: str, use_editorial_strategy: bool = True): + def __init__(self, csv_file: str, use_editorial_strategy: bool = True, + fetch_existing_categories: bool = True): """ Initialize proposer. Args: csv_file: Path to CSV file use_editorial_strategy: If True, align proposals with editorial lines + fetch_existing_categories: If True, fetch existing categories from WordPress """ self.csv_file = Path(csv_file) self.openrouter_api_key = Config.OPENROUTER_API_KEY @@ -75,7 +78,63 @@ class CategoryProposer: self.api_calls = 0 self.ai_cost = 0.0 self.use_editorial_strategy = use_editorial_strategy + self.fetch_existing_categories = fetch_existing_categories self.site_analysis = {} + self.existing_categories_by_site = {} # Fetched from WordPress + self.sites = Config.WORDPRESS_SITES # Add sites config + + def fetch_existing_categories(self) -> Dict[str, List[str]]: + """ + Fetch existing categories from all WordPress sites. + + Returns: + Dict mapping site name to list of category names + """ + logger.info("\nšŸ“ Fetching existing categories from WordPress sites...") + + categories_by_site = {} + + for site_name, site_config in self.sites.items(): + try: + base_url = site_config['url'].rstrip('/') + auth = HTTPBasicAuth(site_config['username'], site_config['password']) + + logger.info(f" Fetching from {site_name}...") + + all_categories = [] + page = 1 + + while True: + response = requests.get( + f"{base_url}/wp-json/wp/v2/categories", + params={'per_page': 100, 'page': page}, + auth=auth, + timeout=10 + ) + + if response.status_code != 200: + logger.warning(f" Could not fetch categories: {response.status_code}") + break + + page_categories = response.json() + if not page_categories: + break + + all_categories.extend([cat['name'] for cat in page_categories]) + + if len(page_categories) < 100: + break + page += 1 + + categories_by_site[site_name] = all_categories + logger.info(f" āœ“ Found {len(all_categories)} categories") + + except Exception as e: + logger.warning(f" Error fetching from {site_name}: {e}") + categories_by_site[site_name] = [] + + self.existing_categories_by_site = categories_by_site + return categories_by_site def load_csv(self) -> bool: """Load posts from CSV.""" @@ -159,29 +218,53 @@ class CategoryProposer: return ('hellogeek.net', "Low-traffic/off-brand content", 0.4) def get_category_proposals(self, batch: List[Dict]) -> Optional[str]: - """Get AI category proposals with editorial strategy context.""" + """Get AI category proposals with editorial strategy context and existing categories.""" if not self.openrouter_api_key: logger.error("OPENROUTER_API_KEY not set") return None - # Build editorial context + # Build editorial context with existing categories editorial_context = "" if self.use_editorial_strategy: editorial_context = """ EDITORIAL STRATEGY GUIDELINES: mistergeek.net (High-value tech): -- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews +- Ideal Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews - Focus: Professional, high-traffic tech content webscroll.fr (Torrenting niche): -- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox +- Ideal Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox - Focus: Torrenting and file-sharing only hellogeek.net (Catch-all): -- Categories: Experimental, Low-Traffic, Off-Brand, Testing +- Ideal Categories: Experimental, Low-Traffic, Off-Brand, Testing - Focus: Everything else, low-traffic content +""" + + # Add existing categories from WordPress + existing_cats_context = "" + if self.existing_categories_by_site: + existing_cats_context = """ +EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES): + +""" + for site_name, categories in self.existing_categories_by_site.items(): + if categories: + existing_cats_context += f"{site_name}:\n" + for cat in categories[:20]: # Show first 20 + existing_cats_context += f" - {cat}\n" + if len(categories) > 20: + existing_cats_context += f" ... and {len(categories) - 20} more\n" + existing_cats_context += "\n" + + existing_cats_context += """ +IMPORTANT: Use existing categories when possible. Only propose new categories if: +1. The post doesn't fit any existing category +2. The new category would have multiple posts (not one-off) +3. It aligns with the site's editorial strategy + """ # Format posts for AI @@ -200,6 +283,7 @@ hellogeek.net (Catch-all): prompt = f"""Analyze these blog posts and propose optimal categories. {editorial_context} +{existing_cats_context} POSTS TO ANALYZE: {posts_text} @@ -208,19 +292,20 @@ For EACH post, provide: {{ "post_id": , "current_categories": "", - "proposed_category": "", + "proposed_category": "", "alternative_categories": ["", ""], "recommended_site": "", "reason": "", "confidence": "", "should_migrate": , - "migration_reason": "" + "migration_reason": "", + "is_existing_category": }} Return ONLY a JSON array with one object per post.""" try: - logger.info(f" Getting category proposals with editorial alignment...") + logger.info(f" Getting category proposals with existing categories...") response = requests.post( "https://openrouter.ai/api/v1/chat/completions", @@ -267,11 +352,15 @@ Return ONLY a JSON array with one object per post.""" return [] def propose_categories(self, batch_size: int = 10) -> bool: - """Propose categories with editorial strategy alignment.""" + """Propose categories with editorial strategy alignment and existing categories.""" logger.info("\n" + "="*70) logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY") logger.info("="*70 + "\n") + # Fetch existing categories from WordPress + if self.fetch_existing_categories: + self.fetch_existing_categories() + # Analyze editorial strategy first if self.use_editorial_strategy: self.analyze_editorial_strategy() @@ -311,6 +400,7 @@ Return ONLY a JSON array with one object per post.""" proposed_category = proposal.get('proposed_category', post.get('categories', '')) recommended_site = proposal.get('recommended_site', current_site) should_migrate = proposal.get('should_migrate', False) + is_existing = proposal.get('is_existing_category', True) # If AI didn't specify, use editorial strategy if not recommended_site or recommended_site == current_site: @@ -331,7 +421,8 @@ Return ONLY a JSON array with one object per post.""" 'recommended_site': recommended_site, 'should_migrate': 'Yes' if should_migrate else 'No', 'migration_reason': migration_reason, - 'current_site': current_site + 'current_site': current_site, + 'is_existing_category': 'Yes' if is_existing else 'No' } self.proposed_categories.append(proposal_record) @@ -357,6 +448,13 @@ Return ONLY a JSON array with one object per post.""" by_site[site] = by_site.get(site, 0) + 1 for site, count in by_site.items(): logger.info(f" To {site}: {count} posts") + + # Summary of existing vs new categories + existing_count = sum(1 for p in self.proposed_categories if p.get('is_existing_category') == 'Yes') + new_count = len(self.proposed_categories) - existing_count + logger.info(f"\nšŸ“Š Category Statistics:") + logger.info(f" Using existing categories: {existing_count} posts") + logger.info(f" Proposing new categories: {new_count} posts") return True @@ -375,7 +473,8 @@ Return ONLY a JSON array with one object per post.""" 'post_id', 'title', 'current_site', 'current_categories', 'proposed_category', 'alternative_categories', 'category_reason', 'category_confidence', - 'recommended_site', 'should_migrate', 'migration_reason' + 'recommended_site', 'should_migrate', 'migration_reason', + 'is_existing_category' ] logger.info(f"\nExporting to: {output_file}")