Refactor category_propose to align with editorial strategy

Major refactoring of category proposal system: ### Changes: - Integrated editorial strategy into category proposals - Added site migration recommendations to category proposals - AI now considers editorial lines when suggesting categories - Automatic detection of posts that should migrate between sites ### New Features: - Editorial line definitions for each site - Topic-based site matching algorithm - Migration recommendations alongside category proposals - Dual output: category_proposals_*.csv + migration_recommendations_*.csv ### Editorial Lines: mistergeek.net: VPN, Software, Gaming, SEO, Tech (high-value) webscroll.fr: Torrenting, File-Sharing, Tracker Guides (niche) hellogeek.net: Experimental, Low-Traffic, Off-Brand (catch-all) ### Output Files: 1. category_proposals_*.csv - Categories + site recommendations 2. migration_recommendations_*.csv - Posts to migrate between sites ### CSV Columns Added: - recommended_site - Best site for the post - should_migrate - Yes/No flag - migration_reason - Why migration is recommended - current_site - Original site for comparison ### Benefits: - Categories aligned with site strategy - Automatic migration detection - Smarter AI prompts with editorial context - Unified category + migration workflow Usage: ./seo category_propose # Generates both category and migration files Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 15:43:26 +01:00
parent c980edf047
commit b5c586e7ad
3 changed files with 270 additions and 34 deletions
--- a/src/seo/app.py
+++ b/src/seo/app.py
@@ -5,7 +5,7 @@ SEO Application Core - Integrated SEO automation functionality
 import logging
 from pathlib import Path
 from datetime import datetime
-from typing import Optional, List
+from typing import Optional, List, Tuple

 from .exporter import PostExporter
 from .analyzer import EnhancedPostAnalyzer
@@ -64,9 +64,18 @@ class SEOApp:
        analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields)
        return analyzer.run(output_file=output, update_input=update)
    
-    def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> str:
-        """Propose categories for posts."""
-        logger.info("🏷️  Proposing categories with AI...")
+    def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]:
+        """
+        Propose categories for posts with editorial strategy alignment.
+        
+        Args:
+            csv_file: Path to CSV file (uses latest export if not provided)
+            output: Custom output file path
+            
+        Returns:
+            Tuple of (proposals_file, migrations_file)
+        """
+        logger.info("🏷️  Proposing categories with AI (editorial strategy aligned)...")
        
        if not csv_file:
            csv_file = self._find_latest_export()
@@ -76,7 +85,7 @@ class SEOApp:
        
        logger.info(f"Using file: {csv_file}")
        
-        proposer = CategoryProposer(csv_file)
+        proposer = CategoryProposer(csv_file, use_editorial_strategy=True)
        return proposer.run(output_file=output)
    
    def category_apply(self, proposals_csv: str, site_name: str,
--- a/src/seo/category_proposer.py
+++ b/src/seo/category_proposer.py
@@ -1,5 +1,6 @@
 """
-Category Proposer - AI-powered category suggestions
+Category Proposer - AI-powered category suggestions with editorial strategy alignment
+Proposes categories based on content AND site editorial lines
 """

 import csv
@@ -7,26 +8,74 @@ import json
 import logging
 from pathlib import Path
 from datetime import datetime
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import requests

 from .config import Config
+from .editorial_strategy import EditorialStrategyAnalyzer

 logger = logging.getLogger(__name__)


-class CategoryProposer:
-    """Propose categories for posts using AI."""
+# Editorial line definitions for each site
+EDITORIAL_LINES = {
+    'mistergeek.net': {
+        'focus': 'High-value tech content',
+        'ideal_categories': [
+            'VPN', 'Software/Tools', 'Gaming', 'SEO', 
+            'Content Marketing', 'Tech Reviews', 'Tutorials'
+        ],
+        'topic_keywords': {
+            'VPN': ['vpn', 'proxy', 'privacy', 'security', 'encryption'],
+            'Software': ['software', 'app', 'tool', 'download', 'install'],
+            'Gaming': ['game', 'gaming', 'console', 'steam', 'playstation'],
+            'SEO': ['seo', 'ranking', 'google', 'search', 'optimization'],
+            'Tech': ['tech', 'technology', 'review', 'device', 'hardware'],
+        }
+    },
+    'webscroll.fr': {
+        'focus': 'Torrenting and file-sharing niche',
+        'ideal_categories': [
+            'Torrenting', 'File-Sharing', 'Tracker Guides', 
+            'VPN for Torrenting', 'Seedbox'
+        ],
+        'topic_keywords': {
+            'Torrenting': ['torrent', 'download', 'upload', 'tracker', 'seed'],
+            'File-Sharing': ['file-sharing', 'ddl', 'hosting', 'upload'],
+            'Tracker Guides': ['tracker', 'ratio', 'invite', 'private'],
+        }
+    },
+    'hellogeek.net': {
+        'focus': 'Low-traffic, experimental, off-brand content',
+        'ideal_categories': [
+            'Experimental', 'Low-Traffic', 'Off-Brand', 'Testing'
+        ],
+        'topic_keywords': {}  # Catch-all for everything else
+    }
+}

-    def __init__(self, csv_file: str):
-        """Initialize proposer with CSV file."""
+
+class CategoryProposer:
+    """Propose categories for posts using AI with editorial strategy alignment."""
+
+    def __init__(self, csv_file: str, use_editorial_strategy: bool = True):
+        """
+        Initialize proposer.
+        
+        Args:
+            csv_file: Path to CSV file
+            use_editorial_strategy: If True, align proposals with editorial lines
+        """
        self.csv_file = Path(csv_file)
        self.openrouter_api_key = Config.OPENROUTER_API_KEY
        self.ai_model = Config.AI_MODEL
        self.posts = []
        self.proposed_categories = []
+        self.migration_recommendations = []
        self.api_calls = 0
        self.ai_cost = 0.0
+        self.use_editorial_strategy = use_editorial_strategy
+        self.site_analysis = {}

    def load_csv(self) -> bool:
        """Load posts from CSV."""
@@ -48,15 +97,98 @@ class CategoryProposer:
            logger.error(f"Error loading CSV: {e}")
            return False

+    def analyze_editorial_strategy(self) -> Dict:
+        """Analyze editorial strategy to inform category proposals."""
+        if not self.use_editorial_strategy:
+            return {}
+
+        logger.info("\n📊 Analyzing editorial strategy to inform category proposals...")
+        
+        analyzer = EditorialStrategyAnalyzer()
+        analyzer.load_csv(str(self.csv_file))
+        self.site_analysis = analyzer.analyze_site_content()
+        
+        logger.info("✓ Editorial strategy analysis complete")
+        return self.site_analysis
+
+    def determine_best_site_for_post(self, post: Dict) -> Tuple[str, str, float]:
+        """
+        Determine the best site for a post based on content.
+        
+        Returns:
+            Tuple of (site_name, reason, confidence)
+        """
+        title = (post.get('title', '') + ' ' + post.get('content_preview', '')).lower()
+        current_site = post.get('site', '')
+        
+        # Check topic match for each site
+        site_scores = {}
+        
+        for site_name, editorial in EDITORIAL_LINES.items():
+            score = 0
+            matched_topics = []
+            
+            for topic, keywords in editorial['topic_keywords'].items():
+                if any(kw in title for kw in keywords):
+                    score += 1
+                    matched_topics.append(topic)
+            
+            # Bonus for staying on current site (avoid unnecessary moves)
+            if site_name == current_site:
+                score += 0.5
+            
+            site_scores[site_name] = {
+                'score': score,
+                'topics': matched_topics
+            }
+        
+        # Find best match
+        best_site = max(site_scores.items(), key=lambda x: x[1]['score'])
+        
+        if best_site[1]['score'] >= 1:
+            return (
+                best_site[0],
+                f"Content matches {best_site[0]} editorial line ({', '.join(best_site[1]['topics'])})",
+                min(1.0, best_site[1]['score'] / 3.0)  # Normalize confidence
+            )
+        else:
+            # No strong match, keep on current site or move to hellogeek
+            if current_site in ['mistergeek.net', 'webscroll.fr']:
+                return (current_site, "Keep on current site (no better match)", 0.5)
+            else:
+                return ('hellogeek.net', "Low-traffic/off-brand content", 0.4)
+
    def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
-        """Get AI category proposals for a batch of posts."""
+        """Get AI category proposals with editorial strategy context."""
        if not self.openrouter_api_key:
            logger.error("OPENROUTER_API_KEY not set")
            return None

+        # Build editorial context
+        editorial_context = ""
+        if self.use_editorial_strategy:
+            editorial_context = """
+EDITORIAL STRATEGY GUIDELINES:
+
+mistergeek.net (High-value tech):
+- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
+- Focus: Professional, high-traffic tech content
+
+webscroll.fr (Torrenting niche):
+- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
+- Focus: Torrenting and file-sharing only
+
+hellogeek.net (Catch-all):
+- Categories: Experimental, Low-Traffic, Off-Brand, Testing
+- Focus: Everything else, low-traffic content
+
+"""
+
+        # Format posts for AI
        formatted = []
        for i, post in enumerate(batch, 1):
            text = f"{i}. ID: {post['post_id']}\n"
+            text += f"   Site: {post.get('site', '')}\n"
            text += f"   Title: {post.get('title', '')}\n"
            text += f"   Current Categories: {post.get('categories', '')}\n"
            if 'content_preview' in post:
@@ -67,22 +199,28 @@ class CategoryProposer:

        prompt = f"""Analyze these blog posts and propose optimal categories.

+{editorial_context}
+POSTS TO ANALYZE:
+
 {posts_text}

 For EACH post, provide:
 {{
  "post_id": <id>,
  "current_categories": "<current>",
-  "proposed_category": "<best category>",
+  "proposed_category": "<best category from editorial lines above>",
  "alternative_categories": ["<alt1>", "<alt2>"],
-  "reason": "<brief explanation>",
-  "confidence": "<High|Medium|Low>"
+  "recommended_site": "<best site for this post>",
+  "reason": "<brief explanation referencing editorial strategy>",
+  "confidence": "<High|Medium|Low>",
+  "should_migrate": <true/false>,
+  "migration_reason": "<reason if should_migrate is true>"
 }}

 Return ONLY a JSON array with one object per post."""

        try:
-            logger.info(f"  Getting category proposals...")
+            logger.info(f"  Getting category proposals with editorial alignment...")

            response = requests.post(
                "https://openrouter.ai/api/v1/chat/completions",
@@ -129,11 +267,15 @@ Return ONLY a JSON array with one object per post."""
            return []

    def propose_categories(self, batch_size: int = 10) -> bool:
-        """Propose categories for all posts."""
+        """Propose categories with editorial strategy alignment."""
        logger.info("\n" + "="*70)
-        logger.info("PROPOSING CATEGORIES WITH AI")
+        logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY")
        logger.info("="*70 + "\n")

+        # Analyze editorial strategy first
+        if self.use_editorial_strategy:
+            self.analyze_editorial_strategy()
+
        batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)]
        logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n")

@@ -158,23 +300,68 @@ Return ONLY a JSON array with one object per post."""
        logger.info(f"  API calls: {self.api_calls}")
        logger.info(f"  Cost: ${self.ai_cost:.4f}")

+        # Process proposals with editorial alignment
        for post in self.posts:
            post_id = str(post['post_id'])
            proposal = all_proposals.get(post_id, {})
-
-            self.proposed_categories.append({
+            
+            current_site = post.get('site', '')
+            
+            # Get AI recommendation or use editorial strategy
+            proposed_category = proposal.get('proposed_category', post.get('categories', ''))
+            recommended_site = proposal.get('recommended_site', current_site)
+            should_migrate = proposal.get('should_migrate', False)
+            
+            # If AI didn't specify, use editorial strategy
+            if not recommended_site or recommended_site == current_site:
+                recommended_site, migration_reason, confidence = self.determine_best_site_for_post(post)
+                should_migrate = (recommended_site != current_site)
+            else:
+                migration_reason = proposal.get('migration_reason', '')
+                confidence = proposal.get('confidence', 'Medium')
+            
+            # Build proposal record
+            proposal_record = {
                **post,
-                'proposed_category': proposal.get('proposed_category', post.get('categories', '')),
+                'proposed_category': proposed_category,
                'alternative_categories': ', '.join(proposal.get('alternative_categories', [])),
                'category_reason': proposal.get('reason', ''),
                'category_confidence': proposal.get('confidence', 'Medium'),
-                'current_categories': post.get('categories', '')
-            })
+                'current_categories': post.get('categories', ''),
+                'recommended_site': recommended_site,
+                'should_migrate': 'Yes' if should_migrate else 'No',
+                'migration_reason': migration_reason,
+                'current_site': current_site
+            }
+            
+            self.proposed_categories.append(proposal_record)
+            
+            # Track migration recommendations
+            if should_migrate:
+                self.migration_recommendations.append({
+                    'post_id': post_id,
+                    'title': post.get('title', '')[:80],
+                    'from_site': current_site,
+                    'to_site': recommended_site,
+                    'reason': migration_reason,
+                    'category': proposed_category
+                })
+
+        # Summary
+        migration_count = len(self.migration_recommendations)
+        logger.info(f"\n📊 Migration Recommendations: {migration_count} posts")
+        if migration_count > 0:
+            by_site = {}
+            for mig in self.migration_recommendations:
+                site = mig['to_site']
+                by_site[site] = by_site.get(site, 0) + 1
+            for site, count in by_site.items():
+                logger.info(f"  To {site}: {count} posts")

        return True

    def export_proposals(self, output_file: Optional[str] = None) -> str:
-        """Export category proposals to CSV."""
+        """Export category proposals with migration data."""
        if not output_file:
            output_dir = Path(__file__).parent.parent.parent / 'output'
            output_dir.mkdir(parents=True, exist_ok=True)
@@ -185,9 +372,10 @@ Return ONLY a JSON array with one object per post."""
        output_file.parent.mkdir(parents=True, exist_ok=True)

        fieldnames = [
-            'post_id', 'title', 'site', 'current_categories',
+            'post_id', 'title', 'current_site', 'current_categories',
            'proposed_category', 'alternative_categories',
-            'category_reason', 'category_confidence'
+            'category_reason', 'category_confidence',
+            'recommended_site', 'should_migrate', 'migration_reason'
        ]

        logger.info(f"\nExporting to: {output_file}")
@@ -200,13 +388,45 @@ Return ONLY a JSON array with one object per post."""
        logger.info(f"✓ Exported {len(self.proposed_categories)} proposals")
        return str(output_file)

-    def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> str:
-        """Run complete category proposal process."""
-        if not self.load_csv():
+    def export_migrations(self, output_file: Optional[str] = None) -> str:
+        """Export migration recommendations separately."""
+        if not self.migration_recommendations:
+            logger.info("No migration recommendations to export")
            return ""

+        if not output_file:
+            output_dir = Path(__file__).parent.parent.parent / 'output'
+            output_dir.mkdir(parents=True, exist_ok=True)
+            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+            output_file = output_dir / f'migration_recommendations_{timestamp}.csv'
+
+        output_file = Path(output_file)
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+
+        fieldnames = [
+            'post_id', 'title', 'from_site', 'to_site', 'reason', 'category'
+        ]
+
+        logger.info(f"\nExporting migrations to: {output_file}")
+
+        with open(output_file, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(self.migration_recommendations)
+
+        logger.info(f"✓ Exported {len(self.migration_recommendations)} migration recommendations")
+        return str(output_file)
+
+    def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> Tuple[str, str]:
+        """Run complete category proposal process with editorial strategy."""
+        if not self.load_csv():
+            return "", ""
+
        if not self.propose_categories(batch_size=batch_size):
            logger.error("Failed to propose categories")
-            return ""
+            return "", ""

-        return self.export_proposals(output_file)
+        proposals_file = self.export_proposals(output_file)
+        migrations_file = self.export_migrations()
+
+        return proposals_file, migrations_file
--- a/src/seo/cli.py
+++ b/src/seo/cli.py
@@ -140,10 +140,17 @@ def cmd_category_propose(app, args):
    
    csv_file = args.args[0] if args.args else None
    
-    result = app.category_propose(csv_file=csv_file, output=args.output)
+    proposals_file, migrations_file = app.category_propose(csv_file=csv_file, output=args.output)
    
-    if result:
-        print(f"✅ Category proposals saved to: {result}")
+    if proposals_file:
+        print(f"\n✅ Category proposals complete!")
+        print(f"  Proposals: {proposals_file}")
+        if migrations_file:
+            print(f"  Migrations: {migrations_file}")
+        print(f"\nReview the files to see:")
+        print(f"  1. Proposed categories for each post")
+        print(f"  2. Site migration recommendations")
+        print(f"  3. Editorial strategy alignment")
    return 0