seo/scripts/content_strategy_analyzer.py

"""
Multi-Site Content Strategy Analyzer
Analyzes all content (published + drafts) across 3 websites.
Recommends optimal distribution and consolidation strategy.
"""

import csv
import json
import argparse
from pathlib import Path
from collections import defaultdict
from datetime import datetime


class ContentStrategyAnalyzer:
    """Analyze and optimize content distribution across multiple sites."""

    def __init__(self):
        """Initialize analyzer."""
        self.output_dir = Path('output')
        self.output_dir.mkdir(exist_ok=True)
        (self.output_dir / 'analysis').mkdir(exist_ok=True)
        (self.output_dir / 'reports').mkdir(exist_ok=True)
        (self.output_dir / 'logs').mkdir(exist_ok=True)

        self.logs = []

    def log(self, message):
        """Log message."""
        self.logs.append(message)
        print(message)

    def load_wordpress_posts(self, csv_path):
        """Load published WordPress posts."""
        posts = {}
        if not csv_path.exists():
            self.log(f"⚠️  WordPress posts file not found: {csv_path}")
            return posts

        try:
            with open(csv_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    post_id = row.get('ID') or row.get('post_id')
                    if not post_id:
                        continue

                    posts[post_id] = {
                        'source': 'wordpress',
                        'status': 'published',
                        'title': row.get('Title') or row.get('title') or row.get('post_title') or '',
                        'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
                        'author': row.get('Author') or row.get('author') or 'Unknown',
                        'traffic': int(row.get('traffic', 0) or 0),
                        'impressions': int(row.get('impressions', 0) or 0),
                        'position': float(row.get('avg_position', 0) or 0),
                        'category': row.get('Category') or row.get('category') or '',
                    }

            self.log(f"✓ Loaded {len(posts)} published WordPress posts")
        except Exception as e:
            self.log(f"❌ Error reading WordPress posts: {e}")

        return posts

    def load_draft_posts(self, csv_path):
        """Load draft/unpublished posts."""
        posts = {}
        if not csv_path.exists():
            self.log(f"⚠️  Draft posts file not found: {csv_path}")
            return posts

        try:
            with open(csv_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    post_id = row.get('ID') or row.get('post_id')
                    if not post_id:
                        continue

                    posts[post_id] = {
                        'source': 'draft',
                        'status': 'draft',
                        'title': row.get('Title') or row.get('title') or row.get('post_title') or '',
                        'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
                        'author': row.get('Author') or row.get('author') or 'Unknown',
                        'traffic': 0,  # Drafts have no traffic
                        'impressions': 0,
                        'position': 0,
                        'category': row.get('Category') or row.get('category') or '',
                    }

            self.log(f"✓ Loaded {len(posts)} draft posts")
        except Exception as e:
            self.log(f"❌ Error reading draft posts: {e}")

        return posts

    def classify_post_topic(self, post):
        """Classify post into topic area."""
        title = post['title'].lower()
        category = post['category'].lower()
        content = f"{title} {category}"

        # Topic classification based on keywords
        topic_keywords = {
            'torrent': ['torrent', 'ygg', 'ratio', 'tracker', 'magnet', 'seedbox', 'upload'],
            'streaming': ['stream', 'film', 'série', 'netflix', 'disney', 'platforma'],
            'vpn': ['vpn', 'proxy', 'anonyme', 'privacy', 'chiffr'],
            'software': ['software', 'tool', 'app', 'logiciel', 'outil', 'program'],
            'gaming': ['game', 'jeu', 'gaming', 'emula', 'console', 'retro'],
            'download': ['download', 'télécharge', 'ddl', 'upload'],
            'tech': ['tech', 'informatique', 'code', 'programming', 'developer'],
            'other': [],
        }

        for topic, keywords in topic_keywords.items():
            if topic == 'other':
                continue
            for keyword in keywords:
                if keyword in content:
                    return topic

        return 'other'

    def classify_website(self, post):
        """Determine which website this post should be on."""
        topic = self.classify_post_topic(post)
        author = post.get('author', '').strip()
        is_sponsored = author == 'Expert'

        # Website assignment rules
        if topic == 'torrent' or topic == 'download':
            return {
                'site': 'webscroll.fr',
                'reason': f'Torrent/file-sharing content',
                'priority': 'HIGH' if post['traffic'] > 100 else 'MEDIUM'
            }

        if topic in ['vpn', 'software', 'gaming', 'tech']:
            return {
                'site': 'mistergeek.net',
                'reason': f'{topic.capitalize()} - core content',
                'priority': 'HIGH' if post['traffic'] > 50 else 'MEDIUM'
            }

        if topic == 'streaming' and post['traffic'] < 100:
            return {
                'site': 'hellogeek.net',
                'reason': 'Low-traffic streaming content',
                'priority': 'LOW'
            }

        if topic == 'other' or post['traffic'] < 10:
            return {
                'site': 'hellogeek.net',
                'reason': 'Off-brand or low-traffic content',
                'priority': 'LOW'
            }

        # Default to main site
        return {
            'site': 'mistergeek.net',
            'reason': 'Core content',
            'priority': 'MEDIUM'
        }

    def classify_content_action(self, post):
        """Determine what action to take with this post."""
        topic = self.classify_post_topic(post)
        traffic = post.get('traffic', 0)
        impressions = post.get('impressions', 0)
        position = post.get('position', 0)
        status = post.get('status', 'published')

        # Determine action
        if status == 'draft':
            if traffic == 0:
                return 'REVIEW_PUBLISH_OR_DELETE'  # Unpublished draft
            else:
                return 'REPUBLISH'  # Was published, now draft

        if traffic < 5 and impressions < 20:
            return 'DELETE_OR_CONSOLIDATE'

        if traffic > 0 and position > 0 and position < 11:
            return 'KEEP_OPTIMIZE'

        if position > 11 and position < 30:
            return 'KEEP_OPTIMIZE'

        if position > 30 or traffic < 10:
            return 'MOVE_TO_OTHER_SITE'

        return 'KEEP_MONITOR'

    def analyze_all_content(self, posts):
        """Analyze and classify all posts."""
        analysis = {
            'total_posts': len(posts),
            'by_site': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
            'by_topic': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
            'by_action': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
            'sponsored_posts': {'count': 0, 'traffic': 0, 'posts': []},
            'draft_posts': {'count': 0, 'posts': []},
        }

        for post_id, post in posts.items():
            topic = self.classify_post_topic(post)
            site_assignment = self.classify_website(post)
            action = self.classify_content_action(post)
            is_sponsored = post.get('author', '').strip() == 'Expert'
            is_draft = post.get('status') == 'draft'

            # Record in analysis
            analysis['by_site'][site_assignment['site']]['count'] += 1
            analysis['by_site'][site_assignment['site']]['traffic'] += post['traffic']
            analysis['by_site'][site_assignment['site']]['posts'].append({
                'id': post_id,
                'title': post['title'],
                'traffic': post['traffic'],
                'reason': site_assignment['reason']
            })

            analysis['by_topic'][topic]['count'] += 1
            analysis['by_topic'][topic]['traffic'] += post['traffic']

            analysis['by_action'][action]['count'] += 1
            analysis['by_action'][action]['traffic'] += post['traffic']

            if is_sponsored:
                analysis['sponsored_posts']['count'] += 1
                analysis['sponsored_posts']['traffic'] += post['traffic']
                analysis['sponsored_posts']['posts'].append({
                    'id': post_id,
                    'title': post['title'],
                    'traffic': post['traffic']
                })

            if is_draft:
                analysis['draft_posts']['count'] += 1
                analysis['draft_posts']['posts'].append({
                    'id': post_id,
                    'title': post['title'],
                    'status': 'draft'
                })

        return analysis

    def generate_content_distribution_csv(self, posts, output_path):
        """Export detailed content distribution plan."""
        try:
            fieldnames = [
                'post_id', 'title', 'topic', 'status', 'author',
                'traffic', 'impressions', 'position',
                'recommended_site', 'reason', 'action',
                'priority', 'notes'
            ]

            rows = []
            for post_id, post in posts.items():
                topic = self.classify_post_topic(post)
                site_assignment = self.classify_website(post)
                action = self.classify_content_action(post)
                author = post.get('author', '').strip()
                is_sponsored = author == 'Expert'

                rows.append({
                    'post_id': post_id,
                    'title': post['title'][:80],
                    'topic': topic,
                    'status': post.get('status', 'published'),
                    'author': author,
                    'traffic': post.get('traffic', 0),
                    'impressions': post.get('impressions', 0),
                    'position': post.get('position', 0),
                    'recommended_site': site_assignment['site'],
                    'reason': site_assignment['reason'],
                    'action': action,
                    'priority': site_assignment['priority'],
                    'notes': 'SPONSORED' if is_sponsored else ''
                })

            rows.sort(key=lambda x: x['traffic'], reverse=True)

            with open(output_path, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(rows)

            self.log(f"✓ Exported {len(rows)} posts to {output_path}")
        except Exception as e:
            self.log(f"❌ Error exporting CSV: {e}")

    def generate_strategy_report(self, analysis, output_path):
        """Generate comprehensive strategy report."""
        try:
            report = []
            report.append("# Multi-Site Content Strategy Report\n")
            report.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}*\n\n")

            # Executive Summary
            report.append("## Executive Summary\n\n")
            report.append(f"**Total Content Analyzed:** {analysis['total_posts']} posts\n")
            report.append(f"- Published: {analysis['total_posts'] - analysis['draft_posts']['count']}\n")
            report.append(f"- Drafts: {analysis['draft_posts']['count']}\n")
            report.append(f"- Sponsored: {analysis['sponsored_posts']['count']}\n\n")

            # Distribution Strategy
            report.append("## Recommended Site Distribution\n\n")
            for site, data in sorted(analysis['by_site'].items(),
                                    key=lambda x: x[1]['traffic'], reverse=True):
                report.append(f"### {site}\n")
                report.append(f"- Posts: {data['count']}\n")
                report.append(f"- Total Traffic: {data['traffic']:,} visits/month\n")
                report.append(f"- Top Posts:\n")
                for post in sorted(data['posts'], key=lambda x: x['traffic'], reverse=True)[:5]:
                    report.append(f"  - {post['title'][:60]} ({post['traffic']} visits)\n")
                report.append(f"\n")

            # Topic Distribution
            report.append("## Content by Topic\n\n")
            for topic, data in sorted(analysis['by_topic'].items(),
                                     key=lambda x: x[1]['traffic'], reverse=True):
                report.append(f"- **{topic.title()}:** {data['count']} posts ({data['traffic']:,} visits)\n")
            report.append("\n")

            # Actions Required
            report.append("## Required Actions\n\n")
            for action, data in sorted(analysis['by_action'].items(),
                                      key=lambda x: x[1]['count'], reverse=True):
                report.append(f"- **{action}:** {data['count']} posts ({data['traffic']:,} visits)\n")
            report.append("\n")

            # Sponsored Content
            if analysis['sponsored_posts']['count'] > 0:
                report.append("## Sponsored Content (by 'Expert')\n\n")
                report.append(f"Total: {analysis['sponsored_posts']['count']} posts\n")
                report.append(f"Traffic: {analysis['sponsored_posts']['traffic']:,} visits/month\n\n")
                for post in sorted(analysis['sponsored_posts']['posts'],
                                 key=lambda x: x['traffic'], reverse=True)[:10]:
                    report.append(f"- {post['title'][:70]} ({post['traffic']} visits)\n")
                report.append("\n")

            # Draft Posts
            if analysis['draft_posts']['count'] > 0:
                report.append("## Draft Posts (Unpublished)\n\n")
                report.append(f"Total: {analysis['draft_posts']['count']} posts\n")
                report.append("*Decision needed: Publish, delete, or move to other site?*\n\n")
                for post in analysis['draft_posts']['posts'][:15]:
                    report.append(f"- {post['title'][:70]}\n")
                report.append("\n")

            # Recommendations
            report.append("## Strategic Recommendations\n\n")
            report.append("1. **Consolidate on mistergeek.net:**\n")
            report.append("   - Keep only VPN, software, gaming, tech content\n")
            report.append("   - Focus on high-traffic posts (>50 visits/month)\n\n")

            report.append("2. **Move to webscroll.fr:**\n")
            report.append("   - All torrent/file-sharing content\n")
            report.append("   - File-specific guides\n\n")

            report.append("3. **Move to hellogeek.net:**\n")
            report.append("   - Low-traffic content (<50 visits)\n")
            report.append("   - Off-brand content\n")
            report.append("   - Experimental/niche posts\n\n")

            report.append("4. **Delete:**\n")
            report.append(f"   - Posts with <5 visits and <20 impressions\n")
            report.append("   - Duplicates/thin content\n\n")

            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(''.join(report))

            self.log(f"✓ Generated strategy report: {output_path}")
        except Exception as e:
            self.log(f"❌ Error generating report: {e}")

    def run(self, wordpress_csv, drafts_csv):
        """Run complete content strategy analysis."""
        self.log("\n" + "="*70)
        self.log("Multi-Site Content Strategy Analyzer")
        self.log("="*70 + "\n")

        # Load posts
        self.log("📚 Loading content...\n")
        wordpress_posts = self.load_wordpress_posts(wordpress_csv)
        draft_posts = self.load_draft_posts(drafts_csv)

        # Combine all posts
        all_posts = {**wordpress_posts, **draft_posts}
        self.log(f"Total posts: {len(all_posts)}\n")

        # Analyze
        self.log("🔍 Analyzing content distribution...\n")
        analysis = self.analyze_all_content(all_posts)

        # Generate outputs
        self.log("📊 Generating outputs...\n")

        output_csv = self.output_dir / 'analysis' / 'content_distribution.csv'
        self.generate_content_distribution_csv(all_posts, output_csv)

        output_md = self.output_dir / 'reports' / 'content_strategy_report.md'
        self.generate_strategy_report(analysis, output_md)

        # Export analysis JSON
        analysis_json = self.output_dir / 'analysis' / 'analysis_summary.json'
        try:
            with open(analysis_json, 'w', encoding='utf-8') as f:
                # Convert defaultdict to regular dict for JSON serialization
                analysis_clean = {
                    'total_posts': analysis['total_posts'],
                    'by_site': dict(analysis['by_site']),
                    'by_topic': {k: {'count': v['count'], 'traffic': v['traffic']}
                               for k, v in analysis['by_topic'].items()},
                    'by_action': {k: {'count': v['count'], 'traffic': v['traffic']}
                                for k, v in analysis['by_action'].items()},
                    'sponsored_posts': {
                        'count': analysis['sponsored_posts']['count'],
                        'traffic': analysis['sponsored_posts']['traffic']
                    },
                    'draft_posts': {
                        'count': analysis['draft_posts']['count']
                    }
                }
                json.dump(analysis_clean, f, indent=2, ensure_ascii=False)
            self.log(f"✓ Exported analysis JSON: {analysis_json}\n")
        except Exception as e:
            self.log(f"❌ Error exporting JSON: {e}\n")

        # Summary
        self.log("\n" + "="*70)
        self.log("ANALYSIS COMPLETE")
        self.log("="*70)
        self.log(f"\nOutputs:")
        self.log(f"  Distribution: {output_csv}")
        self.log(f"  Strategy: {output_md}")
        self.log(f"  Summary: {analysis_json}\n")

        self.log("Next steps:")
        self.log("  1. Review content_strategy_report.md")
        self.log("  2. Review content_distribution.csv")
        self.log("  3. Decide: which posts go to which site?")
        self.log("  4. Plan content consolidation")


def main():
    """CLI entry point."""
    parser = argparse.ArgumentParser(description='Analyze content across multiple sites')
    parser.add_argument('--wordpress-csv', type=Path,
                       default=Path('input/wordpress/new-propositions.csv'),
                       help='WordPress posts CSV')
    parser.add_argument('--drafts-csv', type=Path,
                       default=Path('input/drafts/drafts.csv'),
                       help='Draft posts CSV')

    args = parser.parse_args()

    analyzer = ContentStrategyAnalyzer()
    analyzer.run(args.wordpress_csv, args.drafts_csv)


if __name__ == '__main__':
    main()