seo/scripts/content_gap_analyzer.py

"""
Content gap analyzer for SEO strategy.
Identifies missing topics and content opportunities using AI analysis.
"""

import csv
import json
import argparse
import time
from pathlib import Path
from collections import defaultdict
from openai import OpenAI
from config import Config


class ContentGapAnalyzer:
    """Identify content gaps and opportunities."""

    def __init__(self):
        """Initialize analyzer."""
        self.config = Config
        self.output_dir = self.config.OUTPUT_DIR
        self.logs = []
        self.client = None

        if self.config.OPENROUTER_API_KEY:
            self.client = OpenAI(
                base_url="https://openrouter.ai/api/v1",
                api_key=self.config.OPENROUTER_API_KEY,
            )

    def log(self, message):
        """Add message to log."""
        self.logs.append(message)
        print(message)

    def load_posts(self, posts_csv):
        """Load post titles and data."""
        posts = []
        if not posts_csv.exists():
            self.log(f"❌ File not found: {posts_csv}")
            return posts

        try:
            with open(posts_csv, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    posts.append({
                        'id': row.get('ID', ''),
                        'title': row.get('Title', ''),
                        'url': row.get('URL', ''),
                        'traffic': int(row.get('traffic', 0) or 0),
                        'impressions': int(row.get('impressions', 0) or 0),
                        'top_keywords': row.get('top_keywords', '')
                    })

            self.log(f"✓ Loaded {len(posts)} posts")
        except Exception as e:
            self.log(f"❌ Error reading posts: {e}")

        return posts

    def load_gsc_data(self, gsc_csv):
        """Load Search Console queries for gap analysis."""
        queries = []
        if not gsc_csv.exists():
            self.log(f"⚠️  GSC file not found: {gsc_csv}")
            return queries

        try:
            with open(gsc_csv, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    try:
                        query = row.get('Query', '').strip()
                        if not query:
                            continue

                        impressions = int(row.get('Impressions', 0) or 0)
                        clicks = int(row.get('Clicks', 0) or 0)

                        # Only include queries with impressions but low clicks
                        if impressions > 0 and (clicks / impressions < 0.05):
                            queries.append({
                                'query': query,
                                'impressions': impressions,
                                'clicks': clicks,
                                'ctr': clicks / impressions if impressions > 0 else 0
                            })
                    except (ValueError, TypeError):
                        continue

            self.log(f"✓ Loaded {len(queries)} underperforming queries")
        except Exception as e:
            self.log(f"⚠️  Error reading GSC file: {e}")

        return queries

    def extract_topics(self, posts):
        """Extract topic clusters from post titles using AI."""
        if not self.client or len(posts) == 0:
            self.log("⚠️  Cannot extract topics without AI client or posts")
            return {}

        try:
            self.log("🤖 Extracting topic clusters from post titles...")

            # Batch posts into groups
            titles = [p['title'] for p in posts][:100]  # Limit to first 100

            prompt = f"""Analyze these {len(titles)} blog post titles and identify topic clusters:

Titles:
{chr(10).join(f'{i+1}. {t}' for i, t in enumerate(titles))}

Extract for each post:
1. Primary topic category
2. Subtopics covered
3. Content type (guide, tutorial, review, comparison, etc.)

Then identify:
1. Top 10 topic clusters with post counts
2. Most common subtopics
3. Over/under-represented topics

Return JSON:
{{
  "post_topics": {{
    "1": {{"primary": "...", "subtopics": ["..."], "type": "..."}},
    ...
  }},
  "topic_clusters": [
    {{"cluster": "...", "post_count": 0, "importance": "high/medium/low"}}
  ],
  "coverage_gaps": ["topic 1", "topic 2", ...],
  "niche": "detected niche or industry"
}}"""

            response = self.client.chat.completions.create(
                model=self.config.AI_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=1500
            )

            try:
                result_text = response.choices[0].message.content
                start_idx = result_text.find('{')
                end_idx = result_text.rfind('}') + 1
                if start_idx >= 0 and end_idx > start_idx:
                    return json.loads(result_text[start_idx:end_idx])
            except json.JSONDecodeError:
                self.log("⚠️  Could not parse topic extraction response")
                return {}

        except Exception as e:
            self.log(f"⚠️  Topic extraction failed: {e}")
            return {}

    def identify_content_gaps(self, topic_analysis, queries):
        """Use AI to identify content gaps and suggest new topics."""
        if not self.client:
            return []

        try:
            self.log("🤖 Identifying content gaps and opportunities...")

            clusters = topic_analysis.get('topic_clusters', [])
            gaps = topic_analysis.get('coverage_gaps', [])
            niche = topic_analysis.get('niche', 'general')

            # Prepare query analysis
            top_queries = sorted(queries, key=lambda x: x['impressions'], reverse=True)[:20]
            queries_str = '\n'.join([f"- {q['query']} ({q['impressions']} impr, {q['ctr']:.1%} CTR)"
                                    for q in top_queries])

            prompt = f"""Based on content analysis and search demand, identify content gaps:

Existing Topics: {', '.join([c.get('cluster', '') for c in clusters[:10]])}
Coverage Gaps: {', '.join(gaps[:5])}
Niche: {niche}

Top Underperforming Queries (low CTR despite impressions):
{queries_str}

Identify high-value missing topics that could:
1. Fill coverage gaps
2. Target underperforming queries (CTR improvement)
3. Capitalize on search demand
4. Complement existing content

For each suggestion:
- Topic title
- Why it's valuable (search demand + intent)
- Search volume estimate (high/medium/low)
- How it complements existing content
- Recommended content format
- Estimated traffic potential

Prioritize by traffic opportunity. Max 20 ideas.

Return JSON:
{{
  "content_opportunities": [
    {{
      "title": "...",
      "why_valuable": "...",
      "search_volume": "high/medium/low",
      "complements": "existing topic",
      "format": "guide/tutorial/comparison/review/list",
      "traffic_potential": number,
      "priority": "high/medium/low"
    }}
  ]
}}"""

            response = self.client.chat.completions.create(
                model=self.config.AI_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=2000
            )

            try:
                result_text = response.choices[0].message.content
                start_idx = result_text.find('{')
                end_idx = result_text.rfind('}') + 1
                if start_idx >= 0 and end_idx > start_idx:
                    result = json.loads(result_text[start_idx:end_idx])
                    return result.get('content_opportunities', [])
            except json.JSONDecodeError:
                self.log("⚠️  Could not parse gap analysis response")
                return []

        except Exception as e:
            self.log(f"⚠️  Gap analysis failed: {e}")
            return []

    def export_gaps_csv(self, gaps, output_csv):
        """Export content gaps to CSV."""
        if not gaps:
            self.log("⚠️  No gaps to export")
            return

        try:
            fieldnames = [
                'priority', 'title', 'why_valuable', 'search_volume',
                'complements', 'format', 'traffic_potential'
            ]

            with open(output_csv, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
                writer.writeheader()

                for gap in sorted(gaps, key=lambda x: x.get('priority') == 'high', reverse=True):
                    writer.writerow(gap)

            self.log(f"✓ Exported {len(gaps)} content gaps to {output_csv}")
        except Exception as e:
            self.log(f"❌ Error exporting CSV: {e}")

    def export_topic_clusters_json(self, topic_analysis, output_json):
        """Export topic analysis to JSON."""
        if not topic_analysis:
            return

        try:
            with open(output_json, 'w', encoding='utf-8') as f:
                json.dump(topic_analysis, f, indent=2)

            self.log(f"✓ Exported topic analysis to {output_json}")
        except Exception as e:
            self.log(f"❌ Error exporting JSON: {e}")

    def export_log(self, log_file):
        """Export analysis log."""
        try:
            with open(log_file, 'w', encoding='utf-8') as f:
                f.write("Content Gap Analysis Report\n")
                f.write("=" * 60 + "\n\n")

                for msg in self.logs:
                    f.write(msg + "\n")

            self.log(f"✓ Exported log to {log_file}")
        except Exception as e:
            self.log(f"❌ Error exporting log: {e}")

    def run(self, posts_csv, gsc_csv, output_csv):
        """Run complete analysis workflow."""
        self.log("📊 Starting content gap analysis...")
        self.log(f"Posts: {posts_csv}")
        self.log(f"GSC queries: {gsc_csv}\n")

        # Load data
        posts = self.load_posts(posts_csv)
        queries = self.load_gsc_data(gsc_csv)

        if not posts:
            return

        # Extract topics
        topic_analysis = self.extract_topics(posts)
        if topic_analysis:
            self.log(f"✓ Identified {len(topic_analysis.get('topic_clusters', []))} topic clusters")

        # Identify gaps
        gaps = self.identify_content_gaps(topic_analysis, queries)
        if gaps:
            self.log(f"✓ Identified {len(gaps)} content opportunities")

        # Export
        self.log("\n📁 Exporting results...")
        self.export_gaps_csv(gaps, output_csv)

        topic_json = self.output_dir / 'topic_clusters.json'
        self.export_topic_clusters_json(topic_analysis, topic_json)

        # Export log
        log_dir = self.output_dir / 'logs'
        log_dir.mkdir(exist_ok=True)
        log_file = log_dir / 'content_gap_analysis_log.txt'
        self.export_log(log_file)

        self.log("\n✓ Content gap analysis complete!")


def main():
    """CLI entry point."""
    parser = argparse.ArgumentParser(description='Analyze content gaps')
    parser.add_argument('--posts-csv', type=Path,
                       default=Path('output/results/posts_with_analytics.csv'),
                       help='Posts CSV')
    parser.add_argument('--gsc-queries', type=Path,
                       default=Path('input/analytics/gsc/Requêtes.csv'),
                       help='GSC queries CSV')
    parser.add_argument('--output', type=Path,
                       default=Path('output/results/content_gaps.csv'),
                       help='Output gaps CSV')

    args = parser.parse_args()

    analyzer = ContentGapAnalyzer()
    analyzer.run(args.posts_csv, args.gsc_queries, args.output)


if __name__ == '__main__':
    main()