seo/scripts/export_posts_for_ai_decision.py

#!/usr/bin/env python3
"""
Export All Posts to CSV for AI Decision Making
Fetches complete post data from all 3 WordPress sites and exports to CSV
for AI-powered categorization and movement recommendations.
Uses credentials from .env file for secure authentication.
"""

import csv
import logging
import sys
from pathlib import Path
from typing import Dict, List, Optional
import requests
from requests.auth import HTTPBasicAuth
import time
from datetime import datetime
import re
from config import Config

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class PostExporter:
    """Export posts from WordPress sites to CSV for AI analysis."""

    def __init__(self):
        """Initialize the exporter with sites from Config."""
        self.sites = Config.WORDPRESS_SITES
        self.all_posts = []
        self.category_cache = {}  # Cache category names by site

    def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]:
        """
        Fetch ALL posts from a site with full details.

        Args:
            site_name: Website name
            site_config: Site configuration dict

        Returns:
            List of posts with full metadata
        """
        logger.info(f"\nFetching posts from {site_name}...")

        posts = []
        page = 1
        base_url = site_config['url'].rstrip('/')
        api_url = f"{base_url}/wp-json/wp/v2/posts"
        auth = HTTPBasicAuth(site_config['username'], site_config['password'])

        for status in ['publish', 'draft']:
            page = 1
            status_count = 0

            while True:
                params = {
                    'page': page,
                    'per_page': 100,
                    'status': status,
                }

                try:
                    logger.info(f"  Fetching page {page} ({status} posts)...")
                    response = requests.get(api_url, params=params, auth=auth, timeout=10)
                    response.raise_for_status()

                    page_posts = response.json()
                    if not page_posts:
                        break

                    posts.extend(page_posts)
                    status_count += len(page_posts)
                    logger.info(f"    ✓ Got {len(page_posts)} posts (total: {len(posts)})")

                    page += 1
                    time.sleep(0.5)

                except requests.exceptions.HTTPError as e:
                    if response.status_code == 400:
                        logger.info(f"    ℹ API limit reached (got {status_count} {status} posts)")
                        break
                    else:
                        logger.error(f"Error on page {page}: {e}")
                        break

                except requests.exceptions.RequestException as e:
                    logger.error(f"Error fetching from {site_name}: {e}")
                    break

            if status_count > 0:
                logger.info(f"  ✓ Total {status} posts: {status_count}")

        logger.info(f"✓ Total posts from {site_name}: {len(posts)}\n")
        return posts

    def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, str]:
        """
        Fetch category names and slugs from a WordPress site.

        Args:
            site_name: Website name
            site_config: Site configuration dict

        Returns:
            Dict mapping category IDs to category names
        """
        if site_name in self.category_cache:
            return self.category_cache[site_name]

        logger.info(f"  Fetching categories from {site_name}...")
        categories = {}
        base_url = site_config['url'].rstrip('/')
        api_url = f"{base_url}/wp-json/wp/v2/categories"
        auth = HTTPBasicAuth(site_config['username'], site_config['password'])

        try:
            # Fetch all categories (per_page=100)
            params = {'per_page': 100}
            response = requests.get(api_url, params=params, auth=auth, timeout=10)
            response.raise_for_status()

            cat_list = response.json()
            for cat in cat_list:
                categories[cat['id']] = {
                    'name': cat.get('name', ''),
                    'slug': cat.get('slug', ''),
                }
            logger.info(f"    ✓ Fetched {len(categories)} categories")
        except Exception as e:
            logger.warning(f"  Could not fetch categories from {site_name}: {e}")

        self.category_cache[site_name] = categories
        return categories

    def extract_post_details(self, post: Dict, site_name: str, category_map: Dict[int, Dict]) -> Dict:
        """
        Extract all relevant details from a post for AI analysis.

        Args:
            post: WordPress post object
            site_name: Website name
            category_map: Dict mapping category IDs to names

        Returns:
            Dict with extracted post details
        """
        # Title
        title = post.get('title', {})
        if isinstance(title, dict):
            title = title.get('rendered', '')

        # Content (first 500 chars for context)
        content = post.get('content', {})
        if isinstance(content, dict):
            content = content.get('rendered', '')
        # Strip HTML tags for readability
        content_text = re.sub('<[^<]+?>', '', content)[:500]

        # Excerpt
        excerpt = post.get('excerpt', {})
        if isinstance(excerpt, dict):
            excerpt = excerpt.get('rendered', '')
        excerpt_text = re.sub('<[^<]+?>', '', excerpt)

        # Meta descriptions and SEO data
        meta_dict = post.get('meta', {}) if isinstance(post.get('meta'), dict) else {}

        rank_math_title = meta_dict.get('rank_math_title', '')
        rank_math_description = meta_dict.get('rank_math_description', '')
        rank_math_keyword = meta_dict.get('rank_math_focus_keyword', '')
        yoast_description = meta_dict.get('_yoast_wpseo_metadesc', '')

        meta_description = rank_math_description or yoast_description or ''

        # Categories - convert IDs to names using category_map
        category_ids = post.get('categories', [])
        category_names = ', '.join([
            category_map.get(cat_id, {}).get('name', str(cat_id))
            for cat_id in category_ids
        ]) if category_ids else ''

        # Tags
        tags = post.get('tags', [])
        tag_names = ', '.join([str(t) for t in tags]) if tags else ''

        # Author
        author_id = post.get('author', '')

        # Date
        date_published = post.get('date', '')
        date_modified = post.get('modified', '')

        # Status
        status = post.get('status', 'publish')

        # URL
        url = post.get('link', '')

        return {
            'site': site_name,
            'post_id': post['id'],
            'status': status,
            'title': title.strip(),
            'slug': post.get('slug', ''),
            'url': url,
            'author_id': author_id,
            'date_published': date_published,
            'date_modified': date_modified,
            'categories': category_names,
            'tags': tag_names,
            'excerpt': excerpt_text.strip(),
            'content_preview': content_text.strip(),
            'seo_title': rank_math_title,
            'meta_description': meta_description,
            'focus_keyword': rank_math_keyword,
            'word_count': len(content_text.split()),
        }

    def export_to_csv(self, output_file: Optional[str] = None) -> str:
        """
        Export all posts to CSV.

        Args:
            output_file: Optional custom output path

        Returns:
            Path to exported CSV file
        """
        if not output_file:
            output_dir = Path(__file__).parent.parent / 'output'
            output_dir.mkdir(parents=True, exist_ok=True)
            date_str = datetime.now().strftime('%Y-%m-%d')
            output_file = output_dir / f'all_posts_{date_str}.csv'

        output_file = Path(output_file)
        output_file.parent.mkdir(parents=True, exist_ok=True)

        if not self.all_posts:
            logger.error("No posts to export")
            return None

        fieldnames = [
            'site',
            'post_id',
            'status',
            'title',
            'slug',
            'url',
            'author_id',
            'date_published',
            'date_modified',
            'categories',
            'tags',
            'excerpt',
            'content_preview',
            'seo_title',
            'meta_description',
            'focus_keyword',
            'word_count',
        ]

        logger.info(f"Exporting {len(self.all_posts)} posts to CSV...")

        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

            for post in self.all_posts:
                writer.writerow({field: post.get(field, '') for field in fieldnames})

        logger.info(f"✓ CSV exported to: {output_file}")
        return str(output_file)

    def run(self):
        """Run complete export process."""
        logger.info("="*70)
        logger.info("EXPORTING ALL POSTS FOR AI DECISION MAKING")
        logger.info("="*70)
        logger.info("Sites configured: " + ", ".join(self.sites.keys()))
        logger.info("")

        # Fetch from all sites
        total_posts_before = len(self.all_posts)

        for site_name, config in self.sites.items():
            # Fetch categories for this site
            categories = self.fetch_category_names(site_name, config)

            # Fetch posts for this site
            posts = self.fetch_posts_from_site(site_name, config)

            if posts:
                for post in posts:
                    post_details = self.extract_post_details(post, site_name, categories)
                    self.all_posts.append(post_details)

        if not self.all_posts:
            logger.error("No posts found on any site")
            sys.exit(1)

        # Sort by site then by post_id
        self.all_posts.sort(key=lambda x: (x['site'], x['post_id']))

        # Export to CSV
        csv_file = self.export_to_csv()

        # Print summary
        logger.info("\n" + "="*70)
        logger.info("EXPORT SUMMARY")
        logger.info("="*70)

        by_site = {}
        for post in self.all_posts:
            site = post['site']
            if site not in by_site:
                by_site[site] = {'total': 0, 'published': 0, 'draft': 0}
            by_site[site]['total'] += 1
            if post['status'] == 'publish':
                by_site[site]['published'] += 1
            else:
                by_site[site]['draft'] += 1

        for site, stats in sorted(by_site.items()):
            logger.info(f"\n{site}:")
            logger.info(f"  Total: {stats['total']}")
            logger.info(f"  Published: {stats['published']}")
            logger.info(f"  Drafts: {stats['draft']}")

        total_posts = len(self.all_posts)
        total_published = sum(1 for p in self.all_posts if p['status'] == 'publish')
        total_drafts = sum(1 for p in self.all_posts if p['status'] == 'draft')

        logger.info(f"\n{'─'*70}")
        logger.info(f"Total across all sites: {total_posts} posts")
        logger.info(f"  Published: {total_published}")
        logger.info(f"  Drafts: {total_drafts}")
        logger.info(f"{'─'*70}")

        logger.info(f"\n✓ Export complete!")
        logger.info(f"✓ CSV file: {csv_file}")
        logger.info(f"\nCSV includes:")
        logger.info(f"  • Site, Post ID, Status, Title, URL")
        logger.info(f"  • Publication dates, Categories, Tags")
        logger.info(f"  • Content preview (500 chars)")
        logger.info(f"  • SEO title, Meta description, Focus keyword")
        logger.info(f"  • Word count")
        logger.info(f"\nNext step: Upload CSV to Claude or other AI for:")
        logger.info(f"  1. Categorize by topic (VPN, software, gaming, torrenting, etc.)")
        logger.info(f"  2. Recommend which site each post should be on")
        logger.info(f"  3. Identify duplicates for consolidation")
        logger.info(f"  4. Flag posts for deletion (low-traffic, thin content)")


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Export all posts from WordPress sites for AI decision making'
    )
    parser.add_argument(
        '--output',
        help='Custom output CSV file path'
    )

    args = parser.parse_args()

    exporter = PostExporter()
    exporter.run()


if __name__ == '__main__':
    main()