Files
seo/scripts/export_posts_for_ai_decision.py
Kevin Bataille 8c7cd24685 Refactor SEO automation into unified CLI application
Major refactoring to create a clean, integrated CLI application:

### New Features:
- Unified CLI executable (./seo) with simple command structure
- All commands accept optional CSV file arguments
- Auto-detection of latest files when no arguments provided
- Simplified output directory structure (output/ instead of output/reports/)
- Cleaner export filename format (all_posts_YYYY-MM-DD.csv)

### Commands:
- export: Export all posts from WordPress sites
- analyze [csv]: Analyze posts with AI (optional CSV input)
- recategorize [csv]: Recategorize posts with AI
- seo_check: Check SEO quality
- categories: Manage categories across sites
- approve [files]: Review and approve recommendations
- full_pipeline: Run complete workflow
- analytics, gaps, opportunities, report, status

### Changes:
- Moved all scripts to scripts/ directory
- Created config.yaml for configuration
- Updated all scripts to use output/ directory
- Deprecated old seo-cli.py in favor of new ./seo
- Added AGENTS.md and CHANGELOG.md documentation
- Consolidated README.md with updated usage

### Technical:
- Added PyYAML dependency
- Removed hardcoded configuration values
- All scripts now properly integrated
- Better error handling and user feedback

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 14:24:44 +01:00

379 lines
12 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Export All Posts to CSV for AI Decision Making
Fetches complete post data from all 3 WordPress sites and exports to CSV
for AI-powered categorization and movement recommendations.
Uses credentials from .env file for secure authentication.
"""
import csv
import logging
import sys
from pathlib import Path
from typing import Dict, List, Optional
import requests
from requests.auth import HTTPBasicAuth
import time
from datetime import datetime
import re
from config import Config
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class PostExporter:
"""Export posts from WordPress sites to CSV for AI analysis."""
def __init__(self):
"""Initialize the exporter with sites from Config."""
self.sites = Config.WORDPRESS_SITES
self.all_posts = []
self.category_cache = {} # Cache category names by site
def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]:
"""
Fetch ALL posts from a site with full details.
Args:
site_name: Website name
site_config: Site configuration dict
Returns:
List of posts with full metadata
"""
logger.info(f"\nFetching posts from {site_name}...")
posts = []
page = 1
base_url = site_config['url'].rstrip('/')
api_url = f"{base_url}/wp-json/wp/v2/posts"
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
for status in ['publish', 'draft']:
page = 1
status_count = 0
while True:
params = {
'page': page,
'per_page': 100,
'status': status,
}
try:
logger.info(f" Fetching page {page} ({status} posts)...")
response = requests.get(api_url, params=params, auth=auth, timeout=10)
response.raise_for_status()
page_posts = response.json()
if not page_posts:
break
posts.extend(page_posts)
status_count += len(page_posts)
logger.info(f" ✓ Got {len(page_posts)} posts (total: {len(posts)})")
page += 1
time.sleep(0.5)
except requests.exceptions.HTTPError as e:
if response.status_code == 400:
logger.info(f" API limit reached (got {status_count} {status} posts)")
break
else:
logger.error(f"Error on page {page}: {e}")
break
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching from {site_name}: {e}")
break
if status_count > 0:
logger.info(f" ✓ Total {status} posts: {status_count}")
logger.info(f"✓ Total posts from {site_name}: {len(posts)}\n")
return posts
def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, str]:
"""
Fetch category names and slugs from a WordPress site.
Args:
site_name: Website name
site_config: Site configuration dict
Returns:
Dict mapping category IDs to category names
"""
if site_name in self.category_cache:
return self.category_cache[site_name]
logger.info(f" Fetching categories from {site_name}...")
categories = {}
base_url = site_config['url'].rstrip('/')
api_url = f"{base_url}/wp-json/wp/v2/categories"
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
try:
# Fetch all categories (per_page=100)
params = {'per_page': 100}
response = requests.get(api_url, params=params, auth=auth, timeout=10)
response.raise_for_status()
cat_list = response.json()
for cat in cat_list:
categories[cat['id']] = {
'name': cat.get('name', ''),
'slug': cat.get('slug', ''),
}
logger.info(f" ✓ Fetched {len(categories)} categories")
except Exception as e:
logger.warning(f" Could not fetch categories from {site_name}: {e}")
self.category_cache[site_name] = categories
return categories
def extract_post_details(self, post: Dict, site_name: str, category_map: Dict[int, Dict]) -> Dict:
"""
Extract all relevant details from a post for AI analysis.
Args:
post: WordPress post object
site_name: Website name
category_map: Dict mapping category IDs to names
Returns:
Dict with extracted post details
"""
# Title
title = post.get('title', {})
if isinstance(title, dict):
title = title.get('rendered', '')
# Content (first 500 chars for context)
content = post.get('content', {})
if isinstance(content, dict):
content = content.get('rendered', '')
# Strip HTML tags for readability
content_text = re.sub('<[^<]+?>', '', content)[:500]
# Excerpt
excerpt = post.get('excerpt', {})
if isinstance(excerpt, dict):
excerpt = excerpt.get('rendered', '')
excerpt_text = re.sub('<[^<]+?>', '', excerpt)
# Meta descriptions and SEO data
meta_dict = post.get('meta', {}) if isinstance(post.get('meta'), dict) else {}
rank_math_title = meta_dict.get('rank_math_title', '')
rank_math_description = meta_dict.get('rank_math_description', '')
rank_math_keyword = meta_dict.get('rank_math_focus_keyword', '')
yoast_description = meta_dict.get('_yoast_wpseo_metadesc', '')
meta_description = rank_math_description or yoast_description or ''
# Categories - convert IDs to names using category_map
category_ids = post.get('categories', [])
category_names = ', '.join([
category_map.get(cat_id, {}).get('name', str(cat_id))
for cat_id in category_ids
]) if category_ids else ''
# Tags
tags = post.get('tags', [])
tag_names = ', '.join([str(t) for t in tags]) if tags else ''
# Author
author_id = post.get('author', '')
# Date
date_published = post.get('date', '')
date_modified = post.get('modified', '')
# Status
status = post.get('status', 'publish')
# URL
url = post.get('link', '')
return {
'site': site_name,
'post_id': post['id'],
'status': status,
'title': title.strip(),
'slug': post.get('slug', ''),
'url': url,
'author_id': author_id,
'date_published': date_published,
'date_modified': date_modified,
'categories': category_names,
'tags': tag_names,
'excerpt': excerpt_text.strip(),
'content_preview': content_text.strip(),
'seo_title': rank_math_title,
'meta_description': meta_description,
'focus_keyword': rank_math_keyword,
'word_count': len(content_text.split()),
}
def export_to_csv(self, output_file: Optional[str] = None) -> str:
"""
Export all posts to CSV.
Args:
output_file: Optional custom output path
Returns:
Path to exported CSV file
"""
if not output_file:
output_dir = Path(__file__).parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
date_str = datetime.now().strftime('%Y-%m-%d')
output_file = output_dir / f'all_posts_{date_str}.csv'
output_file = Path(output_file)
output_file.parent.mkdir(parents=True, exist_ok=True)
if not self.all_posts:
logger.error("No posts to export")
return None
fieldnames = [
'site',
'post_id',
'status',
'title',
'slug',
'url',
'author_id',
'date_published',
'date_modified',
'categories',
'tags',
'excerpt',
'content_preview',
'seo_title',
'meta_description',
'focus_keyword',
'word_count',
]
logger.info(f"Exporting {len(self.all_posts)} posts to CSV...")
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for post in self.all_posts:
writer.writerow({field: post.get(field, '') for field in fieldnames})
logger.info(f"✓ CSV exported to: {output_file}")
return str(output_file)
def run(self):
"""Run complete export process."""
logger.info("="*70)
logger.info("EXPORTING ALL POSTS FOR AI DECISION MAKING")
logger.info("="*70)
logger.info("Sites configured: " + ", ".join(self.sites.keys()))
logger.info("")
# Fetch from all sites
total_posts_before = len(self.all_posts)
for site_name, config in self.sites.items():
# Fetch categories for this site
categories = self.fetch_category_names(site_name, config)
# Fetch posts for this site
posts = self.fetch_posts_from_site(site_name, config)
if posts:
for post in posts:
post_details = self.extract_post_details(post, site_name, categories)
self.all_posts.append(post_details)
if not self.all_posts:
logger.error("No posts found on any site")
sys.exit(1)
# Sort by site then by post_id
self.all_posts.sort(key=lambda x: (x['site'], x['post_id']))
# Export to CSV
csv_file = self.export_to_csv()
# Print summary
logger.info("\n" + "="*70)
logger.info("EXPORT SUMMARY")
logger.info("="*70)
by_site = {}
for post in self.all_posts:
site = post['site']
if site not in by_site:
by_site[site] = {'total': 0, 'published': 0, 'draft': 0}
by_site[site]['total'] += 1
if post['status'] == 'publish':
by_site[site]['published'] += 1
else:
by_site[site]['draft'] += 1
for site, stats in sorted(by_site.items()):
logger.info(f"\n{site}:")
logger.info(f" Total: {stats['total']}")
logger.info(f" Published: {stats['published']}")
logger.info(f" Drafts: {stats['draft']}")
total_posts = len(self.all_posts)
total_published = sum(1 for p in self.all_posts if p['status'] == 'publish')
total_drafts = sum(1 for p in self.all_posts if p['status'] == 'draft')
logger.info(f"\n{''*70}")
logger.info(f"Total across all sites: {total_posts} posts")
logger.info(f" Published: {total_published}")
logger.info(f" Drafts: {total_drafts}")
logger.info(f"{''*70}")
logger.info(f"\n✓ Export complete!")
logger.info(f"✓ CSV file: {csv_file}")
logger.info(f"\nCSV includes:")
logger.info(f" • Site, Post ID, Status, Title, URL")
logger.info(f" • Publication dates, Categories, Tags")
logger.info(f" • Content preview (500 chars)")
logger.info(f" • SEO title, Meta description, Focus keyword")
logger.info(f" • Word count")
logger.info(f"\nNext step: Upload CSV to Claude or other AI for:")
logger.info(f" 1. Categorize by topic (VPN, software, gaming, torrenting, etc.)")
logger.info(f" 2. Recommend which site each post should be on")
logger.info(f" 3. Identify duplicates for consolidation")
logger.info(f" 4. Flag posts for deletion (low-traffic, thin content)")
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Export all posts from WordPress sites for AI decision making'
)
parser.add_argument(
'--output',
help='Custom output CSV file path'
)
args = parser.parse_args()
exporter = PostExporter()
exporter.run()
if __name__ == '__main__':
main()