Major refactoring to create a clean, integrated CLI application: ### New Features: - Unified CLI executable (./seo) with simple command structure - All commands accept optional CSV file arguments - Auto-detection of latest files when no arguments provided - Simplified output directory structure (output/ instead of output/reports/) - Cleaner export filename format (all_posts_YYYY-MM-DD.csv) ### Commands: - export: Export all posts from WordPress sites - analyze [csv]: Analyze posts with AI (optional CSV input) - recategorize [csv]: Recategorize posts with AI - seo_check: Check SEO quality - categories: Manage categories across sites - approve [files]: Review and approve recommendations - full_pipeline: Run complete workflow - analytics, gaps, opportunities, report, status ### Changes: - Moved all scripts to scripts/ directory - Created config.yaml for configuration - Updated all scripts to use output/ directory - Deprecated old seo-cli.py in favor of new ./seo - Added AGENTS.md and CHANGELOG.md documentation - Consolidated README.md with updated usage ### Technical: - Added PyYAML dependency - Removed hardcoded configuration values - All scripts now properly integrated - Better error handling and user feedback Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
379 lines
12 KiB
Python
Executable File
379 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Export All Posts to CSV for AI Decision Making
|
||
Fetches complete post data from all 3 WordPress sites and exports to CSV
|
||
for AI-powered categorization and movement recommendations.
|
||
Uses credentials from .env file for secure authentication.
|
||
"""
|
||
|
||
import csv
|
||
import logging
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional
|
||
import requests
|
||
from requests.auth import HTTPBasicAuth
|
||
import time
|
||
from datetime import datetime
|
||
import re
|
||
from config import Config
|
||
|
||
# Setup logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class PostExporter:
|
||
"""Export posts from WordPress sites to CSV for AI analysis."""
|
||
|
||
def __init__(self):
|
||
"""Initialize the exporter with sites from Config."""
|
||
self.sites = Config.WORDPRESS_SITES
|
||
self.all_posts = []
|
||
self.category_cache = {} # Cache category names by site
|
||
|
||
def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]:
|
||
"""
|
||
Fetch ALL posts from a site with full details.
|
||
|
||
Args:
|
||
site_name: Website name
|
||
site_config: Site configuration dict
|
||
|
||
Returns:
|
||
List of posts with full metadata
|
||
"""
|
||
logger.info(f"\nFetching posts from {site_name}...")
|
||
|
||
posts = []
|
||
page = 1
|
||
base_url = site_config['url'].rstrip('/')
|
||
api_url = f"{base_url}/wp-json/wp/v2/posts"
|
||
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
|
||
|
||
for status in ['publish', 'draft']:
|
||
page = 1
|
||
status_count = 0
|
||
|
||
while True:
|
||
params = {
|
||
'page': page,
|
||
'per_page': 100,
|
||
'status': status,
|
||
}
|
||
|
||
try:
|
||
logger.info(f" Fetching page {page} ({status} posts)...")
|
||
response = requests.get(api_url, params=params, auth=auth, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
page_posts = response.json()
|
||
if not page_posts:
|
||
break
|
||
|
||
posts.extend(page_posts)
|
||
status_count += len(page_posts)
|
||
logger.info(f" ✓ Got {len(page_posts)} posts (total: {len(posts)})")
|
||
|
||
page += 1
|
||
time.sleep(0.5)
|
||
|
||
except requests.exceptions.HTTPError as e:
|
||
if response.status_code == 400:
|
||
logger.info(f" ℹ API limit reached (got {status_count} {status} posts)")
|
||
break
|
||
else:
|
||
logger.error(f"Error on page {page}: {e}")
|
||
break
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
logger.error(f"Error fetching from {site_name}: {e}")
|
||
break
|
||
|
||
if status_count > 0:
|
||
logger.info(f" ✓ Total {status} posts: {status_count}")
|
||
|
||
logger.info(f"✓ Total posts from {site_name}: {len(posts)}\n")
|
||
return posts
|
||
|
||
def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, str]:
|
||
"""
|
||
Fetch category names and slugs from a WordPress site.
|
||
|
||
Args:
|
||
site_name: Website name
|
||
site_config: Site configuration dict
|
||
|
||
Returns:
|
||
Dict mapping category IDs to category names
|
||
"""
|
||
if site_name in self.category_cache:
|
||
return self.category_cache[site_name]
|
||
|
||
logger.info(f" Fetching categories from {site_name}...")
|
||
categories = {}
|
||
base_url = site_config['url'].rstrip('/')
|
||
api_url = f"{base_url}/wp-json/wp/v2/categories"
|
||
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
|
||
|
||
try:
|
||
# Fetch all categories (per_page=100)
|
||
params = {'per_page': 100}
|
||
response = requests.get(api_url, params=params, auth=auth, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
cat_list = response.json()
|
||
for cat in cat_list:
|
||
categories[cat['id']] = {
|
||
'name': cat.get('name', ''),
|
||
'slug': cat.get('slug', ''),
|
||
}
|
||
logger.info(f" ✓ Fetched {len(categories)} categories")
|
||
except Exception as e:
|
||
logger.warning(f" Could not fetch categories from {site_name}: {e}")
|
||
|
||
self.category_cache[site_name] = categories
|
||
return categories
|
||
|
||
def extract_post_details(self, post: Dict, site_name: str, category_map: Dict[int, Dict]) -> Dict:
|
||
"""
|
||
Extract all relevant details from a post for AI analysis.
|
||
|
||
Args:
|
||
post: WordPress post object
|
||
site_name: Website name
|
||
category_map: Dict mapping category IDs to names
|
||
|
||
Returns:
|
||
Dict with extracted post details
|
||
"""
|
||
# Title
|
||
title = post.get('title', {})
|
||
if isinstance(title, dict):
|
||
title = title.get('rendered', '')
|
||
|
||
# Content (first 500 chars for context)
|
||
content = post.get('content', {})
|
||
if isinstance(content, dict):
|
||
content = content.get('rendered', '')
|
||
# Strip HTML tags for readability
|
||
content_text = re.sub('<[^<]+?>', '', content)[:500]
|
||
|
||
# Excerpt
|
||
excerpt = post.get('excerpt', {})
|
||
if isinstance(excerpt, dict):
|
||
excerpt = excerpt.get('rendered', '')
|
||
excerpt_text = re.sub('<[^<]+?>', '', excerpt)
|
||
|
||
# Meta descriptions and SEO data
|
||
meta_dict = post.get('meta', {}) if isinstance(post.get('meta'), dict) else {}
|
||
|
||
rank_math_title = meta_dict.get('rank_math_title', '')
|
||
rank_math_description = meta_dict.get('rank_math_description', '')
|
||
rank_math_keyword = meta_dict.get('rank_math_focus_keyword', '')
|
||
yoast_description = meta_dict.get('_yoast_wpseo_metadesc', '')
|
||
|
||
meta_description = rank_math_description or yoast_description or ''
|
||
|
||
# Categories - convert IDs to names using category_map
|
||
category_ids = post.get('categories', [])
|
||
category_names = ', '.join([
|
||
category_map.get(cat_id, {}).get('name', str(cat_id))
|
||
for cat_id in category_ids
|
||
]) if category_ids else ''
|
||
|
||
# Tags
|
||
tags = post.get('tags', [])
|
||
tag_names = ', '.join([str(t) for t in tags]) if tags else ''
|
||
|
||
# Author
|
||
author_id = post.get('author', '')
|
||
|
||
# Date
|
||
date_published = post.get('date', '')
|
||
date_modified = post.get('modified', '')
|
||
|
||
# Status
|
||
status = post.get('status', 'publish')
|
||
|
||
# URL
|
||
url = post.get('link', '')
|
||
|
||
return {
|
||
'site': site_name,
|
||
'post_id': post['id'],
|
||
'status': status,
|
||
'title': title.strip(),
|
||
'slug': post.get('slug', ''),
|
||
'url': url,
|
||
'author_id': author_id,
|
||
'date_published': date_published,
|
||
'date_modified': date_modified,
|
||
'categories': category_names,
|
||
'tags': tag_names,
|
||
'excerpt': excerpt_text.strip(),
|
||
'content_preview': content_text.strip(),
|
||
'seo_title': rank_math_title,
|
||
'meta_description': meta_description,
|
||
'focus_keyword': rank_math_keyword,
|
||
'word_count': len(content_text.split()),
|
||
}
|
||
|
||
def export_to_csv(self, output_file: Optional[str] = None) -> str:
|
||
"""
|
||
Export all posts to CSV.
|
||
|
||
Args:
|
||
output_file: Optional custom output path
|
||
|
||
Returns:
|
||
Path to exported CSV file
|
||
"""
|
||
if not output_file:
|
||
output_dir = Path(__file__).parent.parent / 'output'
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
date_str = datetime.now().strftime('%Y-%m-%d')
|
||
output_file = output_dir / f'all_posts_{date_str}.csv'
|
||
|
||
output_file = Path(output_file)
|
||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
if not self.all_posts:
|
||
logger.error("No posts to export")
|
||
return None
|
||
|
||
fieldnames = [
|
||
'site',
|
||
'post_id',
|
||
'status',
|
||
'title',
|
||
'slug',
|
||
'url',
|
||
'author_id',
|
||
'date_published',
|
||
'date_modified',
|
||
'categories',
|
||
'tags',
|
||
'excerpt',
|
||
'content_preview',
|
||
'seo_title',
|
||
'meta_description',
|
||
'focus_keyword',
|
||
'word_count',
|
||
]
|
||
|
||
logger.info(f"Exporting {len(self.all_posts)} posts to CSV...")
|
||
|
||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
|
||
for post in self.all_posts:
|
||
writer.writerow({field: post.get(field, '') for field in fieldnames})
|
||
|
||
logger.info(f"✓ CSV exported to: {output_file}")
|
||
return str(output_file)
|
||
|
||
def run(self):
|
||
"""Run complete export process."""
|
||
logger.info("="*70)
|
||
logger.info("EXPORTING ALL POSTS FOR AI DECISION MAKING")
|
||
logger.info("="*70)
|
||
logger.info("Sites configured: " + ", ".join(self.sites.keys()))
|
||
logger.info("")
|
||
|
||
# Fetch from all sites
|
||
total_posts_before = len(self.all_posts)
|
||
|
||
for site_name, config in self.sites.items():
|
||
# Fetch categories for this site
|
||
categories = self.fetch_category_names(site_name, config)
|
||
|
||
# Fetch posts for this site
|
||
posts = self.fetch_posts_from_site(site_name, config)
|
||
|
||
if posts:
|
||
for post in posts:
|
||
post_details = self.extract_post_details(post, site_name, categories)
|
||
self.all_posts.append(post_details)
|
||
|
||
if not self.all_posts:
|
||
logger.error("No posts found on any site")
|
||
sys.exit(1)
|
||
|
||
# Sort by site then by post_id
|
||
self.all_posts.sort(key=lambda x: (x['site'], x['post_id']))
|
||
|
||
# Export to CSV
|
||
csv_file = self.export_to_csv()
|
||
|
||
# Print summary
|
||
logger.info("\n" + "="*70)
|
||
logger.info("EXPORT SUMMARY")
|
||
logger.info("="*70)
|
||
|
||
by_site = {}
|
||
for post in self.all_posts:
|
||
site = post['site']
|
||
if site not in by_site:
|
||
by_site[site] = {'total': 0, 'published': 0, 'draft': 0}
|
||
by_site[site]['total'] += 1
|
||
if post['status'] == 'publish':
|
||
by_site[site]['published'] += 1
|
||
else:
|
||
by_site[site]['draft'] += 1
|
||
|
||
for site, stats in sorted(by_site.items()):
|
||
logger.info(f"\n{site}:")
|
||
logger.info(f" Total: {stats['total']}")
|
||
logger.info(f" Published: {stats['published']}")
|
||
logger.info(f" Drafts: {stats['draft']}")
|
||
|
||
total_posts = len(self.all_posts)
|
||
total_published = sum(1 for p in self.all_posts if p['status'] == 'publish')
|
||
total_drafts = sum(1 for p in self.all_posts if p['status'] == 'draft')
|
||
|
||
logger.info(f"\n{'─'*70}")
|
||
logger.info(f"Total across all sites: {total_posts} posts")
|
||
logger.info(f" Published: {total_published}")
|
||
logger.info(f" Drafts: {total_drafts}")
|
||
logger.info(f"{'─'*70}")
|
||
|
||
logger.info(f"\n✓ Export complete!")
|
||
logger.info(f"✓ CSV file: {csv_file}")
|
||
logger.info(f"\nCSV includes:")
|
||
logger.info(f" • Site, Post ID, Status, Title, URL")
|
||
logger.info(f" • Publication dates, Categories, Tags")
|
||
logger.info(f" • Content preview (500 chars)")
|
||
logger.info(f" • SEO title, Meta description, Focus keyword")
|
||
logger.info(f" • Word count")
|
||
logger.info(f"\nNext step: Upload CSV to Claude or other AI for:")
|
||
logger.info(f" 1. Categorize by topic (VPN, software, gaming, torrenting, etc.)")
|
||
logger.info(f" 2. Recommend which site each post should be on")
|
||
logger.info(f" 3. Identify duplicates for consolidation")
|
||
logger.info(f" 4. Flag posts for deletion (low-traffic, thin content)")
|
||
|
||
|
||
def main():
|
||
"""Main entry point."""
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description='Export all posts from WordPress sites for AI decision making'
|
||
)
|
||
parser.add_argument(
|
||
'--output',
|
||
help='Custom output CSV file path'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
exporter = PostExporter()
|
||
exporter.run()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|