Major refactoring to create a clean, integrated CLI application: ### New Features: - Unified CLI executable (./seo) with simple command structure - All commands accept optional CSV file arguments - Auto-detection of latest files when no arguments provided - Simplified output directory structure (output/ instead of output/reports/) - Cleaner export filename format (all_posts_YYYY-MM-DD.csv) ### Commands: - export: Export all posts from WordPress sites - analyze [csv]: Analyze posts with AI (optional CSV input) - recategorize [csv]: Recategorize posts with AI - seo_check: Check SEO quality - categories: Manage categories across sites - approve [files]: Review and approve recommendations - full_pipeline: Run complete workflow - analytics, gaps, opportunities, report, status ### Changes: - Moved all scripts to scripts/ directory - Created config.yaml for configuration - Updated all scripts to use output/ directory - Deprecated old seo-cli.py in favor of new ./seo - Added AGENTS.md and CHANGELOG.md documentation - Consolidated README.md with updated usage ### Technical: - Added PyYAML dependency - Removed hardcoded configuration values - All scripts now properly integrated - Better error handling and user feedback Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
383 lines
14 KiB
Python
383 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
AI-Powered Post Re-categorization
|
||
Analyzes exported posts using Claude AI via OpenRouter and provides
|
||
category recommendations for better content organization.
|
||
"""
|
||
|
||
import csv
|
||
import json
|
||
import logging
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Tuple
|
||
import requests
|
||
from datetime import datetime
|
||
from config import Config
|
||
|
||
# Setup logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class PostRecategorizer:
|
||
"""Re-categorize posts using Claude AI via OpenRouter."""
|
||
|
||
def __init__(self, csv_file: str):
|
||
"""Initialize recategorizer with CSV file."""
|
||
self.csv_file = Path(csv_file)
|
||
self.openrouter_api_key = Config.OPENROUTER_API_KEY
|
||
self.posts = []
|
||
self.recategorized_posts = []
|
||
self.api_calls = 0
|
||
self.ai_cost = 0.0
|
||
|
||
def load_csv(self) -> bool:
|
||
"""Load posts from CSV file."""
|
||
logger.info(f"Loading CSV: {self.csv_file}")
|
||
|
||
if not self.csv_file.exists():
|
||
logger.error(f"CSV file not found: {self.csv_file}")
|
||
return False
|
||
|
||
try:
|
||
with open(self.csv_file, 'r', encoding='utf-8') as f:
|
||
reader = csv.DictReader(f)
|
||
self.posts = list(reader)
|
||
|
||
logger.info(f"✓ Loaded {len(self.posts)} posts from CSV")
|
||
|
||
# Group by site for stats
|
||
by_site = {}
|
||
for post in self.posts:
|
||
site = post.get('site', '')
|
||
if site not in by_site:
|
||
by_site[site] = 0
|
||
by_site[site] += 1
|
||
|
||
for site, count in by_site.items():
|
||
logger.info(f" {site}: {count} posts")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error loading CSV: {e}")
|
||
return False
|
||
|
||
def batch_posts_for_analysis(self, batch_size: int = 10) -> List[List[Dict]]:
|
||
"""Batch posts for AI analysis to manage token usage."""
|
||
batches = []
|
||
for i in range(0, len(self.posts), batch_size):
|
||
batches.append(self.posts[i:i + batch_size])
|
||
return batches
|
||
|
||
def format_batch_for_ai(self, batch: List[Dict]) -> str:
|
||
"""Format batch of posts for AI analysis."""
|
||
formatted = "POSTS TO RECATEGORIZE:\n\n"
|
||
|
||
for i, post in enumerate(batch, 1):
|
||
formatted += f"{i}. POST ID: {post['post_id']}\n"
|
||
formatted += f" Site: {post['site']}\n"
|
||
formatted += f" Title: {post['title']}\n"
|
||
formatted += f" Current Categories: {post.get('categories', 'None')}\n"
|
||
formatted += f" Content: {post.get('content_preview', '')}...\n"
|
||
formatted += f" Word Count: {post.get('word_count', '0')}\n"
|
||
formatted += "\n"
|
||
|
||
return formatted
|
||
|
||
def get_ai_recommendations(self, batch: List[Dict]) -> Optional[str]:
|
||
"""Get AI category recommendations for a batch of posts."""
|
||
if not self.openrouter_api_key:
|
||
logger.error("OPENROUTER_API_KEY not set")
|
||
return None
|
||
|
||
batch_text = self.format_batch_for_ai(batch)
|
||
|
||
prompt = f"""Analyze these blog posts and recommend optimal categories.
|
||
|
||
Website Strategy:
|
||
- mistergeek.net: VPN, Software/Tools, Gaming, General Tech, SEO, Content Marketing
|
||
- webscroll.fr: Torrenting, File-Sharing, Tracker Guides
|
||
- hellogeek.net: Experimental, Low-traffic, Off-brand content
|
||
|
||
{batch_text}
|
||
|
||
For EACH post, provide a JSON object with:
|
||
{{
|
||
"post_id": <id>,
|
||
"current_categories": "<current>",
|
||
"recommended_categories": "<comma-separated categories>",
|
||
"reason": "<Brief reason for recommendation>",
|
||
"confidence": "High|Medium|Low"
|
||
}}
|
||
|
||
Return ONLY a JSON array. Example:
|
||
[
|
||
{{"post_id": 2845, "current_categories": "VPN", "recommended_categories": "VPN, Security", "reason": "Add security angle", "confidence": "High"}},
|
||
{{"post_id": 1234, "current_categories": "Other", "recommended_categories": "Torrenting, Guides", "reason": "Torrent-specific content", "confidence": "Medium"}}
|
||
]
|
||
|
||
Analyze all posts and provide recommendations for EVERY post in the batch."""
|
||
|
||
try:
|
||
logger.info(f" Sending batch to Claude for recategorization...")
|
||
|
||
response = requests.post(
|
||
"https://openrouter.ai/api/v1/chat/completions",
|
||
headers={
|
||
"Authorization": f"Bearer {self.openrouter_api_key}",
|
||
"Content-Type": "application/json",
|
||
},
|
||
json={
|
||
"model": "anthropic/claude-3.5-sonnet",
|
||
"messages": [
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
"temperature": 0.3,
|
||
},
|
||
timeout=60
|
||
)
|
||
response.raise_for_status()
|
||
|
||
result = response.json()
|
||
self.api_calls += 1
|
||
|
||
# Track cost
|
||
usage = result.get('usage', {})
|
||
input_tokens = usage.get('prompt_tokens', 0)
|
||
output_tokens = usage.get('completion_tokens', 0)
|
||
self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000
|
||
|
||
recommendations_text = result['choices'][0]['message']['content'].strip()
|
||
logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})")
|
||
|
||
return recommendations_text
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error getting AI recommendations: {e}")
|
||
return None
|
||
|
||
def parse_recommendations(self, recommendations_json: str) -> List[Dict]:
|
||
"""Parse JSON recommendations from AI."""
|
||
try:
|
||
# Try to extract JSON from response
|
||
start_idx = recommendations_json.find('[')
|
||
end_idx = recommendations_json.rfind(']') + 1
|
||
|
||
if start_idx == -1 or end_idx == 0:
|
||
logger.error("Could not find JSON array in response")
|
||
return []
|
||
|
||
json_str = recommendations_json[start_idx:end_idx]
|
||
recommendations = json.loads(json_str)
|
||
|
||
return recommendations
|
||
|
||
except json.JSONDecodeError as e:
|
||
logger.error(f"Error parsing JSON recommendations: {e}")
|
||
logger.debug(f"Response was: {recommendations_json[:500]}")
|
||
return []
|
||
|
||
def analyze_all_posts(self) -> bool:
|
||
"""Analyze all posts in batches."""
|
||
logger.info("\n" + "="*70)
|
||
logger.info("RECATEGORIZING POSTS WITH AI")
|
||
logger.info("="*70 + "\n")
|
||
|
||
batches = self.batch_posts_for_analysis(batch_size=10)
|
||
logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches of 10...\n")
|
||
|
||
all_recommendations = {}
|
||
|
||
for batch_num, batch in enumerate(batches, 1):
|
||
logger.info(f"Batch {batch_num}/{len(batches)}: Analyzing {len(batch)} posts...")
|
||
|
||
recommendations_json = self.get_ai_recommendations(batch)
|
||
|
||
if not recommendations_json:
|
||
logger.error(f" Failed to get recommendations for batch {batch_num}")
|
||
continue
|
||
|
||
recommendations = self.parse_recommendations(recommendations_json)
|
||
|
||
for rec in recommendations:
|
||
all_recommendations[str(rec.get('post_id', ''))] = rec
|
||
|
||
logger.info(f" ✓ Got {len(recommendations)} recommendations")
|
||
|
||
logger.info(f"\n✓ Analysis complete!")
|
||
logger.info(f" Total recommendations: {len(all_recommendations)}")
|
||
logger.info(f" API calls: {self.api_calls}")
|
||
logger.info(f" Estimated cost: ${self.ai_cost:.4f}")
|
||
|
||
# Map recommendations to posts
|
||
for post in self.posts:
|
||
post_id = str(post['post_id'])
|
||
if post_id in all_recommendations:
|
||
rec = all_recommendations[post_id]
|
||
post['recommended_categories'] = rec.get('recommended_categories', post.get('categories', ''))
|
||
post['recategorization_reason'] = rec.get('reason', '')
|
||
post['recategorization_confidence'] = rec.get('confidence', 'Medium')
|
||
else:
|
||
post['recommended_categories'] = post.get('categories', '')
|
||
post['recategorization_reason'] = 'No recommendation'
|
||
post['recategorization_confidence'] = 'Unknown'
|
||
|
||
self.recategorized_posts.append(post)
|
||
|
||
return len(self.recategorized_posts) > 0
|
||
|
||
def export_with_recommendations(self) -> Tuple[str, str]:
|
||
"""Export CSV with recategorization recommendations."""
|
||
output_dir = Path(__file__).parent.parent / 'output'
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
|
||
# Main file with all recommendations
|
||
main_file = output_dir / f'posts_with_recategorization_{timestamp}.csv'
|
||
|
||
# Differences file (only posts with different recommendations)
|
||
changes_file = output_dir / f'category_changes_only_{timestamp}.csv'
|
||
|
||
# Full fieldnames including new recommendation columns
|
||
fieldnames = list(self.recategorized_posts[0].keys()) + [
|
||
'recommended_categories',
|
||
'recategorization_reason',
|
||
'recategorization_confidence'
|
||
]
|
||
|
||
logger.info(f"\nExporting recategorization recommendations to CSV...")
|
||
|
||
# Export main file with all posts
|
||
with open(main_file, 'w', newline='', encoding='utf-8') as f:
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
writer.writerows(self.recategorized_posts)
|
||
|
||
logger.info(f"✓ Main file: {main_file}")
|
||
|
||
# Export changes file (only posts where category changed)
|
||
posts_with_changes = [
|
||
p for p in self.recategorized_posts
|
||
if p.get('categories', '') != p.get('recommended_categories', '')
|
||
]
|
||
|
||
if posts_with_changes:
|
||
with open(changes_file, 'w', newline='', encoding='utf-8') as f:
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
writer.writerows(posts_with_changes)
|
||
logger.info(f"✓ Changes file ({len(posts_with_changes)} posts): {changes_file}")
|
||
else:
|
||
logger.info(f"ℹ No category changes recommended")
|
||
|
||
return (str(main_file), str(changes_file) if posts_with_changes else None)
|
||
|
||
def print_summary(self):
|
||
"""Print recategorization summary."""
|
||
logger.info("\n" + "="*70)
|
||
logger.info("RECATEGORIZATION SUMMARY")
|
||
logger.info("="*70 + "\n")
|
||
|
||
# Count changes by site
|
||
by_site = {}
|
||
total_changes = 0
|
||
|
||
for post in self.recategorized_posts:
|
||
site = post.get('site', 'Unknown')
|
||
if site not in by_site:
|
||
by_site[site] = {'total': 0, 'changed': 0}
|
||
|
||
by_site[site]['total'] += 1
|
||
|
||
if post.get('categories', '') != post.get('recommended_categories', ''):
|
||
by_site[site]['changed'] += 1
|
||
total_changes += 1
|
||
|
||
logger.info("CHANGES BY SITE:")
|
||
for site in sorted(by_site.keys()):
|
||
stats = by_site[site]
|
||
logger.info(f" {site}: {stats['changed']} changes out of {stats['total']} posts")
|
||
|
||
logger.info(f"\nTOTAL CHANGES: {total_changes} out of {len(self.recategorized_posts)} posts")
|
||
logger.info(f" ({(total_changes/len(self.recategorized_posts)*100):.1f}% of posts)")
|
||
|
||
# Confidence breakdown
|
||
logger.info("\nRECOMMENDATION CONFIDENCE:")
|
||
confidence_counts = {}
|
||
for post in self.recategorized_posts:
|
||
conf = post.get('recategorization_confidence', 'Unknown')
|
||
confidence_counts[conf] = confidence_counts.get(conf, 0) + 1
|
||
|
||
for conf in ['High', 'Medium', 'Low', 'Unknown']:
|
||
count = confidence_counts.get(conf, 0)
|
||
if count > 0:
|
||
logger.info(f" {conf}: {count} posts ({(count/len(self.recategorized_posts)*100):.1f}%)")
|
||
|
||
def run(self):
|
||
"""Run complete recategorization analysis."""
|
||
logger.info("="*70)
|
||
logger.info("AI-POWERED POST RECATEGORIZATION")
|
||
logger.info("="*70)
|
||
|
||
# Load CSV
|
||
if not self.load_csv():
|
||
sys.exit(1)
|
||
|
||
# Analyze posts
|
||
if not self.analyze_all_posts():
|
||
logger.error("Failed to analyze posts")
|
||
sys.exit(1)
|
||
|
||
# Print summary
|
||
self.print_summary()
|
||
|
||
# Export results
|
||
logger.info("\n" + "="*70)
|
||
logger.info("EXPORTING RESULTS")
|
||
logger.info("="*70)
|
||
|
||
main_file, changes_file = self.export_with_recommendations()
|
||
|
||
logger.info("\n" + "="*70)
|
||
logger.info("NEXT STEPS")
|
||
logger.info("="*70)
|
||
logger.info("\n1. Review recategorization recommendations:")
|
||
logger.info(f" {main_file}")
|
||
logger.info("\n2. Review only posts with category changes:")
|
||
if changes_file:
|
||
logger.info(f" {changes_file}")
|
||
else:
|
||
logger.info(" No changes recommended")
|
||
logger.info("\n3. Apply recommendations:")
|
||
logger.info(" Use categorization automation script (coming soon)")
|
||
logger.info(" Or manually update categories in WordPress")
|
||
|
||
logger.info("\n✓ Recategorization analysis complete!")
|
||
|
||
|
||
def main():
|
||
"""Main entry point."""
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description='Re-categorize posts using Claude AI for better organization'
|
||
)
|
||
parser.add_argument(
|
||
'csv_file',
|
||
help='Path to exported posts CSV file'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
recategorizer = PostRecategorizer(args.csv_file)
|
||
recategorizer.run()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|