Files
seo/scripts/category_manager.py
Kevin Bataille 8c7cd24685 Refactor SEO automation into unified CLI application
Major refactoring to create a clean, integrated CLI application:

### New Features:
- Unified CLI executable (./seo) with simple command structure
- All commands accept optional CSV file arguments
- Auto-detection of latest files when no arguments provided
- Simplified output directory structure (output/ instead of output/reports/)
- Cleaner export filename format (all_posts_YYYY-MM-DD.csv)

### Commands:
- export: Export all posts from WordPress sites
- analyze [csv]: Analyze posts with AI (optional CSV input)
- recategorize [csv]: Recategorize posts with AI
- seo_check: Check SEO quality
- categories: Manage categories across sites
- approve [files]: Review and approve recommendations
- full_pipeline: Run complete workflow
- analytics, gaps, opportunities, report, status

### Changes:
- Moved all scripts to scripts/ directory
- Created config.yaml for configuration
- Updated all scripts to use output/ directory
- Deprecated old seo-cli.py in favor of new ./seo
- Added AGENTS.md and CHANGELOG.md documentation
- Consolidated README.md with updated usage

### Technical:
- Added PyYAML dependency
- Removed hardcoded configuration values
- All scripts now properly integrated
- Better error handling and user feedback

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 14:24:44 +01:00

614 lines
25 KiB
Python

#!/usr/bin/env python3
"""
WordPress Category Management Script
Fetches all categories from WordPress sites, proposes new categories,
and allows assigning posts to categories or websites using AI recommendations.
"""
import csv
import json
import logging
import sys
from pathlib import Path
from typing import Dict, List, Optional
import requests
from requests.auth import HTTPBasicAuth
import time
from datetime import datetime
from config import Config
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class AICategoryAdvisor:
"""AI-powered advisor for category and site recommendations."""
def __init__(self):
self.openrouter_api_key = Config.OPENROUTER_API_KEY
self.ai_model = Config.AI_MODEL
self.api_calls = 0
self.ai_cost = 0.0
def get_ai_category_recommendations(self, posts_batch: List[Dict]) -> Optional[List[Dict]]:
"""
Get AI recommendations for category assignments.
Args:
posts_batch: List of posts to analyze
Returns:
List of recommendations for each post
"""
if not self.openrouter_api_key:
logger.error("OPENROUTER_API_KEY not set")
return None
# Format posts for AI analysis
formatted_posts = []
for i, post in enumerate(posts_batch, 1):
title = post.get('title', {}).get('rendered', 'Untitled')
content = post.get('content', {}).get('rendered', '')[:500] # First 500 chars
current_categories = post.get('categories', [])
formatted_posts.append(
f"{i}. POST ID: {post['id']}\n"
f" Title: {title}\n"
f" Content Preview: {content}...\n"
f" Current Categories: {current_categories}\n"
)
posts_text = "\n".join(formatted_posts)
prompt = f"""Analyze these blog posts and provide category recommendations.
Website Strategy:
- mistergeek.net: High-value topics (VPN, Software, Gaming, General Tech, SEO, Content Marketing)
- webscroll.fr: Torrenting, File-Sharing, Tracker guides (niche audience)
- hellogeek.net: Low-traffic, experimental, off-brand, or niche content
{posts_text}
For EACH post, provide a JSON object with:
{{
"post_id": <id>,
"recommended_category": "<SUGGESTED_CATEGORY>",
"recommended_site": "<SITE_NAME>",
"reason": "<Brief reason for recommendation>",
"confidence": "<High|Medium|Low>"
}}
Return ONLY a JSON array. Example:
[
{{"post_id": 2845, "recommended_category": "VPN", "recommended_site": "mistergeek.net", "reason": "Core VPN topic", "confidence": "High"}},
{{"post_id": 1234, "recommended_category": "Torrenting", "recommended_site": "webscroll.fr", "reason": "Torrent tracker content", "confidence": "High"}}
]
Analyze all posts and provide recommendations for EVERY post in the batch."""
try:
logger.info(f" Sending batch to AI for category recommendations...")
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {self.openrouter_api_key}",
"Content-Type": "application/json",
},
json={
"model": self.ai_model,
"messages": [
{"role": "user", "content": prompt}
],
"temperature": 0.3, # Lower temp for more consistent recommendations
},
timeout=60
)
response.raise_for_status()
result = response.json()
self.api_calls += 1
# Track cost
usage = result.get('usage', {})
input_tokens = usage.get('prompt_tokens', 0)
output_tokens = usage.get('completion_tokens', 0)
# Using Claude 3.5 Sonnet pricing: $3/$15 per 1M tokens
self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000
recommendations_text = result['choices'][0]['message']['content'].strip()
logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})")
# Parse the recommendations
return self._parse_recommendations(recommendations_text)
except Exception as e:
logger.error(f"Error getting AI recommendations: {e}")
return None
def _parse_recommendations(self, recommendations_json: str) -> List[Dict]:
"""Parse JSON recommendations from AI."""
try:
# Try to extract JSON from response
start_idx = recommendations_json.find('[')
end_idx = recommendations_json.rfind(']') + 1
if start_idx == -1 or end_idx == 0:
logger.error("Could not find JSON array in response")
return []
json_str = recommendations_json[start_idx:end_idx]
recommendations = json.loads(json_str)
return recommendations
except json.JSONDecodeError as e:
logger.error(f"Error parsing JSON recommendations: {e}")
logger.debug(f"Response was: {recommendations_json[:500]}")
return []
class CategoryManager:
"""Manage WordPress categories across multiple sites."""
def __init__(self):
"""Initialize the category manager with sites from Config."""
self.sites = Config.WORDPRESS_SITES
self.categories_by_site = {}
self.posts_by_site = {}
self.proposed_categories = {}
self.category_assignments = []
self.ai_advisor = AICategoryAdvisor()
def fetch_categories_from_site(self, site_name: str, site_config: Dict) -> List[Dict]:
"""
Fetch all categories from a WordPress site.
Args:
site_name: Website name
site_config: Site configuration dict
Returns:
List of categories with metadata
"""
logger.info(f"Fetching categories from {site_name}...")
categories = []
base_url = site_config['url'].rstrip('/')
api_url = f"{base_url}/wp-json/wp/v2/categories"
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
try:
# Fetch all categories (pagination if needed)
page = 1
while True:
params = {
'page': page,
'per_page': 100,
}
response = requests.get(api_url, params=params, auth=auth, timeout=10)
if response.status_code == 401:
logger.error(f"Unauthorized access to {site_name}. Check credentials.")
break
elif response.status_code == 403:
logger.error(f"Forbidden access to {site_name}. Check permissions.")
break
response.raise_for_status()
page_categories = response.json()
if not page_categories:
break
categories.extend(page_categories)
logger.info(f" Page {page}: Got {len(page_categories)} categories")
# Check if there are more pages
link_header = response.headers.get('Link', '')
if 'rel="next"' not in link_header:
break
page += 1
time.sleep(0.5)
logger.info(f"✓ Total categories from {site_name}: {len(categories)}")
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching categories from {site_name}: {e}")
return []
return categories
def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]:
"""
Fetch posts from a WordPress site to see current category assignments.
Args:
site_name: Website name
site_config: Site configuration dict
Returns:
List of posts with category information
"""
logger.info(f"Fetching posts from {site_name} to analyze category assignments...")
posts = []
base_url = site_config['url'].rstrip('/')
api_url = f"{base_url}/wp-json/wp/v2/posts"
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
try:
page = 1
while True:
params = {
'page': page,
'per_page': 100,
'status': 'publish',
}
response = requests.get(api_url, params=params, auth=auth, timeout=10)
if response.status_code == 401:
logger.error(f"Unauthorized access to {site_name}. Check credentials.")
break
elif response.status_code == 403:
logger.error(f"Forbidden access to {site_name}. Check permissions.")
break
response.raise_for_status()
page_posts = response.json()
if not page_posts:
break
posts.extend(page_posts)
logger.info(f" Page {page}: Got {len(page_posts)} posts")
# Check if there are more pages
link_header = response.headers.get('Link', '')
if 'rel="next"' not in link_header:
break
page += 1
time.sleep(0.5)
logger.info(f"✓ Total posts from {site_name}: {len(posts)}")
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching posts from {site_name}: {e}")
return []
return posts
def analyze_categories(self):
"""Analyze current categories and propose new ones."""
logger.info("\n" + "="*70)
logger.info("ANALYZING CURRENT CATEGORIES")
logger.info("="*70)
for site_name, config in self.sites.items():
categories = self.fetch_categories_from_site(site_name, config)
posts = self.fetch_posts_from_site(site_name, config)
self.categories_by_site[site_name] = categories
self.posts_by_site[site_name] = posts
logger.info(f"\n{site_name}:")
logger.info(f" Categories: {len(categories)}")
logger.info(f" Posts: {len(posts)}")
# Show top categories by post count
if categories:
logger.info(" Top 10 categories by post count:")
# Sort categories by count (most posts first)
sorted_cats = sorted(categories, key=lambda x: x.get('count', 0), reverse=True)
for i, cat in enumerate(sorted_cats[:10]):
logger.info(f" {i+1}. {cat['name']} ({cat['count']} posts)")
def propose_new_categories(self):
"""Propose new categories based on content analysis."""
logger.info("\n" + "="*70)
logger.info("PROPOSING NEW CATEGORIES")
logger.info("="*70)
# Define category proposals based on content analysis
category_proposals = {
'mistergeek.net': [
{'name': 'VPN Reviews', 'description': 'Reviews of VPN services', 'parent': 0},
{'name': 'Software Tutorials', 'description': 'Step-by-step software guides', 'parent': 0},
{'name': 'Tech News', 'description': 'Latest technology news', 'parent': 0},
{'name': 'Cybersecurity', 'description': 'Security tips and tools', 'parent': 0},
],
'webscroll.fr': [
{'name': 'Torrent Clients', 'description': 'Reviews of torrent clients', 'parent': 0},
{'name': 'Privacy Tools', 'description': 'Privacy-focused tools and services', 'parent': 0},
{'name': 'File Sharing Guide', 'description': 'Guides on file sharing methods', 'parent': 0},
],
'hellogeek.net': [
{'name': 'Experimental Tech', 'description': 'New and experimental tech', 'parent': 0},
{'name': 'Random Thoughts', 'description': 'Opinion and commentary posts', 'parent': 0},
{'name': 'Testing Zone', 'description': 'Posts for testing purposes', 'parent': 0},
]
}
for site_name in self.sites.keys():
if site_name in category_proposals:
self.proposed_categories[site_name] = category_proposals[site_name]
logger.info(f"\n{site_name} - Proposed categories:")
for cat in category_proposals[site_name]:
logger.info(f" - {cat['name']}: {cat['description']}")
def create_category_assignment_proposals(self):
"""Create proposals for assigning posts to categories or websites."""
logger.info("\n" + "="*70)
logger.info("CREATING CATEGORY ASSIGNMENT PROPOSALS")
logger.info("="*70)
# Analyze posts and propose category assignments
for site_name, posts in self.posts_by_site.items():
logger.info(f"\nAnalyzing posts from {site_name} for category assignments...")
# Process posts in batches for AI analysis
batch_size = 10
for i in range(0, len(posts), batch_size):
batch = posts[i:i + batch_size]
# Get AI recommendations for this batch
ai_recommendations = self.ai_advisor.get_ai_category_recommendations(batch)
if ai_recommendations:
# Map AI recommendations to our assignment format
for post in batch:
title = post.get('title', {}).get('rendered', 'Untitled')
content = post.get('content', {}).get('rendered', '')[:200] # First 200 chars
current_categories = post.get('categories', [])
# Find the AI recommendation for this post
ai_rec = None
for rec in ai_recommendations:
if rec.get('post_id') == post['id']:
ai_rec = rec
break
if ai_rec:
assignment = {
'site': site_name,
'post_id': post['id'],
'post_title': title[:50] + "..." if len(title) > 50 else title,
'current_categories': current_categories,
'proposed_category': ai_rec.get('recommended_category', 'Uncategorized'),
'proposed_site': ai_rec.get('recommended_site', site_name),
'reason': ai_rec.get('reason', ''),
'confidence': ai_rec.get('confidence', 'Low'),
'content_preview': content[:100] + "..." if len(content) > 100 else content,
'status': 'pending_approval'
}
else:
# Fallback to keyword-based suggestion if no AI recommendation
proposed_category = self._suggest_category_by_content(title + " " + content, site_name)
assignment = {
'site': site_name,
'post_id': post['id'],
'post_title': title[:50] + "..." if len(title) > 50 else title,
'current_categories': current_categories,
'proposed_category': proposed_category,
'proposed_site': site_name,
'reason': 'Keyword-based suggestion',
'confidence': 'Low',
'content_preview': content[:100] + "..." if len(content) > 100 else content,
'status': 'pending_approval'
}
self.category_assignments.append(assignment)
else:
# If AI is not available, use keyword-based suggestions
for post in batch:
title = post.get('title', {}).get('rendered', 'Untitled')
content = post.get('content', {}).get('rendered', '')[:200] # First 200 chars
current_categories = post.get('categories', [])
proposed_category = self._suggest_category_by_content(title + " " + content, site_name)
assignment = {
'site': site_name,
'post_id': post['id'],
'post_title': title[:50] + "..." if len(title) > 50 else title,
'current_categories': current_categories,
'proposed_category': proposed_category,
'proposed_site': site_name,
'reason': 'Keyword-based suggestion',
'confidence': 'Low',
'content_preview': content[:100] + "..." if len(content) > 100 else content,
'status': 'pending_approval'
}
self.category_assignments.append(assignment)
logger.info(f"Created {len(self.category_assignments)} category assignment proposals")
def _suggest_category_by_content(self, content: str, site_name: str) -> str:
"""Suggest a category based on content keywords."""
content_lower = content.lower()
# Site-specific category mappings
category_keywords = {
'mistergeek.net': {
'VPN': ['vpn', 'proxy', 'privacy', 'secure', 'encryption'],
'Software': ['software', 'app', 'tool', 'download', 'install'],
'Gaming': ['game', 'gaming', 'console', 'steam', 'playstation'],
'Tech News': ['news', 'update', 'release', 'announced'],
'Cybersecurity': ['security', 'malware', 'antivirus', 'hacking', 'breach']
},
'webscroll.fr': {
'Torrent': ['torrent', 'download', 'upload', 'client', 'tracker'],
'Privacy': ['privacy', 'anonymous', 'tor', 'vpn'],
'File Sharing': ['share', 'sharing', 'ddl', 'upload']
},
'hellogeek.net': {
'Opinion': ['think', 'believe', 'opinion', 'view', 'perspective'],
'Tutorial': ['how to', 'guide', 'tutorial', 'steps', 'instructions'],
'Review': ['review', 'rating', 'comparison', 'test']
}
}
site_categories = category_keywords.get(site_name, {})
for category, keywords in site_categories.items():
for keyword in keywords:
if keyword in content_lower:
return category
return 'Uncategorized'
def export_categories_csv(self) -> str:
"""Export current categories to CSV."""
output_dir = Path(__file__).parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_file = output_dir / f'current_categories_{timestamp}.csv'
fieldnames = ['site', 'category_id', 'name', 'slug', 'description', 'post_count', 'parent_id']
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for site_name, categories in self.categories_by_site.items():
for cat in categories:
writer.writerow({
'site': site_name,
'category_id': cat.get('id', ''),
'name': cat.get('name', ''),
'slug': cat.get('slug', ''),
'description': cat.get('description', ''),
'post_count': cat.get('count', 0),
'parent_id': cat.get('parent', 0)
})
logger.info(f"✓ Current categories exported to: {csv_file}")
return str(csv_file)
def export_proposed_categories_csv(self) -> str:
"""Export proposed new categories to CSV."""
output_dir = Path(__file__).parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_file = output_dir / f'proposed_categories_{timestamp}.csv'
fieldnames = ['site', 'proposed_category', 'description', 'parent_category', 'reason']
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for site_name, categories in self.proposed_categories.items():
for cat in categories:
writer.writerow({
'site': site_name,
'proposed_category': cat.get('name', ''),
'description': cat.get('description', ''),
'parent_category': cat.get('parent', 0),
'reason': 'Content analysis and organization improvement'
})
logger.info(f"✓ Proposed categories exported to: {csv_file}")
return str(csv_file)
def export_category_assignments_csv(self) -> str:
"""Export category assignment proposals to CSV."""
output_dir = Path(__file__).parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_file = output_dir / f'category_assignments_{timestamp}.csv'
fieldnames = ['site', 'post_id', 'post_title', 'current_categories', 'proposed_category', 'proposed_site', 'reason', 'confidence', 'content_preview', 'status']
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for assignment in self.category_assignments:
writer.writerow(assignment)
logger.info(f"✓ Category assignments exported to: {csv_file}")
return str(csv_file)
def run(self):
"""Run complete category management process."""
logger.info("="*70)
logger.info("WORDPRESS CATEGORY MANAGEMENT")
logger.info("="*70)
logger.info("Sites configured: " + ", ".join(self.sites.keys()))
logger.info("")
# Analyze current categories
self.analyze_categories()
# Propose new categories
self.propose_new_categories()
# Create category assignment proposals
self.create_category_assignment_proposals()
# Export all data
logger.info("\n" + "="*70)
logger.info("EXPORTING RESULTS")
logger.info("="*70)
categories_csv = self.export_categories_csv()
proposed_csv = self.export_proposed_categories_csv()
assignments_csv = self.export_category_assignments_csv()
# Print summary
logger.info("\n" + "="*70)
logger.info("CATEGORY MANAGEMENT SUMMARY")
logger.info("="*70)
total_categories = sum(len(cats) for cats in self.categories_by_site.values())
logger.info(f"Total current categories: {total_categories}")
total_proposed = sum(len(props) for props in self.proposed_categories.values())
logger.info(f"Total proposed categories: {total_proposed}")
logger.info(f"Category assignment proposals: {len(self.category_assignments)}")
# AI Advisor stats
logger.info(f"AI API calls made: {self.ai_advisor.api_calls}")
logger.info(f"AI cost: ${self.ai_advisor.ai_cost:.4f}")
logger.info(f"\n{''*70}")
logger.info("Exported files:")
logger.info(f" • Current categories: {categories_csv}")
logger.info(f" • Proposed categories: {proposed_csv}")
logger.info(f" • Category assignments: {assignments_csv}")
logger.info(f"{''*70}")
logger.info(f"\n✓ Category management complete!")
logger.info(f"\nNext steps:")
logger.info(f" 1. Review proposed_categories.csv for new categories to add")
logger.info(f" 2. Review category_assignments.csv for posts that need re-categorization")
logger.info(f" 3. Manually approve or modify proposals before applying changes")
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Manage WordPress categories across multiple sites'
)
args = parser.parse_args()
manager = CategoryManager()
manager.run()
if __name__ == '__main__':
main()