Files
seo/scripts/multi_site_seo_analyzer.py
Kevin Bataille 8c7cd24685 Refactor SEO automation into unified CLI application
Major refactoring to create a clean, integrated CLI application:

### New Features:
- Unified CLI executable (./seo) with simple command structure
- All commands accept optional CSV file arguments
- Auto-detection of latest files when no arguments provided
- Simplified output directory structure (output/ instead of output/reports/)
- Cleaner export filename format (all_posts_YYYY-MM-DD.csv)

### Commands:
- export: Export all posts from WordPress sites
- analyze [csv]: Analyze posts with AI (optional CSV input)
- recategorize [csv]: Recategorize posts with AI
- seo_check: Check SEO quality
- categories: Manage categories across sites
- approve [files]: Review and approve recommendations
- full_pipeline: Run complete workflow
- analytics, gaps, opportunities, report, status

### Changes:
- Moved all scripts to scripts/ directory
- Created config.yaml for configuration
- Updated all scripts to use output/ directory
- Deprecated old seo-cli.py in favor of new ./seo
- Added AGENTS.md and CHANGELOG.md documentation
- Consolidated README.md with updated usage

### Technical:
- Added PyYAML dependency
- Removed hardcoded configuration values
- All scripts now properly integrated
- Better error handling and user feedback

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 14:24:44 +01:00

779 lines
29 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Multi-Site WordPress SEO Analyzer
Fetches posts from 3 WordPress sites, analyzes titles and meta descriptions,
and provides AI-powered optimization recommendations.
"""
import os
import csv
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import requests
from requests.auth import HTTPBasicAuth
import time
from config import Config
import sys
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class MultiSiteSEOAnalyzer:
"""Analyzes titles and meta descriptions across multiple WordPress sites."""
def __init__(self, progressive_csv: bool = True):
"""
Initialize the analyzer.
Args:
progressive_csv: If True, write CSV progressively as posts are analyzed
"""
self.sites_config = Config.WORDPRESS_SITES
self.posts_data = {}
self.analysis_results = []
self.api_calls = 0
self.ai_cost = 0.0
self.openrouter_api_key = Config.OPENROUTER_API_KEY
self.progressive_csv = progressive_csv
self.csv_file = None
self.csv_writer = None
def fetch_posts_from_site(self, site_name: str, site_config: Dict,
include_drafts: bool = False) -> List[Dict]:
"""
Fetch posts from a WordPress site using REST API.
Args:
site_name: Name of the site (domain)
site_config: Configuration dict with url, username, password
include_drafts: If True, fetch both published and draft posts
Returns:
List of posts with metadata
"""
logger.info(f"Fetching posts from {site_name}...")
posts = []
base_url = site_config['url'].rstrip('/')
api_url = f"{base_url}/wp-json/wp/v2/posts"
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
# Determine which statuses to fetch
statuses = ['publish', 'draft'] if include_drafts else ['publish']
status_str = ', '.join(statuses).replace('publish', 'published').replace('draft', 'drafts')
# Fetch each status separately to avoid 400 Bad Request on pagination
for status in statuses:
page = 1
status_count = 0
use_fields = True # Try with _fields first, fallback without if 400
while True:
params = {
'page': page,
'per_page': 100,
'status': status, # Single status per request
}
# Add _fields only if not getting 400 errors
if use_fields:
params['_fields'] = 'id,title,slug,link,meta,status'
try:
response = requests.get(api_url, params=params, auth=auth, timeout=10)
response.raise_for_status()
page_posts = response.json()
if not page_posts:
break
posts.extend(page_posts)
status_count += len(page_posts)
logger.info(f" ✓ Fetched {len(page_posts)} {status} posts (page {page})")
page += 1
time.sleep(Config.API_DELAY_SECONDS)
except requests.exceptions.HTTPError as e:
# Handle 400 errors gracefully
if response.status_code == 400 and use_fields and page == 1:
# Retry page 1 without _fields parameter
logger.info(f" ⓘ Retrying without _fields parameter...")
use_fields = False
continue
elif response.status_code == 400:
# Pagination or API limit reached
logger.info(f" ⓘ API limit reached (fetched {status_count} {status} posts)")
break
else:
logger.error(f"Error fetching page {page} from {site_name}: {e}")
break
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching from {site_name}: {e}")
break
if status_count > 0:
logger.info(f" ✓ Total {status} posts: {status_count}")
logger.info(f"✓ Total posts from {site_name} ({status_str}): {len(posts)}")
return posts
def extract_seo_data(self, post: Dict, site_name: str) -> Dict:
"""
Extract SEO-relevant data from a post.
Args:
post: Post data from WordPress API
site_name: Name of the site
Returns:
Dict with extracted SEO data
"""
title = post.get('title', {})
if isinstance(title, dict):
title = title.get('rendered', '')
# Get meta description from various SEO plugins
# Check multiple possible locations where different plugins store meta descriptions
meta_desc = ''
if isinstance(post.get('meta'), dict):
meta_dict = post['meta']
# Try various SEO plugin fields (order matters - most specific first)
meta_desc = (
meta_dict.get('_yoast_wpseo_metadesc', '') or # Yoast SEO
meta_dict.get('_rank_math_description', '') or # Rank Math
meta_dict.get('_aioseo_description', '') or # All in One SEO
meta_dict.get('description', '') or # Standard field
meta_dict.get('_meta_description', '') or # Alternative
meta_dict.get('metadesc', '') # Alternative
)
# Get post status
status = post.get('status', 'publish')
return {
'site': site_name,
'post_id': post['id'],
'title': title.strip(),
'slug': post.get('slug', ''),
'url': post.get('link', ''),
'meta_description': meta_desc.strip(),
'status': status,
}
def analyze_title(self, title: str) -> Dict:
"""
Analyze title for SEO best practices.
Args:
title: Post title
Returns:
Dict with analysis results
"""
length = len(title)
# SEO best practices
issues = []
recommendations = []
score = 100
if length < 30:
issues.append(f"Too short ({length})")
recommendations.append("Expand title to 50-60 characters")
score -= 20
elif length < 50:
recommendations.append("Could be slightly longer (target 50-60)")
score -= 5
elif length > 70:
issues.append(f"Too long ({length})")
recommendations.append("Consider shortening to 50-70 characters")
score -= 15
# Check for power words
power_words = ['best', 'ultimate', 'complete', 'essential', 'proven',
'effective', 'powerful', 'expert', 'guide', 'tutorial',
'how to', 'step by step', 'top 10', 'ultimate guide']
has_power_word = any(word.lower() in title.lower() for word in power_words)
if not has_power_word:
recommendations.append("Consider adding a power word (best, complete, guide, etc.)")
score -= 10
# Check for numbers
if not any(c.isdigit() for c in title):
recommendations.append("Consider adding a number (e.g., 'Top 5', '2025')")
score -= 5
# Check for emojis or special chars that might break rendering
special_chars = set(title) - set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 -:')
if special_chars:
recommendations.append(f"Check special characters: {special_chars}")
score -= 5
return {
'length': length,
'issues': issues,
'recommendations': recommendations,
'score': max(0, score),
'has_power_word': has_power_word,
'has_number': any(c.isdigit() for c in title)
}
def analyze_meta_description(self, meta_desc: str) -> Dict:
"""
Analyze meta description for SEO best practices.
Args:
meta_desc: Meta description text
Returns:
Dict with analysis results
"""
length = len(meta_desc)
issues = []
recommendations = []
score = 100
if not meta_desc or length == 0:
issues.append("Missing meta description")
recommendations.append("Write a 120-160 character meta description")
score = 0
else:
if length < 100:
issues.append(f"Too short ({length})")
recommendations.append("Expand to 120-160 characters")
score -= 20
elif length < 120:
recommendations.append("Could be slightly longer (target 120-160)")
score -= 5
elif length > 160:
issues.append(f"Too long ({length})")
recommendations.append("Shorten to 120-160 characters")
score -= 15
# Check for CTA
cta_words = ['learn', 'discover', 'read', 'explore', 'find', 'get',
'download', 'check', 'see', 'watch', 'try', 'start']
has_cta = any(word.lower() in meta_desc.lower() for word in cta_words)
if not has_cta:
recommendations.append("Consider adding a call-to-action")
score -= 5
return {
'length': length,
'is_missing': not meta_desc,
'issues': issues,
'recommendations': recommendations,
'score': max(0, score),
}
def calculate_overall_score(self, title_analysis: Dict, meta_analysis: Dict) -> float:
"""Calculate overall SEO score (0-100)."""
title_weight = 0.4
meta_weight = 0.6
return (title_analysis['score'] * title_weight) + (meta_analysis['score'] * meta_weight)
def generate_ai_recommendations(self, post_data: Dict, title_analysis: Dict,
meta_analysis: Dict) -> Optional[str]:
"""
Use Claude AI to generate specific optimization recommendations.
Args:
post_data: Post data
title_analysis: Title analysis results
meta_analysis: Meta description analysis
Returns:
AI-generated recommendations or None if AI disabled
"""
if not self.openrouter_api_key:
return None
prompt = f"""Analyze this blog post and provide specific SEO optimization recommendations:
Post Title: "{post_data['title']}"
Current Meta Description: "{post_data['meta_description'] or 'MISSING'}"
URL: {post_data['url']}
Title Analysis:
- Length: {title_analysis['length']} characters (target: 50-70)
- Issues: {', '.join(title_analysis['issues']) or 'None'}
Meta Description Analysis:
- Length: {meta_analysis['length']} characters (target: 120-160)
- Issues: {', '.join(meta_analysis['issues']) or 'None'}
Provide 2-3 specific, actionable recommendations to improve SEO. Focus on:
1. If title needs improvement: suggest a better title
2. If meta description is missing: write one
3. If both are weak: provide both improved versions
Format as:
- Recommendation 1: [specific action]
- Recommendation 2: [specific action]
etc.
Be concise and specific."""
try:
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {self.openrouter_api_key}",
"Content-Type": "application/json",
},
json={
"model": "anthropic/claude-3.5-sonnet",
"messages": [
{"role": "user", "content": prompt}
],
"temperature": 0.7,
},
timeout=30
)
response.raise_for_status()
result = response.json()
self.api_calls += 1
# Track cost (Claude 3.5 Sonnet: $3/$15 per 1M tokens)
usage = result.get('usage', {})
input_tokens = usage.get('prompt_tokens', 0)
output_tokens = usage.get('completion_tokens', 0)
self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000
recommendations = result['choices'][0]['message']['content'].strip()
return recommendations
except Exception as e:
logger.warning(f"AI recommendation failed: {e}")
return None
def _setup_progressive_csv(self) -> Optional[Tuple]:
"""
Setup CSV file for progressive writing.
Returns:
Tuple of (file_handle, writer) or None if progressive_csv is False
"""
if not self.progressive_csv:
return None
output_dir = Path(__file__).parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_path = output_dir / f'seo_analysis_{timestamp}.csv'
fieldnames = [
'site', 'post_id', 'status', 'title', 'slug', 'url',
'meta_description', 'title_score', 'title_issues',
'title_recommendations', 'meta_score', 'meta_issues',
'meta_recommendations', 'overall_score', 'ai_recommendations',
]
csv_file = open(csv_path, 'w', newline='', encoding='utf-8')
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
csv_file.flush()
logger.info(f"✓ CSV file created: {csv_path}")
self.csv_file = csv_file
self.csv_writer = writer
return csv_path
def _write_result_to_csv(self, result: Dict) -> None:
"""Write a single result row to CSV file."""
if self.progressive_csv and self.csv_writer:
self.csv_writer.writerow(result)
self.csv_file.flush()
def analyze_all_sites(self, use_ai: bool = True, top_n: int = 10,
include_drafts: bool = False):
"""
Analyze all configured sites.
Args:
use_ai: Whether to use AI for recommendations
top_n: Number of top priority posts to get AI recommendations for
include_drafts: If True, include draft posts in analysis
"""
logger.info(f"Starting analysis of {len(self.sites_config)} sites...")
if include_drafts:
logger.info("(Including draft posts)")
logger.info("")
all_posts = []
# Fetch posts from all sites
for site_name, config in self.sites_config.items():
posts = self.fetch_posts_from_site(site_name, config, include_drafts=include_drafts)
if posts:
self.posts_data[site_name] = posts
all_posts.extend(posts)
if not all_posts:
logger.error("No posts found on any site")
return
logger.info(f"\nAnalyzing {len(all_posts)} posts...\n")
# Setup progressive CSV if enabled
csv_path = self._setup_progressive_csv()
# Analyze each post
for site_name, posts in self.posts_data.items():
logger.info(f"Analyzing {len(posts)} posts from {site_name}...")
for idx, post in enumerate(posts, 1):
seo_data = self.extract_seo_data(post, site_name)
title_analysis = self.analyze_title(seo_data['title'])
meta_analysis = self.analyze_meta_description(seo_data['meta_description'])
overall_score = self.calculate_overall_score(title_analysis, meta_analysis)
result = {
**seo_data,
'title_score': title_analysis['score'],
'title_issues': '|'.join(title_analysis['issues']) or 'None',
'title_recommendations': '|'.join(title_analysis['recommendations']),
'meta_score': meta_analysis['score'],
'meta_issues': '|'.join(meta_analysis['issues']) or 'None',
'meta_recommendations': '|'.join(meta_analysis['recommendations']),
'overall_score': overall_score,
'ai_recommendations': '',
}
self.analysis_results.append(result)
# Write to CSV progressively (before AI recommendations)
if self.progressive_csv:
self._write_result_to_csv(result)
logger.debug(f" [{idx}/{len(posts)}] Written: {seo_data['title'][:40]}")
# Sort by priority (lowest scores first) and get AI recommendations for top posts
if use_ai:
self.analysis_results.sort(key=lambda x: x['overall_score'])
logger.info(f"\nGenerating AI recommendations for top {top_n} posts...\n")
for idx, result in enumerate(self.analysis_results[:top_n], 1):
logger.info(f" [{idx}/{top_n}] {result['title'][:50]}...")
ai_recs = self.generate_ai_recommendations(
result,
{
'score': result['title_score'],
'issues': result['title_issues'].split('|'),
'length': len(result['title'])
},
{
'score': result['meta_score'],
'issues': result['meta_issues'].split('|'),
'length': len(result['meta_description'])
}
)
result['ai_recommendations'] = ai_recs or ''
# Update CSV with AI recommendations if using progressive CSV
if self.progressive_csv and self.csv_writer:
# Find and update the row in the CSV by re-writing it
# This is a limitation of CSV - we'll update in final export instead
pass
time.sleep(0.5) # Rate limiting
# Sort by overall score for final export
self.analysis_results.sort(key=lambda x: x['overall_score'])
# Close progressive CSV if open (will be re-written with final data including AI recs)
if self.progressive_csv and self.csv_file:
self.csv_file.close()
self.csv_file = None
self.csv_writer = None
def export_results(self, output_file: Optional[str] = None):
"""
Export analysis results to CSV.
Args:
output_file: Output file path (optional)
"""
if not output_file:
output_dir = Path(__file__).parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
if self.progressive_csv:
# Use same timestamp as progressive file
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Find the most recent seo_analysis file
files = sorted(output_dir.glob('seo_analysis_*.csv'))
if files:
output_file = files[-1] # Use the most recent one
else:
output_file = output_dir / f'seo_analysis_{timestamp}_final.csv'
else:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = output_dir / f'seo_analysis_{timestamp}.csv'
output_file = Path(output_file)
output_file.parent.mkdir(parents=True, exist_ok=True)
if not self.analysis_results:
logger.error("No results to export")
return
fieldnames = [
'site',
'post_id',
'status',
'title',
'slug',
'url',
'meta_description',
'title_score',
'title_issues',
'title_recommendations',
'meta_score',
'meta_issues',
'meta_recommendations',
'overall_score',
'ai_recommendations',
]
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for result in self.analysis_results:
writer.writerow({field: result.get(field, '') for field in fieldnames})
if self.progressive_csv:
logger.info(f"\n✓ Final results saved to: {output_file}")
else:
logger.info(f"\n✓ Results exported to: {output_file}")
# Also export as a summary report
self.export_summary_report(output_file)
def export_summary_report(self, csv_file: Path):
"""Export a markdown summary report."""
report_file = csv_file.parent / f"{csv_file.stem}_summary.md"
# Group by site
by_site = {}
for result in self.analysis_results:
site = result['site']
if site not in by_site:
by_site[site] = []
by_site[site].append(result)
with open(report_file, 'w', encoding='utf-8') as f:
f.write("# Multi-Site SEO Analysis Report\n\n")
f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
# Summary stats
total_posts = len(self.analysis_results)
published = sum(1 for r in self.analysis_results if r['status'] == 'publish')
drafts = sum(1 for r in self.analysis_results if r['status'] == 'draft')
avg_score = sum(r['overall_score'] for r in self.analysis_results) / total_posts if total_posts > 0 else 0
f.write("## Summary\n\n")
f.write(f"- **Total Posts:** {total_posts}\n")
if published > 0:
f.write(f" - Published: {published}\n")
if drafts > 0:
f.write(f" - Drafts: {drafts}\n")
f.write(f"- **Average SEO Score:** {avg_score:.1f}/100\n")
f.write(f"- **API Calls Made:** {self.api_calls}\n")
f.write(f"- **AI Cost:** ${self.ai_cost:.4f}\n")
f.write(f"- **Sites Analyzed:** {len(by_site)}\n\n")
# Priority issues
missing_meta = sum(1 for r in self.analysis_results if r['meta_score'] == 0)
weak_titles = sum(1 for r in self.analysis_results if r['title_score'] < 50)
weak_meta = sum(1 for r in self.analysis_results if r['meta_score'] < 50 and r['meta_score'] > 0)
f.write("## Priority Issues\n\n")
f.write(f"- **Missing Meta Descriptions:** {missing_meta} posts\n")
f.write(f"- **Weak Titles (Score < 50):** {weak_titles} posts\n")
f.write(f"- **Weak Meta (Score < 50):** {weak_meta} posts\n\n")
# By site
for site_name, posts in by_site.items():
avg = sum(p['overall_score'] for p in posts) / len(posts)
f.write(f"## {site_name}\n\n")
f.write(f"- **Posts:** {len(posts)}\n")
f.write(f"- **Avg Score:** {avg:.1f}/100\n")
f.write(f"- **Missing Meta:** {sum(1 for p in posts if p['meta_score'] == 0)}\n\n")
# Top 5 to optimize
f.write("### Top 5 Posts to Optimize\n\n")
for idx, post in enumerate(posts[:5], 1):
f.write(f"{idx}. **{post['title']}** (Score: {post['overall_score']:.0f})\n")
f.write(f" - URL: {post['url']}\n")
if post['meta_issues'] != 'None':
f.write(f" - Meta Issues: {post['meta_issues']}\n")
if post['ai_recommendations']:
f.write(f" - Recommendations: {post['ai_recommendations'].split(chr(10))[0]}\n")
f.write("\n")
f.write("\n## Legend\n\n")
f.write("- **Title Score:** Evaluates length, power words, numbers, readability\n")
f.write("- **Meta Score:** Evaluates presence, length, call-to-action\n")
f.write("- **Overall Score:** 40% title + 60% meta description\n")
f.write("- **Optimal Ranges:**\n")
f.write(" - Title: 50-70 characters\n")
f.write(" - Meta: 120-160 characters\n")
logger.info(f"✓ Summary report: {report_file}")
def run(self, use_ai: bool = True, top_n: int = 10, include_drafts: bool = False):
"""Run complete analysis."""
try:
self.analyze_all_sites(use_ai=use_ai, top_n=top_n, include_drafts=include_drafts)
self.export_results()
logger.info("\n" + "="*60)
logger.info("ANALYSIS COMPLETE")
logger.info("="*60)
logger.info(f"Total posts analyzed: {len(self.analysis_results)}")
published = sum(1 for r in self.analysis_results if r['status'] == 'publish')
drafts = sum(1 for r in self.analysis_results if r['status'] == 'draft')
if published > 0:
logger.info(f" - Published: {published}")
if drafts > 0:
logger.info(f" - Drafts: {drafts}")
logger.info(f"AI recommendations: {sum(1 for r in self.analysis_results if r['ai_recommendations'])}")
logger.info(f"AI cost: ${self.ai_cost:.4f}")
except Exception as e:
logger.error(f"Analysis failed: {e}", exc_info=True)
sys.exit(1)
def check_meta_fields(site_url: str, username: str, password: str) -> None:
"""
Diagnostic function to check what meta fields are available on a site.
Args:
site_url: WordPress site URL
username: WordPress username
password: WordPress app password
"""
logger.info(f"\n{'='*60}")
logger.info("META FIELD DIAGNOSTIC")
logger.info(f"{'='*60}\n")
logger.info(f"Site: {site_url}")
logger.info("Checking available meta fields in first post...\n")
base_url = site_url.rstrip('/')
api_url = f"{base_url}/wp-json/wp/v2/posts"
auth = HTTPBasicAuth(username, password)
try:
params = {
'per_page': 1,
'status': 'publish'
}
response = requests.get(api_url, params=params, auth=auth, timeout=10)
response.raise_for_status()
posts = response.json()
if not posts:
logger.error("No posts found")
return
post = posts[0]
logger.info(f"Post: {post.get('title', {}).get('rendered', 'N/A')}")
logger.info(f"\nAvailable meta fields:")
if isinstance(post.get('meta'), dict):
meta_dict = post['meta']
if meta_dict:
for key, value in sorted(meta_dict.items()):
preview = str(value)[:60]
logger.info(f"{key}: {preview}")
else:
logger.info(" (No meta fields found)")
else:
logger.info(" (Meta is not a dictionary)")
logger.info(f"\nFull meta object:")
logger.info(json.dumps(post.get('meta', {}), indent=2)[:500])
except Exception as e:
logger.error(f"Error: {e}")
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Analyze SEO across multiple WordPress sites'
)
parser.add_argument(
'--no-ai',
action='store_true',
help='Skip AI recommendations to save cost'
)
parser.add_argument(
'--top-n',
type=int,
default=10,
help='Number of top posts to get AI recommendations for'
)
parser.add_argument(
'--output',
help='Output CSV file path'
)
parser.add_argument(
'--include-drafts',
action='store_true',
help='Include draft posts in analysis (published + drafts)'
)
parser.add_argument(
'--no-progressive',
action='store_true',
help='Disable real-time CSV writing (write only at end)'
)
parser.add_argument(
'--diagnose',
help='Diagnose meta fields for a site (URL). Example: --diagnose https://www.mistergeek.net'
)
args = parser.parse_args()
# Diagnostic mode
if args.diagnose:
# Ask for username/password if not in env
from getpass import getpass
username = Config.WORDPRESS_USERNAME
password = Config.WORDPRESS_APP_PASSWORD
if not username or not password:
logger.error("WORDPRESS_USERNAME and WORDPRESS_APP_PASSWORD must be set in .env")
sys.exit(1)
check_meta_fields(args.diagnose, username, password)
sys.exit(0)
analyzer = MultiSiteSEOAnalyzer(progressive_csv=not args.no_progressive)
analyzer.run(use_ai=not args.no_ai, top_n=args.top_n, include_drafts=args.include_drafts)
if __name__ == '__main__':
main()