Files
seo/scripts/content_strategy_analyzer.py
Kevin Bataille 8c7cd24685 Refactor SEO automation into unified CLI application
Major refactoring to create a clean, integrated CLI application:

### New Features:
- Unified CLI executable (./seo) with simple command structure
- All commands accept optional CSV file arguments
- Auto-detection of latest files when no arguments provided
- Simplified output directory structure (output/ instead of output/reports/)
- Cleaner export filename format (all_posts_YYYY-MM-DD.csv)

### Commands:
- export: Export all posts from WordPress sites
- analyze [csv]: Analyze posts with AI (optional CSV input)
- recategorize [csv]: Recategorize posts with AI
- seo_check: Check SEO quality
- categories: Manage categories across sites
- approve [files]: Review and approve recommendations
- full_pipeline: Run complete workflow
- analytics, gaps, opportunities, report, status

### Changes:
- Moved all scripts to scripts/ directory
- Created config.yaml for configuration
- Updated all scripts to use output/ directory
- Deprecated old seo-cli.py in favor of new ./seo
- Added AGENTS.md and CHANGELOG.md documentation
- Consolidated README.md with updated usage

### Technical:
- Added PyYAML dependency
- Removed hardcoded configuration values
- All scripts now properly integrated
- Better error handling and user feedback

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 14:24:44 +01:00

467 lines
19 KiB
Python

"""
Multi-Site Content Strategy Analyzer
Analyzes all content (published + drafts) across 3 websites.
Recommends optimal distribution and consolidation strategy.
"""
import csv
import json
import argparse
from pathlib import Path
from collections import defaultdict
from datetime import datetime
class ContentStrategyAnalyzer:
"""Analyze and optimize content distribution across multiple sites."""
def __init__(self):
"""Initialize analyzer."""
self.output_dir = Path('output')
self.output_dir.mkdir(exist_ok=True)
(self.output_dir / 'analysis').mkdir(exist_ok=True)
(self.output_dir / 'reports').mkdir(exist_ok=True)
(self.output_dir / 'logs').mkdir(exist_ok=True)
self.logs = []
def log(self, message):
"""Log message."""
self.logs.append(message)
print(message)
def load_wordpress_posts(self, csv_path):
"""Load published WordPress posts."""
posts = {}
if not csv_path.exists():
self.log(f"⚠️ WordPress posts file not found: {csv_path}")
return posts
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
post_id = row.get('ID') or row.get('post_id')
if not post_id:
continue
posts[post_id] = {
'source': 'wordpress',
'status': 'published',
'title': row.get('Title') or row.get('title') or row.get('post_title') or '',
'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
'author': row.get('Author') or row.get('author') or 'Unknown',
'traffic': int(row.get('traffic', 0) or 0),
'impressions': int(row.get('impressions', 0) or 0),
'position': float(row.get('avg_position', 0) or 0),
'category': row.get('Category') or row.get('category') or '',
}
self.log(f"✓ Loaded {len(posts)} published WordPress posts")
except Exception as e:
self.log(f"❌ Error reading WordPress posts: {e}")
return posts
def load_draft_posts(self, csv_path):
"""Load draft/unpublished posts."""
posts = {}
if not csv_path.exists():
self.log(f"⚠️ Draft posts file not found: {csv_path}")
return posts
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
post_id = row.get('ID') or row.get('post_id')
if not post_id:
continue
posts[post_id] = {
'source': 'draft',
'status': 'draft',
'title': row.get('Title') or row.get('title') or row.get('post_title') or '',
'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
'author': row.get('Author') or row.get('author') or 'Unknown',
'traffic': 0, # Drafts have no traffic
'impressions': 0,
'position': 0,
'category': row.get('Category') or row.get('category') or '',
}
self.log(f"✓ Loaded {len(posts)} draft posts")
except Exception as e:
self.log(f"❌ Error reading draft posts: {e}")
return posts
def classify_post_topic(self, post):
"""Classify post into topic area."""
title = post['title'].lower()
category = post['category'].lower()
content = f"{title} {category}"
# Topic classification based on keywords
topic_keywords = {
'torrent': ['torrent', 'ygg', 'ratio', 'tracker', 'magnet', 'seedbox', 'upload'],
'streaming': ['stream', 'film', 'série', 'netflix', 'disney', 'platforma'],
'vpn': ['vpn', 'proxy', 'anonyme', 'privacy', 'chiffr'],
'software': ['software', 'tool', 'app', 'logiciel', 'outil', 'program'],
'gaming': ['game', 'jeu', 'gaming', 'emula', 'console', 'retro'],
'download': ['download', 'télécharge', 'ddl', 'upload'],
'tech': ['tech', 'informatique', 'code', 'programming', 'developer'],
'other': [],
}
for topic, keywords in topic_keywords.items():
if topic == 'other':
continue
for keyword in keywords:
if keyword in content:
return topic
return 'other'
def classify_website(self, post):
"""Determine which website this post should be on."""
topic = self.classify_post_topic(post)
author = post.get('author', '').strip()
is_sponsored = author == 'Expert'
# Website assignment rules
if topic == 'torrent' or topic == 'download':
return {
'site': 'webscroll.fr',
'reason': f'Torrent/file-sharing content',
'priority': 'HIGH' if post['traffic'] > 100 else 'MEDIUM'
}
if topic in ['vpn', 'software', 'gaming', 'tech']:
return {
'site': 'mistergeek.net',
'reason': f'{topic.capitalize()} - core content',
'priority': 'HIGH' if post['traffic'] > 50 else 'MEDIUM'
}
if topic == 'streaming' and post['traffic'] < 100:
return {
'site': 'hellogeek.net',
'reason': 'Low-traffic streaming content',
'priority': 'LOW'
}
if topic == 'other' or post['traffic'] < 10:
return {
'site': 'hellogeek.net',
'reason': 'Off-brand or low-traffic content',
'priority': 'LOW'
}
# Default to main site
return {
'site': 'mistergeek.net',
'reason': 'Core content',
'priority': 'MEDIUM'
}
def classify_content_action(self, post):
"""Determine what action to take with this post."""
topic = self.classify_post_topic(post)
traffic = post.get('traffic', 0)
impressions = post.get('impressions', 0)
position = post.get('position', 0)
status = post.get('status', 'published')
# Determine action
if status == 'draft':
if traffic == 0:
return 'REVIEW_PUBLISH_OR_DELETE' # Unpublished draft
else:
return 'REPUBLISH' # Was published, now draft
if traffic < 5 and impressions < 20:
return 'DELETE_OR_CONSOLIDATE'
if traffic > 0 and position > 0 and position < 11:
return 'KEEP_OPTIMIZE'
if position > 11 and position < 30:
return 'KEEP_OPTIMIZE'
if position > 30 or traffic < 10:
return 'MOVE_TO_OTHER_SITE'
return 'KEEP_MONITOR'
def analyze_all_content(self, posts):
"""Analyze and classify all posts."""
analysis = {
'total_posts': len(posts),
'by_site': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
'by_topic': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
'by_action': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
'sponsored_posts': {'count': 0, 'traffic': 0, 'posts': []},
'draft_posts': {'count': 0, 'posts': []},
}
for post_id, post in posts.items():
topic = self.classify_post_topic(post)
site_assignment = self.classify_website(post)
action = self.classify_content_action(post)
is_sponsored = post.get('author', '').strip() == 'Expert'
is_draft = post.get('status') == 'draft'
# Record in analysis
analysis['by_site'][site_assignment['site']]['count'] += 1
analysis['by_site'][site_assignment['site']]['traffic'] += post['traffic']
analysis['by_site'][site_assignment['site']]['posts'].append({
'id': post_id,
'title': post['title'],
'traffic': post['traffic'],
'reason': site_assignment['reason']
})
analysis['by_topic'][topic]['count'] += 1
analysis['by_topic'][topic]['traffic'] += post['traffic']
analysis['by_action'][action]['count'] += 1
analysis['by_action'][action]['traffic'] += post['traffic']
if is_sponsored:
analysis['sponsored_posts']['count'] += 1
analysis['sponsored_posts']['traffic'] += post['traffic']
analysis['sponsored_posts']['posts'].append({
'id': post_id,
'title': post['title'],
'traffic': post['traffic']
})
if is_draft:
analysis['draft_posts']['count'] += 1
analysis['draft_posts']['posts'].append({
'id': post_id,
'title': post['title'],
'status': 'draft'
})
return analysis
def generate_content_distribution_csv(self, posts, output_path):
"""Export detailed content distribution plan."""
try:
fieldnames = [
'post_id', 'title', 'topic', 'status', 'author',
'traffic', 'impressions', 'position',
'recommended_site', 'reason', 'action',
'priority', 'notes'
]
rows = []
for post_id, post in posts.items():
topic = self.classify_post_topic(post)
site_assignment = self.classify_website(post)
action = self.classify_content_action(post)
author = post.get('author', '').strip()
is_sponsored = author == 'Expert'
rows.append({
'post_id': post_id,
'title': post['title'][:80],
'topic': topic,
'status': post.get('status', 'published'),
'author': author,
'traffic': post.get('traffic', 0),
'impressions': post.get('impressions', 0),
'position': post.get('position', 0),
'recommended_site': site_assignment['site'],
'reason': site_assignment['reason'],
'action': action,
'priority': site_assignment['priority'],
'notes': 'SPONSORED' if is_sponsored else ''
})
rows.sort(key=lambda x: x['traffic'], reverse=True)
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
self.log(f"✓ Exported {len(rows)} posts to {output_path}")
except Exception as e:
self.log(f"❌ Error exporting CSV: {e}")
def generate_strategy_report(self, analysis, output_path):
"""Generate comprehensive strategy report."""
try:
report = []
report.append("# Multi-Site Content Strategy Report\n")
report.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}*\n\n")
# Executive Summary
report.append("## Executive Summary\n\n")
report.append(f"**Total Content Analyzed:** {analysis['total_posts']} posts\n")
report.append(f"- Published: {analysis['total_posts'] - analysis['draft_posts']['count']}\n")
report.append(f"- Drafts: {analysis['draft_posts']['count']}\n")
report.append(f"- Sponsored: {analysis['sponsored_posts']['count']}\n\n")
# Distribution Strategy
report.append("## Recommended Site Distribution\n\n")
for site, data in sorted(analysis['by_site'].items(),
key=lambda x: x[1]['traffic'], reverse=True):
report.append(f"### {site}\n")
report.append(f"- Posts: {data['count']}\n")
report.append(f"- Total Traffic: {data['traffic']:,} visits/month\n")
report.append(f"- Top Posts:\n")
for post in sorted(data['posts'], key=lambda x: x['traffic'], reverse=True)[:5]:
report.append(f" - {post['title'][:60]} ({post['traffic']} visits)\n")
report.append(f"\n")
# Topic Distribution
report.append("## Content by Topic\n\n")
for topic, data in sorted(analysis['by_topic'].items(),
key=lambda x: x[1]['traffic'], reverse=True):
report.append(f"- **{topic.title()}:** {data['count']} posts ({data['traffic']:,} visits)\n")
report.append("\n")
# Actions Required
report.append("## Required Actions\n\n")
for action, data in sorted(analysis['by_action'].items(),
key=lambda x: x[1]['count'], reverse=True):
report.append(f"- **{action}:** {data['count']} posts ({data['traffic']:,} visits)\n")
report.append("\n")
# Sponsored Content
if analysis['sponsored_posts']['count'] > 0:
report.append("## Sponsored Content (by 'Expert')\n\n")
report.append(f"Total: {analysis['sponsored_posts']['count']} posts\n")
report.append(f"Traffic: {analysis['sponsored_posts']['traffic']:,} visits/month\n\n")
for post in sorted(analysis['sponsored_posts']['posts'],
key=lambda x: x['traffic'], reverse=True)[:10]:
report.append(f"- {post['title'][:70]} ({post['traffic']} visits)\n")
report.append("\n")
# Draft Posts
if analysis['draft_posts']['count'] > 0:
report.append("## Draft Posts (Unpublished)\n\n")
report.append(f"Total: {analysis['draft_posts']['count']} posts\n")
report.append("*Decision needed: Publish, delete, or move to other site?*\n\n")
for post in analysis['draft_posts']['posts'][:15]:
report.append(f"- {post['title'][:70]}\n")
report.append("\n")
# Recommendations
report.append("## Strategic Recommendations\n\n")
report.append("1. **Consolidate on mistergeek.net:**\n")
report.append(" - Keep only VPN, software, gaming, tech content\n")
report.append(" - Focus on high-traffic posts (>50 visits/month)\n\n")
report.append("2. **Move to webscroll.fr:**\n")
report.append(" - All torrent/file-sharing content\n")
report.append(" - File-specific guides\n\n")
report.append("3. **Move to hellogeek.net:**\n")
report.append(" - Low-traffic content (<50 visits)\n")
report.append(" - Off-brand content\n")
report.append(" - Experimental/niche posts\n\n")
report.append("4. **Delete:**\n")
report.append(f" - Posts with <5 visits and <20 impressions\n")
report.append(" - Duplicates/thin content\n\n")
with open(output_path, 'w', encoding='utf-8') as f:
f.write(''.join(report))
self.log(f"✓ Generated strategy report: {output_path}")
except Exception as e:
self.log(f"❌ Error generating report: {e}")
def run(self, wordpress_csv, drafts_csv):
"""Run complete content strategy analysis."""
self.log("\n" + "="*70)
self.log("Multi-Site Content Strategy Analyzer")
self.log("="*70 + "\n")
# Load posts
self.log("📚 Loading content...\n")
wordpress_posts = self.load_wordpress_posts(wordpress_csv)
draft_posts = self.load_draft_posts(drafts_csv)
# Combine all posts
all_posts = {**wordpress_posts, **draft_posts}
self.log(f"Total posts: {len(all_posts)}\n")
# Analyze
self.log("🔍 Analyzing content distribution...\n")
analysis = self.analyze_all_content(all_posts)
# Generate outputs
self.log("📊 Generating outputs...\n")
output_csv = self.output_dir / 'analysis' / 'content_distribution.csv'
self.generate_content_distribution_csv(all_posts, output_csv)
output_md = self.output_dir / 'reports' / 'content_strategy_report.md'
self.generate_strategy_report(analysis, output_md)
# Export analysis JSON
analysis_json = self.output_dir / 'analysis' / 'analysis_summary.json'
try:
with open(analysis_json, 'w', encoding='utf-8') as f:
# Convert defaultdict to regular dict for JSON serialization
analysis_clean = {
'total_posts': analysis['total_posts'],
'by_site': dict(analysis['by_site']),
'by_topic': {k: {'count': v['count'], 'traffic': v['traffic']}
for k, v in analysis['by_topic'].items()},
'by_action': {k: {'count': v['count'], 'traffic': v['traffic']}
for k, v in analysis['by_action'].items()},
'sponsored_posts': {
'count': analysis['sponsored_posts']['count'],
'traffic': analysis['sponsored_posts']['traffic']
},
'draft_posts': {
'count': analysis['draft_posts']['count']
}
}
json.dump(analysis_clean, f, indent=2, ensure_ascii=False)
self.log(f"✓ Exported analysis JSON: {analysis_json}\n")
except Exception as e:
self.log(f"❌ Error exporting JSON: {e}\n")
# Summary
self.log("\n" + "="*70)
self.log("ANALYSIS COMPLETE")
self.log("="*70)
self.log(f"\nOutputs:")
self.log(f" Distribution: {output_csv}")
self.log(f" Strategy: {output_md}")
self.log(f" Summary: {analysis_json}\n")
self.log("Next steps:")
self.log(" 1. Review content_strategy_report.md")
self.log(" 2. Review content_distribution.csv")
self.log(" 3. Decide: which posts go to which site?")
self.log(" 4. Plan content consolidation")
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(description='Analyze content across multiple sites')
parser.add_argument('--wordpress-csv', type=Path,
default=Path('input/wordpress/new-propositions.csv'),
help='WordPress posts CSV')
parser.add_argument('--drafts-csv', type=Path,
default=Path('input/drafts/drafts.csv'),
help='Draft posts CSV')
args = parser.parse_args()
analyzer = ContentStrategyAnalyzer()
analyzer.run(args.wordpress_csv, args.drafts_csv)
if __name__ == '__main__':
main()