Major refactoring to create a clean, integrated CLI application: ### New Features: - Unified CLI executable (./seo) with simple command structure - All commands accept optional CSV file arguments - Auto-detection of latest files when no arguments provided - Simplified output directory structure (output/ instead of output/reports/) - Cleaner export filename format (all_posts_YYYY-MM-DD.csv) ### Commands: - export: Export all posts from WordPress sites - analyze [csv]: Analyze posts with AI (optional CSV input) - recategorize [csv]: Recategorize posts with AI - seo_check: Check SEO quality - categories: Manage categories across sites - approve [files]: Review and approve recommendations - full_pipeline: Run complete workflow - analytics, gaps, opportunities, report, status ### Changes: - Moved all scripts to scripts/ directory - Created config.yaml for configuration - Updated all scripts to use output/ directory - Deprecated old seo-cli.py in favor of new ./seo - Added AGENTS.md and CHANGELOG.md documentation - Consolidated README.md with updated usage ### Technical: - Added PyYAML dependency - Removed hardcoded configuration values - All scripts now properly integrated - Better error handling and user feedback Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
467 lines
19 KiB
Python
467 lines
19 KiB
Python
"""
|
|
Multi-Site Content Strategy Analyzer
|
|
Analyzes all content (published + drafts) across 3 websites.
|
|
Recommends optimal distribution and consolidation strategy.
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
|
|
class ContentStrategyAnalyzer:
|
|
"""Analyze and optimize content distribution across multiple sites."""
|
|
|
|
def __init__(self):
|
|
"""Initialize analyzer."""
|
|
self.output_dir = Path('output')
|
|
self.output_dir.mkdir(exist_ok=True)
|
|
(self.output_dir / 'analysis').mkdir(exist_ok=True)
|
|
(self.output_dir / 'reports').mkdir(exist_ok=True)
|
|
(self.output_dir / 'logs').mkdir(exist_ok=True)
|
|
|
|
self.logs = []
|
|
|
|
def log(self, message):
|
|
"""Log message."""
|
|
self.logs.append(message)
|
|
print(message)
|
|
|
|
def load_wordpress_posts(self, csv_path):
|
|
"""Load published WordPress posts."""
|
|
posts = {}
|
|
if not csv_path.exists():
|
|
self.log(f"⚠️ WordPress posts file not found: {csv_path}")
|
|
return posts
|
|
|
|
try:
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
post_id = row.get('ID') or row.get('post_id')
|
|
if not post_id:
|
|
continue
|
|
|
|
posts[post_id] = {
|
|
'source': 'wordpress',
|
|
'status': 'published',
|
|
'title': row.get('Title') or row.get('title') or row.get('post_title') or '',
|
|
'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
|
|
'author': row.get('Author') or row.get('author') or 'Unknown',
|
|
'traffic': int(row.get('traffic', 0) or 0),
|
|
'impressions': int(row.get('impressions', 0) or 0),
|
|
'position': float(row.get('avg_position', 0) or 0),
|
|
'category': row.get('Category') or row.get('category') or '',
|
|
}
|
|
|
|
self.log(f"✓ Loaded {len(posts)} published WordPress posts")
|
|
except Exception as e:
|
|
self.log(f"❌ Error reading WordPress posts: {e}")
|
|
|
|
return posts
|
|
|
|
def load_draft_posts(self, csv_path):
|
|
"""Load draft/unpublished posts."""
|
|
posts = {}
|
|
if not csv_path.exists():
|
|
self.log(f"⚠️ Draft posts file not found: {csv_path}")
|
|
return posts
|
|
|
|
try:
|
|
with open(csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
post_id = row.get('ID') or row.get('post_id')
|
|
if not post_id:
|
|
continue
|
|
|
|
posts[post_id] = {
|
|
'source': 'draft',
|
|
'status': 'draft',
|
|
'title': row.get('Title') or row.get('title') or row.get('post_title') or '',
|
|
'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
|
|
'author': row.get('Author') or row.get('author') or 'Unknown',
|
|
'traffic': 0, # Drafts have no traffic
|
|
'impressions': 0,
|
|
'position': 0,
|
|
'category': row.get('Category') or row.get('category') or '',
|
|
}
|
|
|
|
self.log(f"✓ Loaded {len(posts)} draft posts")
|
|
except Exception as e:
|
|
self.log(f"❌ Error reading draft posts: {e}")
|
|
|
|
return posts
|
|
|
|
def classify_post_topic(self, post):
|
|
"""Classify post into topic area."""
|
|
title = post['title'].lower()
|
|
category = post['category'].lower()
|
|
content = f"{title} {category}"
|
|
|
|
# Topic classification based on keywords
|
|
topic_keywords = {
|
|
'torrent': ['torrent', 'ygg', 'ratio', 'tracker', 'magnet', 'seedbox', 'upload'],
|
|
'streaming': ['stream', 'film', 'série', 'netflix', 'disney', 'platforma'],
|
|
'vpn': ['vpn', 'proxy', 'anonyme', 'privacy', 'chiffr'],
|
|
'software': ['software', 'tool', 'app', 'logiciel', 'outil', 'program'],
|
|
'gaming': ['game', 'jeu', 'gaming', 'emula', 'console', 'retro'],
|
|
'download': ['download', 'télécharge', 'ddl', 'upload'],
|
|
'tech': ['tech', 'informatique', 'code', 'programming', 'developer'],
|
|
'other': [],
|
|
}
|
|
|
|
for topic, keywords in topic_keywords.items():
|
|
if topic == 'other':
|
|
continue
|
|
for keyword in keywords:
|
|
if keyword in content:
|
|
return topic
|
|
|
|
return 'other'
|
|
|
|
def classify_website(self, post):
|
|
"""Determine which website this post should be on."""
|
|
topic = self.classify_post_topic(post)
|
|
author = post.get('author', '').strip()
|
|
is_sponsored = author == 'Expert'
|
|
|
|
# Website assignment rules
|
|
if topic == 'torrent' or topic == 'download':
|
|
return {
|
|
'site': 'webscroll.fr',
|
|
'reason': f'Torrent/file-sharing content',
|
|
'priority': 'HIGH' if post['traffic'] > 100 else 'MEDIUM'
|
|
}
|
|
|
|
if topic in ['vpn', 'software', 'gaming', 'tech']:
|
|
return {
|
|
'site': 'mistergeek.net',
|
|
'reason': f'{topic.capitalize()} - core content',
|
|
'priority': 'HIGH' if post['traffic'] > 50 else 'MEDIUM'
|
|
}
|
|
|
|
if topic == 'streaming' and post['traffic'] < 100:
|
|
return {
|
|
'site': 'hellogeek.net',
|
|
'reason': 'Low-traffic streaming content',
|
|
'priority': 'LOW'
|
|
}
|
|
|
|
if topic == 'other' or post['traffic'] < 10:
|
|
return {
|
|
'site': 'hellogeek.net',
|
|
'reason': 'Off-brand or low-traffic content',
|
|
'priority': 'LOW'
|
|
}
|
|
|
|
# Default to main site
|
|
return {
|
|
'site': 'mistergeek.net',
|
|
'reason': 'Core content',
|
|
'priority': 'MEDIUM'
|
|
}
|
|
|
|
def classify_content_action(self, post):
|
|
"""Determine what action to take with this post."""
|
|
topic = self.classify_post_topic(post)
|
|
traffic = post.get('traffic', 0)
|
|
impressions = post.get('impressions', 0)
|
|
position = post.get('position', 0)
|
|
status = post.get('status', 'published')
|
|
|
|
# Determine action
|
|
if status == 'draft':
|
|
if traffic == 0:
|
|
return 'REVIEW_PUBLISH_OR_DELETE' # Unpublished draft
|
|
else:
|
|
return 'REPUBLISH' # Was published, now draft
|
|
|
|
if traffic < 5 and impressions < 20:
|
|
return 'DELETE_OR_CONSOLIDATE'
|
|
|
|
if traffic > 0 and position > 0 and position < 11:
|
|
return 'KEEP_OPTIMIZE'
|
|
|
|
if position > 11 and position < 30:
|
|
return 'KEEP_OPTIMIZE'
|
|
|
|
if position > 30 or traffic < 10:
|
|
return 'MOVE_TO_OTHER_SITE'
|
|
|
|
return 'KEEP_MONITOR'
|
|
|
|
def analyze_all_content(self, posts):
|
|
"""Analyze and classify all posts."""
|
|
analysis = {
|
|
'total_posts': len(posts),
|
|
'by_site': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
|
|
'by_topic': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
|
|
'by_action': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}),
|
|
'sponsored_posts': {'count': 0, 'traffic': 0, 'posts': []},
|
|
'draft_posts': {'count': 0, 'posts': []},
|
|
}
|
|
|
|
for post_id, post in posts.items():
|
|
topic = self.classify_post_topic(post)
|
|
site_assignment = self.classify_website(post)
|
|
action = self.classify_content_action(post)
|
|
is_sponsored = post.get('author', '').strip() == 'Expert'
|
|
is_draft = post.get('status') == 'draft'
|
|
|
|
# Record in analysis
|
|
analysis['by_site'][site_assignment['site']]['count'] += 1
|
|
analysis['by_site'][site_assignment['site']]['traffic'] += post['traffic']
|
|
analysis['by_site'][site_assignment['site']]['posts'].append({
|
|
'id': post_id,
|
|
'title': post['title'],
|
|
'traffic': post['traffic'],
|
|
'reason': site_assignment['reason']
|
|
})
|
|
|
|
analysis['by_topic'][topic]['count'] += 1
|
|
analysis['by_topic'][topic]['traffic'] += post['traffic']
|
|
|
|
analysis['by_action'][action]['count'] += 1
|
|
analysis['by_action'][action]['traffic'] += post['traffic']
|
|
|
|
if is_sponsored:
|
|
analysis['sponsored_posts']['count'] += 1
|
|
analysis['sponsored_posts']['traffic'] += post['traffic']
|
|
analysis['sponsored_posts']['posts'].append({
|
|
'id': post_id,
|
|
'title': post['title'],
|
|
'traffic': post['traffic']
|
|
})
|
|
|
|
if is_draft:
|
|
analysis['draft_posts']['count'] += 1
|
|
analysis['draft_posts']['posts'].append({
|
|
'id': post_id,
|
|
'title': post['title'],
|
|
'status': 'draft'
|
|
})
|
|
|
|
return analysis
|
|
|
|
def generate_content_distribution_csv(self, posts, output_path):
|
|
"""Export detailed content distribution plan."""
|
|
try:
|
|
fieldnames = [
|
|
'post_id', 'title', 'topic', 'status', 'author',
|
|
'traffic', 'impressions', 'position',
|
|
'recommended_site', 'reason', 'action',
|
|
'priority', 'notes'
|
|
]
|
|
|
|
rows = []
|
|
for post_id, post in posts.items():
|
|
topic = self.classify_post_topic(post)
|
|
site_assignment = self.classify_website(post)
|
|
action = self.classify_content_action(post)
|
|
author = post.get('author', '').strip()
|
|
is_sponsored = author == 'Expert'
|
|
|
|
rows.append({
|
|
'post_id': post_id,
|
|
'title': post['title'][:80],
|
|
'topic': topic,
|
|
'status': post.get('status', 'published'),
|
|
'author': author,
|
|
'traffic': post.get('traffic', 0),
|
|
'impressions': post.get('impressions', 0),
|
|
'position': post.get('position', 0),
|
|
'recommended_site': site_assignment['site'],
|
|
'reason': site_assignment['reason'],
|
|
'action': action,
|
|
'priority': site_assignment['priority'],
|
|
'notes': 'SPONSORED' if is_sponsored else ''
|
|
})
|
|
|
|
rows.sort(key=lambda x: x['traffic'], reverse=True)
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
self.log(f"✓ Exported {len(rows)} posts to {output_path}")
|
|
except Exception as e:
|
|
self.log(f"❌ Error exporting CSV: {e}")
|
|
|
|
def generate_strategy_report(self, analysis, output_path):
|
|
"""Generate comprehensive strategy report."""
|
|
try:
|
|
report = []
|
|
report.append("# Multi-Site Content Strategy Report\n")
|
|
report.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}*\n\n")
|
|
|
|
# Executive Summary
|
|
report.append("## Executive Summary\n\n")
|
|
report.append(f"**Total Content Analyzed:** {analysis['total_posts']} posts\n")
|
|
report.append(f"- Published: {analysis['total_posts'] - analysis['draft_posts']['count']}\n")
|
|
report.append(f"- Drafts: {analysis['draft_posts']['count']}\n")
|
|
report.append(f"- Sponsored: {analysis['sponsored_posts']['count']}\n\n")
|
|
|
|
# Distribution Strategy
|
|
report.append("## Recommended Site Distribution\n\n")
|
|
for site, data in sorted(analysis['by_site'].items(),
|
|
key=lambda x: x[1]['traffic'], reverse=True):
|
|
report.append(f"### {site}\n")
|
|
report.append(f"- Posts: {data['count']}\n")
|
|
report.append(f"- Total Traffic: {data['traffic']:,} visits/month\n")
|
|
report.append(f"- Top Posts:\n")
|
|
for post in sorted(data['posts'], key=lambda x: x['traffic'], reverse=True)[:5]:
|
|
report.append(f" - {post['title'][:60]} ({post['traffic']} visits)\n")
|
|
report.append(f"\n")
|
|
|
|
# Topic Distribution
|
|
report.append("## Content by Topic\n\n")
|
|
for topic, data in sorted(analysis['by_topic'].items(),
|
|
key=lambda x: x[1]['traffic'], reverse=True):
|
|
report.append(f"- **{topic.title()}:** {data['count']} posts ({data['traffic']:,} visits)\n")
|
|
report.append("\n")
|
|
|
|
# Actions Required
|
|
report.append("## Required Actions\n\n")
|
|
for action, data in sorted(analysis['by_action'].items(),
|
|
key=lambda x: x[1]['count'], reverse=True):
|
|
report.append(f"- **{action}:** {data['count']} posts ({data['traffic']:,} visits)\n")
|
|
report.append("\n")
|
|
|
|
# Sponsored Content
|
|
if analysis['sponsored_posts']['count'] > 0:
|
|
report.append("## Sponsored Content (by 'Expert')\n\n")
|
|
report.append(f"Total: {analysis['sponsored_posts']['count']} posts\n")
|
|
report.append(f"Traffic: {analysis['sponsored_posts']['traffic']:,} visits/month\n\n")
|
|
for post in sorted(analysis['sponsored_posts']['posts'],
|
|
key=lambda x: x['traffic'], reverse=True)[:10]:
|
|
report.append(f"- {post['title'][:70]} ({post['traffic']} visits)\n")
|
|
report.append("\n")
|
|
|
|
# Draft Posts
|
|
if analysis['draft_posts']['count'] > 0:
|
|
report.append("## Draft Posts (Unpublished)\n\n")
|
|
report.append(f"Total: {analysis['draft_posts']['count']} posts\n")
|
|
report.append("*Decision needed: Publish, delete, or move to other site?*\n\n")
|
|
for post in analysis['draft_posts']['posts'][:15]:
|
|
report.append(f"- {post['title'][:70]}\n")
|
|
report.append("\n")
|
|
|
|
# Recommendations
|
|
report.append("## Strategic Recommendations\n\n")
|
|
report.append("1. **Consolidate on mistergeek.net:**\n")
|
|
report.append(" - Keep only VPN, software, gaming, tech content\n")
|
|
report.append(" - Focus on high-traffic posts (>50 visits/month)\n\n")
|
|
|
|
report.append("2. **Move to webscroll.fr:**\n")
|
|
report.append(" - All torrent/file-sharing content\n")
|
|
report.append(" - File-specific guides\n\n")
|
|
|
|
report.append("3. **Move to hellogeek.net:**\n")
|
|
report.append(" - Low-traffic content (<50 visits)\n")
|
|
report.append(" - Off-brand content\n")
|
|
report.append(" - Experimental/niche posts\n\n")
|
|
|
|
report.append("4. **Delete:**\n")
|
|
report.append(f" - Posts with <5 visits and <20 impressions\n")
|
|
report.append(" - Duplicates/thin content\n\n")
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(''.join(report))
|
|
|
|
self.log(f"✓ Generated strategy report: {output_path}")
|
|
except Exception as e:
|
|
self.log(f"❌ Error generating report: {e}")
|
|
|
|
def run(self, wordpress_csv, drafts_csv):
|
|
"""Run complete content strategy analysis."""
|
|
self.log("\n" + "="*70)
|
|
self.log("Multi-Site Content Strategy Analyzer")
|
|
self.log("="*70 + "\n")
|
|
|
|
# Load posts
|
|
self.log("📚 Loading content...\n")
|
|
wordpress_posts = self.load_wordpress_posts(wordpress_csv)
|
|
draft_posts = self.load_draft_posts(drafts_csv)
|
|
|
|
# Combine all posts
|
|
all_posts = {**wordpress_posts, **draft_posts}
|
|
self.log(f"Total posts: {len(all_posts)}\n")
|
|
|
|
# Analyze
|
|
self.log("🔍 Analyzing content distribution...\n")
|
|
analysis = self.analyze_all_content(all_posts)
|
|
|
|
# Generate outputs
|
|
self.log("📊 Generating outputs...\n")
|
|
|
|
output_csv = self.output_dir / 'analysis' / 'content_distribution.csv'
|
|
self.generate_content_distribution_csv(all_posts, output_csv)
|
|
|
|
output_md = self.output_dir / 'reports' / 'content_strategy_report.md'
|
|
self.generate_strategy_report(analysis, output_md)
|
|
|
|
# Export analysis JSON
|
|
analysis_json = self.output_dir / 'analysis' / 'analysis_summary.json'
|
|
try:
|
|
with open(analysis_json, 'w', encoding='utf-8') as f:
|
|
# Convert defaultdict to regular dict for JSON serialization
|
|
analysis_clean = {
|
|
'total_posts': analysis['total_posts'],
|
|
'by_site': dict(analysis['by_site']),
|
|
'by_topic': {k: {'count': v['count'], 'traffic': v['traffic']}
|
|
for k, v in analysis['by_topic'].items()},
|
|
'by_action': {k: {'count': v['count'], 'traffic': v['traffic']}
|
|
for k, v in analysis['by_action'].items()},
|
|
'sponsored_posts': {
|
|
'count': analysis['sponsored_posts']['count'],
|
|
'traffic': analysis['sponsored_posts']['traffic']
|
|
},
|
|
'draft_posts': {
|
|
'count': analysis['draft_posts']['count']
|
|
}
|
|
}
|
|
json.dump(analysis_clean, f, indent=2, ensure_ascii=False)
|
|
self.log(f"✓ Exported analysis JSON: {analysis_json}\n")
|
|
except Exception as e:
|
|
self.log(f"❌ Error exporting JSON: {e}\n")
|
|
|
|
# Summary
|
|
self.log("\n" + "="*70)
|
|
self.log("ANALYSIS COMPLETE")
|
|
self.log("="*70)
|
|
self.log(f"\nOutputs:")
|
|
self.log(f" Distribution: {output_csv}")
|
|
self.log(f" Strategy: {output_md}")
|
|
self.log(f" Summary: {analysis_json}\n")
|
|
|
|
self.log("Next steps:")
|
|
self.log(" 1. Review content_strategy_report.md")
|
|
self.log(" 2. Review content_distribution.csv")
|
|
self.log(" 3. Decide: which posts go to which site?")
|
|
self.log(" 4. Plan content consolidation")
|
|
|
|
|
|
def main():
|
|
"""CLI entry point."""
|
|
parser = argparse.ArgumentParser(description='Analyze content across multiple sites')
|
|
parser.add_argument('--wordpress-csv', type=Path,
|
|
default=Path('input/wordpress/new-propositions.csv'),
|
|
help='WordPress posts CSV')
|
|
parser.add_argument('--drafts-csv', type=Path,
|
|
default=Path('input/drafts/drafts.csv'),
|
|
help='Draft posts CSV')
|
|
|
|
args = parser.parse_args()
|
|
|
|
analyzer = ContentStrategyAnalyzer()
|
|
analyzer.run(args.wordpress_csv, args.drafts_csv)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|