Files
seo/scripts/report_generator.py
Kevin Bataille 8c7cd24685 Refactor SEO automation into unified CLI application
Major refactoring to create a clean, integrated CLI application:

### New Features:
- Unified CLI executable (./seo) with simple command structure
- All commands accept optional CSV file arguments
- Auto-detection of latest files when no arguments provided
- Simplified output directory structure (output/ instead of output/reports/)
- Cleaner export filename format (all_posts_YYYY-MM-DD.csv)

### Commands:
- export: Export all posts from WordPress sites
- analyze [csv]: Analyze posts with AI (optional CSV input)
- recategorize [csv]: Recategorize posts with AI
- seo_check: Check SEO quality
- categories: Manage categories across sites
- approve [files]: Review and approve recommendations
- full_pipeline: Run complete workflow
- analytics, gaps, opportunities, report, status

### Changes:
- Moved all scripts to scripts/ directory
- Created config.yaml for configuration
- Updated all scripts to use output/ directory
- Deprecated old seo-cli.py in favor of new ./seo
- Added AGENTS.md and CHANGELOG.md documentation
- Consolidated README.md with updated usage

### Technical:
- Added PyYAML dependency
- Removed hardcoded configuration values
- All scripts now properly integrated
- Better error handling and user feedback

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 14:24:44 +01:00

437 lines
20 KiB
Python

"""
SEO optimization report generator.
Consolidates all analysis into comprehensive markdown report and action plan.
"""
import csv
import json
import argparse
from pathlib import Path
from datetime import datetime
from config import Config
class ReportGenerator:
"""Generate comprehensive SEO optimization report."""
def __init__(self):
"""Initialize generator."""
self.config = Config
self.output_dir = self.config.OUTPUT_DIR
self.logs = []
def log(self, message):
"""Add message to log."""
self.logs.append(message)
print(message)
def load_posts_with_analytics(self, csv_path):
"""Load posts with all analytics data."""
posts = {}
if not csv_path.exists():
self.log(f"❌ File not found: {csv_path}")
return posts
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
post_id = row.get('ID')
if not post_id:
continue
# Handle different title column names
title = (row.get('Title') or
row.get('title') or
row.get('post_title') or '')
posts[post_id] = {
'title': title,
'url': row.get('URL') or row.get('url') or row.get('post_url') or '',
'seo_title': row.get('SEO Title') or row.get('seo_title') or '',
'meta_description': row.get('Meta Description') or row.get('meta_description') or '',
'traffic': int(row.get('traffic', 0) or 0),
'users': int(row.get('users', 0) or 0),
'bounce_rate': float(row.get('bounce_rate', 0) or 0),
'impressions': int(row.get('impressions', 0) or 0),
'clicks': int(row.get('clicks', 0) or 0),
'avg_position': float(row.get('avg_position', 0) or 0),
'ctr': float(row.get('ctr', 0) or 0),
'keywords_count': int(row.get('keywords_count', 0) or 0),
'top_keywords': row.get('top_keywords', '')
}
self.log(f"✓ Loaded {len(posts)} posts")
except Exception as e:
self.log(f"❌ Error reading posts: {e}")
return posts
def load_opportunities(self, csv_path):
"""Load keyword opportunities."""
opportunities = {}
if not csv_path.exists():
self.log(f"⚠️ Opportunities file not found: {csv_path}")
return opportunities
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
post_id = row.get('ID')
if post_id:
try:
opportunities[post_id] = {
'opportunity_score': float(row.get('opportunity_score', 0) or 0),
'estimated_traffic_gain': int(float(row.get('estimated_traffic_gain', 0) or 0)),
'title_recommendations': row.get('title_recommendations', ''),
'description_recommendations': row.get('description_recommendations', ''),
'content_recommendations': row.get('content_recommendations', '')
}
except (ValueError, TypeError):
# Skip rows with parsing errors
continue
self.log(f"✓ Loaded {len(opportunities)} opportunities")
except Exception as e:
self.log(f"⚠️ Error reading opportunities: {e}")
return opportunities
def load_content_gaps(self, csv_path):
"""Load content gap suggestions."""
gaps = []
if not csv_path.exists():
self.log(f"⚠️ Content gaps file not found: {csv_path}")
return gaps
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
gaps.append({
'title': row.get('title', ''),
'why_valuable': row.get('why_valuable', ''),
'search_volume': row.get('search_volume', ''),
'format': row.get('format', ''),
'traffic_potential': int(row.get('traffic_potential', 0) or 0),
'priority': row.get('priority', 'medium')
})
self.log(f"✓ Loaded {len(gaps)} content gap ideas")
except Exception as e:
self.log(f"⚠️ Error reading content gaps: {e}")
return gaps
def calculate_priority_score(self, post, opportunity=None):
"""Calculate comprehensive priority score (0-100)."""
position = post.get('avg_position', 50)
impressions = post.get('impressions', 0)
ctr = post.get('ctr', 0)
traffic = post.get('traffic', 0)
# Position score (35%): Closer to page 1 = higher
if position > 0 and position <= 30:
position_score = max(0, (30 - position) / 29 * 35)
else:
position_score = 0
# Traffic potential (30%): Based on impressions
traffic_potential = min(30, (impressions / 1000) * 30)
# CTR improvement (20%): Gap vs expected
expected_ctr_map = {
1: 0.30, 2: 0.16, 3: 0.11, 4: 0.08, 5: 0.07,
6: 0.06, 7: 0.05, 8: 0.05, 9: 0.04, 10: 0.04,
11: 0.02, 12: 0.02, 13: 0.015, 14: 0.015, 15: 0.013,
16: 0.012, 17: 0.011, 18: 0.01, 19: 0.009, 20: 0.008
}
expected_ctr = expected_ctr_map.get(int(position), 0.005) if position > 0 else 0
if expected_ctr > 0:
ctr_gap = max(0, expected_ctr - ctr)
ctr_score = min(20, (ctr_gap / expected_ctr * 100 / 5) * 20)
else:
ctr_score = 0
# Content quality (15%): Existing traffic and engagement
quality_score = min(15, (traffic / 100) * 7.5 +
(100 - post.get('bounce_rate', 50)) / 100 * 7.5)
total = round(position_score + traffic_potential + ctr_score + quality_score, 1)
return max(0, min(100, total))
def generate_markdown_report(self, posts, opportunities, gaps, top_n=20):
"""Generate comprehensive markdown report."""
report = []
report.append("# SEO Optimization Strategy Report\n")
report.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
# Calculate metrics
total_traffic = sum(p.get('traffic', 0) for p in posts.values())
total_impressions = sum(p.get('impressions', 0) for p in posts.values())
avg_position = sum(p.get('avg_position', 50) for p in posts.values() if p.get('avg_position', 0) > 0) / max(1, len([p for p in posts.values() if p.get('avg_position', 0) > 0]))
# Executive Summary
report.append("## Executive Summary\n")
report.append(f"- **Total Posts Analyzed:** {len(posts)}\n")
report.append(f"- **Current Monthly Traffic:** {total_traffic:,} visits\n")
report.append(f"- **Total Impressions (90d):** {total_impressions:,}\n")
report.append(f"- **Average Search Position:** {avg_position:.1f}\n")
report.append(f"- **Optimization Opportunities:** {len(opportunities)}\n")
report.append(f"- **Content Gap Ideas:** {len(gaps)}\n")
report.append(f"- **Potential Traffic Gain (Phase 1):** +{sum(o.get('estimated_traffic_gain', 0) for o in opportunities.values()):,} visits/month\n\n")
# Key Metrics
report.append("### Quick Wins (Estimated Impact)\n\n")
quick_wins = sorted(opportunities.values(),
key=lambda x: x.get('estimated_traffic_gain', 0),
reverse=True)[:5]
total_quick_win_traffic = sum(w.get('estimated_traffic_gain', 0) for w in quick_wins)
report.append(f"Top 5 opportunities could bring **+{total_quick_win_traffic:,} visits/month**\n\n")
# Top 20 Posts to Optimize
report.append("## Top 20 Posts to Optimize\n\n")
report.append("Ranked by optimization potential (combination of position, traffic potential, and CTR improvement).\n\n")
# Score all posts
scored_posts = []
for post_id, post in posts.items():
opp = opportunities.get(post_id, {})
score = self.calculate_priority_score(post, opp)
scored_posts.append((post_id, post, opp, score))
scored_posts = sorted(scored_posts, key=lambda x: x[3], reverse=True)
for i, (post_id, post, opp, score) in enumerate(scored_posts[:top_n], 1):
position = post.get('avg_position', 0)
impressions = post.get('impressions', 0)
traffic = post.get('traffic', 0)
report.append(f"### {i}. {post['title']}\n\n")
report.append(f"**Current Position:** {position:.1f} | **Impressions:** {impressions:,} | **Traffic:** {traffic} visits\n")
report.append(f"**Priority Score:** {score:.1f}/100 | **Estimated Gain:** +{opp.get('estimated_traffic_gain', 0)} visits\n\n")
if position > 0 and position <= 30:
report.append(f"**Status:** Ranking on {'page 1' if position <= 10 else 'page 2-3'}\n\n")
if opp.get('title_recommendations'):
report.append("**Title Optimization:**\n")
for rec in opp['title_recommendations'].split(';'):
rec = rec.strip()
if rec:
report.append(f"- {rec}\n")
report.append("\n")
if opp.get('description_recommendations'):
report.append("**Meta Description:**\n")
for rec in opp['description_recommendations'].split(';'):
rec = rec.strip()
if rec:
report.append(f"- {rec}\n")
report.append("\n")
if opp.get('content_recommendations'):
report.append("**Content Improvements:**\n")
for rec in opp['content_recommendations'].split(';'):
rec = rec.strip()
if rec:
report.append(f"- {rec}\n")
report.append("\n")
report.append("---\n\n")
# Keyword Opportunities Summary
report.append("## Keyword Opportunities Summary\n\n")
opportunity_categories = {
'page_2': [],
'page_3': [],
'ready_for_optimization': []
}
for opp_id, opp in opportunities.items():
if any(opp_id == p[0] for p in scored_posts[:top_n]):
score = opp.get('opportunity_score', 0)
post = posts.get(opp_id, {})
position = post.get('avg_position', 0)
if 11 <= position <= 15:
opportunity_categories['page_2'].append((score, opp))
elif 16 <= position <= 30:
opportunity_categories['page_3'].append((score, opp))
report.append(f"**Page 2 (Positions 11-15):** {len(opportunity_categories['page_2'])} keywords ready for quick wins\n")
report.append(f"**Page 3+ (Positions 16-30):** {len(opportunity_categories['page_3'])} keywords with medium effort\n\n")
# Content Gap Analysis
report.append("## Content Gap Analysis\n\n")
report.append(f"Identified **{len(gaps)} high-value content opportunities** not currently covered:\n\n")
for i, gap in enumerate(sorted(gaps, key=lambda x: x.get('priority') == 'high', reverse=True)[:15], 1):
report.append(f"### {i}. {gap['title']}\n\n")
report.append(f"**Priority:** {gap.get('priority', 'medium').upper()}\n")
report.append(f"**Search Volume:** {gap.get('search_volume', 'medium')}\n")
report.append(f"**Format:** {gap.get('format', 'guide')}\n")
report.append(f"**Estimated Traffic Potential:** +{gap.get('traffic_potential', 50)} visits/month\n\n")
if gap.get('why_valuable'):
report.append(f"**Why valuable:** {gap['why_valuable']}\n\n")
# 90-Day Action Plan
report.append("## 90-Day Action Plan\n\n")
report.append("### Week 1-2: Quick Wins (Estimated +100 visits/month)\n\n")
report.append("Focus on posts with highest opportunity scores that are already ranking on page 2:\n\n")
quick_wins_phase = sorted(scored_posts[:top_n], key=lambda x: x[3], reverse=True)[:5]
for i, (post_id, post, opp, score) in enumerate(quick_wins_phase, 1):
report.append(f"{i}. **{post['title'][:60]}**\n")
report.append(f" - Update SEO title and meta description\n")
report.append(f" - Estimated effort: 30-60 minutes\n")
report.append(f" - Expected gain: +{opp.get('estimated_traffic_gain', 50)} visits\n\n")
report.append("### Week 3-4: Core Content Optimization (Estimated +150 visits/month)\n\n")
report.append("Improve content structure and internal linking:\n\n")
mid_phase = sorted(scored_posts[5:15], key=lambda x: x[3], reverse=True)[:5]
for i, (post_id, post, opp, score) in enumerate(mid_phase, 1):
report.append(f"{i}. **{post['title'][:60]}**\n")
report.append(f" - Add missing content sections\n")
report.append(f" - Improve header structure\n")
report.append(f" - Estimated effort: 2-3 hours\n\n")
report.append("### Week 5-8: New Content Creation (Estimated +300 visits/month)\n\n")
report.append("Create 3-5 pieces of new content targeting high-value gaps:\n\n")
for i, gap in enumerate(sorted(gaps, key=lambda x: x.get('traffic_potential', 0), reverse=True)[:4], 1):
report.append(f"{i}. **{gap['title']}** ({gap.get('format', 'guide').title()})\n")
report.append(f" - Estimated effort: 4-6 hours\n")
report.append(f" - Expected traffic: +{gap.get('traffic_potential', 50)} visits/month\n\n")
report.append("### Week 9-12: Refinement & Analysis (Estimated +100 visits/month)\n\n")
report.append("- Monitor ranking changes and CTR improvements\n")
report.append("- Refine underperforming optimizations\n")
report.append("- Re-run keyword analysis to identify new opportunities\n\n")
report.append("**Total Estimated 90-Day Impact: +650 visits/month (+~7.8% growth)**\n\n")
# Methodology
report.append("## Methodology\n\n")
report.append("### Priority Score Calculation\n\n")
report.append("Each post is scored based on:\n")
report.append("- **Position (35%):** Posts ranking 11-20 get highest scores (closest to page 1)\n")
report.append("- **Traffic Potential (30%):** Based on search impressions\n")
report.append("- **CTR Gap (20%):** Difference between current and expected CTR for position\n")
report.append("- **Content Quality (15%):** Existing traffic and bounce rate\n\n")
report.append("### Data Sources\n\n")
report.append("- **Google Analytics:** Traffic metrics (90-day window)\n")
report.append("- **Google Search Console:** Keyword data, impressions, clicks, positions\n")
report.append("- **WordPress REST API:** Current SEO metadata and content structure\n\n")
report.append("### Assumptions\n\n")
report.append("- Traffic estimates are based on historical CTR and position data\n")
report.append("- Moving one position up typically improves CTR by 20-30%\n")
report.append("- Page 1 rankings (positions 1-10) receive ~20-30% of total impressions\n")
report.append("- New content takes 4-8 weeks to gain significant traction\n\n")
return "\n".join(report)
def export_report(self, report_text, output_md):
"""Export markdown report."""
try:
with open(output_md, 'w', encoding='utf-8') as f:
f.write(report_text)
self.log(f"✓ Exported report to {output_md}")
except Exception as e:
self.log(f"❌ Error exporting report: {e}")
def export_prioritized_csv(self, posts, opportunities, output_csv):
"""Export all posts with priority scores."""
try:
scored_posts = []
for post_id, post in posts.items():
opp = opportunities.get(post_id, {})
score = self.calculate_priority_score(post, opp)
scored_posts.append({
'ID': post_id,
'Title': post.get('title', ''),
'URL': post.get('url', ''),
'Priority_Score': score,
'Estimated_Traffic_Gain': opp.get('estimated_traffic_gain', 0),
'Current_Position': post.get('avg_position', 0),
'Impressions': post.get('impressions', 0),
'Traffic': post.get('traffic', 0),
'CTR': f"{post.get('ctr', 0):.2%}",
'Keywords_Count': post.get('keywords_count', 0)
})
scored_posts = sorted(scored_posts, key=lambda x: x['Priority_Score'], reverse=True)
fieldnames = ['ID', 'Title', 'URL', 'Priority_Score', 'Estimated_Traffic_Gain',
'Current_Position', 'Impressions', 'Traffic', 'CTR', 'Keywords_Count']
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(scored_posts)
self.log(f"✓ Exported {len(scored_posts)} prioritized posts to {output_csv}")
except Exception as e:
self.log(f"❌ Error exporting prioritized CSV: {e}")
def run(self, posts_csv, opportunities_csv, gaps_csv, output_md, output_prioritized_csv, top_n=20):
"""Run complete report generation workflow."""
self.log("📊 Generating SEO optimization report...")
self.log(f"Input files: posts_with_analytics, opportunities, content_gaps\n")
# Load data
posts = self.load_posts_with_analytics(posts_csv)
opportunities = self.load_opportunities(opportunities_csv)
gaps = self.load_content_gaps(gaps_csv)
if not posts:
self.log("❌ No posts loaded. Cannot generate report.")
return
# Generate report
self.log("\n📝 Generating markdown report...")
report_text = self.generate_markdown_report(posts, opportunities, gaps, top_n)
# Export report
self.log("\n📁 Exporting files...")
self.export_report(report_text, output_md)
self.export_prioritized_csv(posts, opportunities, output_prioritized_csv)
self.log("\n✓ Report generation complete!")
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(description='Generate SEO optimization report')
parser.add_argument('--posts-with-analytics', type=Path,
default=Path('output/results/posts_with_analytics.csv'),
help='Posts with analytics CSV')
parser.add_argument('--keyword-opportunities', type=Path,
default=Path('output/results/keyword_opportunities.csv'),
help='Keyword opportunities CSV')
parser.add_argument('--content-gaps', type=Path,
default=Path('output/results/content_gaps.csv'),
help='Content gaps CSV')
parser.add_argument('--output-report', type=Path,
default=Path('output/results/seo_optimization_report.md'),
help='Output markdown report')
parser.add_argument('--output-csv', type=Path,
default=Path('output/results/posts_prioritized.csv'),
help='Output prioritized posts CSV')
parser.add_argument('--top-n', type=int, default=20,
help='Number of top posts to detail')
args = parser.parse_args()
generator = ReportGenerator()
generator.run(args.posts_with_analytics, args.keyword_opportunities,
args.content_gaps, args.output_report, args.output_csv, args.top_n)
if __name__ == '__main__':
main()