Files
seo/scripts/content_gap_analyzer.py
Kevin Bataille 8c7cd24685 Refactor SEO automation into unified CLI application
Major refactoring to create a clean, integrated CLI application:

### New Features:
- Unified CLI executable (./seo) with simple command structure
- All commands accept optional CSV file arguments
- Auto-detection of latest files when no arguments provided
- Simplified output directory structure (output/ instead of output/reports/)
- Cleaner export filename format (all_posts_YYYY-MM-DD.csv)

### Commands:
- export: Export all posts from WordPress sites
- analyze [csv]: Analyze posts with AI (optional CSV input)
- recategorize [csv]: Recategorize posts with AI
- seo_check: Check SEO quality
- categories: Manage categories across sites
- approve [files]: Review and approve recommendations
- full_pipeline: Run complete workflow
- analytics, gaps, opportunities, report, status

### Changes:
- Moved all scripts to scripts/ directory
- Created config.yaml for configuration
- Updated all scripts to use output/ directory
- Deprecated old seo-cli.py in favor of new ./seo
- Added AGENTS.md and CHANGELOG.md documentation
- Consolidated README.md with updated usage

### Technical:
- Added PyYAML dependency
- Removed hardcoded configuration values
- All scripts now properly integrated
- Better error handling and user feedback

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 14:24:44 +01:00

349 lines
12 KiB
Python

"""
Content gap analyzer for SEO strategy.
Identifies missing topics and content opportunities using AI analysis.
"""
import csv
import json
import argparse
import time
from pathlib import Path
from collections import defaultdict
from openai import OpenAI
from config import Config
class ContentGapAnalyzer:
"""Identify content gaps and opportunities."""
def __init__(self):
"""Initialize analyzer."""
self.config = Config
self.output_dir = self.config.OUTPUT_DIR
self.logs = []
self.client = None
if self.config.OPENROUTER_API_KEY:
self.client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=self.config.OPENROUTER_API_KEY,
)
def log(self, message):
"""Add message to log."""
self.logs.append(message)
print(message)
def load_posts(self, posts_csv):
"""Load post titles and data."""
posts = []
if not posts_csv.exists():
self.log(f"❌ File not found: {posts_csv}")
return posts
try:
with open(posts_csv, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
posts.append({
'id': row.get('ID', ''),
'title': row.get('Title', ''),
'url': row.get('URL', ''),
'traffic': int(row.get('traffic', 0) or 0),
'impressions': int(row.get('impressions', 0) or 0),
'top_keywords': row.get('top_keywords', '')
})
self.log(f"✓ Loaded {len(posts)} posts")
except Exception as e:
self.log(f"❌ Error reading posts: {e}")
return posts
def load_gsc_data(self, gsc_csv):
"""Load Search Console queries for gap analysis."""
queries = []
if not gsc_csv.exists():
self.log(f"⚠️ GSC file not found: {gsc_csv}")
return queries
try:
with open(gsc_csv, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
try:
query = row.get('Query', '').strip()
if not query:
continue
impressions = int(row.get('Impressions', 0) or 0)
clicks = int(row.get('Clicks', 0) or 0)
# Only include queries with impressions but low clicks
if impressions > 0 and (clicks / impressions < 0.05):
queries.append({
'query': query,
'impressions': impressions,
'clicks': clicks,
'ctr': clicks / impressions if impressions > 0 else 0
})
except (ValueError, TypeError):
continue
self.log(f"✓ Loaded {len(queries)} underperforming queries")
except Exception as e:
self.log(f"⚠️ Error reading GSC file: {e}")
return queries
def extract_topics(self, posts):
"""Extract topic clusters from post titles using AI."""
if not self.client or len(posts) == 0:
self.log("⚠️ Cannot extract topics without AI client or posts")
return {}
try:
self.log("🤖 Extracting topic clusters from post titles...")
# Batch posts into groups
titles = [p['title'] for p in posts][:100] # Limit to first 100
prompt = f"""Analyze these {len(titles)} blog post titles and identify topic clusters:
Titles:
{chr(10).join(f'{i+1}. {t}' for i, t in enumerate(titles))}
Extract for each post:
1. Primary topic category
2. Subtopics covered
3. Content type (guide, tutorial, review, comparison, etc.)
Then identify:
1. Top 10 topic clusters with post counts
2. Most common subtopics
3. Over/under-represented topics
Return JSON:
{{
"post_topics": {{
"1": {{"primary": "...", "subtopics": ["..."], "type": "..."}},
...
}},
"topic_clusters": [
{{"cluster": "...", "post_count": 0, "importance": "high/medium/low"}}
],
"coverage_gaps": ["topic 1", "topic 2", ...],
"niche": "detected niche or industry"
}}"""
response = self.client.chat.completions.create(
model=self.config.AI_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=1500
)
try:
result_text = response.choices[0].message.content
start_idx = result_text.find('{')
end_idx = result_text.rfind('}') + 1
if start_idx >= 0 and end_idx > start_idx:
return json.loads(result_text[start_idx:end_idx])
except json.JSONDecodeError:
self.log("⚠️ Could not parse topic extraction response")
return {}
except Exception as e:
self.log(f"⚠️ Topic extraction failed: {e}")
return {}
def identify_content_gaps(self, topic_analysis, queries):
"""Use AI to identify content gaps and suggest new topics."""
if not self.client:
return []
try:
self.log("🤖 Identifying content gaps and opportunities...")
clusters = topic_analysis.get('topic_clusters', [])
gaps = topic_analysis.get('coverage_gaps', [])
niche = topic_analysis.get('niche', 'general')
# Prepare query analysis
top_queries = sorted(queries, key=lambda x: x['impressions'], reverse=True)[:20]
queries_str = '\n'.join([f"- {q['query']} ({q['impressions']} impr, {q['ctr']:.1%} CTR)"
for q in top_queries])
prompt = f"""Based on content analysis and search demand, identify content gaps:
Existing Topics: {', '.join([c.get('cluster', '') for c in clusters[:10]])}
Coverage Gaps: {', '.join(gaps[:5])}
Niche: {niche}
Top Underperforming Queries (low CTR despite impressions):
{queries_str}
Identify high-value missing topics that could:
1. Fill coverage gaps
2. Target underperforming queries (CTR improvement)
3. Capitalize on search demand
4. Complement existing content
For each suggestion:
- Topic title
- Why it's valuable (search demand + intent)
- Search volume estimate (high/medium/low)
- How it complements existing content
- Recommended content format
- Estimated traffic potential
Prioritize by traffic opportunity. Max 20 ideas.
Return JSON:
{{
"content_opportunities": [
{{
"title": "...",
"why_valuable": "...",
"search_volume": "high/medium/low",
"complements": "existing topic",
"format": "guide/tutorial/comparison/review/list",
"traffic_potential": number,
"priority": "high/medium/low"
}}
]
}}"""
response = self.client.chat.completions.create(
model=self.config.AI_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=2000
)
try:
result_text = response.choices[0].message.content
start_idx = result_text.find('{')
end_idx = result_text.rfind('}') + 1
if start_idx >= 0 and end_idx > start_idx:
result = json.loads(result_text[start_idx:end_idx])
return result.get('content_opportunities', [])
except json.JSONDecodeError:
self.log("⚠️ Could not parse gap analysis response")
return []
except Exception as e:
self.log(f"⚠️ Gap analysis failed: {e}")
return []
def export_gaps_csv(self, gaps, output_csv):
"""Export content gaps to CSV."""
if not gaps:
self.log("⚠️ No gaps to export")
return
try:
fieldnames = [
'priority', 'title', 'why_valuable', 'search_volume',
'complements', 'format', 'traffic_potential'
]
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
for gap in sorted(gaps, key=lambda x: x.get('priority') == 'high', reverse=True):
writer.writerow(gap)
self.log(f"✓ Exported {len(gaps)} content gaps to {output_csv}")
except Exception as e:
self.log(f"❌ Error exporting CSV: {e}")
def export_topic_clusters_json(self, topic_analysis, output_json):
"""Export topic analysis to JSON."""
if not topic_analysis:
return
try:
with open(output_json, 'w', encoding='utf-8') as f:
json.dump(topic_analysis, f, indent=2)
self.log(f"✓ Exported topic analysis to {output_json}")
except Exception as e:
self.log(f"❌ Error exporting JSON: {e}")
def export_log(self, log_file):
"""Export analysis log."""
try:
with open(log_file, 'w', encoding='utf-8') as f:
f.write("Content Gap Analysis Report\n")
f.write("=" * 60 + "\n\n")
for msg in self.logs:
f.write(msg + "\n")
self.log(f"✓ Exported log to {log_file}")
except Exception as e:
self.log(f"❌ Error exporting log: {e}")
def run(self, posts_csv, gsc_csv, output_csv):
"""Run complete analysis workflow."""
self.log("📊 Starting content gap analysis...")
self.log(f"Posts: {posts_csv}")
self.log(f"GSC queries: {gsc_csv}\n")
# Load data
posts = self.load_posts(posts_csv)
queries = self.load_gsc_data(gsc_csv)
if not posts:
return
# Extract topics
topic_analysis = self.extract_topics(posts)
if topic_analysis:
self.log(f"✓ Identified {len(topic_analysis.get('topic_clusters', []))} topic clusters")
# Identify gaps
gaps = self.identify_content_gaps(topic_analysis, queries)
if gaps:
self.log(f"✓ Identified {len(gaps)} content opportunities")
# Export
self.log("\n📁 Exporting results...")
self.export_gaps_csv(gaps, output_csv)
topic_json = self.output_dir / 'topic_clusters.json'
self.export_topic_clusters_json(topic_analysis, topic_json)
# Export log
log_dir = self.output_dir / 'logs'
log_dir.mkdir(exist_ok=True)
log_file = log_dir / 'content_gap_analysis_log.txt'
self.export_log(log_file)
self.log("\n✓ Content gap analysis complete!")
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(description='Analyze content gaps')
parser.add_argument('--posts-csv', type=Path,
default=Path('output/results/posts_with_analytics.csv'),
help='Posts CSV')
parser.add_argument('--gsc-queries', type=Path,
default=Path('input/analytics/gsc/Requêtes.csv'),
help='GSC queries CSV')
parser.add_argument('--output', type=Path,
default=Path('output/results/content_gaps.csv'),
help='Output gaps CSV')
args = parser.parse_args()
analyzer = ContentGapAnalyzer()
analyzer.run(args.posts_csv, args.gsc_queries, args.output)
if __name__ == '__main__':
main()