Major refactoring to create a clean, integrated CLI application: ### New Features: - Unified CLI executable (./seo) with simple command structure - All commands accept optional CSV file arguments - Auto-detection of latest files when no arguments provided - Simplified output directory structure (output/ instead of output/reports/) - Cleaner export filename format (all_posts_YYYY-MM-DD.csv) ### Commands: - export: Export all posts from WordPress sites - analyze [csv]: Analyze posts with AI (optional CSV input) - recategorize [csv]: Recategorize posts with AI - seo_check: Check SEO quality - categories: Manage categories across sites - approve [files]: Review and approve recommendations - full_pipeline: Run complete workflow - analytics, gaps, opportunities, report, status ### Changes: - Moved all scripts to scripts/ directory - Created config.yaml for configuration - Updated all scripts to use output/ directory - Deprecated old seo-cli.py in favor of new ./seo - Added AGENTS.md and CHANGELOG.md documentation - Consolidated README.md with updated usage ### Technical: - Added PyYAML dependency - Removed hardcoded configuration values - All scripts now properly integrated - Better error handling and user feedback Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
349 lines
12 KiB
Python
349 lines
12 KiB
Python
"""
|
|
Content gap analyzer for SEO strategy.
|
|
Identifies missing topics and content opportunities using AI analysis.
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import argparse
|
|
import time
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from openai import OpenAI
|
|
from config import Config
|
|
|
|
|
|
class ContentGapAnalyzer:
|
|
"""Identify content gaps and opportunities."""
|
|
|
|
def __init__(self):
|
|
"""Initialize analyzer."""
|
|
self.config = Config
|
|
self.output_dir = self.config.OUTPUT_DIR
|
|
self.logs = []
|
|
self.client = None
|
|
|
|
if self.config.OPENROUTER_API_KEY:
|
|
self.client = OpenAI(
|
|
base_url="https://openrouter.ai/api/v1",
|
|
api_key=self.config.OPENROUTER_API_KEY,
|
|
)
|
|
|
|
def log(self, message):
|
|
"""Add message to log."""
|
|
self.logs.append(message)
|
|
print(message)
|
|
|
|
def load_posts(self, posts_csv):
|
|
"""Load post titles and data."""
|
|
posts = []
|
|
if not posts_csv.exists():
|
|
self.log(f"❌ File not found: {posts_csv}")
|
|
return posts
|
|
|
|
try:
|
|
with open(posts_csv, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
posts.append({
|
|
'id': row.get('ID', ''),
|
|
'title': row.get('Title', ''),
|
|
'url': row.get('URL', ''),
|
|
'traffic': int(row.get('traffic', 0) or 0),
|
|
'impressions': int(row.get('impressions', 0) or 0),
|
|
'top_keywords': row.get('top_keywords', '')
|
|
})
|
|
|
|
self.log(f"✓ Loaded {len(posts)} posts")
|
|
except Exception as e:
|
|
self.log(f"❌ Error reading posts: {e}")
|
|
|
|
return posts
|
|
|
|
def load_gsc_data(self, gsc_csv):
|
|
"""Load Search Console queries for gap analysis."""
|
|
queries = []
|
|
if not gsc_csv.exists():
|
|
self.log(f"⚠️ GSC file not found: {gsc_csv}")
|
|
return queries
|
|
|
|
try:
|
|
with open(gsc_csv, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
try:
|
|
query = row.get('Query', '').strip()
|
|
if not query:
|
|
continue
|
|
|
|
impressions = int(row.get('Impressions', 0) or 0)
|
|
clicks = int(row.get('Clicks', 0) or 0)
|
|
|
|
# Only include queries with impressions but low clicks
|
|
if impressions > 0 and (clicks / impressions < 0.05):
|
|
queries.append({
|
|
'query': query,
|
|
'impressions': impressions,
|
|
'clicks': clicks,
|
|
'ctr': clicks / impressions if impressions > 0 else 0
|
|
})
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
self.log(f"✓ Loaded {len(queries)} underperforming queries")
|
|
except Exception as e:
|
|
self.log(f"⚠️ Error reading GSC file: {e}")
|
|
|
|
return queries
|
|
|
|
def extract_topics(self, posts):
|
|
"""Extract topic clusters from post titles using AI."""
|
|
if not self.client or len(posts) == 0:
|
|
self.log("⚠️ Cannot extract topics without AI client or posts")
|
|
return {}
|
|
|
|
try:
|
|
self.log("🤖 Extracting topic clusters from post titles...")
|
|
|
|
# Batch posts into groups
|
|
titles = [p['title'] for p in posts][:100] # Limit to first 100
|
|
|
|
prompt = f"""Analyze these {len(titles)} blog post titles and identify topic clusters:
|
|
|
|
Titles:
|
|
{chr(10).join(f'{i+1}. {t}' for i, t in enumerate(titles))}
|
|
|
|
Extract for each post:
|
|
1. Primary topic category
|
|
2. Subtopics covered
|
|
3. Content type (guide, tutorial, review, comparison, etc.)
|
|
|
|
Then identify:
|
|
1. Top 10 topic clusters with post counts
|
|
2. Most common subtopics
|
|
3. Over/under-represented topics
|
|
|
|
Return JSON:
|
|
{{
|
|
"post_topics": {{
|
|
"1": {{"primary": "...", "subtopics": ["..."], "type": "..."}},
|
|
...
|
|
}},
|
|
"topic_clusters": [
|
|
{{"cluster": "...", "post_count": 0, "importance": "high/medium/low"}}
|
|
],
|
|
"coverage_gaps": ["topic 1", "topic 2", ...],
|
|
"niche": "detected niche or industry"
|
|
}}"""
|
|
|
|
response = self.client.chat.completions.create(
|
|
model=self.config.AI_MODEL,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=0.7,
|
|
max_tokens=1500
|
|
)
|
|
|
|
try:
|
|
result_text = response.choices[0].message.content
|
|
start_idx = result_text.find('{')
|
|
end_idx = result_text.rfind('}') + 1
|
|
if start_idx >= 0 and end_idx > start_idx:
|
|
return json.loads(result_text[start_idx:end_idx])
|
|
except json.JSONDecodeError:
|
|
self.log("⚠️ Could not parse topic extraction response")
|
|
return {}
|
|
|
|
except Exception as e:
|
|
self.log(f"⚠️ Topic extraction failed: {e}")
|
|
return {}
|
|
|
|
def identify_content_gaps(self, topic_analysis, queries):
|
|
"""Use AI to identify content gaps and suggest new topics."""
|
|
if not self.client:
|
|
return []
|
|
|
|
try:
|
|
self.log("🤖 Identifying content gaps and opportunities...")
|
|
|
|
clusters = topic_analysis.get('topic_clusters', [])
|
|
gaps = topic_analysis.get('coverage_gaps', [])
|
|
niche = topic_analysis.get('niche', 'general')
|
|
|
|
# Prepare query analysis
|
|
top_queries = sorted(queries, key=lambda x: x['impressions'], reverse=True)[:20]
|
|
queries_str = '\n'.join([f"- {q['query']} ({q['impressions']} impr, {q['ctr']:.1%} CTR)"
|
|
for q in top_queries])
|
|
|
|
prompt = f"""Based on content analysis and search demand, identify content gaps:
|
|
|
|
Existing Topics: {', '.join([c.get('cluster', '') for c in clusters[:10]])}
|
|
Coverage Gaps: {', '.join(gaps[:5])}
|
|
Niche: {niche}
|
|
|
|
Top Underperforming Queries (low CTR despite impressions):
|
|
{queries_str}
|
|
|
|
Identify high-value missing topics that could:
|
|
1. Fill coverage gaps
|
|
2. Target underperforming queries (CTR improvement)
|
|
3. Capitalize on search demand
|
|
4. Complement existing content
|
|
|
|
For each suggestion:
|
|
- Topic title
|
|
- Why it's valuable (search demand + intent)
|
|
- Search volume estimate (high/medium/low)
|
|
- How it complements existing content
|
|
- Recommended content format
|
|
- Estimated traffic potential
|
|
|
|
Prioritize by traffic opportunity. Max 20 ideas.
|
|
|
|
Return JSON:
|
|
{{
|
|
"content_opportunities": [
|
|
{{
|
|
"title": "...",
|
|
"why_valuable": "...",
|
|
"search_volume": "high/medium/low",
|
|
"complements": "existing topic",
|
|
"format": "guide/tutorial/comparison/review/list",
|
|
"traffic_potential": number,
|
|
"priority": "high/medium/low"
|
|
}}
|
|
]
|
|
}}"""
|
|
|
|
response = self.client.chat.completions.create(
|
|
model=self.config.AI_MODEL,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=0.7,
|
|
max_tokens=2000
|
|
)
|
|
|
|
try:
|
|
result_text = response.choices[0].message.content
|
|
start_idx = result_text.find('{')
|
|
end_idx = result_text.rfind('}') + 1
|
|
if start_idx >= 0 and end_idx > start_idx:
|
|
result = json.loads(result_text[start_idx:end_idx])
|
|
return result.get('content_opportunities', [])
|
|
except json.JSONDecodeError:
|
|
self.log("⚠️ Could not parse gap analysis response")
|
|
return []
|
|
|
|
except Exception as e:
|
|
self.log(f"⚠️ Gap analysis failed: {e}")
|
|
return []
|
|
|
|
def export_gaps_csv(self, gaps, output_csv):
|
|
"""Export content gaps to CSV."""
|
|
if not gaps:
|
|
self.log("⚠️ No gaps to export")
|
|
return
|
|
|
|
try:
|
|
fieldnames = [
|
|
'priority', 'title', 'why_valuable', 'search_volume',
|
|
'complements', 'format', 'traffic_potential'
|
|
]
|
|
|
|
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
writer.writeheader()
|
|
|
|
for gap in sorted(gaps, key=lambda x: x.get('priority') == 'high', reverse=True):
|
|
writer.writerow(gap)
|
|
|
|
self.log(f"✓ Exported {len(gaps)} content gaps to {output_csv}")
|
|
except Exception as e:
|
|
self.log(f"❌ Error exporting CSV: {e}")
|
|
|
|
def export_topic_clusters_json(self, topic_analysis, output_json):
|
|
"""Export topic analysis to JSON."""
|
|
if not topic_analysis:
|
|
return
|
|
|
|
try:
|
|
with open(output_json, 'w', encoding='utf-8') as f:
|
|
json.dump(topic_analysis, f, indent=2)
|
|
|
|
self.log(f"✓ Exported topic analysis to {output_json}")
|
|
except Exception as e:
|
|
self.log(f"❌ Error exporting JSON: {e}")
|
|
|
|
def export_log(self, log_file):
|
|
"""Export analysis log."""
|
|
try:
|
|
with open(log_file, 'w', encoding='utf-8') as f:
|
|
f.write("Content Gap Analysis Report\n")
|
|
f.write("=" * 60 + "\n\n")
|
|
|
|
for msg in self.logs:
|
|
f.write(msg + "\n")
|
|
|
|
self.log(f"✓ Exported log to {log_file}")
|
|
except Exception as e:
|
|
self.log(f"❌ Error exporting log: {e}")
|
|
|
|
def run(self, posts_csv, gsc_csv, output_csv):
|
|
"""Run complete analysis workflow."""
|
|
self.log("📊 Starting content gap analysis...")
|
|
self.log(f"Posts: {posts_csv}")
|
|
self.log(f"GSC queries: {gsc_csv}\n")
|
|
|
|
# Load data
|
|
posts = self.load_posts(posts_csv)
|
|
queries = self.load_gsc_data(gsc_csv)
|
|
|
|
if not posts:
|
|
return
|
|
|
|
# Extract topics
|
|
topic_analysis = self.extract_topics(posts)
|
|
if topic_analysis:
|
|
self.log(f"✓ Identified {len(topic_analysis.get('topic_clusters', []))} topic clusters")
|
|
|
|
# Identify gaps
|
|
gaps = self.identify_content_gaps(topic_analysis, queries)
|
|
if gaps:
|
|
self.log(f"✓ Identified {len(gaps)} content opportunities")
|
|
|
|
# Export
|
|
self.log("\n📁 Exporting results...")
|
|
self.export_gaps_csv(gaps, output_csv)
|
|
|
|
topic_json = self.output_dir / 'topic_clusters.json'
|
|
self.export_topic_clusters_json(topic_analysis, topic_json)
|
|
|
|
# Export log
|
|
log_dir = self.output_dir / 'logs'
|
|
log_dir.mkdir(exist_ok=True)
|
|
log_file = log_dir / 'content_gap_analysis_log.txt'
|
|
self.export_log(log_file)
|
|
|
|
self.log("\n✓ Content gap analysis complete!")
|
|
|
|
|
|
def main():
|
|
"""CLI entry point."""
|
|
parser = argparse.ArgumentParser(description='Analyze content gaps')
|
|
parser.add_argument('--posts-csv', type=Path,
|
|
default=Path('output/results/posts_with_analytics.csv'),
|
|
help='Posts CSV')
|
|
parser.add_argument('--gsc-queries', type=Path,
|
|
default=Path('input/analytics/gsc/Requêtes.csv'),
|
|
help='GSC queries CSV')
|
|
parser.add_argument('--output', type=Path,
|
|
default=Path('output/results/content_gaps.csv'),
|
|
help='Output gaps CSV')
|
|
|
|
args = parser.parse_args()
|
|
|
|
analyzer = ContentGapAnalyzer()
|
|
analyzer.run(args.posts_csv, args.gsc_queries, args.output)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|