From c8fb141cdd236ffe3f2f52dda2335dccb1df50b4 Mon Sep 17 00:00:00 2001 From: Kevin Bataille Date: Mon, 16 Feb 2026 15:20:11 +0100 Subject: [PATCH] Refactor to single integrated package - Remove scripts folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major refactoring to create a unified, self-contained Python package: ### Architecture Changes: - Removed scripts/ directory completely - All functionality now in src/seo/ package - Single entry point: ./seo (imports from src/seo/cli) - No external dependencies on scripts folder ### New Package Structure: src/seo/ ├── __init__.py - Package exports (SEOApp, PostExporter, etc.) ├── cli.py - Command-line interface ├── app.py - Main application class ├── config.py - Configuration management ├── exporter.py - Post export functionality (self-contained) ├── analyzer.py - Enhanced analyzer with selective fields ├── category_proposer.py - AI category proposals (self-contained) ├── seo_checker.py - Placeholder for future implementation ├── categories.py - Placeholder for future implementation ├── approval.py - Placeholder for future implementation └── recategorizer.py - Placeholder for future implementation ### Features: - All modules are self-contained (no scripts dependencies) - EnhancedPostAnalyzer with selective field analysis - CategoryProposer for AI-powered category suggestions - Support for in-place CSV updates with backups - Clean, integrated codebase ### CLI Commands: - seo export - Export posts from WordPress - seo analyze - Analyze with AI (supports -f fields, -u update) - seo category_propose - Propose categories - seo status - Show output files - seo help - Show help ### Usage Examples: ./seo export ./seo analyze -f title categories ./seo analyze -u -f meta_description ./seo category_propose ./seo status ### Benefits: - Single source of truth - Easier to maintain and extend - Proper Python package structure - Can be installed with pip install -e . - Clean imports throughout - No path resolution issues Co-authored-by: Qwen-Coder --- scripts/__init__.py | 0 scripts/ai_analyze_posts_for_decisions.py | 453 ------------- scripts/ai_recategorize_posts.py | 382 ----------- scripts/analytics_importer.py | 427 ------------ scripts/category_manager.py | 614 ----------------- scripts/config.py | 110 --- scripts/content_gap_analyzer.py | 348 ---------- scripts/content_strategy_analyzer.py | 466 ------------- scripts/enhanced_analyzer.py | 375 ----------- scripts/export_posts_for_ai_decision.py | 378 ----------- scripts/multi_site_seo_analyzer.py | 778 ---------------------- scripts/opportunity_analyzer.py | 347 ---------- scripts/report_generator.py | 436 ------------ scripts/run_analysis.sh | 73 -- scripts/seo-cli.py | 388 ----------- scripts/user_approval.py | 352 ---------- seo | 2 +- src/seo/__init__.py | 11 +- src/seo/analyzer.py | 354 +++++++++- src/seo/app.py | 207 +----- src/seo/approval.py | 19 +- src/seo/categories.py | 19 +- {scripts => src/seo}/category_proposer.py | 39 +- src/seo/cli.py | 142 +--- src/seo/exporter.py | 51 +- src/seo/recategorizer.py | 20 +- src/seo/seo_checker.py | 19 +- 27 files changed, 468 insertions(+), 6342 deletions(-) delete mode 100644 scripts/__init__.py delete mode 100755 scripts/ai_analyze_posts_for_decisions.py delete mode 100644 scripts/ai_recategorize_posts.py delete mode 100644 scripts/analytics_importer.py delete mode 100644 scripts/category_manager.py delete mode 100644 scripts/config.py delete mode 100644 scripts/content_gap_analyzer.py delete mode 100644 scripts/content_strategy_analyzer.py delete mode 100644 scripts/enhanced_analyzer.py delete mode 100755 scripts/export_posts_for_ai_decision.py delete mode 100755 scripts/multi_site_seo_analyzer.py delete mode 100644 scripts/opportunity_analyzer.py delete mode 100644 scripts/report_generator.py delete mode 100755 scripts/run_analysis.sh delete mode 100755 scripts/seo-cli.py delete mode 100644 scripts/user_approval.py rename {scripts => src/seo}/category_proposer.py (88%) diff --git a/scripts/__init__.py b/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/ai_analyze_posts_for_decisions.py b/scripts/ai_analyze_posts_for_decisions.py deleted file mode 100755 index ee254c8..0000000 --- a/scripts/ai_analyze_posts_for_decisions.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/env python3 -""" -AI-Powered Post Analysis and Recommendation Script -Analyzes exported posts CSV using Claude via OpenRouter and provides -clear, automation-friendly recommendations for: -- Which site to move posts to -- Categories to set -- Posts to consolidate -- Posts to delete -- Posts to optimize -""" - -import csv -import json -import logging -import sys -from pathlib import Path -from typing import Dict, List, Optional, Tuple -import requests -from datetime import datetime -from config import Config - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - - -class PostAnalyzer: - """Analyze posts CSV using Claude AI via OpenRouter.""" - - def __init__(self, csv_file: str): - """Initialize analyzer with CSV file.""" - self.csv_file = Path(csv_file) - self.openrouter_api_key = Config.OPENROUTER_API_KEY - self.posts = [] - self.analyzed_posts = [] - self.api_calls = 0 - self.ai_cost = 0.0 - - def load_csv(self) -> bool: - """Load posts from CSV file.""" - logger.info(f"Loading CSV: {self.csv_file}") - - if not self.csv_file.exists(): - logger.error(f"CSV file not found: {self.csv_file}") - return False - - try: - with open(self.csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - self.posts = list(reader) - - logger.info(f"✓ Loaded {len(self.posts)} posts from CSV") - - # Group by site for stats - by_site = {} - for post in self.posts: - site = post.get('site', '') - if site not in by_site: - by_site[site] = 0 - by_site[site] += 1 - - for site, count in by_site.items(): - logger.info(f" {site}: {count} posts") - - return True - - except Exception as e: - logger.error(f"Error loading CSV: {e}") - return False - - def batch_posts_for_analysis(self, batch_size: int = 10) -> List[List[Dict]]: - """Batch posts for AI analysis to manage token usage.""" - batches = [] - for i in range(0, len(self.posts), batch_size): - batches.append(self.posts[i:i + batch_size]) - return batches - - def format_batch_for_ai(self, batch: List[Dict]) -> str: - """Format batch of posts for AI analysis.""" - formatted = "POSTS TO ANALYZE:\n\n" - - for i, post in enumerate(batch, 1): - formatted += f"{i}. POST ID: {post['post_id']}\n" - formatted += f" Site: {post['site']}\n" - formatted += f" Title: {post['title']}\n" - formatted += f" Status: {post['status']}\n" - formatted += f" Word Count: {post['word_count']}\n" - formatted += f" Content: {post['content_preview']}\n" - formatted += f" Current Categories: {post['categories']}\n" - formatted += f" Meta Description: {post['meta_description']}\n" - formatted += "\n" - - return formatted - - def get_ai_recommendations(self, batch: List[Dict]) -> Optional[str]: - """Get AI recommendations for a batch of posts.""" - if not self.openrouter_api_key: - logger.error("OPENROUTER_API_KEY not set") - return None - - batch_text = self.format_batch_for_ai(batch) - - prompt = f"""Analyze these blog posts and provide clear, actionable recommendations. - -Website Strategy: -- mistergeek.net: High-value topics (VPN, Software, Gaming, General Tech, SEO, Content Marketing) -- webscroll.fr: Torrenting, File-Sharing, Tracker guides (niche audience) -- hellogeek.net: Low-traffic, experimental, off-brand, or niche content - -{batch_text} - -For EACH post, provide a JSON object with: -{{ - "post_id": , - "decision": "" where ACTION is ONE of: - - "Keep on mistergeek.net" (high-value, high-traffic) - - "Move to webscroll.fr" (torrenting/file-sharing content) - - "Move to hellogeek.net" (low-traffic or off-brand) - - "Delete" (spam, extremely low quality, zero traffic) - - "Consolidate with post_id:" (similar content, duplicate) - "category": "" where category is ONE of: - - "VPN" - - "Software/Tools" - - "Gaming" - - "Streaming" - - "Torrenting" - - "File-Sharing" - - "SEO" - - "Content Marketing" - - "Other" - "reason": "", - "priority": "", - "notes": "" -}} - -Return ONLY a JSON array. Example: -[ - {{"post_id": 2845, "decision": "Keep on mistergeek.net", "category": "VPN", "reason": "High traffic, core topic", "priority": "High", "notes": "Already optimized"}}, - {{"post_id": 1234, "decision": "Move to webscroll.fr", "category": "Torrenting", "reason": "Torrent tracker content", "priority": "Medium", "notes": "Good SEO potential on target site"}} -] - -Analyze all posts and provide recommendations for EVERY post in the batch.""" - - try: - logger.info(f" Sending batch to Claude for analysis...") - - response = requests.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={ - "Authorization": f"Bearer {self.openrouter_api_key}", - "Content-Type": "application/json", - }, - json={ - "model": "anthropic/claude-3.5-sonnet", - "messages": [ - {"role": "user", "content": prompt} - ], - "temperature": 0.3, # Lower temp for more consistent recommendations - }, - timeout=60 - ) - response.raise_for_status() - - result = response.json() - self.api_calls += 1 - - # Track cost - usage = result.get('usage', {}) - input_tokens = usage.get('prompt_tokens', 0) - output_tokens = usage.get('completion_tokens', 0) - self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000 - - recommendations_text = result['choices'][0]['message']['content'].strip() - logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})") - - return recommendations_text - - except Exception as e: - logger.error(f"Error getting AI recommendations: {e}") - return None - - def parse_recommendations(self, recommendations_json: str) -> List[Dict]: - """Parse JSON recommendations from AI.""" - try: - # Try to extract JSON from response - start_idx = recommendations_json.find('[') - end_idx = recommendations_json.rfind(']') + 1 - - if start_idx == -1 or end_idx == 0: - logger.error("Could not find JSON array in response") - return [] - - json_str = recommendations_json[start_idx:end_idx] - recommendations = json.loads(json_str) - - return recommendations - - except json.JSONDecodeError as e: - logger.error(f"Error parsing JSON recommendations: {e}") - logger.debug(f"Response was: {recommendations_json[:500]}") - return [] - - def analyze_all_posts(self) -> bool: - """Analyze all posts in batches.""" - logger.info("\n" + "="*70) - logger.info("ANALYZING POSTS WITH AI") - logger.info("="*70 + "\n") - - batches = self.batch_posts_for_analysis(batch_size=10) - logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches of 10...\n") - - all_recommendations = {} - - for batch_num, batch in enumerate(batches, 1): - logger.info(f"Batch {batch_num}/{len(batches)}: Analyzing {len(batch)} posts...") - - recommendations_json = self.get_ai_recommendations(batch) - - if not recommendations_json: - logger.error(f" Failed to get recommendations for batch {batch_num}") - continue - - recommendations = self.parse_recommendations(recommendations_json) - - for rec in recommendations: - all_recommendations[str(rec.get('post_id', ''))] = rec - - logger.info(f" ✓ Got {len(recommendations)} recommendations") - - logger.info(f"\n✓ Analysis complete!") - logger.info(f" Total recommendations: {len(all_recommendations)}") - logger.info(f" API calls: {self.api_calls}") - logger.info(f" Estimated cost: ${self.ai_cost:.4f}") - - # Map recommendations to posts - for post in self.posts: - post_id = str(post['post_id']) - if post_id in all_recommendations: - rec = all_recommendations[post_id] - post['decision'] = rec.get('decision', 'No decision') - post['recommended_category'] = rec.get('category', 'Other') - post['reason'] = rec.get('reason', '') - post['priority'] = rec.get('priority', 'Medium') - post['ai_notes'] = rec.get('notes', '') - else: - post['decision'] = 'Pending' - post['recommended_category'] = 'Other' - post['reason'] = 'No recommendation' - post['priority'] = 'Medium' - post['ai_notes'] = '' - - self.analyzed_posts.append(post) - - return len(self.analyzed_posts) > 0 - - def export_with_recommendations(self) -> Tuple[str, str, str, str]: - """Export CSV with recommendations and create action-specific files.""" - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - - # Main file with all recommendations - main_file = output_dir / f'posts_with_ai_recommendations_{timestamp}.csv' - - # Action-specific files - moves_file = output_dir / f'posts_to_move_{timestamp}.csv' - consolidate_file = output_dir / f'posts_to_consolidate_{timestamp}.csv' - delete_file = output_dir / f'posts_to_delete_{timestamp}.csv' - - # Export main file - fieldnames = list(self.analyzed_posts[0].keys()) + [ - 'decision', - 'recommended_category', - 'reason', - 'priority', - 'ai_notes' - ] - - logger.info(f"\nExporting recommendations to CSV...") - - with open(main_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.analyzed_posts) - - logger.info(f"✓ Main file: {main_file}") - - # Export action-specific files - posts_to_move = [p for p in self.analyzed_posts if 'Move to' in p.get('decision', '')] - posts_to_consolidate = [p for p in self.analyzed_posts if 'Consolidate' in p.get('decision', '')] - posts_to_delete = [p for p in self.analyzed_posts if p.get('decision') == 'Delete'] - - # Moves file - if posts_to_move: - with open(moves_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(posts_to_move) - logger.info(f"✓ Moves file ({len(posts_to_move)} posts): {moves_file}") - - # Consolidate file - if posts_to_consolidate: - with open(consolidate_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(posts_to_consolidate) - logger.info(f"✓ Consolidate file ({len(posts_to_consolidate)} posts): {consolidate_file}") - - # Delete file - if posts_to_delete: - with open(delete_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(posts_to_delete) - logger.info(f"✓ Delete file ({len(posts_to_delete)} posts): {delete_file}") - - return ( - str(main_file), - str(moves_file) if posts_to_move else None, - str(consolidate_file) if posts_to_consolidate else None, - str(delete_file) if posts_to_delete else None - ) - - def print_summary(self): - """Print analysis summary.""" - logger.info("\n" + "="*70) - logger.info("ANALYSIS SUMMARY") - logger.info("="*70 + "\n") - - # Count decisions - decisions = {} - for post in self.analyzed_posts: - decision = post.get('decision', 'Unknown') - decisions[decision] = decisions.get(decision, 0) + 1 - - logger.info("DECISIONS:") - for decision, count in sorted(decisions.items(), key=lambda x: x[1], reverse=True): - logger.info(f" {decision}: {count} posts") - - # Count categories - categories = {} - for post in self.analyzed_posts: - cat = post.get('recommended_category', 'Other') - categories[cat] = categories.get(cat, 0) + 1 - - logger.info("\nRECOMMENDED CATEGORIES:") - for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True): - logger.info(f" {cat}: {count} posts") - - # Count priorities - priorities = {} - for post in self.analyzed_posts: - priority = post.get('priority', 'Unknown') - priorities[priority] = priorities.get(priority, 0) + 1 - - logger.info("\nPRIORITY BREAKDOWN:") - for priority in ['High', 'Medium', 'Low']: - count = priorities.get(priority, 0) - logger.info(f" {priority}: {count} posts") - - # By site - logger.info("\nBY SITE:") - by_site = {} - for post in self.analyzed_posts: - site = post.get('site', 'Unknown') - if site not in by_site: - by_site[site] = [] - by_site[site].append(post.get('decision', 'Unknown')) - - for site in sorted(by_site.keys()): - logger.info(f"\n {site}:") - decisions_for_site = {} - for decision in by_site[site]: - decisions_for_site[decision] = decisions_for_site.get(decision, 0) + 1 - - for decision, count in sorted(decisions_for_site.items()): - logger.info(f" {decision}: {count}") - - def run(self): - """Run complete analysis.""" - logger.info("="*70) - logger.info("AI-POWERED POST ANALYSIS AND RECOMMENDATIONS") - logger.info("="*70) - - # Load CSV - if not self.load_csv(): - sys.exit(1) - - # Analyze posts - if not self.analyze_all_posts(): - logger.error("Failed to analyze posts") - sys.exit(1) - - # Print summary - self.print_summary() - - # Export results - logger.info("\n" + "="*70) - logger.info("EXPORTING RESULTS") - logger.info("="*70) - - main_file, moves_file, consol_file, delete_file = self.export_with_recommendations() - - logger.info("\n" + "="*70) - logger.info("NEXT STEPS") - logger.info("="*70) - logger.info("\n1. Review main file with all recommendations:") - logger.info(f" {main_file}") - logger.info("\n2. Execute moves (automate with script):") - if moves_file: - logger.info(f" {moves_file}") - else: - logger.info(" No posts to move") - - logger.info("\n3. Consolidate duplicates:") - if consol_file: - logger.info(f" {consol_file}") - else: - logger.info(" No posts to consolidate") - - logger.info("\n4. Delete low-quality posts:") - if delete_file: - logger.info(f" {delete_file}") - else: - logger.info(" No posts to delete") - - logger.info("\n✓ Analysis complete!") - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='Analyze exported posts CSV using Claude AI and provide recommendations' - ) - parser.add_argument( - 'csv_file', - help='Path to exported posts CSV file' - ) - - args = parser.parse_args() - - analyzer = PostAnalyzer(args.csv_file) - analyzer.run() - - -if __name__ == '__main__': - main() diff --git a/scripts/ai_recategorize_posts.py b/scripts/ai_recategorize_posts.py deleted file mode 100644 index 134bf7d..0000000 --- a/scripts/ai_recategorize_posts.py +++ /dev/null @@ -1,382 +0,0 @@ -#!/usr/bin/env python3 -""" -AI-Powered Post Re-categorization -Analyzes exported posts using Claude AI via OpenRouter and provides -category recommendations for better content organization. -""" - -import csv -import json -import logging -import sys -from pathlib import Path -from typing import Dict, List, Optional, Tuple -import requests -from datetime import datetime -from config import Config - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - - -class PostRecategorizer: - """Re-categorize posts using Claude AI via OpenRouter.""" - - def __init__(self, csv_file: str): - """Initialize recategorizer with CSV file.""" - self.csv_file = Path(csv_file) - self.openrouter_api_key = Config.OPENROUTER_API_KEY - self.posts = [] - self.recategorized_posts = [] - self.api_calls = 0 - self.ai_cost = 0.0 - - def load_csv(self) -> bool: - """Load posts from CSV file.""" - logger.info(f"Loading CSV: {self.csv_file}") - - if not self.csv_file.exists(): - logger.error(f"CSV file not found: {self.csv_file}") - return False - - try: - with open(self.csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - self.posts = list(reader) - - logger.info(f"✓ Loaded {len(self.posts)} posts from CSV") - - # Group by site for stats - by_site = {} - for post in self.posts: - site = post.get('site', '') - if site not in by_site: - by_site[site] = 0 - by_site[site] += 1 - - for site, count in by_site.items(): - logger.info(f" {site}: {count} posts") - - return True - - except Exception as e: - logger.error(f"Error loading CSV: {e}") - return False - - def batch_posts_for_analysis(self, batch_size: int = 10) -> List[List[Dict]]: - """Batch posts for AI analysis to manage token usage.""" - batches = [] - for i in range(0, len(self.posts), batch_size): - batches.append(self.posts[i:i + batch_size]) - return batches - - def format_batch_for_ai(self, batch: List[Dict]) -> str: - """Format batch of posts for AI analysis.""" - formatted = "POSTS TO RECATEGORIZE:\n\n" - - for i, post in enumerate(batch, 1): - formatted += f"{i}. POST ID: {post['post_id']}\n" - formatted += f" Site: {post['site']}\n" - formatted += f" Title: {post['title']}\n" - formatted += f" Current Categories: {post.get('categories', 'None')}\n" - formatted += f" Content: {post.get('content_preview', '')}...\n" - formatted += f" Word Count: {post.get('word_count', '0')}\n" - formatted += "\n" - - return formatted - - def get_ai_recommendations(self, batch: List[Dict]) -> Optional[str]: - """Get AI category recommendations for a batch of posts.""" - if not self.openrouter_api_key: - logger.error("OPENROUTER_API_KEY not set") - return None - - batch_text = self.format_batch_for_ai(batch) - - prompt = f"""Analyze these blog posts and recommend optimal categories. - -Website Strategy: -- mistergeek.net: VPN, Software/Tools, Gaming, General Tech, SEO, Content Marketing -- webscroll.fr: Torrenting, File-Sharing, Tracker Guides -- hellogeek.net: Experimental, Low-traffic, Off-brand content - -{batch_text} - -For EACH post, provide a JSON object with: -{{ - "post_id": , - "current_categories": "", - "recommended_categories": "", - "reason": "", - "confidence": "High|Medium|Low" -}} - -Return ONLY a JSON array. Example: -[ - {{"post_id": 2845, "current_categories": "VPN", "recommended_categories": "VPN, Security", "reason": "Add security angle", "confidence": "High"}}, - {{"post_id": 1234, "current_categories": "Other", "recommended_categories": "Torrenting, Guides", "reason": "Torrent-specific content", "confidence": "Medium"}} -] - -Analyze all posts and provide recommendations for EVERY post in the batch.""" - - try: - logger.info(f" Sending batch to Claude for recategorization...") - - response = requests.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={ - "Authorization": f"Bearer {self.openrouter_api_key}", - "Content-Type": "application/json", - }, - json={ - "model": "anthropic/claude-3.5-sonnet", - "messages": [ - {"role": "user", "content": prompt} - ], - "temperature": 0.3, - }, - timeout=60 - ) - response.raise_for_status() - - result = response.json() - self.api_calls += 1 - - # Track cost - usage = result.get('usage', {}) - input_tokens = usage.get('prompt_tokens', 0) - output_tokens = usage.get('completion_tokens', 0) - self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000 - - recommendations_text = result['choices'][0]['message']['content'].strip() - logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})") - - return recommendations_text - - except Exception as e: - logger.error(f"Error getting AI recommendations: {e}") - return None - - def parse_recommendations(self, recommendations_json: str) -> List[Dict]: - """Parse JSON recommendations from AI.""" - try: - # Try to extract JSON from response - start_idx = recommendations_json.find('[') - end_idx = recommendations_json.rfind(']') + 1 - - if start_idx == -1 or end_idx == 0: - logger.error("Could not find JSON array in response") - return [] - - json_str = recommendations_json[start_idx:end_idx] - recommendations = json.loads(json_str) - - return recommendations - - except json.JSONDecodeError as e: - logger.error(f"Error parsing JSON recommendations: {e}") - logger.debug(f"Response was: {recommendations_json[:500]}") - return [] - - def analyze_all_posts(self) -> bool: - """Analyze all posts in batches.""" - logger.info("\n" + "="*70) - logger.info("RECATEGORIZING POSTS WITH AI") - logger.info("="*70 + "\n") - - batches = self.batch_posts_for_analysis(batch_size=10) - logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches of 10...\n") - - all_recommendations = {} - - for batch_num, batch in enumerate(batches, 1): - logger.info(f"Batch {batch_num}/{len(batches)}: Analyzing {len(batch)} posts...") - - recommendations_json = self.get_ai_recommendations(batch) - - if not recommendations_json: - logger.error(f" Failed to get recommendations for batch {batch_num}") - continue - - recommendations = self.parse_recommendations(recommendations_json) - - for rec in recommendations: - all_recommendations[str(rec.get('post_id', ''))] = rec - - logger.info(f" ✓ Got {len(recommendations)} recommendations") - - logger.info(f"\n✓ Analysis complete!") - logger.info(f" Total recommendations: {len(all_recommendations)}") - logger.info(f" API calls: {self.api_calls}") - logger.info(f" Estimated cost: ${self.ai_cost:.4f}") - - # Map recommendations to posts - for post in self.posts: - post_id = str(post['post_id']) - if post_id in all_recommendations: - rec = all_recommendations[post_id] - post['recommended_categories'] = rec.get('recommended_categories', post.get('categories', '')) - post['recategorization_reason'] = rec.get('reason', '') - post['recategorization_confidence'] = rec.get('confidence', 'Medium') - else: - post['recommended_categories'] = post.get('categories', '') - post['recategorization_reason'] = 'No recommendation' - post['recategorization_confidence'] = 'Unknown' - - self.recategorized_posts.append(post) - - return len(self.recategorized_posts) > 0 - - def export_with_recommendations(self) -> Tuple[str, str]: - """Export CSV with recategorization recommendations.""" - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - - # Main file with all recommendations - main_file = output_dir / f'posts_with_recategorization_{timestamp}.csv' - - # Differences file (only posts with different recommendations) - changes_file = output_dir / f'category_changes_only_{timestamp}.csv' - - # Full fieldnames including new recommendation columns - fieldnames = list(self.recategorized_posts[0].keys()) + [ - 'recommended_categories', - 'recategorization_reason', - 'recategorization_confidence' - ] - - logger.info(f"\nExporting recategorization recommendations to CSV...") - - # Export main file with all posts - with open(main_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.recategorized_posts) - - logger.info(f"✓ Main file: {main_file}") - - # Export changes file (only posts where category changed) - posts_with_changes = [ - p for p in self.recategorized_posts - if p.get('categories', '') != p.get('recommended_categories', '') - ] - - if posts_with_changes: - with open(changes_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(posts_with_changes) - logger.info(f"✓ Changes file ({len(posts_with_changes)} posts): {changes_file}") - else: - logger.info(f"ℹ No category changes recommended") - - return (str(main_file), str(changes_file) if posts_with_changes else None) - - def print_summary(self): - """Print recategorization summary.""" - logger.info("\n" + "="*70) - logger.info("RECATEGORIZATION SUMMARY") - logger.info("="*70 + "\n") - - # Count changes by site - by_site = {} - total_changes = 0 - - for post in self.recategorized_posts: - site = post.get('site', 'Unknown') - if site not in by_site: - by_site[site] = {'total': 0, 'changed': 0} - - by_site[site]['total'] += 1 - - if post.get('categories', '') != post.get('recommended_categories', ''): - by_site[site]['changed'] += 1 - total_changes += 1 - - logger.info("CHANGES BY SITE:") - for site in sorted(by_site.keys()): - stats = by_site[site] - logger.info(f" {site}: {stats['changed']} changes out of {stats['total']} posts") - - logger.info(f"\nTOTAL CHANGES: {total_changes} out of {len(self.recategorized_posts)} posts") - logger.info(f" ({(total_changes/len(self.recategorized_posts)*100):.1f}% of posts)") - - # Confidence breakdown - logger.info("\nRECOMMENDATION CONFIDENCE:") - confidence_counts = {} - for post in self.recategorized_posts: - conf = post.get('recategorization_confidence', 'Unknown') - confidence_counts[conf] = confidence_counts.get(conf, 0) + 1 - - for conf in ['High', 'Medium', 'Low', 'Unknown']: - count = confidence_counts.get(conf, 0) - if count > 0: - logger.info(f" {conf}: {count} posts ({(count/len(self.recategorized_posts)*100):.1f}%)") - - def run(self): - """Run complete recategorization analysis.""" - logger.info("="*70) - logger.info("AI-POWERED POST RECATEGORIZATION") - logger.info("="*70) - - # Load CSV - if not self.load_csv(): - sys.exit(1) - - # Analyze posts - if not self.analyze_all_posts(): - logger.error("Failed to analyze posts") - sys.exit(1) - - # Print summary - self.print_summary() - - # Export results - logger.info("\n" + "="*70) - logger.info("EXPORTING RESULTS") - logger.info("="*70) - - main_file, changes_file = self.export_with_recommendations() - - logger.info("\n" + "="*70) - logger.info("NEXT STEPS") - logger.info("="*70) - logger.info("\n1. Review recategorization recommendations:") - logger.info(f" {main_file}") - logger.info("\n2. Review only posts with category changes:") - if changes_file: - logger.info(f" {changes_file}") - else: - logger.info(" No changes recommended") - logger.info("\n3. Apply recommendations:") - logger.info(" Use categorization automation script (coming soon)") - logger.info(" Or manually update categories in WordPress") - - logger.info("\n✓ Recategorization analysis complete!") - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='Re-categorize posts using Claude AI for better organization' - ) - parser.add_argument( - 'csv_file', - help='Path to exported posts CSV file' - ) - - args = parser.parse_args() - - recategorizer = PostRecategorizer(args.csv_file) - recategorizer.run() - - -if __name__ == '__main__': - main() diff --git a/scripts/analytics_importer.py b/scripts/analytics_importer.py deleted file mode 100644 index 77ea5b8..0000000 --- a/scripts/analytics_importer.py +++ /dev/null @@ -1,427 +0,0 @@ -""" -Analytics data importer for SEO analysis. -Merges Google Analytics and Search Console data with WordPress posts. -""" - -import csv -import json -import argparse -from pathlib import Path -from urllib.parse import urlparse, parse_qs -from collections import defaultdict -from config import Config - - -class AnalyticsImporter: - """Import and consolidate analytics data with WordPress posts.""" - - def __init__(self): - """Initialize importer.""" - self.config = Config - self.output_dir = self.config.OUTPUT_DIR - self.logs = [] - self.unmatched_urls = [] - - def log(self, message): - """Add message to log.""" - self.logs.append(message) - print(message) - - def normalize_url(self, url): - """Normalize URL for matching.""" - if not url: - return "" - # Remove trailing slash, protocol, www - url = url.rstrip('/') - if url.startswith('http'): - url = urlparse(url).path - url = url.replace('www.', '') - return url.lower() - - def extract_post_slug_from_url(self, url): - """Extract post slug from URL path.""" - path = urlparse(url).path.rstrip('/') - parts = [p for p in path.split('/') if p] - if parts: - return parts[-1] # Last part is usually the slug - return None - - def load_ga4_data(self, ga4_csv): - """Load Google Analytics 4 data.""" - ga_data = {} - if not ga4_csv.exists(): - self.log(f"⚠️ GA4 file not found: {ga4_csv}") - return ga_data - - try: - with open(ga4_csv, 'r', encoding='utf-8') as f: - # Skip comment lines at the top (lines starting with #) - lines = [line for line in f if not line.startswith('#')] - - reader = csv.DictReader(lines) - for row in reader: - if not row: - continue - # Handle French and English column names - url = (row.get('Page path and screen class') or - row.get('Chemin de la page et classe de l\'écran') or - row.get('Page path') or - row.get('Page') or '') - if not url: - continue - - # Normalize URL - normalized = self.normalize_url(url) - - # Extract metrics (handle French and English column names) - try: - traffic = int(float(row.get('Screened Views', row.get('Views', row.get('Vues', '0'))) or 0)) - users = int(float(row.get('Users', row.get('Utilisateurs actifs', '0')) or 0)) - bounce_rate = float(row.get('Bounce rate', row.get('Taux de rebond', '0')) or 0) - avg_duration_str = (row.get('Average session duration', - row.get('Durée d\'engagement moyenne par utilisateur actif', '0')) or '0') - avg_duration = float(avg_duration_str.replace(',', '.')) - except (ValueError, TypeError): - traffic = users = 0 - bounce_rate = avg_duration = 0 - - ga_data[normalized] = { - 'traffic': traffic, - 'users': users, - 'bounce_rate': bounce_rate, - 'avg_session_duration': avg_duration, - 'ga_url': url - } - self.log(f"✓ Loaded {len(ga_data)} GA4 entries") - except Exception as e: - self.log(f"❌ Error reading GA4 file: {e}") - - return ga_data - - def load_gsc_data(self, gsc_csv): - """Load Google Search Console data (Page-level or Query-level).""" - gsc_data = {} - if not gsc_csv.exists(): - self.log(f"⚠️ GSC file not found: {gsc_csv}") - return gsc_data - - try: - with open(gsc_csv, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - if not row: - continue - - # Determine if this is page-level or query-level data - # Pages.csv has: "Pages les plus populaires", Queries.csv has: "Requêtes les plus fréquentes" - url = (row.get('Page') or - row.get('Pages les plus populaires') or - row.get('URL') or '') - - query = row.get('Query') or row.get('Requêtes les plus fréquentes', '').strip() - - # Skip rows without URLs (query-only data) - if not url: - continue - - # Try to parse metrics with flexible column names - try: - # Handle different number formats (decimal separator, percentage signs) - clicks_str = row.get('Clics', row.get('Clicks', '0')) or '0' - impressions_str = row.get('Impressions', '0') or '0' - ctr_str = row.get('CTR', '0') or '0' - position_str = row.get('Position', '0') or '0' - - clicks = int(float(clicks_str.replace(',', '.').rstrip('%'))) - impressions = int(float(impressions_str.replace(',', '.'))) - ctr = float(ctr_str.replace(',', '.').rstrip('%')) / 100 - position = float(position_str.replace(',', '.')) - except (ValueError, TypeError, AttributeError): - clicks = impressions = 0 - ctr = position = 0 - - normalized = self.normalize_url(url) - - if normalized not in gsc_data: - gsc_data[normalized] = { - 'impressions': 0, - 'clicks': 0, - 'avg_position': 0, - 'ctr': 0, - 'keywords': [], - 'gsc_url': url - } - - # Accumulate data (in case of multiple rows per URL) - gsc_data[normalized]['impressions'] += impressions - gsc_data[normalized]['clicks'] += clicks - - # Store position - if position > 0: - gsc_data[normalized]['positions'] = gsc_data[normalized].get('positions', []) - gsc_data[normalized]['positions'].append(position) - - if query and query not in gsc_data[normalized]['keywords']: - gsc_data[normalized]['keywords'].append(query) - - # Calculate average positions and finalize - for data in gsc_data.values(): - if data.get('positions'): - data['avg_position'] = sum(data['positions']) / len(data['positions']) - del data['positions'] - # Recalculate CTR from totals - if data['impressions'] > 0: - data['ctr'] = data['clicks'] / data['impressions'] - data['keywords_count'] = len(data.get('keywords', [])) - - self.log(f"✓ Loaded {len(gsc_data)} GSC entries") - except Exception as e: - self.log(f"❌ Error reading GSC file: {e}") - - return gsc_data - - def load_posts_csv(self, posts_csv): - """Load existing WordPress posts CSV.""" - posts = {} - if not posts_csv.exists(): - self.log(f"⚠️ Posts file not found: {posts_csv}") - return posts - - try: - with open(posts_csv, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - # Handle different column name variations - post_id = row.get('ID') or row.get('post_id') - post_url = row.get('URL') or row.get('Post URL') or row.get('post_url') - post_slug = row.get('Post Slug') or row.get('Slug') or row.get('post_slug') - post_title = row.get('Title') or row.get('post_title') - - if not post_id: - continue - - normalized = self.normalize_url(post_url) if post_url else "" - - # Handle different SEO column names - seo_title = (row.get('SEO Title') or - row.get('proposed_seo_title') or - row.get('current_seo_title') or '') - meta_desc = (row.get('Meta Description') or - row.get('proposed_meta_description') or - row.get('current_meta_description') or '') - - posts[post_id] = { - 'title': post_title or '', - 'url': post_url, - 'slug': post_slug, - 'normalized_url': normalized, - 'seo_title': seo_title, - 'meta_description': meta_desc, - **{k: v for k, v in row.items() - if k not in ['ID', 'post_id', 'Title', 'post_title', 'URL', 'Post URL', 'post_url', - 'Post Slug', 'Slug', 'post_slug', 'SEO Title', 'proposed_seo_title', - 'current_seo_title', 'Meta Description', 'proposed_meta_description', - 'current_meta_description']} - } - - self.log(f"✓ Loaded {len(posts)} posts from CSV") - except Exception as e: - self.log(f"❌ Error reading posts CSV: {e}") - - return posts - - def match_analytics_to_posts(self, posts, ga_data, gsc_data): - """Match analytics data to posts with fuzzy matching.""" - self.log("\n📊 Matching analytics data to posts...") - matched_count = 0 - - for post_id, post_info in posts.items(): - slug = post_info.get('slug') or self.extract_post_slug_from_url(post_info.get('url', '')) - normalized_url = post_info.get('normalized_url', '') - - # Try direct URL match first - if normalized_url in ga_data: - post_info['ga_data'] = ga_data[normalized_url] - matched_count += 1 - else: - post_info['ga_data'] = {} - - if normalized_url in gsc_data: - post_info['gsc_data'] = gsc_data[normalized_url] - matched_count += 1 - else: - post_info['gsc_data'] = {} - - # Try slug-based matching if URL didn't match - if not post_info.get('gsc_data') and slug: - for gsc_url, gsc_info in gsc_data.items(): - if slug in gsc_url: - post_info['gsc_data'] = gsc_info - break - - # Track unmatched GSC URLs - matched_gsc_urls = set() - for post in posts.values(): - if post.get('gsc_data'): - matched_gsc_urls.add(id(post['gsc_data'])) - - for normalized_url, gsc_info in gsc_data.items(): - if id(gsc_info) not in matched_gsc_urls and gsc_info.get('impressions', 0) > 0: - self.unmatched_urls.append({ - 'url': gsc_info.get('gsc_url', normalized_url), - 'impressions': gsc_info.get('impressions', 0), - 'clicks': gsc_info.get('clicks', 0), - 'avg_position': gsc_info.get('avg_position', 0) - }) - - self.log(f"✓ Matched data to posts") - return posts - - def enrich_posts_data(self, posts): - """Enrich posts with calculated metrics.""" - for post_info in posts.values(): - ga = post_info.get('ga_data', {}) - gsc = post_info.get('gsc_data', {}) - - # GA metrics - post_info['traffic'] = ga.get('traffic', 0) - post_info['users'] = ga.get('users', 0) - post_info['bounce_rate'] = ga.get('bounce_rate', 0) - post_info['avg_session_duration'] = ga.get('avg_session_duration', 0) - - # GSC metrics - post_info['impressions'] = gsc.get('impressions', 0) - post_info['clicks'] = gsc.get('clicks', 0) - post_info['avg_position'] = gsc.get('avg_position', 0) - post_info['ctr'] = gsc.get('ctr', 0) - post_info['keywords_count'] = gsc.get('keywords_count', 0) - post_info['top_keywords'] = ','.join(gsc.get('keywords', [])[:5]) - - return posts - - def export_enriched_csv(self, posts, output_csv): - """Export enriched posts data to CSV.""" - if not posts: - self.log("❌ No posts to export") - return - - try: - fieldnames = [ - 'ID', 'Title', 'URL', 'SEO Title', 'Meta Description', - 'traffic', 'users', 'bounce_rate', 'avg_session_duration', - 'impressions', 'clicks', 'avg_position', 'ctr', 'keywords_count', 'top_keywords' - ] - - # Add any extra fields from original posts - all_keys = set() - for post in posts.values(): - all_keys.update(post.keys()) - - extra_fields = [k for k in sorted(all_keys) - if k not in fieldnames and k not in ['ga_data', 'gsc_data', 'normalized_url', 'slug']] - fieldnames.extend(extra_fields) - - with open(output_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') - writer.writeheader() - - for post_id, post_info in sorted(posts.items()): - row = {'ID': post_id} - row.update(post_info) - # Clean up nested dicts - for key in ['ga_data', 'gsc_data']: - row.pop(key, None) - writer.writerow(row) - - self.log(f"✓ Exported {len(posts)} posts to {output_csv}") - except Exception as e: - self.log(f"❌ Error exporting CSV: {e}") - - def export_log(self, log_file): - """Export analysis log and unmatched URLs.""" - try: - with open(log_file, 'w', encoding='utf-8') as f: - f.write("SEO Analytics Import Report\n") - f.write("=" * 60 + "\n\n") - - f.write("Import Log:\n") - f.write("-" * 60 + "\n") - for log_msg in self.logs: - f.write(log_msg + "\n") - - f.write("\n" + "=" * 60 + "\n") - f.write(f"Unmatched URLs ({len(self.unmatched_urls)} total):\n") - f.write("-" * 60 + "\n") - - if self.unmatched_urls: - # Sort by impressions descending - for url_data in sorted(self.unmatched_urls, - key=lambda x: x['impressions'], - reverse=True): - f.write(f"\nURL: {url_data['url']}\n") - f.write(f" Impressions: {url_data['impressions']}\n") - f.write(f" Clicks: {url_data['clicks']}\n") - f.write(f" Avg Position: {url_data['avg_position']:.1f}\n") - else: - f.write("✓ All URLs matched successfully!\n") - - self.log(f"✓ Exported log to {log_file}") - except Exception as e: - self.log(f"❌ Error exporting log: {e}") - - def run(self, ga_csv, gsc_csv, posts_csv, output_csv): - """Run complete import workflow.""" - self.log("Starting analytics import...") - self.log(f"GA4 CSV: {ga_csv}") - self.log(f"GSC CSV: {gsc_csv}") - self.log(f"Posts CSV: {posts_csv}\n") - - # Load data - ga_data = self.load_ga4_data(ga_csv) - gsc_data = self.load_gsc_data(gsc_csv) - posts = self.load_posts_csv(posts_csv) - - if not posts: - self.log("❌ No posts found. Cannot proceed.") - return - - # Match and merge - posts = self.match_analytics_to_posts(posts, ga_data, gsc_data) - posts = self.enrich_posts_data(posts) - - # Export - self.export_enriched_csv(posts, output_csv) - - # Export log - log_dir = self.output_dir / 'logs' - log_dir.mkdir(exist_ok=True) - log_file = log_dir / 'import_log.txt' - self.export_log(log_file) - - self.log("\n✓ Analytics import complete!") - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser(description='Import and merge analytics data') - parser.add_argument('--ga-export', type=Path, - default=Path('input/analytics/ga4_export.csv'), - help='GA4 export CSV path') - parser.add_argument('--gsc-export', type=Path, - default=Path('input/analytics/gsc/Pages.csv'), - help='Search Console export CSV path (Pages data)') - parser.add_argument('--posts-csv', type=Path, - default=Path('input/new-propositions.csv'), - help='Posts CSV path') - parser.add_argument('--output', type=Path, - default=Path('output/results/posts_with_analytics.csv'), - help='Output CSV path') - - args = parser.parse_args() - - importer = AnalyticsImporter() - importer.run(args.ga_export, args.gsc_export, args.posts_csv, args.output) - - -if __name__ == '__main__': - main() diff --git a/scripts/category_manager.py b/scripts/category_manager.py deleted file mode 100644 index 7a2f721..0000000 --- a/scripts/category_manager.py +++ /dev/null @@ -1,614 +0,0 @@ -#!/usr/bin/env python3 -""" -WordPress Category Management Script -Fetches all categories from WordPress sites, proposes new categories, -and allows assigning posts to categories or websites using AI recommendations. -""" - -import csv -import json -import logging -import sys -from pathlib import Path -from typing import Dict, List, Optional -import requests -from requests.auth import HTTPBasicAuth -import time -from datetime import datetime -from config import Config - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -class AICategoryAdvisor: - """AI-powered advisor for category and site recommendations.""" - - def __init__(self): - self.openrouter_api_key = Config.OPENROUTER_API_KEY - self.ai_model = Config.AI_MODEL - self.api_calls = 0 - self.ai_cost = 0.0 - - def get_ai_category_recommendations(self, posts_batch: List[Dict]) -> Optional[List[Dict]]: - """ - Get AI recommendations for category assignments. - - Args: - posts_batch: List of posts to analyze - - Returns: - List of recommendations for each post - """ - if not self.openrouter_api_key: - logger.error("OPENROUTER_API_KEY not set") - return None - - # Format posts for AI analysis - formatted_posts = [] - for i, post in enumerate(posts_batch, 1): - title = post.get('title', {}).get('rendered', 'Untitled') - content = post.get('content', {}).get('rendered', '')[:500] # First 500 chars - current_categories = post.get('categories', []) - - formatted_posts.append( - f"{i}. POST ID: {post['id']}\n" - f" Title: {title}\n" - f" Content Preview: {content}...\n" - f" Current Categories: {current_categories}\n" - ) - - posts_text = "\n".join(formatted_posts) - - prompt = f"""Analyze these blog posts and provide category recommendations. - -Website Strategy: -- mistergeek.net: High-value topics (VPN, Software, Gaming, General Tech, SEO, Content Marketing) -- webscroll.fr: Torrenting, File-Sharing, Tracker guides (niche audience) -- hellogeek.net: Low-traffic, experimental, off-brand, or niche content - -{posts_text} - -For EACH post, provide a JSON object with: -{{ - "post_id": , - "recommended_category": "", - "recommended_site": "", - "reason": "", - "confidence": "" -}} - -Return ONLY a JSON array. Example: -[ - {{"post_id": 2845, "recommended_category": "VPN", "recommended_site": "mistergeek.net", "reason": "Core VPN topic", "confidence": "High"}}, - {{"post_id": 1234, "recommended_category": "Torrenting", "recommended_site": "webscroll.fr", "reason": "Torrent tracker content", "confidence": "High"}} -] - -Analyze all posts and provide recommendations for EVERY post in the batch.""" - - try: - logger.info(f" Sending batch to AI for category recommendations...") - - response = requests.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={ - "Authorization": f"Bearer {self.openrouter_api_key}", - "Content-Type": "application/json", - }, - json={ - "model": self.ai_model, - "messages": [ - {"role": "user", "content": prompt} - ], - "temperature": 0.3, # Lower temp for more consistent recommendations - }, - timeout=60 - ) - response.raise_for_status() - - result = response.json() - self.api_calls += 1 - - # Track cost - usage = result.get('usage', {}) - input_tokens = usage.get('prompt_tokens', 0) - output_tokens = usage.get('completion_tokens', 0) - # Using Claude 3.5 Sonnet pricing: $3/$15 per 1M tokens - self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000 - - recommendations_text = result['choices'][0]['message']['content'].strip() - logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})") - - # Parse the recommendations - return self._parse_recommendations(recommendations_text) - - except Exception as e: - logger.error(f"Error getting AI recommendations: {e}") - return None - - def _parse_recommendations(self, recommendations_json: str) -> List[Dict]: - """Parse JSON recommendations from AI.""" - try: - # Try to extract JSON from response - start_idx = recommendations_json.find('[') - end_idx = recommendations_json.rfind(']') + 1 - - if start_idx == -1 or end_idx == 0: - logger.error("Could not find JSON array in response") - return [] - - json_str = recommendations_json[start_idx:end_idx] - recommendations = json.loads(json_str) - - return recommendations - - except json.JSONDecodeError as e: - logger.error(f"Error parsing JSON recommendations: {e}") - logger.debug(f"Response was: {recommendations_json[:500]}") - return [] - - -class CategoryManager: - """Manage WordPress categories across multiple sites.""" - - def __init__(self): - """Initialize the category manager with sites from Config.""" - self.sites = Config.WORDPRESS_SITES - self.categories_by_site = {} - self.posts_by_site = {} - self.proposed_categories = {} - self.category_assignments = [] - self.ai_advisor = AICategoryAdvisor() - - def fetch_categories_from_site(self, site_name: str, site_config: Dict) -> List[Dict]: - """ - Fetch all categories from a WordPress site. - - Args: - site_name: Website name - site_config: Site configuration dict - - Returns: - List of categories with metadata - """ - logger.info(f"Fetching categories from {site_name}...") - - categories = [] - base_url = site_config['url'].rstrip('/') - api_url = f"{base_url}/wp-json/wp/v2/categories" - auth = HTTPBasicAuth(site_config['username'], site_config['password']) - - try: - # Fetch all categories (pagination if needed) - page = 1 - while True: - params = { - 'page': page, - 'per_page': 100, - } - - response = requests.get(api_url, params=params, auth=auth, timeout=10) - - if response.status_code == 401: - logger.error(f"Unauthorized access to {site_name}. Check credentials.") - break - elif response.status_code == 403: - logger.error(f"Forbidden access to {site_name}. Check permissions.") - break - - response.raise_for_status() - - page_categories = response.json() - if not page_categories: - break - - categories.extend(page_categories) - logger.info(f" Page {page}: Got {len(page_categories)} categories") - - # Check if there are more pages - link_header = response.headers.get('Link', '') - if 'rel="next"' not in link_header: - break - - page += 1 - time.sleep(0.5) - - logger.info(f"✓ Total categories from {site_name}: {len(categories)}") - - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching categories from {site_name}: {e}") - return [] - - return categories - - def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]: - """ - Fetch posts from a WordPress site to see current category assignments. - - Args: - site_name: Website name - site_config: Site configuration dict - - Returns: - List of posts with category information - """ - logger.info(f"Fetching posts from {site_name} to analyze category assignments...") - - posts = [] - base_url = site_config['url'].rstrip('/') - api_url = f"{base_url}/wp-json/wp/v2/posts" - auth = HTTPBasicAuth(site_config['username'], site_config['password']) - - try: - page = 1 - while True: - params = { - 'page': page, - 'per_page': 100, - 'status': 'publish', - } - - response = requests.get(api_url, params=params, auth=auth, timeout=10) - - if response.status_code == 401: - logger.error(f"Unauthorized access to {site_name}. Check credentials.") - break - elif response.status_code == 403: - logger.error(f"Forbidden access to {site_name}. Check permissions.") - break - - response.raise_for_status() - - page_posts = response.json() - if not page_posts: - break - - posts.extend(page_posts) - logger.info(f" Page {page}: Got {len(page_posts)} posts") - - # Check if there are more pages - link_header = response.headers.get('Link', '') - if 'rel="next"' not in link_header: - break - - page += 1 - time.sleep(0.5) - - logger.info(f"✓ Total posts from {site_name}: {len(posts)}") - - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching posts from {site_name}: {e}") - return [] - - return posts - - def analyze_categories(self): - """Analyze current categories and propose new ones.""" - logger.info("\n" + "="*70) - logger.info("ANALYZING CURRENT CATEGORIES") - logger.info("="*70) - - for site_name, config in self.sites.items(): - categories = self.fetch_categories_from_site(site_name, config) - posts = self.fetch_posts_from_site(site_name, config) - - self.categories_by_site[site_name] = categories - self.posts_by_site[site_name] = posts - - logger.info(f"\n{site_name}:") - logger.info(f" Categories: {len(categories)}") - logger.info(f" Posts: {len(posts)}") - - # Show top categories by post count - if categories: - logger.info(" Top 10 categories by post count:") - # Sort categories by count (most posts first) - sorted_cats = sorted(categories, key=lambda x: x.get('count', 0), reverse=True) - for i, cat in enumerate(sorted_cats[:10]): - logger.info(f" {i+1}. {cat['name']} ({cat['count']} posts)") - - def propose_new_categories(self): - """Propose new categories based on content analysis.""" - logger.info("\n" + "="*70) - logger.info("PROPOSING NEW CATEGORIES") - logger.info("="*70) - - # Define category proposals based on content analysis - category_proposals = { - 'mistergeek.net': [ - {'name': 'VPN Reviews', 'description': 'Reviews of VPN services', 'parent': 0}, - {'name': 'Software Tutorials', 'description': 'Step-by-step software guides', 'parent': 0}, - {'name': 'Tech News', 'description': 'Latest technology news', 'parent': 0}, - {'name': 'Cybersecurity', 'description': 'Security tips and tools', 'parent': 0}, - ], - 'webscroll.fr': [ - {'name': 'Torrent Clients', 'description': 'Reviews of torrent clients', 'parent': 0}, - {'name': 'Privacy Tools', 'description': 'Privacy-focused tools and services', 'parent': 0}, - {'name': 'File Sharing Guide', 'description': 'Guides on file sharing methods', 'parent': 0}, - ], - 'hellogeek.net': [ - {'name': 'Experimental Tech', 'description': 'New and experimental tech', 'parent': 0}, - {'name': 'Random Thoughts', 'description': 'Opinion and commentary posts', 'parent': 0}, - {'name': 'Testing Zone', 'description': 'Posts for testing purposes', 'parent': 0}, - ] - } - - for site_name in self.sites.keys(): - if site_name in category_proposals: - self.proposed_categories[site_name] = category_proposals[site_name] - logger.info(f"\n{site_name} - Proposed categories:") - for cat in category_proposals[site_name]: - logger.info(f" - {cat['name']}: {cat['description']}") - - def create_category_assignment_proposals(self): - """Create proposals for assigning posts to categories or websites.""" - logger.info("\n" + "="*70) - logger.info("CREATING CATEGORY ASSIGNMENT PROPOSALS") - logger.info("="*70) - - # Analyze posts and propose category assignments - for site_name, posts in self.posts_by_site.items(): - logger.info(f"\nAnalyzing posts from {site_name} for category assignments...") - - # Process posts in batches for AI analysis - batch_size = 10 - for i in range(0, len(posts), batch_size): - batch = posts[i:i + batch_size] - - # Get AI recommendations for this batch - ai_recommendations = self.ai_advisor.get_ai_category_recommendations(batch) - - if ai_recommendations: - # Map AI recommendations to our assignment format - for post in batch: - title = post.get('title', {}).get('rendered', 'Untitled') - content = post.get('content', {}).get('rendered', '')[:200] # First 200 chars - current_categories = post.get('categories', []) - - # Find the AI recommendation for this post - ai_rec = None - for rec in ai_recommendations: - if rec.get('post_id') == post['id']: - ai_rec = rec - break - - if ai_rec: - assignment = { - 'site': site_name, - 'post_id': post['id'], - 'post_title': title[:50] + "..." if len(title) > 50 else title, - 'current_categories': current_categories, - 'proposed_category': ai_rec.get('recommended_category', 'Uncategorized'), - 'proposed_site': ai_rec.get('recommended_site', site_name), - 'reason': ai_rec.get('reason', ''), - 'confidence': ai_rec.get('confidence', 'Low'), - 'content_preview': content[:100] + "..." if len(content) > 100 else content, - 'status': 'pending_approval' - } - else: - # Fallback to keyword-based suggestion if no AI recommendation - proposed_category = self._suggest_category_by_content(title + " " + content, site_name) - - assignment = { - 'site': site_name, - 'post_id': post['id'], - 'post_title': title[:50] + "..." if len(title) > 50 else title, - 'current_categories': current_categories, - 'proposed_category': proposed_category, - 'proposed_site': site_name, - 'reason': 'Keyword-based suggestion', - 'confidence': 'Low', - 'content_preview': content[:100] + "..." if len(content) > 100 else content, - 'status': 'pending_approval' - } - - self.category_assignments.append(assignment) - else: - # If AI is not available, use keyword-based suggestions - for post in batch: - title = post.get('title', {}).get('rendered', 'Untitled') - content = post.get('content', {}).get('rendered', '')[:200] # First 200 chars - current_categories = post.get('categories', []) - - proposed_category = self._suggest_category_by_content(title + " " + content, site_name) - - assignment = { - 'site': site_name, - 'post_id': post['id'], - 'post_title': title[:50] + "..." if len(title) > 50 else title, - 'current_categories': current_categories, - 'proposed_category': proposed_category, - 'proposed_site': site_name, - 'reason': 'Keyword-based suggestion', - 'confidence': 'Low', - 'content_preview': content[:100] + "..." if len(content) > 100 else content, - 'status': 'pending_approval' - } - - self.category_assignments.append(assignment) - - logger.info(f"Created {len(self.category_assignments)} category assignment proposals") - - def _suggest_category_by_content(self, content: str, site_name: str) -> str: - """Suggest a category based on content keywords.""" - content_lower = content.lower() - - # Site-specific category mappings - category_keywords = { - 'mistergeek.net': { - 'VPN': ['vpn', 'proxy', 'privacy', 'secure', 'encryption'], - 'Software': ['software', 'app', 'tool', 'download', 'install'], - 'Gaming': ['game', 'gaming', 'console', 'steam', 'playstation'], - 'Tech News': ['news', 'update', 'release', 'announced'], - 'Cybersecurity': ['security', 'malware', 'antivirus', 'hacking', 'breach'] - }, - 'webscroll.fr': { - 'Torrent': ['torrent', 'download', 'upload', 'client', 'tracker'], - 'Privacy': ['privacy', 'anonymous', 'tor', 'vpn'], - 'File Sharing': ['share', 'sharing', 'ddl', 'upload'] - }, - 'hellogeek.net': { - 'Opinion': ['think', 'believe', 'opinion', 'view', 'perspective'], - 'Tutorial': ['how to', 'guide', 'tutorial', 'steps', 'instructions'], - 'Review': ['review', 'rating', 'comparison', 'test'] - } - } - - site_categories = category_keywords.get(site_name, {}) - - for category, keywords in site_categories.items(): - for keyword in keywords: - if keyword in content_lower: - return category - - return 'Uncategorized' - - def export_categories_csv(self) -> str: - """Export current categories to CSV.""" - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - csv_file = output_dir / f'current_categories_{timestamp}.csv' - - fieldnames = ['site', 'category_id', 'name', 'slug', 'description', 'post_count', 'parent_id'] - - with open(csv_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - - for site_name, categories in self.categories_by_site.items(): - for cat in categories: - writer.writerow({ - 'site': site_name, - 'category_id': cat.get('id', ''), - 'name': cat.get('name', ''), - 'slug': cat.get('slug', ''), - 'description': cat.get('description', ''), - 'post_count': cat.get('count', 0), - 'parent_id': cat.get('parent', 0) - }) - - logger.info(f"✓ Current categories exported to: {csv_file}") - return str(csv_file) - - def export_proposed_categories_csv(self) -> str: - """Export proposed new categories to CSV.""" - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - csv_file = output_dir / f'proposed_categories_{timestamp}.csv' - - fieldnames = ['site', 'proposed_category', 'description', 'parent_category', 'reason'] - - with open(csv_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - - for site_name, categories in self.proposed_categories.items(): - for cat in categories: - writer.writerow({ - 'site': site_name, - 'proposed_category': cat.get('name', ''), - 'description': cat.get('description', ''), - 'parent_category': cat.get('parent', 0), - 'reason': 'Content analysis and organization improvement' - }) - - logger.info(f"✓ Proposed categories exported to: {csv_file}") - return str(csv_file) - - def export_category_assignments_csv(self) -> str: - """Export category assignment proposals to CSV.""" - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - csv_file = output_dir / f'category_assignments_{timestamp}.csv' - - fieldnames = ['site', 'post_id', 'post_title', 'current_categories', 'proposed_category', 'proposed_site', 'reason', 'confidence', 'content_preview', 'status'] - - with open(csv_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - - for assignment in self.category_assignments: - writer.writerow(assignment) - - logger.info(f"✓ Category assignments exported to: {csv_file}") - return str(csv_file) - - def run(self): - """Run complete category management process.""" - logger.info("="*70) - logger.info("WORDPRESS CATEGORY MANAGEMENT") - logger.info("="*70) - logger.info("Sites configured: " + ", ".join(self.sites.keys())) - logger.info("") - - # Analyze current categories - self.analyze_categories() - - # Propose new categories - self.propose_new_categories() - - # Create category assignment proposals - self.create_category_assignment_proposals() - - # Export all data - logger.info("\n" + "="*70) - logger.info("EXPORTING RESULTS") - logger.info("="*70) - - categories_csv = self.export_categories_csv() - proposed_csv = self.export_proposed_categories_csv() - assignments_csv = self.export_category_assignments_csv() - - # Print summary - logger.info("\n" + "="*70) - logger.info("CATEGORY MANAGEMENT SUMMARY") - logger.info("="*70) - - total_categories = sum(len(cats) for cats in self.categories_by_site.values()) - logger.info(f"Total current categories: {total_categories}") - - total_proposed = sum(len(props) for props in self.proposed_categories.values()) - logger.info(f"Total proposed categories: {total_proposed}") - - logger.info(f"Category assignment proposals: {len(self.category_assignments)}") - - # AI Advisor stats - logger.info(f"AI API calls made: {self.ai_advisor.api_calls}") - logger.info(f"AI cost: ${self.ai_advisor.ai_cost:.4f}") - - logger.info(f"\n{'─'*70}") - logger.info("Exported files:") - logger.info(f" • Current categories: {categories_csv}") - logger.info(f" • Proposed categories: {proposed_csv}") - logger.info(f" • Category assignments: {assignments_csv}") - logger.info(f"{'─'*70}") - - logger.info(f"\n✓ Category management complete!") - logger.info(f"\nNext steps:") - logger.info(f" 1. Review proposed_categories.csv for new categories to add") - logger.info(f" 2. Review category_assignments.csv for posts that need re-categorization") - logger.info(f" 3. Manually approve or modify proposals before applying changes") - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='Manage WordPress categories across multiple sites' - ) - - args = parser.parse_args() - - manager = CategoryManager() - manager.run() - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/scripts/config.py b/scripts/config.py deleted file mode 100644 index 8509a6c..0000000 --- a/scripts/config.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Configuration module for WordPress SEO automation. -Loads and validates environment variables and YAML configuration. -""" - -import os -import yaml -from dotenv import load_dotenv -from pathlib import Path - -# Load environment variables from .env file -load_dotenv() - -class Config: - """Configuration class for WordPress SEO automation.""" - - # Load configuration from YAML file - CONFIG_FILE = Path(__file__).parent.parent / 'config.yaml' - - if CONFIG_FILE.exists(): - with open(CONFIG_FILE, 'r', encoding='utf-8') as f: - YAML_CONFIG = yaml.safe_load(f) - else: - YAML_CONFIG = {} - - # WordPress Settings (Primary site) - WORDPRESS_URL = os.getenv('WORDPRESS_URL', YAML_CONFIG.get('primary_site', {}).get('url', '')).rstrip('/') - WORDPRESS_USERNAME = os.getenv('WORDPRESS_USERNAME', YAML_CONFIG.get('primary_site', {}).get('username', '')) - WORDPRESS_APP_PASSWORD = os.getenv('WORDPRESS_APP_PASSWORD', YAML_CONFIG.get('primary_site', {}).get('password', '')) - - # Multi-site WordPress Configuration - WORDPRESS_SITES = { - 'mistergeek.net': { - 'url': os.getenv('WORDPRESS_MISTERGEEK_URL', YAML_CONFIG.get('wordpress_sites', {}).get('mistergeek.net', {}).get('url', 'https://www.mistergeek.net')), - 'username': os.getenv('WORDPRESS_MISTERGEEK_USERNAME', os.getenv('WORDPRESS_USERNAME', YAML_CONFIG.get('wordpress_sites', {}).get('mistergeek.net', {}).get('username', ''))), - 'password': os.getenv('WORDPRESS_MISTERGEEK_PASSWORD', os.getenv('WORDPRESS_APP_PASSWORD', YAML_CONFIG.get('wordpress_sites', {}).get('mistergeek.net', {}).get('password', ''))), - }, - 'webscroll.fr': { - 'url': os.getenv('WORDPRESS_WEBSCROLL_URL', YAML_CONFIG.get('wordpress_sites', {}).get('webscroll.fr', {}).get('url', 'https://www.webscroll.fr')), - 'username': os.getenv('WORDPRESS_WEBSCROLL_USERNAME', os.getenv('WORDPRESS_USERNAME', YAML_CONFIG.get('wordpress_sites', {}).get('webscroll.fr', {}).get('username', ''))), - 'password': os.getenv('WORDPRESS_WEBSCROLL_PASSWORD', os.getenv('WORDPRESS_APP_PASSWORD', YAML_CONFIG.get('wordpress_sites', {}).get('webscroll.fr', {}).get('password', ''))), - }, - 'hellogeek.net': { - 'url': os.getenv('WORDPRESS_HELLOGEEK_URL', YAML_CONFIG.get('wordpress_sites', {}).get('hellogeek.net', {}).get('url', 'https://www.hellogeek.net')), - 'username': os.getenv('WORDPRESS_HELLOGEEK_USERNAME', os.getenv('WORDPRESS_USERNAME', YAML_CONFIG.get('wordpress_sites', {}).get('hellogeek.net', {}).get('username', ''))), - 'password': os.getenv('WORDPRESS_HELLOGEEK_PASSWORD', os.getenv('WORDPRESS_APP_PASSWORD', YAML_CONFIG.get('wordpress_sites', {}).get('hellogeek.net', {}).get('password', ''))), - } - } - - # OpenRouter API Settings - OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY', YAML_CONFIG.get('ai_model', {}).get('api_key', '')) - AI_MODEL = os.getenv('AI_MODEL', YAML_CONFIG.get('ai_model', {}).get('name', 'anthropic/claude-3.5-sonnet')) - - # Script Settings - BATCH_SIZE = int(os.getenv('BATCH_SIZE', str(YAML_CONFIG.get('script_settings', {}).get('batch_size', 100)))) - API_DELAY_SECONDS = float(os.getenv('API_DELAY_SECONDS', str(YAML_CONFIG.get('script_settings', {}).get('api_delay_seconds', 0.5)))) - - # Analysis Settings - ANALYSIS_MIN_POSITION = int(os.getenv('ANALYSIS_MIN_POSITION', str(YAML_CONFIG.get('analysis_settings', {}).get('min_position', 11)))) - ANALYSIS_MAX_POSITION = int(os.getenv('ANALYSIS_MAX_POSITION', str(YAML_CONFIG.get('analysis_settings', {}).get('max_position', 30)))) - ANALYSIS_MIN_IMPRESSIONS = int(os.getenv('ANALYSIS_MIN_IMPRESSIONS', str(YAML_CONFIG.get('analysis_settings', {}).get('min_impressions', 50)))) - ANALYSIS_TOP_N_POSTS = int(os.getenv('ANALYSIS_TOP_N_POSTS', str(YAML_CONFIG.get('analysis_settings', {}).get('top_n_posts', 20)))) - - # Output directory - OUTPUT_DIR = Path(os.getenv('OUTPUT_DIR', YAML_CONFIG.get('output_settings', {}).get('output_dir', './output'))) - - @classmethod - def validate(cls): - """Validate that all required configuration is present.""" - errors = [] - - if not cls.WORDPRESS_URL: - errors.append("WORDPRESS_URL is required") - - if not cls.WORDPRESS_USERNAME: - errors.append("WORDPRESS_USERNAME is required") - - if not cls.WORDPRESS_APP_PASSWORD: - errors.append("WORDPRESS_APP_PASSWORD is required") - - if not cls.OPENROUTER_API_KEY: - errors.append("OPENROUTER_API_KEY is required (get one from https://openrouter.ai/)") - - if errors: - raise ValueError("Configuration errors:\n" + "\n".join(f" - {e}" for e in errors)) - - # Create output directory if it doesn't exist - cls.OUTPUT_DIR.mkdir(exist_ok=True) - - return True - - @classmethod - def get_wordpress_auth(cls): - """Get WordPress authentication tuple.""" - return (cls.WORDPRESS_USERNAME, cls.WORDPRESS_APP_PASSWORD) - - @classmethod - def get_api_base_url(cls): - """Get WordPress REST API base URL.""" - return f"{cls.WORDPRESS_URL}/wp-json/wp/v2" - - @classmethod - def get_site_config(cls, site_name): - """Get configuration for a specific site.""" - return cls.WORDPRESS_SITES.get(site_name, {}) - - @classmethod - def get_all_sites(cls): - """Get all configured WordPress sites.""" - return cls.WORDPRESS_SITES.keys() diff --git a/scripts/content_gap_analyzer.py b/scripts/content_gap_analyzer.py deleted file mode 100644 index bfe7634..0000000 --- a/scripts/content_gap_analyzer.py +++ /dev/null @@ -1,348 +0,0 @@ -""" -Content gap analyzer for SEO strategy. -Identifies missing topics and content opportunities using AI analysis. -""" - -import csv -import json -import argparse -import time -from pathlib import Path -from collections import defaultdict -from openai import OpenAI -from config import Config - - -class ContentGapAnalyzer: - """Identify content gaps and opportunities.""" - - def __init__(self): - """Initialize analyzer.""" - self.config = Config - self.output_dir = self.config.OUTPUT_DIR - self.logs = [] - self.client = None - - if self.config.OPENROUTER_API_KEY: - self.client = OpenAI( - base_url="https://openrouter.ai/api/v1", - api_key=self.config.OPENROUTER_API_KEY, - ) - - def log(self, message): - """Add message to log.""" - self.logs.append(message) - print(message) - - def load_posts(self, posts_csv): - """Load post titles and data.""" - posts = [] - if not posts_csv.exists(): - self.log(f"❌ File not found: {posts_csv}") - return posts - - try: - with open(posts_csv, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - posts.append({ - 'id': row.get('ID', ''), - 'title': row.get('Title', ''), - 'url': row.get('URL', ''), - 'traffic': int(row.get('traffic', 0) or 0), - 'impressions': int(row.get('impressions', 0) or 0), - 'top_keywords': row.get('top_keywords', '') - }) - - self.log(f"✓ Loaded {len(posts)} posts") - except Exception as e: - self.log(f"❌ Error reading posts: {e}") - - return posts - - def load_gsc_data(self, gsc_csv): - """Load Search Console queries for gap analysis.""" - queries = [] - if not gsc_csv.exists(): - self.log(f"⚠️ GSC file not found: {gsc_csv}") - return queries - - try: - with open(gsc_csv, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - try: - query = row.get('Query', '').strip() - if not query: - continue - - impressions = int(row.get('Impressions', 0) or 0) - clicks = int(row.get('Clicks', 0) or 0) - - # Only include queries with impressions but low clicks - if impressions > 0 and (clicks / impressions < 0.05): - queries.append({ - 'query': query, - 'impressions': impressions, - 'clicks': clicks, - 'ctr': clicks / impressions if impressions > 0 else 0 - }) - except (ValueError, TypeError): - continue - - self.log(f"✓ Loaded {len(queries)} underperforming queries") - except Exception as e: - self.log(f"⚠️ Error reading GSC file: {e}") - - return queries - - def extract_topics(self, posts): - """Extract topic clusters from post titles using AI.""" - if not self.client or len(posts) == 0: - self.log("⚠️ Cannot extract topics without AI client or posts") - return {} - - try: - self.log("🤖 Extracting topic clusters from post titles...") - - # Batch posts into groups - titles = [p['title'] for p in posts][:100] # Limit to first 100 - - prompt = f"""Analyze these {len(titles)} blog post titles and identify topic clusters: - -Titles: -{chr(10).join(f'{i+1}. {t}' for i, t in enumerate(titles))} - -Extract for each post: -1. Primary topic category -2. Subtopics covered -3. Content type (guide, tutorial, review, comparison, etc.) - -Then identify: -1. Top 10 topic clusters with post counts -2. Most common subtopics -3. Over/under-represented topics - -Return JSON: -{{ - "post_topics": {{ - "1": {{"primary": "...", "subtopics": ["..."], "type": "..."}}, - ... - }}, - "topic_clusters": [ - {{"cluster": "...", "post_count": 0, "importance": "high/medium/low"}} - ], - "coverage_gaps": ["topic 1", "topic 2", ...], - "niche": "detected niche or industry" -}}""" - - response = self.client.chat.completions.create( - model=self.config.AI_MODEL, - messages=[{"role": "user", "content": prompt}], - temperature=0.7, - max_tokens=1500 - ) - - try: - result_text = response.choices[0].message.content - start_idx = result_text.find('{') - end_idx = result_text.rfind('}') + 1 - if start_idx >= 0 and end_idx > start_idx: - return json.loads(result_text[start_idx:end_idx]) - except json.JSONDecodeError: - self.log("⚠️ Could not parse topic extraction response") - return {} - - except Exception as e: - self.log(f"⚠️ Topic extraction failed: {e}") - return {} - - def identify_content_gaps(self, topic_analysis, queries): - """Use AI to identify content gaps and suggest new topics.""" - if not self.client: - return [] - - try: - self.log("🤖 Identifying content gaps and opportunities...") - - clusters = topic_analysis.get('topic_clusters', []) - gaps = topic_analysis.get('coverage_gaps', []) - niche = topic_analysis.get('niche', 'general') - - # Prepare query analysis - top_queries = sorted(queries, key=lambda x: x['impressions'], reverse=True)[:20] - queries_str = '\n'.join([f"- {q['query']} ({q['impressions']} impr, {q['ctr']:.1%} CTR)" - for q in top_queries]) - - prompt = f"""Based on content analysis and search demand, identify content gaps: - -Existing Topics: {', '.join([c.get('cluster', '') for c in clusters[:10]])} -Coverage Gaps: {', '.join(gaps[:5])} -Niche: {niche} - -Top Underperforming Queries (low CTR despite impressions): -{queries_str} - -Identify high-value missing topics that could: -1. Fill coverage gaps -2. Target underperforming queries (CTR improvement) -3. Capitalize on search demand -4. Complement existing content - -For each suggestion: -- Topic title -- Why it's valuable (search demand + intent) -- Search volume estimate (high/medium/low) -- How it complements existing content -- Recommended content format -- Estimated traffic potential - -Prioritize by traffic opportunity. Max 20 ideas. - -Return JSON: -{{ - "content_opportunities": [ - {{ - "title": "...", - "why_valuable": "...", - "search_volume": "high/medium/low", - "complements": "existing topic", - "format": "guide/tutorial/comparison/review/list", - "traffic_potential": number, - "priority": "high/medium/low" - }} - ] -}}""" - - response = self.client.chat.completions.create( - model=self.config.AI_MODEL, - messages=[{"role": "user", "content": prompt}], - temperature=0.7, - max_tokens=2000 - ) - - try: - result_text = response.choices[0].message.content - start_idx = result_text.find('{') - end_idx = result_text.rfind('}') + 1 - if start_idx >= 0 and end_idx > start_idx: - result = json.loads(result_text[start_idx:end_idx]) - return result.get('content_opportunities', []) - except json.JSONDecodeError: - self.log("⚠️ Could not parse gap analysis response") - return [] - - except Exception as e: - self.log(f"⚠️ Gap analysis failed: {e}") - return [] - - def export_gaps_csv(self, gaps, output_csv): - """Export content gaps to CSV.""" - if not gaps: - self.log("⚠️ No gaps to export") - return - - try: - fieldnames = [ - 'priority', 'title', 'why_valuable', 'search_volume', - 'complements', 'format', 'traffic_potential' - ] - - with open(output_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') - writer.writeheader() - - for gap in sorted(gaps, key=lambda x: x.get('priority') == 'high', reverse=True): - writer.writerow(gap) - - self.log(f"✓ Exported {len(gaps)} content gaps to {output_csv}") - except Exception as e: - self.log(f"❌ Error exporting CSV: {e}") - - def export_topic_clusters_json(self, topic_analysis, output_json): - """Export topic analysis to JSON.""" - if not topic_analysis: - return - - try: - with open(output_json, 'w', encoding='utf-8') as f: - json.dump(topic_analysis, f, indent=2) - - self.log(f"✓ Exported topic analysis to {output_json}") - except Exception as e: - self.log(f"❌ Error exporting JSON: {e}") - - def export_log(self, log_file): - """Export analysis log.""" - try: - with open(log_file, 'w', encoding='utf-8') as f: - f.write("Content Gap Analysis Report\n") - f.write("=" * 60 + "\n\n") - - for msg in self.logs: - f.write(msg + "\n") - - self.log(f"✓ Exported log to {log_file}") - except Exception as e: - self.log(f"❌ Error exporting log: {e}") - - def run(self, posts_csv, gsc_csv, output_csv): - """Run complete analysis workflow.""" - self.log("📊 Starting content gap analysis...") - self.log(f"Posts: {posts_csv}") - self.log(f"GSC queries: {gsc_csv}\n") - - # Load data - posts = self.load_posts(posts_csv) - queries = self.load_gsc_data(gsc_csv) - - if not posts: - return - - # Extract topics - topic_analysis = self.extract_topics(posts) - if topic_analysis: - self.log(f"✓ Identified {len(topic_analysis.get('topic_clusters', []))} topic clusters") - - # Identify gaps - gaps = self.identify_content_gaps(topic_analysis, queries) - if gaps: - self.log(f"✓ Identified {len(gaps)} content opportunities") - - # Export - self.log("\n📁 Exporting results...") - self.export_gaps_csv(gaps, output_csv) - - topic_json = self.output_dir / 'topic_clusters.json' - self.export_topic_clusters_json(topic_analysis, topic_json) - - # Export log - log_dir = self.output_dir / 'logs' - log_dir.mkdir(exist_ok=True) - log_file = log_dir / 'content_gap_analysis_log.txt' - self.export_log(log_file) - - self.log("\n✓ Content gap analysis complete!") - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser(description='Analyze content gaps') - parser.add_argument('--posts-csv', type=Path, - default=Path('output/results/posts_with_analytics.csv'), - help='Posts CSV') - parser.add_argument('--gsc-queries', type=Path, - default=Path('input/analytics/gsc/Requêtes.csv'), - help='GSC queries CSV') - parser.add_argument('--output', type=Path, - default=Path('output/results/content_gaps.csv'), - help='Output gaps CSV') - - args = parser.parse_args() - - analyzer = ContentGapAnalyzer() - analyzer.run(args.posts_csv, args.gsc_queries, args.output) - - -if __name__ == '__main__': - main() diff --git a/scripts/content_strategy_analyzer.py b/scripts/content_strategy_analyzer.py deleted file mode 100644 index 35da66e..0000000 --- a/scripts/content_strategy_analyzer.py +++ /dev/null @@ -1,466 +0,0 @@ -""" -Multi-Site Content Strategy Analyzer -Analyzes all content (published + drafts) across 3 websites. -Recommends optimal distribution and consolidation strategy. -""" - -import csv -import json -import argparse -from pathlib import Path -from collections import defaultdict -from datetime import datetime - - -class ContentStrategyAnalyzer: - """Analyze and optimize content distribution across multiple sites.""" - - def __init__(self): - """Initialize analyzer.""" - self.output_dir = Path('output') - self.output_dir.mkdir(exist_ok=True) - (self.output_dir / 'analysis').mkdir(exist_ok=True) - (self.output_dir / 'reports').mkdir(exist_ok=True) - (self.output_dir / 'logs').mkdir(exist_ok=True) - - self.logs = [] - - def log(self, message): - """Log message.""" - self.logs.append(message) - print(message) - - def load_wordpress_posts(self, csv_path): - """Load published WordPress posts.""" - posts = {} - if not csv_path.exists(): - self.log(f"⚠️ WordPress posts file not found: {csv_path}") - return posts - - try: - with open(csv_path, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - post_id = row.get('ID') or row.get('post_id') - if not post_id: - continue - - posts[post_id] = { - 'source': 'wordpress', - 'status': 'published', - 'title': row.get('Title') or row.get('title') or row.get('post_title') or '', - 'url': row.get('URL') or row.get('url') or row.get('post_url') or '', - 'author': row.get('Author') or row.get('author') or 'Unknown', - 'traffic': int(row.get('traffic', 0) or 0), - 'impressions': int(row.get('impressions', 0) or 0), - 'position': float(row.get('avg_position', 0) or 0), - 'category': row.get('Category') or row.get('category') or '', - } - - self.log(f"✓ Loaded {len(posts)} published WordPress posts") - except Exception as e: - self.log(f"❌ Error reading WordPress posts: {e}") - - return posts - - def load_draft_posts(self, csv_path): - """Load draft/unpublished posts.""" - posts = {} - if not csv_path.exists(): - self.log(f"⚠️ Draft posts file not found: {csv_path}") - return posts - - try: - with open(csv_path, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - post_id = row.get('ID') or row.get('post_id') - if not post_id: - continue - - posts[post_id] = { - 'source': 'draft', - 'status': 'draft', - 'title': row.get('Title') or row.get('title') or row.get('post_title') or '', - 'url': row.get('URL') or row.get('url') or row.get('post_url') or '', - 'author': row.get('Author') or row.get('author') or 'Unknown', - 'traffic': 0, # Drafts have no traffic - 'impressions': 0, - 'position': 0, - 'category': row.get('Category') or row.get('category') or '', - } - - self.log(f"✓ Loaded {len(posts)} draft posts") - except Exception as e: - self.log(f"❌ Error reading draft posts: {e}") - - return posts - - def classify_post_topic(self, post): - """Classify post into topic area.""" - title = post['title'].lower() - category = post['category'].lower() - content = f"{title} {category}" - - # Topic classification based on keywords - topic_keywords = { - 'torrent': ['torrent', 'ygg', 'ratio', 'tracker', 'magnet', 'seedbox', 'upload'], - 'streaming': ['stream', 'film', 'série', 'netflix', 'disney', 'platforma'], - 'vpn': ['vpn', 'proxy', 'anonyme', 'privacy', 'chiffr'], - 'software': ['software', 'tool', 'app', 'logiciel', 'outil', 'program'], - 'gaming': ['game', 'jeu', 'gaming', 'emula', 'console', 'retro'], - 'download': ['download', 'télécharge', 'ddl', 'upload'], - 'tech': ['tech', 'informatique', 'code', 'programming', 'developer'], - 'other': [], - } - - for topic, keywords in topic_keywords.items(): - if topic == 'other': - continue - for keyword in keywords: - if keyword in content: - return topic - - return 'other' - - def classify_website(self, post): - """Determine which website this post should be on.""" - topic = self.classify_post_topic(post) - author = post.get('author', '').strip() - is_sponsored = author == 'Expert' - - # Website assignment rules - if topic == 'torrent' or topic == 'download': - return { - 'site': 'webscroll.fr', - 'reason': f'Torrent/file-sharing content', - 'priority': 'HIGH' if post['traffic'] > 100 else 'MEDIUM' - } - - if topic in ['vpn', 'software', 'gaming', 'tech']: - return { - 'site': 'mistergeek.net', - 'reason': f'{topic.capitalize()} - core content', - 'priority': 'HIGH' if post['traffic'] > 50 else 'MEDIUM' - } - - if topic == 'streaming' and post['traffic'] < 100: - return { - 'site': 'hellogeek.net', - 'reason': 'Low-traffic streaming content', - 'priority': 'LOW' - } - - if topic == 'other' or post['traffic'] < 10: - return { - 'site': 'hellogeek.net', - 'reason': 'Off-brand or low-traffic content', - 'priority': 'LOW' - } - - # Default to main site - return { - 'site': 'mistergeek.net', - 'reason': 'Core content', - 'priority': 'MEDIUM' - } - - def classify_content_action(self, post): - """Determine what action to take with this post.""" - topic = self.classify_post_topic(post) - traffic = post.get('traffic', 0) - impressions = post.get('impressions', 0) - position = post.get('position', 0) - status = post.get('status', 'published') - - # Determine action - if status == 'draft': - if traffic == 0: - return 'REVIEW_PUBLISH_OR_DELETE' # Unpublished draft - else: - return 'REPUBLISH' # Was published, now draft - - if traffic < 5 and impressions < 20: - return 'DELETE_OR_CONSOLIDATE' - - if traffic > 0 and position > 0 and position < 11: - return 'KEEP_OPTIMIZE' - - if position > 11 and position < 30: - return 'KEEP_OPTIMIZE' - - if position > 30 or traffic < 10: - return 'MOVE_TO_OTHER_SITE' - - return 'KEEP_MONITOR' - - def analyze_all_content(self, posts): - """Analyze and classify all posts.""" - analysis = { - 'total_posts': len(posts), - 'by_site': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}), - 'by_topic': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}), - 'by_action': defaultdict(lambda: {'count': 0, 'traffic': 0, 'posts': []}), - 'sponsored_posts': {'count': 0, 'traffic': 0, 'posts': []}, - 'draft_posts': {'count': 0, 'posts': []}, - } - - for post_id, post in posts.items(): - topic = self.classify_post_topic(post) - site_assignment = self.classify_website(post) - action = self.classify_content_action(post) - is_sponsored = post.get('author', '').strip() == 'Expert' - is_draft = post.get('status') == 'draft' - - # Record in analysis - analysis['by_site'][site_assignment['site']]['count'] += 1 - analysis['by_site'][site_assignment['site']]['traffic'] += post['traffic'] - analysis['by_site'][site_assignment['site']]['posts'].append({ - 'id': post_id, - 'title': post['title'], - 'traffic': post['traffic'], - 'reason': site_assignment['reason'] - }) - - analysis['by_topic'][topic]['count'] += 1 - analysis['by_topic'][topic]['traffic'] += post['traffic'] - - analysis['by_action'][action]['count'] += 1 - analysis['by_action'][action]['traffic'] += post['traffic'] - - if is_sponsored: - analysis['sponsored_posts']['count'] += 1 - analysis['sponsored_posts']['traffic'] += post['traffic'] - analysis['sponsored_posts']['posts'].append({ - 'id': post_id, - 'title': post['title'], - 'traffic': post['traffic'] - }) - - if is_draft: - analysis['draft_posts']['count'] += 1 - analysis['draft_posts']['posts'].append({ - 'id': post_id, - 'title': post['title'], - 'status': 'draft' - }) - - return analysis - - def generate_content_distribution_csv(self, posts, output_path): - """Export detailed content distribution plan.""" - try: - fieldnames = [ - 'post_id', 'title', 'topic', 'status', 'author', - 'traffic', 'impressions', 'position', - 'recommended_site', 'reason', 'action', - 'priority', 'notes' - ] - - rows = [] - for post_id, post in posts.items(): - topic = self.classify_post_topic(post) - site_assignment = self.classify_website(post) - action = self.classify_content_action(post) - author = post.get('author', '').strip() - is_sponsored = author == 'Expert' - - rows.append({ - 'post_id': post_id, - 'title': post['title'][:80], - 'topic': topic, - 'status': post.get('status', 'published'), - 'author': author, - 'traffic': post.get('traffic', 0), - 'impressions': post.get('impressions', 0), - 'position': post.get('position', 0), - 'recommended_site': site_assignment['site'], - 'reason': site_assignment['reason'], - 'action': action, - 'priority': site_assignment['priority'], - 'notes': 'SPONSORED' if is_sponsored else '' - }) - - rows.sort(key=lambda x: x['traffic'], reverse=True) - - with open(output_path, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - - self.log(f"✓ Exported {len(rows)} posts to {output_path}") - except Exception as e: - self.log(f"❌ Error exporting CSV: {e}") - - def generate_strategy_report(self, analysis, output_path): - """Generate comprehensive strategy report.""" - try: - report = [] - report.append("# Multi-Site Content Strategy Report\n") - report.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}*\n\n") - - # Executive Summary - report.append("## Executive Summary\n\n") - report.append(f"**Total Content Analyzed:** {analysis['total_posts']} posts\n") - report.append(f"- Published: {analysis['total_posts'] - analysis['draft_posts']['count']}\n") - report.append(f"- Drafts: {analysis['draft_posts']['count']}\n") - report.append(f"- Sponsored: {analysis['sponsored_posts']['count']}\n\n") - - # Distribution Strategy - report.append("## Recommended Site Distribution\n\n") - for site, data in sorted(analysis['by_site'].items(), - key=lambda x: x[1]['traffic'], reverse=True): - report.append(f"### {site}\n") - report.append(f"- Posts: {data['count']}\n") - report.append(f"- Total Traffic: {data['traffic']:,} visits/month\n") - report.append(f"- Top Posts:\n") - for post in sorted(data['posts'], key=lambda x: x['traffic'], reverse=True)[:5]: - report.append(f" - {post['title'][:60]} ({post['traffic']} visits)\n") - report.append(f"\n") - - # Topic Distribution - report.append("## Content by Topic\n\n") - for topic, data in sorted(analysis['by_topic'].items(), - key=lambda x: x[1]['traffic'], reverse=True): - report.append(f"- **{topic.title()}:** {data['count']} posts ({data['traffic']:,} visits)\n") - report.append("\n") - - # Actions Required - report.append("## Required Actions\n\n") - for action, data in sorted(analysis['by_action'].items(), - key=lambda x: x[1]['count'], reverse=True): - report.append(f"- **{action}:** {data['count']} posts ({data['traffic']:,} visits)\n") - report.append("\n") - - # Sponsored Content - if analysis['sponsored_posts']['count'] > 0: - report.append("## Sponsored Content (by 'Expert')\n\n") - report.append(f"Total: {analysis['sponsored_posts']['count']} posts\n") - report.append(f"Traffic: {analysis['sponsored_posts']['traffic']:,} visits/month\n\n") - for post in sorted(analysis['sponsored_posts']['posts'], - key=lambda x: x['traffic'], reverse=True)[:10]: - report.append(f"- {post['title'][:70]} ({post['traffic']} visits)\n") - report.append("\n") - - # Draft Posts - if analysis['draft_posts']['count'] > 0: - report.append("## Draft Posts (Unpublished)\n\n") - report.append(f"Total: {analysis['draft_posts']['count']} posts\n") - report.append("*Decision needed: Publish, delete, or move to other site?*\n\n") - for post in analysis['draft_posts']['posts'][:15]: - report.append(f"- {post['title'][:70]}\n") - report.append("\n") - - # Recommendations - report.append("## Strategic Recommendations\n\n") - report.append("1. **Consolidate on mistergeek.net:**\n") - report.append(" - Keep only VPN, software, gaming, tech content\n") - report.append(" - Focus on high-traffic posts (>50 visits/month)\n\n") - - report.append("2. **Move to webscroll.fr:**\n") - report.append(" - All torrent/file-sharing content\n") - report.append(" - File-specific guides\n\n") - - report.append("3. **Move to hellogeek.net:**\n") - report.append(" - Low-traffic content (<50 visits)\n") - report.append(" - Off-brand content\n") - report.append(" - Experimental/niche posts\n\n") - - report.append("4. **Delete:**\n") - report.append(f" - Posts with <5 visits and <20 impressions\n") - report.append(" - Duplicates/thin content\n\n") - - with open(output_path, 'w', encoding='utf-8') as f: - f.write(''.join(report)) - - self.log(f"✓ Generated strategy report: {output_path}") - except Exception as e: - self.log(f"❌ Error generating report: {e}") - - def run(self, wordpress_csv, drafts_csv): - """Run complete content strategy analysis.""" - self.log("\n" + "="*70) - self.log("Multi-Site Content Strategy Analyzer") - self.log("="*70 + "\n") - - # Load posts - self.log("📚 Loading content...\n") - wordpress_posts = self.load_wordpress_posts(wordpress_csv) - draft_posts = self.load_draft_posts(drafts_csv) - - # Combine all posts - all_posts = {**wordpress_posts, **draft_posts} - self.log(f"Total posts: {len(all_posts)}\n") - - # Analyze - self.log("🔍 Analyzing content distribution...\n") - analysis = self.analyze_all_content(all_posts) - - # Generate outputs - self.log("📊 Generating outputs...\n") - - output_csv = self.output_dir / 'analysis' / 'content_distribution.csv' - self.generate_content_distribution_csv(all_posts, output_csv) - - output_md = self.output_dir / 'reports' / 'content_strategy_report.md' - self.generate_strategy_report(analysis, output_md) - - # Export analysis JSON - analysis_json = self.output_dir / 'analysis' / 'analysis_summary.json' - try: - with open(analysis_json, 'w', encoding='utf-8') as f: - # Convert defaultdict to regular dict for JSON serialization - analysis_clean = { - 'total_posts': analysis['total_posts'], - 'by_site': dict(analysis['by_site']), - 'by_topic': {k: {'count': v['count'], 'traffic': v['traffic']} - for k, v in analysis['by_topic'].items()}, - 'by_action': {k: {'count': v['count'], 'traffic': v['traffic']} - for k, v in analysis['by_action'].items()}, - 'sponsored_posts': { - 'count': analysis['sponsored_posts']['count'], - 'traffic': analysis['sponsored_posts']['traffic'] - }, - 'draft_posts': { - 'count': analysis['draft_posts']['count'] - } - } - json.dump(analysis_clean, f, indent=2, ensure_ascii=False) - self.log(f"✓ Exported analysis JSON: {analysis_json}\n") - except Exception as e: - self.log(f"❌ Error exporting JSON: {e}\n") - - # Summary - self.log("\n" + "="*70) - self.log("ANALYSIS COMPLETE") - self.log("="*70) - self.log(f"\nOutputs:") - self.log(f" Distribution: {output_csv}") - self.log(f" Strategy: {output_md}") - self.log(f" Summary: {analysis_json}\n") - - self.log("Next steps:") - self.log(" 1. Review content_strategy_report.md") - self.log(" 2. Review content_distribution.csv") - self.log(" 3. Decide: which posts go to which site?") - self.log(" 4. Plan content consolidation") - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser(description='Analyze content across multiple sites') - parser.add_argument('--wordpress-csv', type=Path, - default=Path('input/wordpress/new-propositions.csv'), - help='WordPress posts CSV') - parser.add_argument('--drafts-csv', type=Path, - default=Path('input/drafts/drafts.csv'), - help='Draft posts CSV') - - args = parser.parse_args() - - analyzer = ContentStrategyAnalyzer() - analyzer.run(args.wordpress_csv, args.drafts_csv) - - -if __name__ == '__main__': - main() diff --git a/scripts/enhanced_analyzer.py b/scripts/enhanced_analyzer.py deleted file mode 100644 index cf7e3ba..0000000 --- a/scripts/enhanced_analyzer.py +++ /dev/null @@ -1,375 +0,0 @@ -#!/usr/bin/env python3 -""" -Enhanced AI Analyzer - Selective analysis with in-place updates -Analyzes posts and updates CSV with AI recommendations for: -- Title optimization -- Meta description optimization -- Category suggestions -- Site placement recommendations -""" - -import csv -import json -import logging -import sys -from pathlib import Path -from typing import Dict, List, Optional, Tuple -import requests -from datetime import datetime -from config import Config - -logger = logging.getLogger(__name__) - - -class EnhancedPostAnalyzer: - """Enhanced analyzer with selective column analysis and in-place updates.""" - - def __init__(self, csv_file: str, analyze_fields: Optional[List[str]] = None): - """ - Initialize analyzer. - - Args: - csv_file: Path to input CSV - analyze_fields: List of fields to analyze ['title', 'meta_description', 'categories', 'site'] - If None, analyzes all fields - """ - self.csv_file = Path(csv_file) - self.openrouter_api_key = Config.OPENROUTER_API_KEY - self.ai_model = Config.AI_MODEL - self.posts = [] - self.analyzed_posts = [] - self.api_calls = 0 - self.ai_cost = 0.0 - - # Default: analyze all fields - if analyze_fields is None: - self.analyze_fields = ['title', 'meta_description', 'categories', 'site'] - else: - self.analyze_fields = analyze_fields - - logger.info(f"Fields to analyze: {', '.join(self.analyze_fields)}") - - def load_csv(self) -> bool: - """Load posts from CSV file.""" - logger.info(f"Loading CSV: {self.csv_file}") - - if not self.csv_file.exists(): - logger.error(f"CSV file not found: {self.csv_file}") - return False - - try: - with open(self.csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - self.posts = list(reader) - - logger.info(f"✓ Loaded {len(self.posts)} posts from CSV") - return True - - except Exception as e: - logger.error(f"Error loading CSV: {e}") - return False - - def get_ai_recommendations(self, batch: List[Dict], fields: List[str]) -> Optional[str]: - """Get AI recommendations for specific fields.""" - if not self.openrouter_api_key: - logger.error("OPENROUTER_API_KEY not set") - return None - - # Format posts for AI - formatted_posts = [] - for i, post in enumerate(batch, 1): - post_text = f"{i}. POST ID: {post['post_id']}\n" - post_text += f" Site: {post.get('site', '')}\n" - - if 'title' in fields: - post_text += f" Title: {post.get('title', '')}\n" - - if 'meta_description' in fields: - post_text += f" Meta Description: {post.get('meta_description', '')}\n" - - if 'categories' in fields: - post_text += f" Categories: {post.get('categories', '')}\n" - - if 'content_preview' in post: - post_text += f" Content Preview: {post.get('content_preview', '')[:300]}...\n" - - formatted_posts.append(post_text) - - posts_text = "\n".join(formatted_posts) - - # Build prompt based on requested fields - prompt_parts = ["Analyze these blog posts and provide recommendations.\n\n"] - - if 'site' in fields: - prompt_parts.append("""Website Strategy: -- mistergeek.net: High-value topics (VPN, Software, Gaming, General Tech, SEO, Content Marketing) -- webscroll.fr: Torrenting, File-Sharing, Tracker guides -- hellogeek.net: Low-traffic, experimental, off-brand content - -""") - - prompt_parts.append(posts_text) - prompt_parts.append("\nFor EACH post, provide a JSON object with:\n{\n") - - if 'title' in fields: - prompt_parts.append(' "proposed_title": "",\n') - prompt_parts.append(' "title_reason": "",\n') - - if 'meta_description' in fields: - prompt_parts.append(' "proposed_meta_description": "",\n') - prompt_parts.append(' "meta_reason": "",\n') - - if 'categories' in fields: - prompt_parts.append(' "proposed_category": "",\n') - prompt_parts.append(' "category_reason": "",\n') - - if 'site' in fields: - prompt_parts.append(' "proposed_site": "",\n') - prompt_parts.append(' "site_reason": "",\n') - - prompt_parts.append(' "confidence": "",\n') - prompt_parts.append(' "priority": ""\n}') - - prompt_parts.append("\nReturn ONLY a JSON array of objects, one per post.") - - prompt = "".join(prompt_parts) - - try: - logger.info(f" Sending batch to AI for analysis...") - - response = requests.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={ - "Authorization": f"Bearer {self.openrouter_api_key}", - "Content-Type": "application/json", - }, - json={ - "model": self.ai_model, - "messages": [{"role": "user", "content": prompt}], - "temperature": 0.3, - }, - timeout=60 - ) - response.raise_for_status() - - result = response.json() - self.api_calls += 1 - - # Track cost - usage = result.get('usage', {}) - input_tokens = usage.get('prompt_tokens', 0) - output_tokens = usage.get('completion_tokens', 0) - self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000 - - recommendations_text = result['choices'][0]['message']['content'].strip() - logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})") - - return recommendations_text - - except Exception as e: - logger.error(f"Error getting AI recommendations: {e}") - return None - - def parse_recommendations(self, recommendations_json: str) -> List[Dict]: - """Parse JSON recommendations from AI.""" - try: - start_idx = recommendations_json.find('[') - end_idx = recommendations_json.rfind(']') + 1 - - if start_idx == -1 or end_idx == 0: - logger.error("Could not find JSON array in response") - return [] - - json_str = recommendations_json[start_idx:end_idx] - recommendations = json.loads(json_str) - - return recommendations - - except json.JSONDecodeError as e: - logger.error(f"Error parsing JSON recommendations: {e}") - return [] - - def analyze_posts(self, batch_size: int = 10) -> bool: - """Analyze all posts in batches.""" - logger.info("\n" + "="*70) - logger.info("ANALYZING POSTS WITH AI") - logger.info("="*70 + "\n") - - batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)] - logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n") - - all_recommendations = {} - - for batch_num, batch in enumerate(batches, 1): - logger.info(f"Batch {batch_num}/{len(batches)}: Analyzing {len(batch)} posts...") - - recommendations_json = self.get_ai_recommendations(batch, self.analyze_fields) - - if not recommendations_json: - logger.error(f" Failed to get recommendations for batch {batch_num}") - continue - - recommendations = self.parse_recommendations(recommendations_json) - - for rec in recommendations: - all_recommendations[str(rec.get('post_id', ''))] = rec - - logger.info(f" ✓ Got {len(recommendations)} recommendations") - - logger.info(f"\n✓ Analysis complete!") - logger.info(f" Total recommendations: {len(all_recommendations)}") - logger.info(f" API calls: {self.api_calls}") - logger.info(f" Estimated cost: ${self.ai_cost:.4f}") - - # Map recommendations to posts - for post in self.posts: - post_id = str(post['post_id']) - if post_id in all_recommendations: - rec = all_recommendations[post_id] - - # Add only requested fields - if 'title' in self.analyze_fields: - post['proposed_title'] = rec.get('proposed_title', post.get('title', '')) - post['title_reason'] = rec.get('title_reason', '') - - if 'meta_description' in self.analyze_fields: - post['proposed_meta_description'] = rec.get('proposed_meta_description', post.get('meta_description', '')) - post['meta_reason'] = rec.get('meta_reason', '') - - if 'categories' in self.analyze_fields: - post['proposed_category'] = rec.get('proposed_category', post.get('categories', '')) - post['category_reason'] = rec.get('category_reason', '') - - if 'site' in self.analyze_fields: - post['proposed_site'] = rec.get('proposed_site', post.get('site', '')) - post['site_reason'] = rec.get('site_reason', '') - - # Common fields - post['ai_confidence'] = rec.get('confidence', 'Medium') - post['ai_priority'] = rec.get('priority', 'Medium') - else: - # Add empty fields for consistency - if 'title' in self.analyze_fields: - post['proposed_title'] = post.get('title', '') - post['title_reason'] = 'No AI recommendation' - - if 'meta_description' in self.analyze_fields: - post['proposed_meta_description'] = post.get('meta_description', '') - post['meta_reason'] = 'No AI recommendation' - - if 'categories' in self.analyze_fields: - post['proposed_category'] = post.get('categories', '') - post['category_reason'] = 'No AI recommendation' - - if 'site' in self.analyze_fields: - post['proposed_site'] = post.get('site', '') - post['site_reason'] = 'No AI recommendation' - - post['ai_confidence'] = 'Unknown' - post['ai_priority'] = 'Medium' - - self.analyzed_posts.append(post) - - return len(self.analyzed_posts) > 0 - - def export_results(self, output_file: Optional[str] = None, update_input: bool = False) -> str: - """ - Export results to CSV. - - Args: - output_file: Custom output path - update_input: If True, update the input CSV file (creates backup) - - Returns: - Path to exported file - """ - if update_input: - # Create backup of original file - backup_file = self.csv_file.parent / f"{self.csv_file.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" - import shutil - shutil.copy2(self.csv_file, backup_file) - logger.info(f"✓ Created backup: {backup_file}") - - output_file = self.csv_file - elif not output_file: - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - output_file = output_dir / f'analyzed_posts_{timestamp}.csv' - - output_file = Path(output_file) - output_file.parent.mkdir(parents=True, exist_ok=True) - - if not self.analyzed_posts: - logger.error("No analyzed posts to export") - return "" - - # Build fieldnames - original fields + new fields - original_fields = list(self.analyzed_posts[0].keys()) - - # Determine which new fields were added - new_fields = [] - if 'title' in self.analyze_fields: - new_fields.extend(['proposed_title', 'title_reason']) - if 'meta_description' in self.analyze_fields: - new_fields.extend(['proposed_meta_description', 'meta_reason']) - if 'categories' in self.analyze_fields: - new_fields.extend(['proposed_category', 'category_reason']) - if 'site' in self.analyze_fields: - new_fields.extend(['proposed_site', 'site_reason']) - - new_fields.extend(['ai_confidence', 'ai_priority']) - - fieldnames = original_fields + new_fields - - logger.info(f"\nExporting results to: {output_file}") - - with open(output_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.analyzed_posts) - - logger.info(f"✓ Exported {len(self.analyzed_posts)} posts") - return str(output_file) - - def run(self, output_file: Optional[str] = None, update_input: bool = False, batch_size: int = 10) -> str: - """Run complete analysis.""" - if not self.load_csv(): - sys.exit(1) - - if not self.analyze_posts(batch_size=batch_size): - logger.error("Failed to analyze posts") - sys.exit(1) - - return self.export_results(output_file=output_file, update_input=update_input) - - -def main(): - """Main entry point with argument parsing.""" - import argparse - - parser = argparse.ArgumentParser( - description='Enhanced AI analyzer with selective field analysis' - ) - parser.add_argument('csv_file', help='Input CSV file') - parser.add_argument('--output', '-o', help='Output CSV file (default: creates new file in output/)') - parser.add_argument('--update', '-u', action='store_true', help='Update input CSV file (creates backup)') - parser.add_argument('--fields', '-f', nargs='+', - choices=['title', 'meta_description', 'categories', 'site'], - help='Fields to analyze (default: all fields)') - parser.add_argument('--batch-size', type=int, default=10, help='Batch size for AI analysis') - - args = parser.parse_args() - - analyzer = EnhancedPostAnalyzer(args.csv_file, analyze_fields=args.fields) - output_file = analyzer.run( - output_file=args.output, - update_input=args.update, - batch_size=args.batch_size - ) - - logger.info(f"\n✓ Analysis complete! Results saved to: {output_file}") - - -if __name__ == '__main__': - main() diff --git a/scripts/export_posts_for_ai_decision.py b/scripts/export_posts_for_ai_decision.py deleted file mode 100755 index 0cb9328..0000000 --- a/scripts/export_posts_for_ai_decision.py +++ /dev/null @@ -1,378 +0,0 @@ -#!/usr/bin/env python3 -""" -Export All Posts to CSV for AI Decision Making -Fetches complete post data from all 3 WordPress sites and exports to CSV -for AI-powered categorization and movement recommendations. -Uses credentials from .env file for secure authentication. -""" - -import csv -import logging -import sys -from pathlib import Path -from typing import Dict, List, Optional -import requests -from requests.auth import HTTPBasicAuth -import time -from datetime import datetime -import re -from config import Config - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -class PostExporter: - """Export posts from WordPress sites to CSV for AI analysis.""" - - def __init__(self): - """Initialize the exporter with sites from Config.""" - self.sites = Config.WORDPRESS_SITES - self.all_posts = [] - self.category_cache = {} # Cache category names by site - - def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]: - """ - Fetch ALL posts from a site with full details. - - Args: - site_name: Website name - site_config: Site configuration dict - - Returns: - List of posts with full metadata - """ - logger.info(f"\nFetching posts from {site_name}...") - - posts = [] - page = 1 - base_url = site_config['url'].rstrip('/') - api_url = f"{base_url}/wp-json/wp/v2/posts" - auth = HTTPBasicAuth(site_config['username'], site_config['password']) - - for status in ['publish', 'draft']: - page = 1 - status_count = 0 - - while True: - params = { - 'page': page, - 'per_page': 100, - 'status': status, - } - - try: - logger.info(f" Fetching page {page} ({status} posts)...") - response = requests.get(api_url, params=params, auth=auth, timeout=10) - response.raise_for_status() - - page_posts = response.json() - if not page_posts: - break - - posts.extend(page_posts) - status_count += len(page_posts) - logger.info(f" ✓ Got {len(page_posts)} posts (total: {len(posts)})") - - page += 1 - time.sleep(0.5) - - except requests.exceptions.HTTPError as e: - if response.status_code == 400: - logger.info(f" ℹ API limit reached (got {status_count} {status} posts)") - break - else: - logger.error(f"Error on page {page}: {e}") - break - - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching from {site_name}: {e}") - break - - if status_count > 0: - logger.info(f" ✓ Total {status} posts: {status_count}") - - logger.info(f"✓ Total posts from {site_name}: {len(posts)}\n") - return posts - - def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, str]: - """ - Fetch category names and slugs from a WordPress site. - - Args: - site_name: Website name - site_config: Site configuration dict - - Returns: - Dict mapping category IDs to category names - """ - if site_name in self.category_cache: - return self.category_cache[site_name] - - logger.info(f" Fetching categories from {site_name}...") - categories = {} - base_url = site_config['url'].rstrip('/') - api_url = f"{base_url}/wp-json/wp/v2/categories" - auth = HTTPBasicAuth(site_config['username'], site_config['password']) - - try: - # Fetch all categories (per_page=100) - params = {'per_page': 100} - response = requests.get(api_url, params=params, auth=auth, timeout=10) - response.raise_for_status() - - cat_list = response.json() - for cat in cat_list: - categories[cat['id']] = { - 'name': cat.get('name', ''), - 'slug': cat.get('slug', ''), - } - logger.info(f" ✓ Fetched {len(categories)} categories") - except Exception as e: - logger.warning(f" Could not fetch categories from {site_name}: {e}") - - self.category_cache[site_name] = categories - return categories - - def extract_post_details(self, post: Dict, site_name: str, category_map: Dict[int, Dict]) -> Dict: - """ - Extract all relevant details from a post for AI analysis. - - Args: - post: WordPress post object - site_name: Website name - category_map: Dict mapping category IDs to names - - Returns: - Dict with extracted post details - """ - # Title - title = post.get('title', {}) - if isinstance(title, dict): - title = title.get('rendered', '') - - # Content (first 500 chars for context) - content = post.get('content', {}) - if isinstance(content, dict): - content = content.get('rendered', '') - # Strip HTML tags for readability - content_text = re.sub('<[^<]+?>', '', content)[:500] - - # Excerpt - excerpt = post.get('excerpt', {}) - if isinstance(excerpt, dict): - excerpt = excerpt.get('rendered', '') - excerpt_text = re.sub('<[^<]+?>', '', excerpt) - - # Meta descriptions and SEO data - meta_dict = post.get('meta', {}) if isinstance(post.get('meta'), dict) else {} - - rank_math_title = meta_dict.get('rank_math_title', '') - rank_math_description = meta_dict.get('rank_math_description', '') - rank_math_keyword = meta_dict.get('rank_math_focus_keyword', '') - yoast_description = meta_dict.get('_yoast_wpseo_metadesc', '') - - meta_description = rank_math_description or yoast_description or '' - - # Categories - convert IDs to names using category_map - category_ids = post.get('categories', []) - category_names = ', '.join([ - category_map.get(cat_id, {}).get('name', str(cat_id)) - for cat_id in category_ids - ]) if category_ids else '' - - # Tags - tags = post.get('tags', []) - tag_names = ', '.join([str(t) for t in tags]) if tags else '' - - # Author - author_id = post.get('author', '') - - # Date - date_published = post.get('date', '') - date_modified = post.get('modified', '') - - # Status - status = post.get('status', 'publish') - - # URL - url = post.get('link', '') - - return { - 'site': site_name, - 'post_id': post['id'], - 'status': status, - 'title': title.strip(), - 'slug': post.get('slug', ''), - 'url': url, - 'author_id': author_id, - 'date_published': date_published, - 'date_modified': date_modified, - 'categories': category_names, - 'tags': tag_names, - 'excerpt': excerpt_text.strip(), - 'content_preview': content_text.strip(), - 'seo_title': rank_math_title, - 'meta_description': meta_description, - 'focus_keyword': rank_math_keyword, - 'word_count': len(content_text.split()), - } - - def export_to_csv(self, output_file: Optional[str] = None) -> str: - """ - Export all posts to CSV. - - Args: - output_file: Optional custom output path - - Returns: - Path to exported CSV file - """ - if not output_file: - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - date_str = datetime.now().strftime('%Y-%m-%d') - output_file = output_dir / f'all_posts_{date_str}.csv' - - output_file = Path(output_file) - output_file.parent.mkdir(parents=True, exist_ok=True) - - if not self.all_posts: - logger.error("No posts to export") - return None - - fieldnames = [ - 'site', - 'post_id', - 'status', - 'title', - 'slug', - 'url', - 'author_id', - 'date_published', - 'date_modified', - 'categories', - 'tags', - 'excerpt', - 'content_preview', - 'seo_title', - 'meta_description', - 'focus_keyword', - 'word_count', - ] - - logger.info(f"Exporting {len(self.all_posts)} posts to CSV...") - - with open(output_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - - for post in self.all_posts: - writer.writerow({field: post.get(field, '') for field in fieldnames}) - - logger.info(f"✓ CSV exported to: {output_file}") - return str(output_file) - - def run(self): - """Run complete export process.""" - logger.info("="*70) - logger.info("EXPORTING ALL POSTS FOR AI DECISION MAKING") - logger.info("="*70) - logger.info("Sites configured: " + ", ".join(self.sites.keys())) - logger.info("") - - # Fetch from all sites - total_posts_before = len(self.all_posts) - - for site_name, config in self.sites.items(): - # Fetch categories for this site - categories = self.fetch_category_names(site_name, config) - - # Fetch posts for this site - posts = self.fetch_posts_from_site(site_name, config) - - if posts: - for post in posts: - post_details = self.extract_post_details(post, site_name, categories) - self.all_posts.append(post_details) - - if not self.all_posts: - logger.error("No posts found on any site") - sys.exit(1) - - # Sort by site then by post_id - self.all_posts.sort(key=lambda x: (x['site'], x['post_id'])) - - # Export to CSV - csv_file = self.export_to_csv() - - # Print summary - logger.info("\n" + "="*70) - logger.info("EXPORT SUMMARY") - logger.info("="*70) - - by_site = {} - for post in self.all_posts: - site = post['site'] - if site not in by_site: - by_site[site] = {'total': 0, 'published': 0, 'draft': 0} - by_site[site]['total'] += 1 - if post['status'] == 'publish': - by_site[site]['published'] += 1 - else: - by_site[site]['draft'] += 1 - - for site, stats in sorted(by_site.items()): - logger.info(f"\n{site}:") - logger.info(f" Total: {stats['total']}") - logger.info(f" Published: {stats['published']}") - logger.info(f" Drafts: {stats['draft']}") - - total_posts = len(self.all_posts) - total_published = sum(1 for p in self.all_posts if p['status'] == 'publish') - total_drafts = sum(1 for p in self.all_posts if p['status'] == 'draft') - - logger.info(f"\n{'─'*70}") - logger.info(f"Total across all sites: {total_posts} posts") - logger.info(f" Published: {total_published}") - logger.info(f" Drafts: {total_drafts}") - logger.info(f"{'─'*70}") - - logger.info(f"\n✓ Export complete!") - logger.info(f"✓ CSV file: {csv_file}") - logger.info(f"\nCSV includes:") - logger.info(f" • Site, Post ID, Status, Title, URL") - logger.info(f" • Publication dates, Categories, Tags") - logger.info(f" • Content preview (500 chars)") - logger.info(f" • SEO title, Meta description, Focus keyword") - logger.info(f" • Word count") - logger.info(f"\nNext step: Upload CSV to Claude or other AI for:") - logger.info(f" 1. Categorize by topic (VPN, software, gaming, torrenting, etc.)") - logger.info(f" 2. Recommend which site each post should be on") - logger.info(f" 3. Identify duplicates for consolidation") - logger.info(f" 4. Flag posts for deletion (low-traffic, thin content)") - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='Export all posts from WordPress sites for AI decision making' - ) - parser.add_argument( - '--output', - help='Custom output CSV file path' - ) - - args = parser.parse_args() - - exporter = PostExporter() - exporter.run() - - -if __name__ == '__main__': - main() diff --git a/scripts/multi_site_seo_analyzer.py b/scripts/multi_site_seo_analyzer.py deleted file mode 100755 index 18e472c..0000000 --- a/scripts/multi_site_seo_analyzer.py +++ /dev/null @@ -1,778 +0,0 @@ -#!/usr/bin/env python3 -""" -Multi-Site WordPress SEO Analyzer -Fetches posts from 3 WordPress sites, analyzes titles and meta descriptions, -and provides AI-powered optimization recommendations. -""" - -import os -import csv -import json -import logging -from datetime import datetime -from pathlib import Path -from typing import Dict, List, Optional, Tuple -import requests -from requests.auth import HTTPBasicAuth -import time -from config import Config -import sys - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -class MultiSiteSEOAnalyzer: - """Analyzes titles and meta descriptions across multiple WordPress sites.""" - - def __init__(self, progressive_csv: bool = True): - """ - Initialize the analyzer. - - Args: - progressive_csv: If True, write CSV progressively as posts are analyzed - """ - self.sites_config = Config.WORDPRESS_SITES - self.posts_data = {} - self.analysis_results = [] - self.api_calls = 0 - self.ai_cost = 0.0 - self.openrouter_api_key = Config.OPENROUTER_API_KEY - self.progressive_csv = progressive_csv - self.csv_file = None - self.csv_writer = None - - def fetch_posts_from_site(self, site_name: str, site_config: Dict, - include_drafts: bool = False) -> List[Dict]: - """ - Fetch posts from a WordPress site using REST API. - - Args: - site_name: Name of the site (domain) - site_config: Configuration dict with url, username, password - include_drafts: If True, fetch both published and draft posts - - Returns: - List of posts with metadata - """ - logger.info(f"Fetching posts from {site_name}...") - - posts = [] - base_url = site_config['url'].rstrip('/') - api_url = f"{base_url}/wp-json/wp/v2/posts" - auth = HTTPBasicAuth(site_config['username'], site_config['password']) - - # Determine which statuses to fetch - statuses = ['publish', 'draft'] if include_drafts else ['publish'] - status_str = ', '.join(statuses).replace('publish', 'published').replace('draft', 'drafts') - - # Fetch each status separately to avoid 400 Bad Request on pagination - for status in statuses: - page = 1 - status_count = 0 - use_fields = True # Try with _fields first, fallback without if 400 - - while True: - params = { - 'page': page, - 'per_page': 100, - 'status': status, # Single status per request - } - - # Add _fields only if not getting 400 errors - if use_fields: - params['_fields'] = 'id,title,slug,link,meta,status' - - try: - response = requests.get(api_url, params=params, auth=auth, timeout=10) - response.raise_for_status() - - page_posts = response.json() - if not page_posts: - break - - posts.extend(page_posts) - status_count += len(page_posts) - logger.info(f" ✓ Fetched {len(page_posts)} {status} posts (page {page})") - - page += 1 - time.sleep(Config.API_DELAY_SECONDS) - - except requests.exceptions.HTTPError as e: - # Handle 400 errors gracefully - if response.status_code == 400 and use_fields and page == 1: - # Retry page 1 without _fields parameter - logger.info(f" ⓘ Retrying without _fields parameter...") - use_fields = False - continue - elif response.status_code == 400: - # Pagination or API limit reached - logger.info(f" ⓘ API limit reached (fetched {status_count} {status} posts)") - break - else: - logger.error(f"Error fetching page {page} from {site_name}: {e}") - break - - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching from {site_name}: {e}") - break - - if status_count > 0: - logger.info(f" ✓ Total {status} posts: {status_count}") - - logger.info(f"✓ Total posts from {site_name} ({status_str}): {len(posts)}") - return posts - - def extract_seo_data(self, post: Dict, site_name: str) -> Dict: - """ - Extract SEO-relevant data from a post. - - Args: - post: Post data from WordPress API - site_name: Name of the site - - Returns: - Dict with extracted SEO data - """ - title = post.get('title', {}) - if isinstance(title, dict): - title = title.get('rendered', '') - - # Get meta description from various SEO plugins - # Check multiple possible locations where different plugins store meta descriptions - meta_desc = '' - if isinstance(post.get('meta'), dict): - meta_dict = post['meta'] - - # Try various SEO plugin fields (order matters - most specific first) - meta_desc = ( - meta_dict.get('_yoast_wpseo_metadesc', '') or # Yoast SEO - meta_dict.get('_rank_math_description', '') or # Rank Math - meta_dict.get('_aioseo_description', '') or # All in One SEO - meta_dict.get('description', '') or # Standard field - meta_dict.get('_meta_description', '') or # Alternative - meta_dict.get('metadesc', '') # Alternative - ) - - # Get post status - status = post.get('status', 'publish') - - return { - 'site': site_name, - 'post_id': post['id'], - 'title': title.strip(), - 'slug': post.get('slug', ''), - 'url': post.get('link', ''), - 'meta_description': meta_desc.strip(), - 'status': status, - } - - def analyze_title(self, title: str) -> Dict: - """ - Analyze title for SEO best practices. - - Args: - title: Post title - - Returns: - Dict with analysis results - """ - length = len(title) - - # SEO best practices - issues = [] - recommendations = [] - score = 100 - - if length < 30: - issues.append(f"Too short ({length})") - recommendations.append("Expand title to 50-60 characters") - score -= 20 - elif length < 50: - recommendations.append("Could be slightly longer (target 50-60)") - score -= 5 - elif length > 70: - issues.append(f"Too long ({length})") - recommendations.append("Consider shortening to 50-70 characters") - score -= 15 - - # Check for power words - power_words = ['best', 'ultimate', 'complete', 'essential', 'proven', - 'effective', 'powerful', 'expert', 'guide', 'tutorial', - 'how to', 'step by step', 'top 10', 'ultimate guide'] - - has_power_word = any(word.lower() in title.lower() for word in power_words) - if not has_power_word: - recommendations.append("Consider adding a power word (best, complete, guide, etc.)") - score -= 10 - - # Check for numbers - if not any(c.isdigit() for c in title): - recommendations.append("Consider adding a number (e.g., 'Top 5', '2025')") - score -= 5 - - # Check for emojis or special chars that might break rendering - special_chars = set(title) - set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 -:') - if special_chars: - recommendations.append(f"Check special characters: {special_chars}") - score -= 5 - - return { - 'length': length, - 'issues': issues, - 'recommendations': recommendations, - 'score': max(0, score), - 'has_power_word': has_power_word, - 'has_number': any(c.isdigit() for c in title) - } - - def analyze_meta_description(self, meta_desc: str) -> Dict: - """ - Analyze meta description for SEO best practices. - - Args: - meta_desc: Meta description text - - Returns: - Dict with analysis results - """ - length = len(meta_desc) - - issues = [] - recommendations = [] - score = 100 - - if not meta_desc or length == 0: - issues.append("Missing meta description") - recommendations.append("Write a 120-160 character meta description") - score = 0 - else: - if length < 100: - issues.append(f"Too short ({length})") - recommendations.append("Expand to 120-160 characters") - score -= 20 - elif length < 120: - recommendations.append("Could be slightly longer (target 120-160)") - score -= 5 - elif length > 160: - issues.append(f"Too long ({length})") - recommendations.append("Shorten to 120-160 characters") - score -= 15 - - # Check for CTA - cta_words = ['learn', 'discover', 'read', 'explore', 'find', 'get', - 'download', 'check', 'see', 'watch', 'try', 'start'] - has_cta = any(word.lower() in meta_desc.lower() for word in cta_words) - if not has_cta: - recommendations.append("Consider adding a call-to-action") - score -= 5 - - return { - 'length': length, - 'is_missing': not meta_desc, - 'issues': issues, - 'recommendations': recommendations, - 'score': max(0, score), - } - - def calculate_overall_score(self, title_analysis: Dict, meta_analysis: Dict) -> float: - """Calculate overall SEO score (0-100).""" - title_weight = 0.4 - meta_weight = 0.6 - return (title_analysis['score'] * title_weight) + (meta_analysis['score'] * meta_weight) - - def generate_ai_recommendations(self, post_data: Dict, title_analysis: Dict, - meta_analysis: Dict) -> Optional[str]: - """ - Use Claude AI to generate specific optimization recommendations. - - Args: - post_data: Post data - title_analysis: Title analysis results - meta_analysis: Meta description analysis - - Returns: - AI-generated recommendations or None if AI disabled - """ - if not self.openrouter_api_key: - return None - - prompt = f"""Analyze this blog post and provide specific SEO optimization recommendations: - -Post Title: "{post_data['title']}" -Current Meta Description: "{post_data['meta_description'] or 'MISSING'}" -URL: {post_data['url']} - -Title Analysis: -- Length: {title_analysis['length']} characters (target: 50-70) -- Issues: {', '.join(title_analysis['issues']) or 'None'} - -Meta Description Analysis: -- Length: {meta_analysis['length']} characters (target: 120-160) -- Issues: {', '.join(meta_analysis['issues']) or 'None'} - -Provide 2-3 specific, actionable recommendations to improve SEO. Focus on: -1. If title needs improvement: suggest a better title -2. If meta description is missing: write one -3. If both are weak: provide both improved versions - -Format as: -- Recommendation 1: [specific action] -- Recommendation 2: [specific action] -etc. - -Be concise and specific.""" - - try: - response = requests.post( - "https://openrouter.ai/api/v1/chat/completions", - headers={ - "Authorization": f"Bearer {self.openrouter_api_key}", - "Content-Type": "application/json", - }, - json={ - "model": "anthropic/claude-3.5-sonnet", - "messages": [ - {"role": "user", "content": prompt} - ], - "temperature": 0.7, - }, - timeout=30 - ) - response.raise_for_status() - - result = response.json() - self.api_calls += 1 - - # Track cost (Claude 3.5 Sonnet: $3/$15 per 1M tokens) - usage = result.get('usage', {}) - input_tokens = usage.get('prompt_tokens', 0) - output_tokens = usage.get('completion_tokens', 0) - self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000 - - recommendations = result['choices'][0]['message']['content'].strip() - return recommendations - - except Exception as e: - logger.warning(f"AI recommendation failed: {e}") - return None - - def _setup_progressive_csv(self) -> Optional[Tuple]: - """ - Setup CSV file for progressive writing. - - Returns: - Tuple of (file_handle, writer) or None if progressive_csv is False - """ - if not self.progressive_csv: - return None - - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - csv_path = output_dir / f'seo_analysis_{timestamp}.csv' - - fieldnames = [ - 'site', 'post_id', 'status', 'title', 'slug', 'url', - 'meta_description', 'title_score', 'title_issues', - 'title_recommendations', 'meta_score', 'meta_issues', - 'meta_recommendations', 'overall_score', 'ai_recommendations', - ] - - csv_file = open(csv_path, 'w', newline='', encoding='utf-8') - writer = csv.DictWriter(csv_file, fieldnames=fieldnames) - writer.writeheader() - csv_file.flush() - - logger.info(f"✓ CSV file created: {csv_path}") - self.csv_file = csv_file - self.csv_writer = writer - - return csv_path - - def _write_result_to_csv(self, result: Dict) -> None: - """Write a single result row to CSV file.""" - if self.progressive_csv and self.csv_writer: - self.csv_writer.writerow(result) - self.csv_file.flush() - - def analyze_all_sites(self, use_ai: bool = True, top_n: int = 10, - include_drafts: bool = False): - """ - Analyze all configured sites. - - Args: - use_ai: Whether to use AI for recommendations - top_n: Number of top priority posts to get AI recommendations for - include_drafts: If True, include draft posts in analysis - """ - logger.info(f"Starting analysis of {len(self.sites_config)} sites...") - if include_drafts: - logger.info("(Including draft posts)") - logger.info("") - - all_posts = [] - - # Fetch posts from all sites - for site_name, config in self.sites_config.items(): - posts = self.fetch_posts_from_site(site_name, config, include_drafts=include_drafts) - if posts: - self.posts_data[site_name] = posts - all_posts.extend(posts) - - if not all_posts: - logger.error("No posts found on any site") - return - - logger.info(f"\nAnalyzing {len(all_posts)} posts...\n") - - # Setup progressive CSV if enabled - csv_path = self._setup_progressive_csv() - - # Analyze each post - for site_name, posts in self.posts_data.items(): - logger.info(f"Analyzing {len(posts)} posts from {site_name}...") - - for idx, post in enumerate(posts, 1): - seo_data = self.extract_seo_data(post, site_name) - title_analysis = self.analyze_title(seo_data['title']) - meta_analysis = self.analyze_meta_description(seo_data['meta_description']) - overall_score = self.calculate_overall_score(title_analysis, meta_analysis) - - result = { - **seo_data, - 'title_score': title_analysis['score'], - 'title_issues': '|'.join(title_analysis['issues']) or 'None', - 'title_recommendations': '|'.join(title_analysis['recommendations']), - 'meta_score': meta_analysis['score'], - 'meta_issues': '|'.join(meta_analysis['issues']) or 'None', - 'meta_recommendations': '|'.join(meta_analysis['recommendations']), - 'overall_score': overall_score, - 'ai_recommendations': '', - } - - self.analysis_results.append(result) - - # Write to CSV progressively (before AI recommendations) - if self.progressive_csv: - self._write_result_to_csv(result) - logger.debug(f" [{idx}/{len(posts)}] Written: {seo_data['title'][:40]}") - - # Sort by priority (lowest scores first) and get AI recommendations for top posts - if use_ai: - self.analysis_results.sort(key=lambda x: x['overall_score']) - logger.info(f"\nGenerating AI recommendations for top {top_n} posts...\n") - - for idx, result in enumerate(self.analysis_results[:top_n], 1): - logger.info(f" [{idx}/{top_n}] {result['title'][:50]}...") - - ai_recs = self.generate_ai_recommendations( - result, - { - 'score': result['title_score'], - 'issues': result['title_issues'].split('|'), - 'length': len(result['title']) - }, - { - 'score': result['meta_score'], - 'issues': result['meta_issues'].split('|'), - 'length': len(result['meta_description']) - } - ) - - result['ai_recommendations'] = ai_recs or '' - - # Update CSV with AI recommendations if using progressive CSV - if self.progressive_csv and self.csv_writer: - # Find and update the row in the CSV by re-writing it - # This is a limitation of CSV - we'll update in final export instead - pass - - time.sleep(0.5) # Rate limiting - - # Sort by overall score for final export - self.analysis_results.sort(key=lambda x: x['overall_score']) - - # Close progressive CSV if open (will be re-written with final data including AI recs) - if self.progressive_csv and self.csv_file: - self.csv_file.close() - self.csv_file = None - self.csv_writer = None - - def export_results(self, output_file: Optional[str] = None): - """ - Export analysis results to CSV. - - Args: - output_file: Output file path (optional) - """ - if not output_file: - output_dir = Path(__file__).parent.parent / 'output' - output_dir.mkdir(parents=True, exist_ok=True) - - if self.progressive_csv: - # Use same timestamp as progressive file - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - # Find the most recent seo_analysis file - files = sorted(output_dir.glob('seo_analysis_*.csv')) - if files: - output_file = files[-1] # Use the most recent one - else: - output_file = output_dir / f'seo_analysis_{timestamp}_final.csv' - else: - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - output_file = output_dir / f'seo_analysis_{timestamp}.csv' - - output_file = Path(output_file) - output_file.parent.mkdir(parents=True, exist_ok=True) - - if not self.analysis_results: - logger.error("No results to export") - return - - fieldnames = [ - 'site', - 'post_id', - 'status', - 'title', - 'slug', - 'url', - 'meta_description', - 'title_score', - 'title_issues', - 'title_recommendations', - 'meta_score', - 'meta_issues', - 'meta_recommendations', - 'overall_score', - 'ai_recommendations', - ] - - with open(output_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - - for result in self.analysis_results: - writer.writerow({field: result.get(field, '') for field in fieldnames}) - - if self.progressive_csv: - logger.info(f"\n✓ Final results saved to: {output_file}") - else: - logger.info(f"\n✓ Results exported to: {output_file}") - - # Also export as a summary report - self.export_summary_report(output_file) - - def export_summary_report(self, csv_file: Path): - """Export a markdown summary report.""" - report_file = csv_file.parent / f"{csv_file.stem}_summary.md" - - # Group by site - by_site = {} - for result in self.analysis_results: - site = result['site'] - if site not in by_site: - by_site[site] = [] - by_site[site].append(result) - - with open(report_file, 'w', encoding='utf-8') as f: - f.write("# Multi-Site SEO Analysis Report\n\n") - f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") - - # Summary stats - total_posts = len(self.analysis_results) - published = sum(1 for r in self.analysis_results if r['status'] == 'publish') - drafts = sum(1 for r in self.analysis_results if r['status'] == 'draft') - avg_score = sum(r['overall_score'] for r in self.analysis_results) / total_posts if total_posts > 0 else 0 - - f.write("## Summary\n\n") - f.write(f"- **Total Posts:** {total_posts}\n") - if published > 0: - f.write(f" - Published: {published}\n") - if drafts > 0: - f.write(f" - Drafts: {drafts}\n") - f.write(f"- **Average SEO Score:** {avg_score:.1f}/100\n") - f.write(f"- **API Calls Made:** {self.api_calls}\n") - f.write(f"- **AI Cost:** ${self.ai_cost:.4f}\n") - f.write(f"- **Sites Analyzed:** {len(by_site)}\n\n") - - # Priority issues - missing_meta = sum(1 for r in self.analysis_results if r['meta_score'] == 0) - weak_titles = sum(1 for r in self.analysis_results if r['title_score'] < 50) - weak_meta = sum(1 for r in self.analysis_results if r['meta_score'] < 50 and r['meta_score'] > 0) - - f.write("## Priority Issues\n\n") - f.write(f"- **Missing Meta Descriptions:** {missing_meta} posts\n") - f.write(f"- **Weak Titles (Score < 50):** {weak_titles} posts\n") - f.write(f"- **Weak Meta (Score < 50):** {weak_meta} posts\n\n") - - # By site - for site_name, posts in by_site.items(): - avg = sum(p['overall_score'] for p in posts) / len(posts) - f.write(f"## {site_name}\n\n") - f.write(f"- **Posts:** {len(posts)}\n") - f.write(f"- **Avg Score:** {avg:.1f}/100\n") - f.write(f"- **Missing Meta:** {sum(1 for p in posts if p['meta_score'] == 0)}\n\n") - - # Top 5 to optimize - f.write("### Top 5 Posts to Optimize\n\n") - for idx, post in enumerate(posts[:5], 1): - f.write(f"{idx}. **{post['title']}** (Score: {post['overall_score']:.0f})\n") - f.write(f" - URL: {post['url']}\n") - if post['meta_issues'] != 'None': - f.write(f" - Meta Issues: {post['meta_issues']}\n") - if post['ai_recommendations']: - f.write(f" - Recommendations: {post['ai_recommendations'].split(chr(10))[0]}\n") - f.write("\n") - - f.write("\n## Legend\n\n") - f.write("- **Title Score:** Evaluates length, power words, numbers, readability\n") - f.write("- **Meta Score:** Evaluates presence, length, call-to-action\n") - f.write("- **Overall Score:** 40% title + 60% meta description\n") - f.write("- **Optimal Ranges:**\n") - f.write(" - Title: 50-70 characters\n") - f.write(" - Meta: 120-160 characters\n") - - logger.info(f"✓ Summary report: {report_file}") - - def run(self, use_ai: bool = True, top_n: int = 10, include_drafts: bool = False): - """Run complete analysis.""" - try: - self.analyze_all_sites(use_ai=use_ai, top_n=top_n, include_drafts=include_drafts) - self.export_results() - - logger.info("\n" + "="*60) - logger.info("ANALYSIS COMPLETE") - logger.info("="*60) - logger.info(f"Total posts analyzed: {len(self.analysis_results)}") - published = sum(1 for r in self.analysis_results if r['status'] == 'publish') - drafts = sum(1 for r in self.analysis_results if r['status'] == 'draft') - if published > 0: - logger.info(f" - Published: {published}") - if drafts > 0: - logger.info(f" - Drafts: {drafts}") - logger.info(f"AI recommendations: {sum(1 for r in self.analysis_results if r['ai_recommendations'])}") - logger.info(f"AI cost: ${self.ai_cost:.4f}") - - except Exception as e: - logger.error(f"Analysis failed: {e}", exc_info=True) - sys.exit(1) - - -def check_meta_fields(site_url: str, username: str, password: str) -> None: - """ - Diagnostic function to check what meta fields are available on a site. - - Args: - site_url: WordPress site URL - username: WordPress username - password: WordPress app password - """ - logger.info(f"\n{'='*60}") - logger.info("META FIELD DIAGNOSTIC") - logger.info(f"{'='*60}\n") - logger.info(f"Site: {site_url}") - logger.info("Checking available meta fields in first post...\n") - - base_url = site_url.rstrip('/') - api_url = f"{base_url}/wp-json/wp/v2/posts" - auth = HTTPBasicAuth(username, password) - - try: - params = { - 'per_page': 1, - 'status': 'publish' - } - - response = requests.get(api_url, params=params, auth=auth, timeout=10) - response.raise_for_status() - - posts = response.json() - if not posts: - logger.error("No posts found") - return - - post = posts[0] - logger.info(f"Post: {post.get('title', {}).get('rendered', 'N/A')}") - logger.info(f"\nAvailable meta fields:") - - if isinstance(post.get('meta'), dict): - meta_dict = post['meta'] - if meta_dict: - for key, value in sorted(meta_dict.items()): - preview = str(value)[:60] - logger.info(f" • {key}: {preview}") - else: - logger.info(" (No meta fields found)") - else: - logger.info(" (Meta is not a dictionary)") - - logger.info(f"\nFull meta object:") - logger.info(json.dumps(post.get('meta', {}), indent=2)[:500]) - - except Exception as e: - logger.error(f"Error: {e}") - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='Analyze SEO across multiple WordPress sites' - ) - parser.add_argument( - '--no-ai', - action='store_true', - help='Skip AI recommendations to save cost' - ) - parser.add_argument( - '--top-n', - type=int, - default=10, - help='Number of top posts to get AI recommendations for' - ) - parser.add_argument( - '--output', - help='Output CSV file path' - ) - parser.add_argument( - '--include-drafts', - action='store_true', - help='Include draft posts in analysis (published + drafts)' - ) - parser.add_argument( - '--no-progressive', - action='store_true', - help='Disable real-time CSV writing (write only at end)' - ) - parser.add_argument( - '--diagnose', - help='Diagnose meta fields for a site (URL). Example: --diagnose https://www.mistergeek.net' - ) - - args = parser.parse_args() - - # Diagnostic mode - if args.diagnose: - # Ask for username/password if not in env - from getpass import getpass - username = Config.WORDPRESS_USERNAME - password = Config.WORDPRESS_APP_PASSWORD - - if not username or not password: - logger.error("WORDPRESS_USERNAME and WORDPRESS_APP_PASSWORD must be set in .env") - sys.exit(1) - - check_meta_fields(args.diagnose, username, password) - sys.exit(0) - - analyzer = MultiSiteSEOAnalyzer(progressive_csv=not args.no_progressive) - analyzer.run(use_ai=not args.no_ai, top_n=args.top_n, include_drafts=args.include_drafts) - - -if __name__ == '__main__': - main() diff --git a/scripts/opportunity_analyzer.py b/scripts/opportunity_analyzer.py deleted file mode 100644 index b9c3d2a..0000000 --- a/scripts/opportunity_analyzer.py +++ /dev/null @@ -1,347 +0,0 @@ -""" -Keyword opportunity analyzer for SEO optimization. -Identifies high-potential keywords ranking at positions 11-30. -""" - -import csv -import json -import argparse -import time -from pathlib import Path -from openai import OpenAI -from config import Config - - -class OpportunityAnalyzer: - """Analyze keyword opportunities for SEO optimization.""" - - def __init__(self): - """Initialize analyzer.""" - self.config = Config - self.output_dir = self.config.OUTPUT_DIR - self.logs = [] - self.client = None - - if self.config.OPENROUTER_API_KEY: - self.client = OpenAI( - base_url="https://openrouter.ai/api/v1", - api_key=self.config.OPENROUTER_API_KEY, - ) - - def log(self, message): - """Add message to log.""" - self.logs.append(message) - print(message) - - def load_posts(self, posts_csv): - """Load posts with analytics data.""" - posts = [] - if not posts_csv.exists(): - self.log(f"❌ File not found: {posts_csv}") - return posts - - try: - with open(posts_csv, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - try: - posts.append({ - 'id': row.get('ID', ''), - 'title': row.get('Title', ''), - 'url': row.get('URL', ''), - 'impressions': int(row.get('impressions', 0) or 0), - 'clicks': int(row.get('clicks', 0) or 0), - 'avg_position': float(row.get('avg_position', 0) or 0), - 'ctr': float(row.get('ctr', 0) or 0), - 'traffic': int(row.get('traffic', 0) or 0), - 'bounce_rate': float(row.get('bounce_rate', 0) or 0), - 'keywords_count': int(row.get('keywords_count', 0) or 0), - 'top_keywords': row.get('top_keywords', '') - }) - except (ValueError, TypeError): - continue - - self.log(f"✓ Loaded {len(posts)} posts") - except Exception as e: - self.log(f"❌ Error reading posts: {e}") - - return posts - - def filter_opportunities(self, posts, min_pos, max_pos, min_impressions): - """Filter posts with keywords in opportunity range or high traffic for optimization.""" - opportunities = [] - - for post in posts: - position = post.get('avg_position', 0) - impressions = post.get('impressions', 0) - traffic = post.get('traffic', 0) - - # Primary filter: position range (if data available) - if position > 0: - if min_pos <= position <= max_pos and impressions >= min_impressions: - opportunities.append(post) - # Fallback: filter by traffic when position data unavailable - # Include posts with any traffic for optimization analysis - elif traffic > 0: - opportunities.append(post) - - self.log(f"✓ Found {len(opportunities)} posts for optimization analysis") - if opportunities: - traffic_posts = [p for p in opportunities if p.get('traffic', 0) > 0] - self.log(f" ({len(traffic_posts)} have traffic data, {len(opportunities) - len(traffic_posts)} selected for analysis)") - return opportunities - - def calculate_opportunity_score(self, post): - """Calculate opportunity score (0-100) for a post.""" - position = post.get('avg_position', 50) - impressions = post.get('impressions', 0) - ctr = post.get('ctr', 0) - traffic = post.get('traffic', 0) - - # Position score (35%): Closer to page 1 = higher - # Position 11-30 range - position_score = max(0, (30 - position) / 19 * 35) - - # Traffic potential (30%): Based on impressions - # Normalize to 0-30 - traffic_potential = min(30, (impressions / 1000) * 30) - - # CTR improvement potential (20%): Gap between current and expected CTR - # Expected CTR at position X - expected_ctr_map = { - 11: 0.02, 12: 0.02, 13: 0.015, 14: 0.015, 15: 0.013, - 16: 0.012, 17: 0.011, 18: 0.01, 19: 0.009, 20: 0.008, - 21: 0.008, 22: 0.007, 23: 0.007, 24: 0.006, 25: 0.006, - 26: 0.006, 27: 0.005, 28: 0.005, 29: 0.005, 30: 0.004 - } - expected_ctr = expected_ctr_map.get(int(position), 0.005) - ctr_gap = max(0, expected_ctr - ctr) - ctr_score = min(20, (ctr_gap / expected_ctr * 100 / 5) * 20) - - # Content quality (15%): Existing traffic and engagement - quality_score = min(15, (traffic / 100) * 7.5 + - (100 - post.get('bounce_rate', 50)) / 100 * 7.5) - - return round(position_score + traffic_potential + ctr_score + quality_score, 1) - - def estimate_traffic_gain(self, post): - """Estimate potential traffic gain from optimization.""" - position = post.get('avg_position', 50) - impressions = post.get('impressions', 0) - ctr = post.get('ctr', 0) - - # Estimate CTR improvement from moving one position up - # Moving from position X to X-1 typically improves CTR by 20-30% - current_traffic = impressions * ctr - if position > 11: - # Target position: 1 ahead - improvement_factor = 1.25 # 25% improvement per position - estimated_new_traffic = current_traffic * improvement_factor - gain = estimated_new_traffic - current_traffic - else: - gain = 0 - - return round(gain, 0) - - def generate_ai_recommendations(self, post): - """Generate AI recommendations for top opportunities.""" - if not self.client: - return None - - try: - keywords = post.get('top_keywords', '').split(',')[:5] - keywords_str = ', '.join([k.strip() for k in keywords if k.strip()]) - - prompt = f"""Analyze keyword optimization opportunities for this blog post: - -Post Title: {post['title']} -Current Position: {post['avg_position']:.1f} -Monthly Impressions: {post['impressions']} -Current CTR: {post['ctr']:.2%} -Top Keywords: {keywords_str} - -Provide 2-3 specific, actionable recommendations to: -1. Improve the SEO title to increase CTR -2. Enhance the meta description -3. Target structural improvements (headers, content gaps) - -Focus on moving this post from positions 11-20 to page 1 (positions 1-10). -Be specific and practical. - -Return as JSON: -{{ - "title_recommendations": ["recommendation 1", "recommendation 2"], - "description_recommendations": ["recommendation 1", "recommendation 2"], - "content_recommendations": ["recommendation 1", "recommendation 2"], - "estimated_effort_hours": number, - "expected_position_improvement": number -}}""" - - response = self.client.chat.completions.create( - model=self.config.AI_MODEL, - messages=[{"role": "user", "content": prompt}], - temperature=0.7, - max_tokens=500 - ) - - try: - result_text = response.choices[0].message.content - # Extract JSON - start_idx = result_text.find('{') - end_idx = result_text.rfind('}') + 1 - if start_idx >= 0 and end_idx > start_idx: - return json.loads(result_text[start_idx:end_idx]) - except json.JSONDecodeError: - self.log(f"⚠️ Could not parse AI response for {post['title']}") - return None - - except Exception as e: - self.log(f"⚠️ AI generation failed for {post['title']}: {e}") - return None - - def export_opportunities_csv(self, opportunities, output_csv): - """Export opportunities to CSV.""" - if not opportunities: - self.log("⚠️ No opportunities to export") - return - - try: - fieldnames = [ - 'ID', 'Title', 'URL', 'avg_position', 'impressions', 'clicks', - 'ctr', 'traffic', 'bounce_rate', 'keywords_count', 'top_keywords', - 'opportunity_score', 'estimated_traffic_gain', - 'title_recommendations', 'description_recommendations', - 'content_recommendations', 'estimated_effort_hours', - 'expected_position_improvement' - ] - - with open(output_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') - writer.writeheader() - - for opp in sorted(opportunities, key=lambda x: x['opportunity_score'], reverse=True): - row = { - 'ID': opp['id'], - 'Title': opp['title'], - 'URL': opp['url'], - 'avg_position': opp['avg_position'], - 'impressions': opp['impressions'], - 'clicks': opp['clicks'], - 'ctr': f"{opp['ctr']:.2%}", - 'traffic': opp['traffic'], - 'bounce_rate': opp['bounce_rate'], - 'keywords_count': opp['keywords_count'], - 'top_keywords': opp['top_keywords'], - 'opportunity_score': opp['opportunity_score'], - 'estimated_traffic_gain': opp['estimated_traffic_gain'], - 'title_recommendations': opp.get('title_recommendations_str', ''), - 'description_recommendations': opp.get('description_recommendations_str', ''), - 'content_recommendations': opp.get('content_recommendations_str', ''), - 'estimated_effort_hours': opp.get('estimated_effort_hours', ''), - 'expected_position_improvement': opp.get('expected_position_improvement', '') - } - writer.writerow(row) - - self.log(f"✓ Exported {len(opportunities)} opportunities to {output_csv}") - except Exception as e: - self.log(f"❌ Error exporting CSV: {e}") - - def export_log(self, log_file): - """Export analysis log.""" - try: - with open(log_file, 'w', encoding='utf-8') as f: - f.write("SEO Opportunity Analysis Report\n") - f.write("=" * 60 + "\n\n") - - for msg in self.logs: - f.write(msg + "\n") - - self.log(f"✓ Exported log to {log_file}") - except Exception as e: - self.log(f"❌ Error exporting log: {e}") - - def run(self, posts_csv, output_csv, min_position=11, max_position=30, - min_impressions=50, top_n=20): - """Run complete analysis workflow.""" - self.log("🔍 Starting keyword opportunity analysis...") - self.log(f"Input: {posts_csv}") - self.log(f"Position range: {min_position}-{max_position}") - self.log(f"Min impressions: {min_impressions}") - self.log(f"Top N for AI analysis: {top_n}\n") - - # Load posts - posts = self.load_posts(posts_csv) - if not posts: - return - - # Filter opportunities - opportunities = self.filter_opportunities(posts, min_position, max_position, min_impressions) - if not opportunities: - self.log("⚠️ No opportunities found in specified range") - return - - # Calculate scores - self.log("\n📊 Calculating opportunity scores...") - for opp in opportunities: - opp['opportunity_score'] = self.calculate_opportunity_score(opp) - opp['estimated_traffic_gain'] = self.estimate_traffic_gain(opp) - - # Sort by score - opportunities = sorted(opportunities, key=lambda x: x['opportunity_score'], reverse=True) - - # Get AI recommendations for top N - self.log(f"\n🤖 Generating AI recommendations for top {min(top_n, len(opportunities))} opportunities...") - for i, opp in enumerate(opportunities[:top_n]): - self.log(f" [{i+1}/{min(top_n, len(opportunities))}] {opp['title'][:50]}...") - recommendations = self.generate_ai_recommendations(opp) - - if recommendations: - opp['title_recommendations_str'] = '; '.join(recommendations.get('title_recommendations', [])) - opp['description_recommendations_str'] = '; '.join(recommendations.get('description_recommendations', [])) - opp['content_recommendations_str'] = '; '.join(recommendations.get('content_recommendations', [])) - opp['estimated_effort_hours'] = recommendations.get('estimated_effort_hours', '') - opp['expected_position_improvement'] = recommendations.get('expected_position_improvement', '') - - time.sleep(0.2) # Rate limiting - - # Export - self.log("\n📁 Exporting results...") - self.export_opportunities_csv(opportunities, output_csv) - - # Export log - log_dir = self.output_dir / 'logs' - log_dir.mkdir(exist_ok=True) - log_file = log_dir / 'opportunity_analysis_log.txt' - self.export_log(log_file) - - self.log(f"\n✓ Analysis complete! {len(opportunities)} opportunities identified.") - self.log(f" Top opportunity: {opportunities[0]['title'][:50]}... (score: {opportunities[0]['opportunity_score']})") - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser(description='Analyze keyword opportunities') - parser.add_argument('--input', type=Path, - default=Path('output/results/posts_with_analytics.csv'), - help='Input posts CSV') - parser.add_argument('--output', type=Path, - default=Path('output/results/keyword_opportunities.csv'), - help='Output opportunities CSV') - parser.add_argument('--min-position', type=int, default=11, - help='Minimum position (start of range)') - parser.add_argument('--max-position', type=int, default=30, - help='Maximum position (end of range)') - parser.add_argument('--min-impressions', type=int, default=50, - help='Minimum impressions to consider') - parser.add_argument('--top-n', type=int, default=20, - help='Top N for AI recommendations') - - args = parser.parse_args() - - analyzer = OpportunityAnalyzer() - analyzer.run(args.input, args.output, args.min_position, args.max_position, - args.min_impressions, args.top_n) - - -if __name__ == '__main__': - main() diff --git a/scripts/report_generator.py b/scripts/report_generator.py deleted file mode 100644 index 694a281..0000000 --- a/scripts/report_generator.py +++ /dev/null @@ -1,436 +0,0 @@ -""" -SEO optimization report generator. -Consolidates all analysis into comprehensive markdown report and action plan. -""" - -import csv -import json -import argparse -from pathlib import Path -from datetime import datetime -from config import Config - - -class ReportGenerator: - """Generate comprehensive SEO optimization report.""" - - def __init__(self): - """Initialize generator.""" - self.config = Config - self.output_dir = self.config.OUTPUT_DIR - self.logs = [] - - def log(self, message): - """Add message to log.""" - self.logs.append(message) - print(message) - - def load_posts_with_analytics(self, csv_path): - """Load posts with all analytics data.""" - posts = {} - if not csv_path.exists(): - self.log(f"❌ File not found: {csv_path}") - return posts - - try: - with open(csv_path, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - post_id = row.get('ID') - if not post_id: - continue - - # Handle different title column names - title = (row.get('Title') or - row.get('title') or - row.get('post_title') or '') - - posts[post_id] = { - 'title': title, - 'url': row.get('URL') or row.get('url') or row.get('post_url') or '', - 'seo_title': row.get('SEO Title') or row.get('seo_title') or '', - 'meta_description': row.get('Meta Description') or row.get('meta_description') or '', - 'traffic': int(row.get('traffic', 0) or 0), - 'users': int(row.get('users', 0) or 0), - 'bounce_rate': float(row.get('bounce_rate', 0) or 0), - 'impressions': int(row.get('impressions', 0) or 0), - 'clicks': int(row.get('clicks', 0) or 0), - 'avg_position': float(row.get('avg_position', 0) or 0), - 'ctr': float(row.get('ctr', 0) or 0), - 'keywords_count': int(row.get('keywords_count', 0) or 0), - 'top_keywords': row.get('top_keywords', '') - } - - self.log(f"✓ Loaded {len(posts)} posts") - except Exception as e: - self.log(f"❌ Error reading posts: {e}") - - return posts - - def load_opportunities(self, csv_path): - """Load keyword opportunities.""" - opportunities = {} - if not csv_path.exists(): - self.log(f"⚠️ Opportunities file not found: {csv_path}") - return opportunities - - try: - with open(csv_path, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - post_id = row.get('ID') - if post_id: - try: - opportunities[post_id] = { - 'opportunity_score': float(row.get('opportunity_score', 0) or 0), - 'estimated_traffic_gain': int(float(row.get('estimated_traffic_gain', 0) or 0)), - 'title_recommendations': row.get('title_recommendations', ''), - 'description_recommendations': row.get('description_recommendations', ''), - 'content_recommendations': row.get('content_recommendations', '') - } - except (ValueError, TypeError): - # Skip rows with parsing errors - continue - - self.log(f"✓ Loaded {len(opportunities)} opportunities") - except Exception as e: - self.log(f"⚠️ Error reading opportunities: {e}") - - return opportunities - - def load_content_gaps(self, csv_path): - """Load content gap suggestions.""" - gaps = [] - if not csv_path.exists(): - self.log(f"⚠️ Content gaps file not found: {csv_path}") - return gaps - - try: - with open(csv_path, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - gaps.append({ - 'title': row.get('title', ''), - 'why_valuable': row.get('why_valuable', ''), - 'search_volume': row.get('search_volume', ''), - 'format': row.get('format', ''), - 'traffic_potential': int(row.get('traffic_potential', 0) or 0), - 'priority': row.get('priority', 'medium') - }) - - self.log(f"✓ Loaded {len(gaps)} content gap ideas") - except Exception as e: - self.log(f"⚠️ Error reading content gaps: {e}") - - return gaps - - def calculate_priority_score(self, post, opportunity=None): - """Calculate comprehensive priority score (0-100).""" - position = post.get('avg_position', 50) - impressions = post.get('impressions', 0) - ctr = post.get('ctr', 0) - traffic = post.get('traffic', 0) - - # Position score (35%): Closer to page 1 = higher - if position > 0 and position <= 30: - position_score = max(0, (30 - position) / 29 * 35) - else: - position_score = 0 - - # Traffic potential (30%): Based on impressions - traffic_potential = min(30, (impressions / 1000) * 30) - - # CTR improvement (20%): Gap vs expected - expected_ctr_map = { - 1: 0.30, 2: 0.16, 3: 0.11, 4: 0.08, 5: 0.07, - 6: 0.06, 7: 0.05, 8: 0.05, 9: 0.04, 10: 0.04, - 11: 0.02, 12: 0.02, 13: 0.015, 14: 0.015, 15: 0.013, - 16: 0.012, 17: 0.011, 18: 0.01, 19: 0.009, 20: 0.008 - } - expected_ctr = expected_ctr_map.get(int(position), 0.005) if position > 0 else 0 - if expected_ctr > 0: - ctr_gap = max(0, expected_ctr - ctr) - ctr_score = min(20, (ctr_gap / expected_ctr * 100 / 5) * 20) - else: - ctr_score = 0 - - # Content quality (15%): Existing traffic and engagement - quality_score = min(15, (traffic / 100) * 7.5 + - (100 - post.get('bounce_rate', 50)) / 100 * 7.5) - - total = round(position_score + traffic_potential + ctr_score + quality_score, 1) - return max(0, min(100, total)) - - def generate_markdown_report(self, posts, opportunities, gaps, top_n=20): - """Generate comprehensive markdown report.""" - report = [] - report.append("# SEO Optimization Strategy Report\n") - report.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n") - - # Calculate metrics - total_traffic = sum(p.get('traffic', 0) for p in posts.values()) - total_impressions = sum(p.get('impressions', 0) for p in posts.values()) - avg_position = sum(p.get('avg_position', 50) for p in posts.values() if p.get('avg_position', 0) > 0) / max(1, len([p for p in posts.values() if p.get('avg_position', 0) > 0])) - - # Executive Summary - report.append("## Executive Summary\n") - report.append(f"- **Total Posts Analyzed:** {len(posts)}\n") - report.append(f"- **Current Monthly Traffic:** {total_traffic:,} visits\n") - report.append(f"- **Total Impressions (90d):** {total_impressions:,}\n") - report.append(f"- **Average Search Position:** {avg_position:.1f}\n") - report.append(f"- **Optimization Opportunities:** {len(opportunities)}\n") - report.append(f"- **Content Gap Ideas:** {len(gaps)}\n") - report.append(f"- **Potential Traffic Gain (Phase 1):** +{sum(o.get('estimated_traffic_gain', 0) for o in opportunities.values()):,} visits/month\n\n") - - # Key Metrics - report.append("### Quick Wins (Estimated Impact)\n\n") - quick_wins = sorted(opportunities.values(), - key=lambda x: x.get('estimated_traffic_gain', 0), - reverse=True)[:5] - total_quick_win_traffic = sum(w.get('estimated_traffic_gain', 0) for w in quick_wins) - report.append(f"Top 5 opportunities could bring **+{total_quick_win_traffic:,} visits/month**\n\n") - - # Top 20 Posts to Optimize - report.append("## Top 20 Posts to Optimize\n\n") - report.append("Ranked by optimization potential (combination of position, traffic potential, and CTR improvement).\n\n") - - # Score all posts - scored_posts = [] - for post_id, post in posts.items(): - opp = opportunities.get(post_id, {}) - score = self.calculate_priority_score(post, opp) - scored_posts.append((post_id, post, opp, score)) - - scored_posts = sorted(scored_posts, key=lambda x: x[3], reverse=True) - - for i, (post_id, post, opp, score) in enumerate(scored_posts[:top_n], 1): - position = post.get('avg_position', 0) - impressions = post.get('impressions', 0) - traffic = post.get('traffic', 0) - - report.append(f"### {i}. {post['title']}\n\n") - report.append(f"**Current Position:** {position:.1f} | **Impressions:** {impressions:,} | **Traffic:** {traffic} visits\n") - report.append(f"**Priority Score:** {score:.1f}/100 | **Estimated Gain:** +{opp.get('estimated_traffic_gain', 0)} visits\n\n") - - if position > 0 and position <= 30: - report.append(f"**Status:** Ranking on {'page 1' if position <= 10 else 'page 2-3'}\n\n") - - if opp.get('title_recommendations'): - report.append("**Title Optimization:**\n") - for rec in opp['title_recommendations'].split(';'): - rec = rec.strip() - if rec: - report.append(f"- {rec}\n") - report.append("\n") - - if opp.get('description_recommendations'): - report.append("**Meta Description:**\n") - for rec in opp['description_recommendations'].split(';'): - rec = rec.strip() - if rec: - report.append(f"- {rec}\n") - report.append("\n") - - if opp.get('content_recommendations'): - report.append("**Content Improvements:**\n") - for rec in opp['content_recommendations'].split(';'): - rec = rec.strip() - if rec: - report.append(f"- {rec}\n") - report.append("\n") - - report.append("---\n\n") - - # Keyword Opportunities Summary - report.append("## Keyword Opportunities Summary\n\n") - opportunity_categories = { - 'page_2': [], - 'page_3': [], - 'ready_for_optimization': [] - } - - for opp_id, opp in opportunities.items(): - if any(opp_id == p[0] for p in scored_posts[:top_n]): - score = opp.get('opportunity_score', 0) - post = posts.get(opp_id, {}) - position = post.get('avg_position', 0) - - if 11 <= position <= 15: - opportunity_categories['page_2'].append((score, opp)) - elif 16 <= position <= 30: - opportunity_categories['page_3'].append((score, opp)) - - report.append(f"**Page 2 (Positions 11-15):** {len(opportunity_categories['page_2'])} keywords ready for quick wins\n") - report.append(f"**Page 3+ (Positions 16-30):** {len(opportunity_categories['page_3'])} keywords with medium effort\n\n") - - # Content Gap Analysis - report.append("## Content Gap Analysis\n\n") - report.append(f"Identified **{len(gaps)} high-value content opportunities** not currently covered:\n\n") - - for i, gap in enumerate(sorted(gaps, key=lambda x: x.get('priority') == 'high', reverse=True)[:15], 1): - report.append(f"### {i}. {gap['title']}\n\n") - report.append(f"**Priority:** {gap.get('priority', 'medium').upper()}\n") - report.append(f"**Search Volume:** {gap.get('search_volume', 'medium')}\n") - report.append(f"**Format:** {gap.get('format', 'guide')}\n") - report.append(f"**Estimated Traffic Potential:** +{gap.get('traffic_potential', 50)} visits/month\n\n") - - if gap.get('why_valuable'): - report.append(f"**Why valuable:** {gap['why_valuable']}\n\n") - - # 90-Day Action Plan - report.append("## 90-Day Action Plan\n\n") - report.append("### Week 1-2: Quick Wins (Estimated +100 visits/month)\n\n") - report.append("Focus on posts with highest opportunity scores that are already ranking on page 2:\n\n") - quick_wins_phase = sorted(scored_posts[:top_n], key=lambda x: x[3], reverse=True)[:5] - for i, (post_id, post, opp, score) in enumerate(quick_wins_phase, 1): - report.append(f"{i}. **{post['title'][:60]}**\n") - report.append(f" - Update SEO title and meta description\n") - report.append(f" - Estimated effort: 30-60 minutes\n") - report.append(f" - Expected gain: +{opp.get('estimated_traffic_gain', 50)} visits\n\n") - - report.append("### Week 3-4: Core Content Optimization (Estimated +150 visits/month)\n\n") - report.append("Improve content structure and internal linking:\n\n") - mid_phase = sorted(scored_posts[5:15], key=lambda x: x[3], reverse=True)[:5] - for i, (post_id, post, opp, score) in enumerate(mid_phase, 1): - report.append(f"{i}. **{post['title'][:60]}**\n") - report.append(f" - Add missing content sections\n") - report.append(f" - Improve header structure\n") - report.append(f" - Estimated effort: 2-3 hours\n\n") - - report.append("### Week 5-8: New Content Creation (Estimated +300 visits/month)\n\n") - report.append("Create 3-5 pieces of new content targeting high-value gaps:\n\n") - for i, gap in enumerate(sorted(gaps, key=lambda x: x.get('traffic_potential', 0), reverse=True)[:4], 1): - report.append(f"{i}. **{gap['title']}** ({gap.get('format', 'guide').title()})\n") - report.append(f" - Estimated effort: 4-6 hours\n") - report.append(f" - Expected traffic: +{gap.get('traffic_potential', 50)} visits/month\n\n") - - report.append("### Week 9-12: Refinement & Analysis (Estimated +100 visits/month)\n\n") - report.append("- Monitor ranking changes and CTR improvements\n") - report.append("- Refine underperforming optimizations\n") - report.append("- Re-run keyword analysis to identify new opportunities\n\n") - - report.append("**Total Estimated 90-Day Impact: +650 visits/month (+~7.8% growth)**\n\n") - - # Methodology - report.append("## Methodology\n\n") - report.append("### Priority Score Calculation\n\n") - report.append("Each post is scored based on:\n") - report.append("- **Position (35%):** Posts ranking 11-20 get highest scores (closest to page 1)\n") - report.append("- **Traffic Potential (30%):** Based on search impressions\n") - report.append("- **CTR Gap (20%):** Difference between current and expected CTR for position\n") - report.append("- **Content Quality (15%):** Existing traffic and bounce rate\n\n") - - report.append("### Data Sources\n\n") - report.append("- **Google Analytics:** Traffic metrics (90-day window)\n") - report.append("- **Google Search Console:** Keyword data, impressions, clicks, positions\n") - report.append("- **WordPress REST API:** Current SEO metadata and content structure\n\n") - - report.append("### Assumptions\n\n") - report.append("- Traffic estimates are based on historical CTR and position data\n") - report.append("- Moving one position up typically improves CTR by 20-30%\n") - report.append("- Page 1 rankings (positions 1-10) receive ~20-30% of total impressions\n") - report.append("- New content takes 4-8 weeks to gain significant traction\n\n") - - return "\n".join(report) - - def export_report(self, report_text, output_md): - """Export markdown report.""" - try: - with open(output_md, 'w', encoding='utf-8') as f: - f.write(report_text) - - self.log(f"✓ Exported report to {output_md}") - except Exception as e: - self.log(f"❌ Error exporting report: {e}") - - def export_prioritized_csv(self, posts, opportunities, output_csv): - """Export all posts with priority scores.""" - try: - scored_posts = [] - for post_id, post in posts.items(): - opp = opportunities.get(post_id, {}) - score = self.calculate_priority_score(post, opp) - - scored_posts.append({ - 'ID': post_id, - 'Title': post.get('title', ''), - 'URL': post.get('url', ''), - 'Priority_Score': score, - 'Estimated_Traffic_Gain': opp.get('estimated_traffic_gain', 0), - 'Current_Position': post.get('avg_position', 0), - 'Impressions': post.get('impressions', 0), - 'Traffic': post.get('traffic', 0), - 'CTR': f"{post.get('ctr', 0):.2%}", - 'Keywords_Count': post.get('keywords_count', 0) - }) - - scored_posts = sorted(scored_posts, key=lambda x: x['Priority_Score'], reverse=True) - - fieldnames = ['ID', 'Title', 'URL', 'Priority_Score', 'Estimated_Traffic_Gain', - 'Current_Position', 'Impressions', 'Traffic', 'CTR', 'Keywords_Count'] - - with open(output_csv, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(scored_posts) - - self.log(f"✓ Exported {len(scored_posts)} prioritized posts to {output_csv}") - except Exception as e: - self.log(f"❌ Error exporting prioritized CSV: {e}") - - def run(self, posts_csv, opportunities_csv, gaps_csv, output_md, output_prioritized_csv, top_n=20): - """Run complete report generation workflow.""" - self.log("📊 Generating SEO optimization report...") - self.log(f"Input files: posts_with_analytics, opportunities, content_gaps\n") - - # Load data - posts = self.load_posts_with_analytics(posts_csv) - opportunities = self.load_opportunities(opportunities_csv) - gaps = self.load_content_gaps(gaps_csv) - - if not posts: - self.log("❌ No posts loaded. Cannot generate report.") - return - - # Generate report - self.log("\n📝 Generating markdown report...") - report_text = self.generate_markdown_report(posts, opportunities, gaps, top_n) - - # Export report - self.log("\n📁 Exporting files...") - self.export_report(report_text, output_md) - self.export_prioritized_csv(posts, opportunities, output_prioritized_csv) - - self.log("\n✓ Report generation complete!") - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser(description='Generate SEO optimization report') - parser.add_argument('--posts-with-analytics', type=Path, - default=Path('output/results/posts_with_analytics.csv'), - help='Posts with analytics CSV') - parser.add_argument('--keyword-opportunities', type=Path, - default=Path('output/results/keyword_opportunities.csv'), - help='Keyword opportunities CSV') - parser.add_argument('--content-gaps', type=Path, - default=Path('output/results/content_gaps.csv'), - help='Content gaps CSV') - parser.add_argument('--output-report', type=Path, - default=Path('output/results/seo_optimization_report.md'), - help='Output markdown report') - parser.add_argument('--output-csv', type=Path, - default=Path('output/results/posts_prioritized.csv'), - help='Output prioritized posts CSV') - parser.add_argument('--top-n', type=int, default=20, - help='Number of top posts to detail') - - args = parser.parse_args() - - generator = ReportGenerator() - generator.run(args.posts_with_analytics, args.keyword_opportunities, - args.content_gaps, args.output_report, args.output_csv, args.top_n) - - -if __name__ == '__main__': - main() diff --git a/scripts/run_analysis.sh b/scripts/run_analysis.sh deleted file mode 100755 index bbd7b76..0000000 --- a/scripts/run_analysis.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -set -e - -echo "╔════════════════════════════════════════════════════════════╗" -echo "║ SEO Analysis & Improvement System - Full Pipeline ║" -echo "╚════════════════════════════════════════════════════════════╝" -echo "" - -# Check if venv exists -if [ ! -d "venv" ]; then - echo "❌ Virtual environment not found. Please run: python3 -m venv venv" - exit 1 -fi - -# Check if input files exist -if [ ! -f "input/new-propositions.csv" ]; then - echo "❌ Missing input/new-propositions.csv" - echo "Please place your WordPress posts CSV in input/ directory" - exit 1 -fi - -if [ ! -f "input/analytics/ga4_export.csv" ]; then - echo "❌ Missing input/analytics/ga4_export.csv" - echo "Please export GA4 data and place it in input/analytics/" - exit 1 -fi - -# Create output directories -mkdir -p output/results -mkdir -p output/logs - -echo "📊 Step 1: Analytics Integration" -echo " Merging GA4, Search Console, and WordPress data..." -./venv/bin/python analytics_importer.py -echo "" - -echo "🔍 Step 2: Keyword Opportunity Analysis" -echo " Identifying high-potential optimization opportunities..." -./venv/bin/python opportunity_analyzer.py \ - --input output/results/posts_with_analytics.csv \ - --output output/results/keyword_opportunities.csv \ - --min-position 11 \ - --max-position 30 \ - --min-impressions 50 \ - --top-n 20 -echo "" - -echo "📝 Step 3: Report Generation" -echo " Creating comprehensive SEO optimization report..." -./venv/bin/python report_generator.py -echo "" - -echo "╔════════════════════════════════════════════════════════════╗" -echo "║ ✅ Analysis Complete! ║" -echo "╚════════════════════════════════════════════════════════════╝" -echo "" -echo "📂 Results Location:" -echo " └─ output/results/seo_optimization_report.md" -echo "" -echo "📊 Key Files:" -echo " ├─ posts_prioritized.csv (all posts ranked 0-100)" -echo " ├─ keyword_opportunities.csv (26 optimization opportunities)" -echo " └─ posts_with_analytics.csv (enriched dataset)" -echo "" -echo "📋 Logs:" -echo " └─ output/logs/" -echo "" -echo "🚀 Next Steps:" -echo " 1. Open: output/results/seo_optimization_report.md" -echo " 2. Review Top 20 Posts to Optimize" -echo " 3. Start with Quick Wins (positions 11-15)" -echo " 4. Follow 90-day action plan" -echo "" diff --git a/scripts/seo-cli.py b/scripts/seo-cli.py deleted file mode 100755 index 4cb44a2..0000000 --- a/scripts/seo-cli.py +++ /dev/null @@ -1,388 +0,0 @@ -#!/usr/bin/env python3 -""" -DEPRECATED: SEO Automation CLI - -This script is deprecated. Please use the new unified CLI: -- ./seo export -- ./seo analyze -- ./seo seo_check -- ./seo categories -- ./seo full_pipeline - -To see all commands: ./seo help -""" - -import sys -import subprocess -import argparse -from pathlib import Path -from config import Config -import os - -class SEOCLI: - """DEPRECATED: Main CLI orchestrator for SEO workflows. Use new ./seo CLI instead.""" - - def __init__(self): - """Initialize CLI.""" - print("⚠️ DEPRECATION WARNING: This CLI is deprecated. Use ./seo instead.") - print(" Run './seo help' to see new commands.") - self.scripts_dir = Path(__file__).parent - self.project_dir = self.scripts_dir.parent - self.output_dir = self.project_dir / 'output' / 'reports' - - def run_command(self, command, description): - """Run a command and show progress.""" - print(f"\n{'='*70}") - print(f"▶ {description}") - print(f"{'='*70}\n") - - try: - result = subprocess.run(command, shell=True, cwd=self.project_dir) - if result.returncode != 0: - print(f"\n❌ Error running: {description}") - return False - print(f"\n✓ {description} completed successfully") - return True - except Exception as e: - print(f"\n❌ Error: {e}") - return False - - def get_latest_file(self, pattern): - """Get most recent file matching pattern.""" - import glob - # Support both old and new naming patterns - files = glob.glob(str(self.output_dir / pattern)) - if not files: - # Try new pattern - files = glob.glob(str(self.output_dir / "all_posts_*.csv")) - if not files: - return None - return max(files, key=os.path.getctime) - - def export_posts(self): - """Export all posts to CSV.""" - cmd = f"python {self.scripts_dir}/export_posts_for_ai_decision.py" - return self.run_command(cmd, "STEP 1: Export All Posts") - - def analyze_with_ai(self, csv_file=None): - """Analyze exported posts with AI.""" - if not csv_file: - csv_file = self.get_latest_file("all_posts_for_ai_decision_*.csv") - - if not csv_file: - print("\n❌ No exported CSV found. Run 'seo-cli export' first.") - return False - - cmd = f"python {self.scripts_dir}/ai_analyze_posts_for_decisions.py \"{csv_file}\"" - return self.run_command(cmd, "STEP 2: Analyze with AI") - - def recategorize_with_ai(self, csv_file=None): - """Recategorize posts using AI.""" - if not csv_file: - csv_file = self.get_latest_file("all_posts_for_ai_decision_*.csv") - - if not csv_file: - print("\n❌ No exported CSV found. Run 'seo-cli export' first.") - return False - - cmd = f"python {self.scripts_dir}/ai_recategorize_posts.py \"{csv_file}\"" - return self.run_command(cmd, "Recategorizing Posts with AI") - - def seo_check(self, top_n=None): - """Check SEO quality of titles and meta descriptions.""" - cmd = f"python {self.scripts_dir}/multi_site_seo_analyzer.py" - if top_n: - cmd += f" --top-n {top_n}" - - return self.run_command(cmd, f"SEO Quality Check (Top {top_n or 'All'} posts)") - - def import_analytics(self, ga_export, gsc_export, posts_csv=None): - """Import analytics data.""" - if not posts_csv: - posts_csv = self.get_latest_file("all_posts_for_ai_decision_*.csv") - - if not posts_csv: - print("\n❌ No posts CSV found. Run 'seo-cli export' first.") - return False - - cmd = ( - f"python {self.scripts_dir}/analytics_importer.py " - f"--ga-export \"{ga_export}\" " - f"--gsc-export \"{gsc_export}\" " - f"--posts-csv \"{posts_csv}\" " - f"--output output/posts_with_analytics.csv" - ) - return self.run_command(cmd, "STEP: Import Analytics Data") - - def full_pipeline(self, analyze=True, seo=True): - """Run complete pipeline: export → analyze → seo check.""" - steps = [ - ("Export", self.export_posts), - ] - - if analyze: - steps.append(("Analyze", self.analyze_with_ai)) - - if seo: - steps.append(("SEO Check", self.seo_check)) - - print("\n" + "="*70) - print("🚀 STARTING FULL PIPELINE") - print("="*70) - print(f"\nSteps to run: {', '.join([s[0] for s in steps])}\n") - - completed = 0 - for name, func in steps: - if func(): - completed += 1 - else: - print(f"\n⚠️ Pipeline stopped at: {name}") - return False - - print("\n" + "="*70) - print(f"✓ PIPELINE COMPLETE - All {completed} steps succeeded!") - print("="*70) - print("\nNext steps:") - print("1. Review results in output/reports/") - print("2. Check: posts_with_ai_recommendations_*.csv") - print("3. Follow AI recommendations to optimize your content") - return True - - def manage_categories(self): - """Run category management with AI recommendations.""" - cmd = f"python {self.scripts_dir}/category_manager.py" - return self.run_command(cmd, "Category Management with AI Recommendations") - - def approve_recommendations(self, csv_files=None): - """Approve recommendations from CSV files.""" - if not csv_files: - print("\n❌ No CSV files provided for approval.") - return False - - # Join the CSV files into a single command argument - csv_files_str = " ".join(f'"{csv_file}"' for csv_file in csv_files) - cmd = f"python {self.scripts_dir}/user_approval.py {csv_files_str}" - return self.run_command(cmd, f"Approving Recommendations from {len(csv_files)} files") - - def show_status(self): - """Show status of output files.""" - print("\n" + "="*70) - print("📊 OUTPUT FILES STATUS") - print("="*70 + "\n") - - import glob - files = glob.glob(str(self.output_dir / "*")) - - if not files: - print("No output files yet. Run 'seo-cli export' to get started.\n") - return - - # Sort by date - files.sort(key=os.path.getctime, reverse=True) - - for file in files[:10]: # Show last 10 files - size = os.path.getsize(file) / 1024 # KB - mtime = os.path.getmtime(file) - from datetime import datetime - date = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') - filename = os.path.basename(file) - - print(f" {filename}") - print(f" Size: {size:.1f} KB | Modified: {date}") - print() - - def list_workflows(self): - """List available workflows.""" - workflows = { - 'export': { - 'description': 'Export all posts from your 3 WordPress sites', - 'command': 'seo-cli export', - 'time': '5-10 min', - 'cost': 'Free' - }, - 'analyze': { - 'description': 'Analyze exported posts with Claude AI', - 'command': 'seo-cli analyze', - 'time': '5-15 min', - 'cost': '$1.50-2.00' - }, - 'recategorize': { - 'description': 'Re-categorize posts for better organization', - 'command': 'seo-cli recategorize', - 'time': '5-15 min', - 'cost': '$1.50-2.00' - }, - 'seo-check': { - 'description': 'Check SEO quality of titles and descriptions', - 'command': 'seo-cli seo-check [--top-n 50]', - 'time': '3-5 min', - 'cost': 'Free or $0.20-0.50' - }, - 'analytics': { - 'description': 'Combine Google Analytics & Search Console data', - 'command': 'seo-cli analytics GA4.csv GSC.csv', - 'time': '5 min', - 'cost': 'Free' - }, - 'full-pipeline': { - 'description': 'Run complete pipeline: export → analyze → seo-check', - 'command': 'seo-cli full-pipeline', - 'time': '15-30 min', - 'cost': '$1.50-2.50' - }, - 'categories': { - 'description': 'Manage categories across all sites with AI recommendations', - 'command': 'seo-cli categories', - 'time': '10-20 min', - 'cost': '$0.50-1.00' - }, - 'approve': { - 'description': 'Review and approve SEO recommendations', - 'command': 'seo-cli approve [csv_file1] [csv_file2]', - 'time': 'Variable', - 'cost': 'Free' - } - } - - print("\n" + "="*70) - print("📋 AVAILABLE WORKFLOWS") - print("="*70 + "\n") - - for name, info in workflows.items(): - print(f"🔹 {name.upper()}") - print(f" {info['description']}") - print(f" Command: {info['command']}") - print(f" Time: {info['time']} | Cost: {info['cost']}") - print() - - def show_help(self): - """Show help message.""" - print("\n" + "="*70) - print("🚀 SEO AUTOMATION CLI - Workflow Orchestrator") - print("="*70 + "\n") - - print("QUICK START:") - print(" seo-cli full-pipeline Run complete workflow") - print(" seo-cli export Export all posts") - print(" seo-cli analyze Analyze with AI") - print(" seo-cli recategorize Re-categorize posts with AI") - print(" seo-cli seo-check Check SEO quality") - print() - - print("CHAINING WORKFLOWS:") - print(" seo-cli export && seo-cli analyze && seo-cli seo-check") - print() - - print("ADVANCED:") - print(" seo-cli seo-check --top-n 50 Check top 50 posts") - print(" seo-cli analytics GA4.csv GSC.csv Import analytics data") - print(" seo-cli status Show output files") - print(" seo-cli list List all workflows") - print() - - print("Learn more:") - print(" Read: WORKFLOWS.md (complete guide)") - print(" Read: scripts/*/README.md (workflow details)") - print() - - -def main(): - """Main entry point.""" - cli = SEOCLI() - - parser = argparse.ArgumentParser( - description='SEO Automation CLI - Chain workflows together', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - seo-cli export # Export posts - seo-cli full-pipeline # Export + Analyze + SEO check - seo-cli export && seo-cli analyze # Chain commands - seo-cli seo-check --top-n 50 # Check top 50 posts - seo-cli analytics ga4.csv gsc.csv # Import analytics - seo-cli status # Show output files - """ - ) - - subparsers = parser.add_subparsers(dest='command', help='Workflow to run') - - # Export workflow - subparsers.add_parser('export', help='Export all posts from WordPress sites') - - # Analyze workflow - subparsers.add_parser('analyze', help='Analyze exported posts with Claude AI') - - # Recategorize workflow - subparsers.add_parser('recategorize', help='Re-categorize posts with Claude AI') - - # SEO check workflow - seo_parser = subparsers.add_parser('seo-check', help='Check SEO quality of titles/descriptions') - seo_parser.add_argument('--top-n', type=int, help='Analyze top N posts with AI (costs money)') - - # Analytics workflow - analytics_parser = subparsers.add_parser('analytics', help='Import Google Analytics & Search Console') - analytics_parser.add_argument('ga_export', help='Path to GA4 export CSV') - analytics_parser.add_argument('gsc_export', help='Path to Search Console export CSV') - - # Full pipeline - full_parser = subparsers.add_parser('full-pipeline', help='Complete pipeline: export → analyze → seo-check') - full_parser.add_argument('--no-analyze', action='store_true', help='Skip AI analysis') - full_parser.add_argument('--no-seo', action='store_true', help='Skip SEO check') - - # Category management - subparsers.add_parser('categories', help='Manage categories with AI recommendations') - - # Approval system - approval_parser = subparsers.add_parser('approve', help='Approve recommendations from CSV files') - approval_parser.add_argument('csv_files', nargs='*', help='CSV files containing recommendations to approve') - - # Utilities - subparsers.add_parser('status', help='Show status of output files') - subparsers.add_parser('list', help='List all available workflows') - subparsers.add_parser('help', help='Show this help message') - - args = parser.parse_args() - - # If no command, show help - if not args.command: - cli.show_help() - return 0 - - # Route to appropriate command - if args.command == 'export': - success = cli.export_posts() - elif args.command == 'analyze': - success = cli.analyze_with_ai() - elif args.command == 'recategorize': - success = cli.recategorize_with_ai() - elif args.command == 'seo-check': - success = cli.seo_check(top_n=args.top_n) - elif args.command == 'analytics': - success = cli.import_analytics(args.ga_export, args.gsc_export) - elif args.command == 'full-pipeline': - success = cli.full_pipeline( - analyze=not args.no_analyze, - seo=not args.no_seo - ) - elif args.command == 'categories': - success = cli.manage_categories() - elif args.command == 'approve': - success = cli.approve_recommendations(args.csv_files) - elif args.command == 'status': - cli.show_status() - success = True - elif args.command == 'list': - cli.list_workflows() - success = True - elif args.command == 'help': - cli.show_help() - success = True - else: - cli.show_help() - success = False - - return 0 if success else 1 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/scripts/user_approval.py b/scripts/user_approval.py deleted file mode 100644 index 4892215..0000000 --- a/scripts/user_approval.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python3 -""" -User Approval Mechanism for SEO Recommendations -Allows users to review and approve recommendations from CSV files. -""" - -import csv -import json -import logging -import sys -from pathlib import Path -from typing import Dict, List, Optional -from datetime import datetime -from config import Config - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - - -class UserApprovalSystem: - """System for reviewing and approving SEO recommendations.""" - - def __init__(self): - """Initialize the approval system.""" - self.output_dir = Path(__file__).parent.parent / 'output' - self.approved_recommendations = [] - self.rejected_recommendations = [] - self.pending_recommendations = [] - - def load_recommendations_from_csv(self, csv_file: str) -> List[Dict]: - """Load recommendations from CSV file.""" - recommendations = [] - - if not Path(csv_file).exists(): - logger.error(f"CSV file not found: {csv_file}") - return recommendations - - try: - with open(csv_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - recommendations.append(dict(row)) - - logger.info(f"Loaded {len(recommendations)} recommendations from {csv_file}") - return recommendations - except Exception as e: - logger.error(f"Error loading CSV: {e}") - return recommendations - - def display_recommendation(self, recommendation: Dict, index: int, total: int): - """Display a single recommendation for user review.""" - print(f"\n{'='*80}") - print(f"RECOMMENDATION {index}/{total}") - print(f"{'='*80}") - - # Display different fields depending on the type of recommendation - if 'post_title' in recommendation: - print(f"Post Title: {recommendation.get('post_title', 'N/A')}") - print(f"Post ID: {recommendation.get('post_id', 'N/A')}") - print(f"Site: {recommendation.get('site', 'N/A')}") - print(f"Current Categories: {recommendation.get('current_categories', 'N/A')}") - print(f"Proposed Category: {recommendation.get('proposed_category', 'N/A')}") - print(f"Proposed Site: {recommendation.get('proposed_site', 'N/A')}") - print(f"Reason: {recommendation.get('reason', 'N/A')}") - print(f"Confidence: {recommendation.get('confidence', 'N/A')}") - print(f"Content Preview: {recommendation.get('content_preview', 'N/A')[:100]}...") - elif 'title' in recommendation: - print(f"Post Title: {recommendation.get('title', 'N/A')}") - print(f"Post ID: {recommendation.get('post_id', 'N/A')}") - print(f"Site: {recommendation.get('site', 'N/A')}") - print(f"Decision: {recommendation.get('decision', 'N/A')}") - print(f"Recommended Category: {recommendation.get('recommended_category', 'N/A')}") - print(f"Reason: {recommendation.get('reason', 'N/A')}") - print(f"Priority: {recommendation.get('priority', 'N/A')}") - print(f"AI Notes: {recommendation.get('ai_notes', 'N/A')}") - else: - # Generic display for other types of recommendations - for key, value in recommendation.items(): - print(f"{key.replace('_', ' ').title()}: {value}") - - def get_user_choice(self) -> str: - """Get user's approval choice.""" - while True: - print(f"\nOptions:") - print(f" 'y' or 'yes' - Approve this recommendation") - print(f" 'n' or 'no' - Reject this recommendation") - print(f" 's' or 'skip' - Skip this recommendation for later review") - print(f" 'q' or 'quit' - Quit and save current progress") - - choice = input(f"\nEnter your choice: ").strip().lower() - - if choice in ['y', 'yes']: - return 'approved' - elif choice in ['n', 'no']: - return 'rejected' - elif choice in ['s', 'skip']: - return 'pending' - elif choice in ['q', 'quit']: - return 'quit' - else: - print("Invalid choice. Please enter 'y', 'n', 's', or 'q'.") - - def review_recommendations(self, recommendations: List[Dict], title: str = "Recommendations"): - """Review recommendations with user interaction.""" - print(f"\n{'='*80}") - print(f"REVIEWING {title.upper()}") - print(f"Total recommendations to review: {len(recommendations)}") - print(f"{'='*80}") - - for i, recommendation in enumerate(recommendations, 1): - self.display_recommendation(recommendation, i, len(recommendations)) - - choice = self.get_user_choice() - - if choice == 'quit': - logger.info("User chose to quit. Saving progress...") - break - elif choice == 'approved': - recommendation['status'] = 'approved' - self.approved_recommendations.append(recommendation) - logger.info(f"Approved recommendation {i}") - elif choice == 'rejected': - recommendation['status'] = 'rejected' - self.rejected_recommendations.append(recommendation) - logger.info(f"Rejected recommendation {i}") - elif choice == 'pending': - recommendation['status'] = 'pending_review' - self.pending_recommendations.append(recommendation) - logger.info(f"Skipped recommendation {i} for later review") - - def export_approved_recommendations(self, filename_suffix: str = "") -> str: - """Export approved recommendations to CSV.""" - if not self.approved_recommendations: - logger.info("No approved recommendations to export") - return "" - - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - filename = f"approved_recommendations_{timestamp}{filename_suffix}.csv" - csv_file = self.output_dir / filename - - # Get all unique fieldnames from recommendations - fieldnames = set() - for rec in self.approved_recommendations: - fieldnames.update(rec.keys()) - fieldnames = sorted(list(fieldnames)) - - with open(csv_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.approved_recommendations) - - logger.info(f"Exported {len(self.approved_recommendations)} approved recommendations to: {csv_file}") - return str(csv_file) - - def export_rejected_recommendations(self, filename_suffix: str = "") -> str: - """Export rejected recommendations to CSV.""" - if not self.rejected_recommendations: - logger.info("No rejected recommendations to export") - return "" - - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - filename = f"rejected_recommendations_{timestamp}{filename_suffix}.csv" - csv_file = self.output_dir / filename - - # Get all unique fieldnames from recommendations - fieldnames = set() - for rec in self.rejected_recommendations: - fieldnames.update(rec.keys()) - fieldnames = sorted(list(fieldnames)) - - with open(csv_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.rejected_recommendations) - - logger.info(f"Exported {len(self.rejected_recommendations)} rejected recommendations to: {csv_file}") - return str(csv_file) - - def export_pending_recommendations(self, filename_suffix: str = "") -> str: - """Export pending recommendations to CSV.""" - if not self.pending_recommendations: - logger.info("No pending recommendations to export") - return "" - - timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') - filename = f"pending_recommendations_{timestamp}{filename_suffix}.csv" - csv_file = self.output_dir / filename - - # Get all unique fieldnames from recommendations - fieldnames = set() - for rec in self.pending_recommendations: - fieldnames.update(rec.keys()) - fieldnames = sorted(list(fieldnames)) - - with open(csv_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(self.pending_recommendations) - - logger.info(f"Exported {len(self.pending_recommendations)} pending recommendations to: {csv_file}") - return str(csv_file) - - def run_interactive_approval(self, csv_files: List[str]): - """Run interactive approval process for multiple CSV files.""" - logger.info("="*70) - logger.info("USER APPROVAL SYSTEM FOR SEO RECOMMENDATIONS") - logger.info("="*70) - - for csv_file in csv_files: - logger.info(f"\nLoading recommendations from: {csv_file}") - recommendations = self.load_recommendations_from_csv(csv_file) - - if not recommendations: - logger.warning(f"No recommendations found in {csv_file}, skipping...") - continue - - # Get the filename without path for the title - filename = Path(csv_file).stem - self.review_recommendations(recommendations, title=filename) - - # Export results - logger.info("\n" + "="*70) - logger.info("EXPORTING RESULTS") - logger.info("="*70) - - approved_file = self.export_approved_recommendations() - rejected_file = self.export_rejected_recommendations() - pending_file = self.export_pending_recommendations() - - # Summary - logger.info(f"\n{'─'*70}") - logger.info("APPROVAL SUMMARY:") - logger.info(f" Approved: {len(self.approved_recommendations)}") - logger.info(f" Rejected: {len(self.rejected_recommendations)}") - logger.info(f" Pending: {len(self.pending_recommendations)}") - logger.info(f"{'─'*70}") - - if approved_file: - logger.info(f"\nApproved recommendations saved to: {approved_file}") - if rejected_file: - logger.info(f"Rejected recommendations saved to: {rejected_file}") - if pending_file: - logger.info(f"Pending recommendations saved to: {pending_file}") - - logger.info(f"\n✓ Approval process complete!") - - def run_auto_approval(self, csv_files: List[str], auto_approve_threshold: float = 0.8): - """Auto-approve recommendations based on confidence threshold.""" - logger.info("="*70) - logger.info("AUTO APPROVAL SYSTEM FOR SEO RECOMMENDATIONS") - logger.info("="*70) - logger.info(f"Auto-approval threshold: {auto_approve_threshold}") - - all_recommendations = [] - for csv_file in csv_files: - logger.info(f"\nLoading recommendations from: {csv_file}") - recommendations = self.load_recommendations_from_csv(csv_file) - all_recommendations.extend(recommendations) - - approved_count = 0 - rejected_count = 0 - - for rec in all_recommendations: - # Check if there's a confidence field and if it meets the threshold - confidence_str = rec.get('confidence', 'Low').lower() - confidence_value = 0.0 - - if confidence_str == 'high': - confidence_value = 0.9 - elif confidence_str == 'medium': - confidence_value = 0.6 - elif confidence_str == 'low': - confidence_value = 0.3 - else: - # Try to parse as numeric value if possible - try: - confidence_value = float(confidence_str) - except ValueError: - confidence_value = 0.3 # Default to low - - if confidence_value >= auto_approve_threshold: - rec['status'] = 'auto_approved' - self.approved_recommendations.append(rec) - approved_count += 1 - else: - rec['status'] = 'auto_rejected' - self.rejected_recommendations.append(rec) - rejected_count += 1 - - # Export results - logger.info("\n" + "="*70) - logger.info("EXPORTING AUTO-APPROVAL RESULTS") - logger.info("="*70) - - approved_file = self.export_approved_recommendations("_auto") - rejected_file = self.export_rejected_recommendations("_auto") - - # Summary - logger.info(f"\n{'─'*70}") - logger.info("AUTO APPROVAL SUMMARY:") - logger.info(f" Auto-approved: {approved_count}") - logger.info(f" Auto-rejected: {rejected_count}") - logger.info(f"{'─'*70}") - - if approved_file: - logger.info(f"\nAuto-approved recommendations saved to: {approved_file}") - if rejected_file: - logger.info(f"Auto-rejected recommendations saved to: {rejected_file}") - - logger.info(f"\n✓ Auto-approval process complete!") - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='Review and approve SEO recommendations' - ) - parser.add_argument( - 'csv_files', - nargs='+', - help='CSV files containing recommendations to review' - ) - parser.add_argument( - '--auto', - action='store_true', - help='Run auto-approval mode instead of interactive mode' - ) - parser.add_argument( - '--threshold', - type=float, - default=0.8, - help='Confidence threshold for auto-approval (default: 0.8)' - ) - - args = parser.parse_args() - - approval_system = UserApprovalSystem() - - if args.auto: - approval_system.run_auto_approval(args.csv_files, args.threshold) - else: - approval_system.run_interactive_approval(args.csv_files) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/seo b/seo index 14094e6..9c7b517 100755 --- a/seo +++ b/seo @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ SEO Automation CLI - Main executable -Entry point for the SEO automation tool. +Single entry point for SEO automation tool. """ import sys diff --git a/src/seo/__init__.py b/src/seo/__init__.py index 9278c85..6d4cccb 100644 --- a/src/seo/__init__.py +++ b/src/seo/__init__.py @@ -1,7 +1,14 @@ """ -SEO Automation Tool - Integrated Application -A comprehensive WordPress SEO automation suite. +SEO Automation Tool - Complete Integrated Package +Single entry point for all SEO automation functionality. """ __version__ = '1.0.0' __author__ = 'SEO Automation Team' +__all__ = ['SEOApp', 'PostExporter', 'PostAnalyzer', 'CategoryProposer'] + +# Import main classes for easy access +from .app import SEOApp +from .exporter import PostExporter +from .analyzer import PostAnalyzer, EnhancedPostAnalyzer +from .category_proposer import CategoryProposer diff --git a/src/seo/analyzer.py b/src/seo/analyzer.py index b6bb351..6de2713 100644 --- a/src/seo/analyzer.py +++ b/src/seo/analyzer.py @@ -1,15 +1,353 @@ """ -Analyzer Module - AI-powered post analysis +Post Analyzer - AI-powered post analysis with selective field support """ -import sys +import csv +import json +import logging +import shutil from pathlib import Path +from datetime import datetime +from typing import Dict, List, Optional +import requests -# Import from scripts directory (parent of src) -scripts_dir = Path(__file__).parents[2] / 'scripts' -if str(scripts_dir) not in sys.path: - sys.path.insert(0, str(scripts_dir)) +from .config import Config -from ai_analyze_posts_for_decisions import PostAnalyzer +logger = logging.getLogger(__name__) -__all__ = ['PostAnalyzer'] + +class PostAnalyzer: + """Basic post analyzer (legacy compatibility).""" + + def __init__(self, csv_file: str): + self.csv_file = Path(csv_file) + self.openrouter_api_key = Config.OPENROUTER_API_KEY + self.ai_model = Config.AI_MODEL + self.posts = [] + self.analyzed_posts = [] + self.api_calls = 0 + self.ai_cost = 0.0 + + def load_csv(self) -> bool: + """Load posts from CSV.""" + if not self.csv_file.exists(): + logger.error(f"CSV file not found: {self.csv_file}") + return False + + try: + with open(self.csv_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + self.posts = list(reader) + logger.info(f"✓ Loaded {len(self.posts)} posts") + return True + except Exception as e: + logger.error(f"Error loading CSV: {e}") + return False + + def run(self) -> None: + """Run basic analysis (placeholder for legacy compatibility).""" + if not self.load_csv(): + return + logger.warning("Basic PostAnalyzer is deprecated. Use EnhancedPostAnalyzer instead.") + + +class EnhancedPostAnalyzer: + """Enhanced analyzer with selective field analysis and in-place updates.""" + + def __init__(self, csv_file: str, analyze_fields: Optional[List[str]] = None): + """ + Initialize analyzer. + + Args: + csv_file: Path to input CSV + analyze_fields: List of fields to analyze ['title', 'meta_description', 'categories', 'site'] + """ + self.csv_file = Path(csv_file) + self.openrouter_api_key = Config.OPENROUTER_API_KEY + self.ai_model = Config.AI_MODEL + self.posts = [] + self.analyzed_posts = [] + self.api_calls = 0 + self.ai_cost = 0.0 + + if analyze_fields is None: + self.analyze_fields = ['title', 'meta_description', 'categories', 'site'] + else: + self.analyze_fields = analyze_fields + + logger.info(f"Fields to analyze: {', '.join(self.analyze_fields)}") + + def load_csv(self) -> bool: + """Load posts from CSV file.""" + logger.info(f"Loading CSV: {self.csv_file}") + + if not self.csv_file.exists(): + logger.error(f"CSV file not found: {self.csv_file}") + return False + + try: + with open(self.csv_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + self.posts = list(reader) + + logger.info(f"✓ Loaded {len(self.posts)} posts from CSV") + return True + except Exception as e: + logger.error(f"Error loading CSV: {e}") + return False + + def get_ai_recommendations(self, batch: List[Dict], fields: List[str]) -> Optional[str]: + """Get AI recommendations for specific fields.""" + if not self.openrouter_api_key: + logger.error("OPENROUTER_API_KEY not set") + return None + + # Format posts for AI + formatted_posts = [] + for i, post in enumerate(batch, 1): + post_text = f"{i}. POST ID: {post['post_id']}\n" + post_text += f" Site: {post.get('site', '')}\n" + + if 'title' in fields: + post_text += f" Title: {post.get('title', '')}\n" + + if 'meta_description' in fields: + post_text += f" Meta Description: {post.get('meta_description', '')}\n" + + if 'categories' in fields: + post_text += f" Categories: {post.get('categories', '')}\n" + + if 'content_preview' in post: + post_text += f" Content Preview: {post.get('content_preview', '')[:300]}...\n" + + formatted_posts.append(post_text) + + posts_text = "\n".join(formatted_posts) + + # Build prompt based on requested fields + prompt_parts = ["Analyze these blog posts and provide recommendations.\n\n"] + + if 'site' in fields: + prompt_parts.append("""Website Strategy: +- mistergeek.net: High-value topics (VPN, Software, Gaming, General Tech, SEO, Content Marketing) +- webscroll.fr: Torrenting, File-Sharing, Tracker guides +- hellogeek.net: Low-traffic, experimental, off-brand content + +""") + + prompt_parts.append(posts_text) + prompt_parts.append("\nFor EACH post, provide a JSON object with:\n{\n") + + if 'title' in fields: + prompt_parts.append(' "proposed_title": "",\n') + prompt_parts.append(' "title_reason": "",\n') + + if 'meta_description' in fields: + prompt_parts.append(' "proposed_meta_description": "",\n') + prompt_parts.append(' "meta_reason": "",\n') + + if 'categories' in fields: + prompt_parts.append(' "proposed_category": "",\n') + prompt_parts.append(' "category_reason": "",\n') + + if 'site' in fields: + prompt_parts.append(' "proposed_site": "",\n') + prompt_parts.append(' "site_reason": "",\n') + + prompt_parts.append(' "confidence": "",\n') + prompt_parts.append(' "priority": ""\n}') + prompt_parts.append("\nReturn ONLY a JSON array of objects, one per post.") + + prompt = "".join(prompt_parts) + + try: + logger.info(f" Sending batch to AI for analysis...") + + response = requests.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {self.openrouter_api_key}", + "Content-Type": "application/json", + }, + json={ + "model": self.ai_model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + }, + timeout=60 + ) + response.raise_for_status() + + result = response.json() + self.api_calls += 1 + + usage = result.get('usage', {}) + input_tokens = usage.get('prompt_tokens', 0) + output_tokens = usage.get('completion_tokens', 0) + self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000 + + recommendations_text = result['choices'][0]['message']['content'].strip() + logger.info(f" ✓ Got recommendations (tokens: {input_tokens}+{output_tokens})") + + return recommendations_text + + except Exception as e: + logger.error(f"Error getting AI recommendations: {e}") + return None + + def parse_recommendations(self, recommendations_json: str) -> List[Dict]: + """Parse JSON recommendations from AI.""" + try: + start_idx = recommendations_json.find('[') + end_idx = recommendations_json.rfind(']') + 1 + + if start_idx == -1 or end_idx == 0: + logger.error("Could not find JSON array in response") + return [] + + json_str = recommendations_json[start_idx:end_idx] + recommendations = json.loads(json_str) + + return recommendations + + except json.JSONDecodeError as e: + logger.error(f"Error parsing JSON recommendations: {e}") + return [] + + def analyze_posts(self, batch_size: int = 10) -> bool: + """Analyze all posts in batches.""" + logger.info("\n" + "="*70) + logger.info("ANALYZING POSTS WITH AI") + logger.info("="*70 + "\n") + + batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)] + logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n") + + all_recommendations = {} + + for batch_num, batch in enumerate(batches, 1): + logger.info(f"Batch {batch_num}/{len(batches)}: Analyzing {len(batch)} posts...") + + recommendations_json = self.get_ai_recommendations(batch, self.analyze_fields) + + if not recommendations_json: + logger.error(f" Failed to get recommendations for batch {batch_num}") + continue + + recommendations = self.parse_recommendations(recommendations_json) + + for rec in recommendations: + all_recommendations[str(rec.get('post_id', ''))] = rec + + logger.info(f" ✓ Got {len(recommendations)} recommendations") + + logger.info(f"\n✓ Analysis complete!") + logger.info(f" Total recommendations: {len(all_recommendations)}") + logger.info(f" API calls: {self.api_calls}") + logger.info(f" Estimated cost: ${self.ai_cost:.4f}") + + # Map recommendations to posts + for post in self.posts: + post_id = str(post['post_id']) + if post_id in all_recommendations: + rec = all_recommendations[post_id] + + # Add only requested fields + if 'title' in self.analyze_fields: + post['proposed_title'] = rec.get('proposed_title', post.get('title', '')) + post['title_reason'] = rec.get('title_reason', '') + + if 'meta_description' in self.analyze_fields: + post['proposed_meta_description'] = rec.get('proposed_meta_description', post.get('meta_description', '')) + post['meta_reason'] = rec.get('meta_reason', '') + + if 'categories' in self.analyze_fields: + post['proposed_category'] = rec.get('proposed_category', post.get('categories', '')) + post['category_reason'] = rec.get('category_reason', '') + + if 'site' in self.analyze_fields: + post['proposed_site'] = rec.get('proposed_site', post.get('site', '')) + post['site_reason'] = rec.get('site_reason', '') + + post['ai_confidence'] = rec.get('confidence', 'Medium') + post['ai_priority'] = rec.get('priority', 'Medium') + else: + if 'title' in self.analyze_fields: + post['proposed_title'] = post.get('title', '') + post['title_reason'] = 'No AI recommendation' + + if 'meta_description' in self.analyze_fields: + post['proposed_meta_description'] = post.get('meta_description', '') + post['meta_reason'] = 'No AI recommendation' + + if 'categories' in self.analyze_fields: + post['proposed_category'] = post.get('categories', '') + post['category_reason'] = 'No AI recommendation' + + if 'site' in self.analyze_fields: + post['proposed_site'] = post.get('site', '') + post['site_reason'] = 'No AI recommendation' + + post['ai_confidence'] = 'Unknown' + post['ai_priority'] = 'Medium' + + self.analyzed_posts.append(post) + + return len(self.analyzed_posts) > 0 + + def export_results(self, output_file: Optional[str] = None, update_input: bool = False) -> str: + """Export results to CSV.""" + if update_input: + backup_file = self.csv_file.parent / f"{self.csv_file.stem}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + shutil.copy2(self.csv_file, backup_file) + logger.info(f"✓ Created backup: {backup_file}") + output_file = self.csv_file + elif not output_file: + output_dir = Path(__file__).parent.parent.parent / 'output' + output_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + output_file = output_dir / f'analyzed_posts_{timestamp}.csv' + + output_file = Path(output_file) + output_file.parent.mkdir(parents=True, exist_ok=True) + + if not self.analyzed_posts: + logger.error("No analyzed posts to export") + return "" + + original_fields = list(self.analyzed_posts[0].keys()) + + new_fields = [] + if 'title' in self.analyze_fields: + new_fields.extend(['proposed_title', 'title_reason']) + if 'meta_description' in self.analyze_fields: + new_fields.extend(['proposed_meta_description', 'meta_reason']) + if 'categories' in self.analyze_fields: + new_fields.extend(['proposed_category', 'category_reason']) + if 'site' in self.analyze_fields: + new_fields.extend(['proposed_site', 'site_reason']) + + new_fields.extend(['ai_confidence', 'ai_priority']) + + fieldnames = original_fields + new_fields + + logger.info(f"\nExporting results to: {output_file}") + + with open(output_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(self.analyzed_posts) + + logger.info(f"✓ Exported {len(self.analyzed_posts)} posts") + return str(output_file) + + def run(self, output_file: Optional[str] = None, update_input: bool = False, batch_size: int = 10) -> str: + """Run complete analysis.""" + if not self.load_csv(): + return "" + + if not self.analyze_posts(batch_size=batch_size): + logger.error("Failed to analyze posts") + return "" + + return self.export_results(output_file=output_file, update_input=update_input) diff --git a/src/seo/app.py b/src/seo/app.py index b1c636c..ea95f52 100644 --- a/src/seo/app.py +++ b/src/seo/app.py @@ -8,11 +8,8 @@ from datetime import datetime from typing import Optional, List from .exporter import PostExporter -from .analyzer import PostAnalyzer -from .recategorizer import PostRecategorizer -from .seo_checker import MultiSiteSEOAnalyzer -from .categories import CategoryManager -from .approval import UserApprovalSystem +from .analyzer import EnhancedPostAnalyzer +from .category_proposer import CategoryProposer logger = logging.getLogger(__name__) @@ -22,70 +19,38 @@ class SEOApp: Main SEO Application class. Provides a unified interface for all SEO automation tasks. - Inspired by Ruby on Rails' Active Record pattern. - - Usage: - app = SEOApp() - app.export() - app.analyze() - app.seo_check() """ def __init__(self, verbose: bool = False): - """ - Initialize the SEO application. - - Args: - verbose: Enable verbose logging - """ + """Initialize the SEO application.""" self.verbose = verbose self.output_dir = Path(__file__).parent.parent.parent / 'output' self.output_dir.mkdir(parents=True, exist_ok=True) - # Initialize components - self.exporter = None - self.analyzer = None - self.recategorizer = None - self.seo_checker = None - self.category_manager = None - self.approval_system = None - if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) def export(self) -> str: - """ - Export all posts from WordPress sites. - - Returns: - Path to exported CSV file - """ + """Export all posts from WordPress sites.""" logger.info("📦 Exporting all posts from WordPress sites...") - self.exporter = PostExporter() - self.exporter.run() - - # Get the exported file path - date_str = datetime.now().strftime('%Y-%m-%d') - csv_file = self.output_dir / f'all_posts_{date_str}.csv' - - logger.info(f"✅ Export completed: {csv_file}") - return str(csv_file) + exporter = PostExporter() + return exporter.run() - def analyze(self, csv_file: Optional[str] = None) -> str: + def analyze(self, csv_file: Optional[str] = None, fields: Optional[List[str]] = None, + update: bool = False, output: Optional[str] = None) -> str: """ Analyze posts with AI for recommendations. Args: csv_file: Path to CSV file (uses latest export if not provided) - - Returns: - Path to analysis results + fields: Fields to analyze ['title', 'meta_description', 'categories', 'site'] + update: If True, update input CSV (creates backup) + output: Custom output file path """ logger.info("🤖 Analyzing posts with AI for recommendations...") - # Find CSV file if not csv_file: csv_file = self._find_latest_export() @@ -94,26 +59,13 @@ class SEOApp: logger.info(f"Using file: {csv_file}") - # Run analysis - self.analyzer = PostAnalyzer(csv_file) - self.analyzer.run() - - logger.info("✅ AI analysis completed!") - return csv_file + analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields) + return analyzer.run(output_file=output, update_input=update) - def recategorize(self, csv_file: Optional[str] = None) -> str: - """ - Recategorize posts with AI suggestions. + def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> str: + """Propose categories for posts.""" + logger.info("🏷️ Proposing categories with AI...") - Args: - csv_file: Path to CSV file (uses latest export if not provided) - - Returns: - Path to recategorization results - """ - logger.info("🏷️ Recategorizing posts with AI suggestions...") - - # Find CSV file if not csv_file: csv_file = self._find_latest_export() @@ -122,122 +74,11 @@ class SEOApp: logger.info(f"Using file: {csv_file}") - # Run recategorization - self.recategorizer = PostRecategorizer(csv_file) - self.recategorizer.run() - - logger.info("✅ Recategorization completed!") - return csv_file - - def seo_check(self, top_n: int = 10) -> None: - """ - Check SEO quality of titles and descriptions. - - Args: - top_n: Number of top posts to get AI recommendations for - """ - logger.info("🔍 Checking SEO quality of titles/descriptions...") - - self.seo_checker = MultiSiteSEOAnalyzer() - self.seo_checker.run(use_ai=True, top_n=top_n) - - logger.info("✅ SEO check completed!") - - def categories(self) -> None: - """Manage categories across all sites.""" - logger.info("🗂️ Managing categories across all sites...") - - self.category_manager = CategoryManager() - self.category_manager.run() - - logger.info("✅ Category management completed!") - - def approve(self, files: Optional[List[str]] = None) -> None: - """ - Review and approve recommendations. - - Args: - files: List of CSV files to review (auto-detects if not provided) - """ - logger.info("✅ Reviewing and approving recommendations...") - - self.approval_system = UserApprovalSystem() - - if not files: - # Auto-detect recommendation files - files = self._find_recommendation_files() - - if not files: - raise FileNotFoundError("No recommendation files found. Run analyze() or categories() first.") - - logger.info(f"Found {len(files)} recommendation files to review") - self.approval_system.run_interactive_approval(files) - - logger.info("✅ Approval process completed!") - - def full_pipeline(self) -> None: - """ - Run complete workflow: export → analyze → seo_check - """ - logger.info("🚀 Running full SEO automation pipeline...") - - # Step 1: Export - logger.info("\n📦 Step 1/3: Exporting posts...") - self.export() - - # Step 2: Analyze - logger.info("\n🤖 Step 2/3: Analyzing with AI...") - self.analyze() - - # Step 3: SEO Check - logger.info("\n🔍 Step 3/3: Checking SEO quality...") - self.seo_check() - - logger.info("\n✅ Full pipeline completed!") - - def _find_latest_export(self) -> Optional[str]: - """ - Find the latest exported CSV file. - - Returns: - Path to latest CSV file or None if not found - """ - csv_files = list(self.output_dir.glob('all_posts_*.csv')) - - if not csv_files: - return None - - latest = max(csv_files, key=lambda f: f.stat().st_ctime) - return str(latest) - - def _find_recommendation_files(self) -> List[str]: - """ - Find recommendation files in output directory. - - Returns: - List of paths to recommendation files - """ - patterns = [ - 'category_assignments_*.csv', - 'posts_with_ai_recommendations_*.csv', - 'posts_to_move_*.csv', - 'posts_to_consolidate_*.csv', - 'posts_to_delete_*.csv' - ] - - files = [] - for pattern in patterns: - files.extend(self.output_dir.glob(pattern)) - - return [str(f) for f in files] + proposer = CategoryProposer(csv_file) + return proposer.run(output_file=output) def status(self) -> dict: - """ - Get status of output files. - - Returns: - Dictionary with file information - """ + """Get status of output files.""" files = list(self.output_dir.glob('*.csv')) status_info = { @@ -253,3 +94,13 @@ class SEOApp: }) return status_info + + def _find_latest_export(self) -> Optional[str]: + """Find the latest exported CSV file.""" + csv_files = list(self.output_dir.glob('all_posts_*.csv')) + + if not csv_files: + return None + + latest = max(csv_files, key=lambda f: f.stat().st_ctime) + return str(latest) diff --git a/src/seo/approval.py b/src/seo/approval.py index 7d929cf..da16259 100644 --- a/src/seo/approval.py +++ b/src/seo/approval.py @@ -1,15 +1,18 @@ """ Approval System Module - User approval for recommendations +Placeholder for future implementation. """ -import sys -from pathlib import Path +import logging -# Import from scripts directory (parent of src) -scripts_dir = Path(__file__).parents[2] / 'scripts' -if str(scripts_dir) not in sys.path: - sys.path.insert(0, str(scripts_dir)) +logger = logging.getLogger(__name__) -from user_approval import UserApprovalSystem -__all__ = ['UserApprovalSystem'] +class UserApprovalSystem: + """User approval system (placeholder).""" + + def __init__(self): + logger.warning("UserApprovalSystem is a placeholder. Implement full functionality as needed.") + + def run_interactive_approval(self, files): + logger.info("Approval system not yet implemented in integrated package.") diff --git a/src/seo/categories.py b/src/seo/categories.py index 1744f59..6f200de 100644 --- a/src/seo/categories.py +++ b/src/seo/categories.py @@ -1,15 +1,18 @@ """ Category Manager Module - Category management across sites +Placeholder for future implementation. """ -import sys -from pathlib import Path +import logging -# Import from scripts directory (parent of src) -scripts_dir = Path(__file__).parents[2] / 'scripts' -if str(scripts_dir) not in sys.path: - sys.path.insert(0, str(scripts_dir)) +logger = logging.getLogger(__name__) -from category_manager import CategoryManager -__all__ = ['CategoryManager'] +class CategoryManager: + """Category manager (placeholder).""" + + def __init__(self): + logger.warning("CategoryManager is a placeholder. Implement full functionality as needed.") + + def run(self): + logger.info("Category management not yet implemented in integrated package.") diff --git a/scripts/category_proposer.py b/src/seo/category_proposer.py similarity index 88% rename from scripts/category_proposer.py rename to src/seo/category_proposer.py index ac79298..6572ae6 100644 --- a/scripts/category_proposer.py +++ b/src/seo/category_proposer.py @@ -1,18 +1,16 @@ -#!/usr/bin/env python3 """ Category Proposer - AI-powered category suggestions -Analyzes posts and proposes optimal categories based on content. """ import csv import json import logging -import sys from pathlib import Path +from datetime import datetime from typing import Dict, List, Optional import requests -from datetime import datetime -from config import Config + +from .config import Config logger = logging.getLogger(__name__) @@ -56,7 +54,6 @@ class CategoryProposer: logger.error("OPENROUTER_API_KEY not set") return None - # Format posts for AI formatted = [] for i, post in enumerate(batch, 1): text = f"{i}. ID: {post['post_id']}\n" @@ -161,7 +158,6 @@ Return ONLY a JSON array with one object per post.""" logger.info(f" API calls: {self.api_calls}") logger.info(f" Cost: ${self.ai_cost:.4f}") - # Map proposals to posts for post in self.posts: post_id = str(post['post_id']) proposal = all_proposals.get(post_id, {}) @@ -180,7 +176,7 @@ Return ONLY a JSON array with one object per post.""" def export_proposals(self, output_file: Optional[str] = None) -> str: """Export category proposals to CSV.""" if not output_file: - output_dir = Path(__file__).parent.parent / 'output' + output_dir = Path(__file__).parent.parent.parent / 'output' output_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') output_file = output_dir / f'category_proposals_{timestamp}.csv' @@ -207,33 +203,10 @@ Return ONLY a JSON array with one object per post.""" def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> str: """Run complete category proposal process.""" if not self.load_csv(): - sys.exit(1) + return "" if not self.propose_categories(batch_size=batch_size): logger.error("Failed to propose categories") - sys.exit(1) + return "" return self.export_proposals(output_file) - - -def main(): - """Main entry point.""" - import argparse - - parser = argparse.ArgumentParser( - description='AI-powered category proposer for blog posts' - ) - parser.add_argument('csv_file', help='Input CSV file with posts') - parser.add_argument('--output', '-o', help='Output CSV file') - parser.add_argument('--batch-size', type=int, default=10, help='Batch size') - - args = parser.parse_args() - - proposer = CategoryProposer(args.csv_file) - output_file = proposer.run(batch_size=args.batch_size) - - logger.info(f"\n✓ Category proposals saved to: {output_file}") - - -if __name__ == '__main__': - main() diff --git a/src/seo/cli.py b/src/seo/cli.py index dd05e49..73062f3 100644 --- a/src/seo/cli.py +++ b/src/seo/cli.py @@ -26,12 +26,9 @@ def main(): Examples: seo export Export all posts from WordPress sites seo analyze Analyze posts with AI for recommendations - seo analyze posts.csv Analyze specific CSV file - seo recategorize Recategorize posts with AI - seo seo_check Check SEO quality of titles/descriptions - seo categories Manage categories across sites - seo approve Review and approve recommendations - seo full_pipeline Run complete workflow: export → analyze → seo_check + seo analyze -f title Analyze only titles + seo analyze -u -f meta Update CSV with meta descriptions + seo category_propose Propose categories based on content seo status Show output files status """ ) @@ -40,11 +37,10 @@ Examples: parser.add_argument('args', nargs='*', help='Arguments for the command') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') parser.add_argument('--dry-run', action='store_true', help='Show what would be done') - parser.add_argument('--top-n', type=int, default=10, help='Number of top posts for AI analysis') parser.add_argument('--fields', '-f', nargs='+', choices=['title', 'meta_description', 'categories', 'site'], - help='Fields to analyze (for analyze command)') - parser.add_argument('--update', '-u', action='store_true', help='Update input file (creates backup)') + help='Fields to analyze') + parser.add_argument('--update', '-u', action='store_true', help='Update input file') parser.add_argument('--output', '-o', help='Output file path') args = parser.parse_args() @@ -67,12 +63,7 @@ Examples: commands = { 'export': cmd_export, 'analyze': cmd_analyze, - 'recategorize': cmd_recategorize, - 'seo_check': cmd_seo_check, - 'categories': cmd_categories, 'category_propose': cmd_category_propose, - 'approve': cmd_approve, - 'full_pipeline': cmd_full_pipeline, 'status': cmd_status, 'help': cmd_help, } @@ -117,63 +108,19 @@ def cmd_analyze(app, args): csv_file = args.args[0] if args.args else None - # Use enhanced analyzer if fields are specified or update flag is set - if args.fields or args.update: - from pathlib import Path - import sys - scripts_dir = Path(__file__).parent.parent.parent / 'scripts' - sys.path.insert(0, str(scripts_dir)) - - from enhanced_analyzer import EnhancedPostAnalyzer - - if not csv_file: - csv_file = app._find_latest_export() - - if not csv_file: - print("❌ No CSV file found. Provide one or run export first.") - return 1 - - print(f"Using enhanced analyzer with fields: {args.fields or 'all'}") - analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=args.fields) - output_file = analyzer.run( - output_file=args.output, - update_input=args.update - ) - print(f"✅ Analysis completed! Results: {output_file}") - else: - app.analyze(csv_file) + print(f"Analyzing with fields: {args.fields or 'all'}") + if args.update: + print(f"Will update input CSV (backup will be created)") - return 0 - - -def cmd_recategorize(app, args): - """Recategorize posts with AI.""" - if args.dry_run: - print("Would recategorize posts with AI suggestions") - return 0 + result = app.analyze( + csv_file=csv_file, + fields=args.fields, + update=args.update, + output=args.output + ) - csv_file = args.args[0] if args.args else None - app.recategorize(csv_file) - return 0 - - -def cmd_seo_check(app, args): - """Check SEO quality.""" - if args.dry_run: - print("Would check SEO quality of titles/descriptions") - return 0 - - app.seo_check(top_n=args.top_n) - return 0 - - -def cmd_categories(app, args): - """Manage categories.""" - if args.dry_run: - print("Would manage categories across all sites") - return 0 - - app.categories() + if result: + print(f"✅ Analysis completed! Results: {result}") return 0 @@ -185,47 +132,10 @@ def cmd_category_propose(app, args): csv_file = args.args[0] if args.args else None - if not csv_file: - csv_file = app._find_latest_export() + result = app.category_propose(csv_file=csv_file, output=args.output) - if not csv_file: - print("❌ No CSV file found. Provide one or run export first.") - print(" Usage: seo category_propose ") - return 1 - - from pathlib import Path - import sys - scripts_dir = Path(__file__).parent.parent.parent / 'scripts' - sys.path.insert(0, str(scripts_dir)) - - from category_proposer import CategoryProposer - - print(f"Proposing categories for: {csv_file}") - proposer = CategoryProposer(csv_file) - output_file = proposer.run(output_file=args.output) - - print(f"✅ Category proposals saved to: {output_file}") - return 0 - - -def cmd_approve(app, args): - """Approve recommendations.""" - if args.dry_run: - print("Would review and approve recommendations") - return 0 - - files = args.args if args.args else None - app.approve(files) - return 0 - - -def cmd_full_pipeline(app, args): - """Run full pipeline.""" - if args.dry_run: - print("Would run full pipeline: export → analyze → seo_check") - return 0 - - app.full_pipeline() + if result: + print(f"✅ Category proposals saved to: {result}") return 0 @@ -256,23 +166,15 @@ SEO Automation CLI - Available Commands Basic Commands: export Export all posts from WordPress sites analyze [csv_file] Analyze posts with AI - analyze -f title categories Analyze specific fields only - analyze -u Update input CSV with new columns - recategorize [csv_file] Recategorize posts with AI - seo_check Check SEO quality of titles/descriptions - categories Manage categories across sites + analyze -f title Analyze specific fields (title, meta_description, categories, site) + analyze -u Update input CSV with new columns (creates backup) category_propose [csv] Propose categories based on content - approve [files...] Review and approve recommendations - full_pipeline Run complete workflow: export → analyze → seo_check - -Utility: status Show output files status help Show this help message Options: --verbose, -v Enable verbose logging --dry-run Show what would be done without doing it - --top-n N Number of top posts for AI analysis (default: 10) --fields, -f Fields to analyze: title, meta_description, categories, site --update, -u Update input CSV file (creates backup) --output, -o Output file path @@ -284,8 +186,6 @@ Examples: seo analyze -f title categories seo analyze -u -f meta_description seo category_propose - seo approve output/category_proposals_*.csv - seo full_pipeline seo status """) return 0 diff --git a/src/seo/exporter.py b/src/seo/exporter.py index 2557869..108b7d3 100644 --- a/src/seo/exporter.py +++ b/src/seo/exporter.py @@ -1,16 +1,16 @@ """ -Post Exporter Module - Export posts from WordPress sites +Post Exporter - Export posts from WordPress sites """ import csv import logging import time +import re from pathlib import Path from datetime import datetime from typing import Dict, List, Optional import requests from requests.auth import HTTPBasicAuth -import re from .config import Config @@ -26,7 +26,7 @@ class PostExporter: self.all_posts = [] self.category_cache = {} - def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, str]: + def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, Dict]: """Fetch category names from a WordPress site.""" if site_name in self.category_cache: return self.category_cache[site_name] @@ -61,8 +61,6 @@ class PostExporter: for status in ['publish', 'draft']: page = 1 - status_count = 0 - while True: try: logger.info(f" Fetching page {page} ({status} posts)...") @@ -79,19 +77,16 @@ class PostExporter: break posts.extend(page_posts) - status_count += len(page_posts) - logger.info(f" ✓ Got {len(page_posts)} posts (total: {len(posts)})") + logger.info(f" ✓ Got {len(page_posts)} posts") page += 1 time.sleep(0.5) except requests.exceptions.HTTPError as e: if response.status_code == 400: - logger.info(f" ℹ API limit reached (got {status_count} {status} posts)") - break - else: - logger.error(f"Error on page {page}: {e}") break + logger.error(f"Error on page {page}: {e}") + break except requests.exceptions.RequestException as e: logger.error(f"Error fetching from {site_name}: {e}") break @@ -160,7 +155,7 @@ class PostExporter: if not self.all_posts: logger.error("No posts to export") - return None + return "" fieldnames = [ 'site', 'post_id', 'status', 'title', 'slug', 'url', 'author_id', @@ -178,10 +173,10 @@ class PostExporter: logger.info(f"✓ CSV exported to: {output_file}") return str(output_file) - def run(self): + def run(self) -> str: """Run the complete export process.""" logger.info("="*70) - logger.info("EXPORTING ALL POSTS FOR AI DECISION MAKING") + logger.info("EXPORTING ALL POSTS") logger.info("="*70) logger.info("Sites configured: " + ", ".join(self.sites.keys())) @@ -196,31 +191,7 @@ class PostExporter: if not self.all_posts: logger.error("No posts found on any site") - return + return "" self.all_posts.sort(key=lambda x: (x['site'], x['post_id'])) - self.export_to_csv() - - # Print summary - logger.info("\n" + "="*70) - logger.info("EXPORT SUMMARY") - logger.info("="*70) - - by_site = {} - for post in self.all_posts: - site = post['site'] - if site not in by_site: - by_site[site] = {'total': 0, 'published': 0, 'draft': 0} - by_site[site]['total'] += 1 - if post['status'] == 'publish': - by_site[site]['published'] += 1 - else: - by_site[site]['draft'] += 1 - - for site, stats in sorted(by_site.items()): - logger.info(f"\n{site}:") - logger.info(f" Total: {stats['total']}") - logger.info(f" Published: {stats['published']}") - logger.info(f" Drafts: {stats['draft']}") - - logger.info(f"\n✓ Export complete!") + return self.export_to_csv() diff --git a/src/seo/recategorizer.py b/src/seo/recategorizer.py index 45f91fb..b05456a 100644 --- a/src/seo/recategorizer.py +++ b/src/seo/recategorizer.py @@ -1,15 +1,19 @@ """ Recategorizer Module - AI-powered post recategorization +Placeholder for future implementation. """ -import sys -from pathlib import Path +import logging -# Import from scripts directory (parent of src) -scripts_dir = Path(__file__).parents[2] / 'scripts' -if str(scripts_dir) not in sys.path: - sys.path.insert(0, str(scripts_dir)) +logger = logging.getLogger(__name__) -from ai_recategorize_posts import PostRecategorizer -__all__ = ['PostRecategorizer'] +class PostRecategorizer: + """Post recategorizer (placeholder).""" + + def __init__(self, csv_file): + self.csv_file = csv_file + logger.warning("PostRecategorizer is a placeholder. Implement full functionality as needed.") + + def run(self): + logger.info("Recategorization not yet implemented in integrated package.") diff --git a/src/seo/seo_checker.py b/src/seo/seo_checker.py index 4c404c0..77e0296 100644 --- a/src/seo/seo_checker.py +++ b/src/seo/seo_checker.py @@ -1,15 +1,18 @@ """ SEO Checker Module - SEO quality analysis +Placeholder for future implementation. """ -import sys -from pathlib import Path +import logging -# Import from scripts directory (parent of src) -scripts_dir = Path(__file__).parents[2] / 'scripts' -if str(scripts_dir) not in sys.path: - sys.path.insert(0, str(scripts_dir)) +logger = logging.getLogger(__name__) -from multi_site_seo_analyzer import MultiSiteSEOAnalyzer -__all__ = ['MultiSiteSEOAnalyzer'] +class MultiSiteSEOAnalyzer: + """SEO quality analyzer (placeholder).""" + + def __init__(self): + logger.warning("MultiSiteSEOAnalyzer is a placeholder. Implement full functionality as needed.") + + def run(self, use_ai=True, top_n=10): + logger.info("SEO check not yet implemented in integrated package.")