Add enhanced analysis with selective field analysis and category proposer
New Features: - Selective field analysis: Choose which fields to analyze (title, meta_description, categories, site) - In-place CSV updates: Update input CSV with new columns (automatic backup created) - Category proposer: Dedicated command for AI-powered category suggestions New Commands: - seo analyze -f title categories: Analyze specific fields only - seo analyze -u: Update input CSV with recommendations - seo category_propose: Propose categories based on content New Scripts: - enhanced_analyzer.py: Enhanced AI analyzer with selective analysis - category_proposer.py: Dedicated category proposal tool CLI Options: - --fields, -f: Specify fields to analyze - --update, -u: Update input CSV (creates backup) - --output, -o: Custom output file path Output Columns: - proposed_title, title_reason (for title analysis) - proposed_meta_description, meta_reason (for meta analysis) - proposed_category, category_reason (for category analysis) - proposed_site, site_reason (for site analysis) - ai_confidence, ai_priority (common to all) Documentation: - ENHANCED_ANALYSIS_GUIDE.md: Complete guide with examples Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
239
scripts/category_proposer.py
Normal file
239
scripts/category_proposer.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Category Proposer - AI-powered category suggestions
|
||||
Analyzes posts and proposes optimal categories based on content.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from config import Config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CategoryProposer:
|
||||
"""Propose categories for posts using AI."""
|
||||
|
||||
def __init__(self, csv_file: str):
|
||||
"""Initialize proposer with CSV file."""
|
||||
self.csv_file = Path(csv_file)
|
||||
self.openrouter_api_key = Config.OPENROUTER_API_KEY
|
||||
self.ai_model = Config.AI_MODEL
|
||||
self.posts = []
|
||||
self.proposed_categories = []
|
||||
self.api_calls = 0
|
||||
self.ai_cost = 0.0
|
||||
|
||||
def load_csv(self) -> bool:
|
||||
"""Load posts from CSV."""
|
||||
logger.info(f"Loading CSV: {self.csv_file}")
|
||||
|
||||
if not self.csv_file.exists():
|
||||
logger.error(f"CSV file not found: {self.csv_file}")
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(self.csv_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
self.posts = list(reader)
|
||||
|
||||
logger.info(f"✓ Loaded {len(self.posts)} posts")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading CSV: {e}")
|
||||
return False
|
||||
|
||||
def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
|
||||
"""Get AI category proposals for a batch of posts."""
|
||||
if not self.openrouter_api_key:
|
||||
logger.error("OPENROUTER_API_KEY not set")
|
||||
return None
|
||||
|
||||
# Format posts for AI
|
||||
formatted = []
|
||||
for i, post in enumerate(batch, 1):
|
||||
text = f"{i}. ID: {post['post_id']}\n"
|
||||
text += f" Title: {post.get('title', '')}\n"
|
||||
text += f" Current Categories: {post.get('categories', '')}\n"
|
||||
if 'content_preview' in post:
|
||||
text += f" Content: {post['content_preview'][:300]}...\n"
|
||||
formatted.append(text)
|
||||
|
||||
posts_text = "\n".join(formatted)
|
||||
|
||||
prompt = f"""Analyze these blog posts and propose optimal categories.
|
||||
|
||||
{posts_text}
|
||||
|
||||
For EACH post, provide:
|
||||
{{
|
||||
"post_id": <id>,
|
||||
"current_categories": "<current>",
|
||||
"proposed_category": "<best category>",
|
||||
"alternative_categories": ["<alt1>", "<alt2>"],
|
||||
"reason": "<brief explanation>",
|
||||
"confidence": "<High|Medium|Low>"
|
||||
}}
|
||||
|
||||
Return ONLY a JSON array with one object per post."""
|
||||
|
||||
try:
|
||||
logger.info(f" Getting category proposals...")
|
||||
|
||||
response = requests.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.openrouter_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": self.ai_model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.3,
|
||||
},
|
||||
timeout=60
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
self.api_calls += 1
|
||||
|
||||
usage = result.get('usage', {})
|
||||
input_tokens = usage.get('prompt_tokens', 0)
|
||||
output_tokens = usage.get('completion_tokens', 0)
|
||||
self.ai_cost += (input_tokens * 3 + output_tokens * 15) / 1_000_000
|
||||
|
||||
logger.info(f" ✓ Got proposals (tokens: {input_tokens}+{output_tokens})")
|
||||
return result['choices'][0]['message']['content'].strip()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting proposals: {e}")
|
||||
return None
|
||||
|
||||
def parse_proposals(self, proposals_json: str) -> List[Dict]:
|
||||
"""Parse JSON proposals."""
|
||||
try:
|
||||
start_idx = proposals_json.find('[')
|
||||
end_idx = proposals_json.rfind(']') + 1
|
||||
|
||||
if start_idx == -1 or end_idx == 0:
|
||||
return []
|
||||
|
||||
return json.loads(proposals_json[start_idx:end_idx])
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
def propose_categories(self, batch_size: int = 10) -> bool:
|
||||
"""Propose categories for all posts."""
|
||||
logger.info("\n" + "="*70)
|
||||
logger.info("PROPOSING CATEGORIES WITH AI")
|
||||
logger.info("="*70 + "\n")
|
||||
|
||||
batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)]
|
||||
logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n")
|
||||
|
||||
all_proposals = {}
|
||||
|
||||
for batch_num, batch in enumerate(batches, 1):
|
||||
logger.info(f"Batch {batch_num}/{len(batches)}...")
|
||||
|
||||
proposals_json = self.get_category_proposals(batch)
|
||||
if not proposals_json:
|
||||
continue
|
||||
|
||||
proposals = self.parse_proposals(proposals_json)
|
||||
|
||||
for prop in proposals:
|
||||
all_proposals[str(prop.get('post_id', ''))] = prop
|
||||
|
||||
logger.info(f" ✓ Got {len(proposals)} proposals")
|
||||
|
||||
logger.info(f"\n✓ Proposals complete!")
|
||||
logger.info(f" Total: {len(all_proposals)}")
|
||||
logger.info(f" API calls: {self.api_calls}")
|
||||
logger.info(f" Cost: ${self.ai_cost:.4f}")
|
||||
|
||||
# Map proposals to posts
|
||||
for post in self.posts:
|
||||
post_id = str(post['post_id'])
|
||||
proposal = all_proposals.get(post_id, {})
|
||||
|
||||
self.proposed_categories.append({
|
||||
**post,
|
||||
'proposed_category': proposal.get('proposed_category', post.get('categories', '')),
|
||||
'alternative_categories': ', '.join(proposal.get('alternative_categories', [])),
|
||||
'category_reason': proposal.get('reason', ''),
|
||||
'category_confidence': proposal.get('confidence', 'Medium'),
|
||||
'current_categories': post.get('categories', '')
|
||||
})
|
||||
|
||||
return True
|
||||
|
||||
def export_proposals(self, output_file: Optional[str] = None) -> str:
|
||||
"""Export category proposals to CSV."""
|
||||
if not output_file:
|
||||
output_dir = Path(__file__).parent.parent / 'output'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
output_file = output_dir / f'category_proposals_{timestamp}.csv'
|
||||
|
||||
output_file = Path(output_file)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fieldnames = [
|
||||
'post_id', 'title', 'site', 'current_categories',
|
||||
'proposed_category', 'alternative_categories',
|
||||
'category_reason', 'category_confidence'
|
||||
]
|
||||
|
||||
logger.info(f"\nExporting to: {output_file}")
|
||||
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
||||
writer.writeheader()
|
||||
writer.writerows(self.proposed_categories)
|
||||
|
||||
logger.info(f"✓ Exported {len(self.proposed_categories)} proposals")
|
||||
return str(output_file)
|
||||
|
||||
def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> str:
|
||||
"""Run complete category proposal process."""
|
||||
if not self.load_csv():
|
||||
sys.exit(1)
|
||||
|
||||
if not self.propose_categories(batch_size=batch_size):
|
||||
logger.error("Failed to propose categories")
|
||||
sys.exit(1)
|
||||
|
||||
return self.export_proposals(output_file)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='AI-powered category proposer for blog posts'
|
||||
)
|
||||
parser.add_argument('csv_file', help='Input CSV file with posts')
|
||||
parser.add_argument('--output', '-o', help='Output CSV file')
|
||||
parser.add_argument('--batch-size', type=int, default=10, help='Batch size')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
proposer = CategoryProposer(args.csv_file)
|
||||
output_file = proposer.run(batch_size=args.batch_size)
|
||||
|
||||
logger.info(f"\n✓ Category proposals saved to: {output_file}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user