Refactor category_propose to align with editorial strategy
Major refactoring of category proposal system: ### Changes: - Integrated editorial strategy into category proposals - Added site migration recommendations to category proposals - AI now considers editorial lines when suggesting categories - Automatic detection of posts that should migrate between sites ### New Features: - Editorial line definitions for each site - Topic-based site matching algorithm - Migration recommendations alongside category proposals - Dual output: category_proposals_*.csv + migration_recommendations_*.csv ### Editorial Lines: mistergeek.net: VPN, Software, Gaming, SEO, Tech (high-value) webscroll.fr: Torrenting, File-Sharing, Tracker Guides (niche) hellogeek.net: Experimental, Low-Traffic, Off-Brand (catch-all) ### Output Files: 1. category_proposals_*.csv - Categories + site recommendations 2. migration_recommendations_*.csv - Posts to migrate between sites ### CSV Columns Added: - recommended_site - Best site for the post - should_migrate - Yes/No flag - migration_reason - Why migration is recommended - current_site - Original site for comparison ### Benefits: - Categories aligned with site strategy - Automatic migration detection - Smarter AI prompts with editorial context - Unified category + migration workflow Usage: ./seo category_propose # Generates both category and migration files Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
@@ -5,7 +5,7 @@ SEO Application Core - Integrated SEO automation functionality
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
from .exporter import PostExporter
|
||||
from .analyzer import EnhancedPostAnalyzer
|
||||
@@ -64,9 +64,18 @@ class SEOApp:
|
||||
analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields)
|
||||
return analyzer.run(output_file=output, update_input=update)
|
||||
|
||||
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> str:
|
||||
"""Propose categories for posts."""
|
||||
logger.info("🏷️ Proposing categories with AI...")
|
||||
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]:
|
||||
"""
|
||||
Propose categories for posts with editorial strategy alignment.
|
||||
|
||||
Args:
|
||||
csv_file: Path to CSV file (uses latest export if not provided)
|
||||
output: Custom output file path
|
||||
|
||||
Returns:
|
||||
Tuple of (proposals_file, migrations_file)
|
||||
"""
|
||||
logger.info("🏷️ Proposing categories with AI (editorial strategy aligned)...")
|
||||
|
||||
if not csv_file:
|
||||
csv_file = self._find_latest_export()
|
||||
@@ -76,7 +85,7 @@ class SEOApp:
|
||||
|
||||
logger.info(f"Using file: {csv_file}")
|
||||
|
||||
proposer = CategoryProposer(csv_file)
|
||||
proposer = CategoryProposer(csv_file, use_editorial_strategy=True)
|
||||
return proposer.run(output_file=output)
|
||||
|
||||
def category_apply(self, proposals_csv: str, site_name: str,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""
|
||||
Category Proposer - AI-powered category suggestions
|
||||
Category Proposer - AI-powered category suggestions with editorial strategy alignment
|
||||
Proposes categories based on content AND site editorial lines
|
||||
"""
|
||||
|
||||
import csv
|
||||
@@ -7,26 +8,74 @@ import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import requests
|
||||
|
||||
from .config import Config
|
||||
from .editorial_strategy import EditorialStrategyAnalyzer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CategoryProposer:
|
||||
"""Propose categories for posts using AI."""
|
||||
# Editorial line definitions for each site
|
||||
EDITORIAL_LINES = {
|
||||
'mistergeek.net': {
|
||||
'focus': 'High-value tech content',
|
||||
'ideal_categories': [
|
||||
'VPN', 'Software/Tools', 'Gaming', 'SEO',
|
||||
'Content Marketing', 'Tech Reviews', 'Tutorials'
|
||||
],
|
||||
'topic_keywords': {
|
||||
'VPN': ['vpn', 'proxy', 'privacy', 'security', 'encryption'],
|
||||
'Software': ['software', 'app', 'tool', 'download', 'install'],
|
||||
'Gaming': ['game', 'gaming', 'console', 'steam', 'playstation'],
|
||||
'SEO': ['seo', 'ranking', 'google', 'search', 'optimization'],
|
||||
'Tech': ['tech', 'technology', 'review', 'device', 'hardware'],
|
||||
}
|
||||
},
|
||||
'webscroll.fr': {
|
||||
'focus': 'Torrenting and file-sharing niche',
|
||||
'ideal_categories': [
|
||||
'Torrenting', 'File-Sharing', 'Tracker Guides',
|
||||
'VPN for Torrenting', 'Seedbox'
|
||||
],
|
||||
'topic_keywords': {
|
||||
'Torrenting': ['torrent', 'download', 'upload', 'tracker', 'seed'],
|
||||
'File-Sharing': ['file-sharing', 'ddl', 'hosting', 'upload'],
|
||||
'Tracker Guides': ['tracker', 'ratio', 'invite', 'private'],
|
||||
}
|
||||
},
|
||||
'hellogeek.net': {
|
||||
'focus': 'Low-traffic, experimental, off-brand content',
|
||||
'ideal_categories': [
|
||||
'Experimental', 'Low-Traffic', 'Off-Brand', 'Testing'
|
||||
],
|
||||
'topic_keywords': {} # Catch-all for everything else
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, csv_file: str):
|
||||
"""Initialize proposer with CSV file."""
|
||||
|
||||
class CategoryProposer:
|
||||
"""Propose categories for posts using AI with editorial strategy alignment."""
|
||||
|
||||
def __init__(self, csv_file: str, use_editorial_strategy: bool = True):
|
||||
"""
|
||||
Initialize proposer.
|
||||
|
||||
Args:
|
||||
csv_file: Path to CSV file
|
||||
use_editorial_strategy: If True, align proposals with editorial lines
|
||||
"""
|
||||
self.csv_file = Path(csv_file)
|
||||
self.openrouter_api_key = Config.OPENROUTER_API_KEY
|
||||
self.ai_model = Config.AI_MODEL
|
||||
self.posts = []
|
||||
self.proposed_categories = []
|
||||
self.migration_recommendations = []
|
||||
self.api_calls = 0
|
||||
self.ai_cost = 0.0
|
||||
self.use_editorial_strategy = use_editorial_strategy
|
||||
self.site_analysis = {}
|
||||
|
||||
def load_csv(self) -> bool:
|
||||
"""Load posts from CSV."""
|
||||
@@ -48,15 +97,98 @@ class CategoryProposer:
|
||||
logger.error(f"Error loading CSV: {e}")
|
||||
return False
|
||||
|
||||
def analyze_editorial_strategy(self) -> Dict:
|
||||
"""Analyze editorial strategy to inform category proposals."""
|
||||
if not self.use_editorial_strategy:
|
||||
return {}
|
||||
|
||||
logger.info("\n📊 Analyzing editorial strategy to inform category proposals...")
|
||||
|
||||
analyzer = EditorialStrategyAnalyzer()
|
||||
analyzer.load_csv(str(self.csv_file))
|
||||
self.site_analysis = analyzer.analyze_site_content()
|
||||
|
||||
logger.info("✓ Editorial strategy analysis complete")
|
||||
return self.site_analysis
|
||||
|
||||
def determine_best_site_for_post(self, post: Dict) -> Tuple[str, str, float]:
|
||||
"""
|
||||
Determine the best site for a post based on content.
|
||||
|
||||
Returns:
|
||||
Tuple of (site_name, reason, confidence)
|
||||
"""
|
||||
title = (post.get('title', '') + ' ' + post.get('content_preview', '')).lower()
|
||||
current_site = post.get('site', '')
|
||||
|
||||
# Check topic match for each site
|
||||
site_scores = {}
|
||||
|
||||
for site_name, editorial in EDITORIAL_LINES.items():
|
||||
score = 0
|
||||
matched_topics = []
|
||||
|
||||
for topic, keywords in editorial['topic_keywords'].items():
|
||||
if any(kw in title for kw in keywords):
|
||||
score += 1
|
||||
matched_topics.append(topic)
|
||||
|
||||
# Bonus for staying on current site (avoid unnecessary moves)
|
||||
if site_name == current_site:
|
||||
score += 0.5
|
||||
|
||||
site_scores[site_name] = {
|
||||
'score': score,
|
||||
'topics': matched_topics
|
||||
}
|
||||
|
||||
# Find best match
|
||||
best_site = max(site_scores.items(), key=lambda x: x[1]['score'])
|
||||
|
||||
if best_site[1]['score'] >= 1:
|
||||
return (
|
||||
best_site[0],
|
||||
f"Content matches {best_site[0]} editorial line ({', '.join(best_site[1]['topics'])})",
|
||||
min(1.0, best_site[1]['score'] / 3.0) # Normalize confidence
|
||||
)
|
||||
else:
|
||||
# No strong match, keep on current site or move to hellogeek
|
||||
if current_site in ['mistergeek.net', 'webscroll.fr']:
|
||||
return (current_site, "Keep on current site (no better match)", 0.5)
|
||||
else:
|
||||
return ('hellogeek.net', "Low-traffic/off-brand content", 0.4)
|
||||
|
||||
def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
|
||||
"""Get AI category proposals for a batch of posts."""
|
||||
"""Get AI category proposals with editorial strategy context."""
|
||||
if not self.openrouter_api_key:
|
||||
logger.error("OPENROUTER_API_KEY not set")
|
||||
return None
|
||||
|
||||
# Build editorial context
|
||||
editorial_context = ""
|
||||
if self.use_editorial_strategy:
|
||||
editorial_context = """
|
||||
EDITORIAL STRATEGY GUIDELINES:
|
||||
|
||||
mistergeek.net (High-value tech):
|
||||
- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
|
||||
- Focus: Professional, high-traffic tech content
|
||||
|
||||
webscroll.fr (Torrenting niche):
|
||||
- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
|
||||
- Focus: Torrenting and file-sharing only
|
||||
|
||||
hellogeek.net (Catch-all):
|
||||
- Categories: Experimental, Low-Traffic, Off-Brand, Testing
|
||||
- Focus: Everything else, low-traffic content
|
||||
|
||||
"""
|
||||
|
||||
# Format posts for AI
|
||||
formatted = []
|
||||
for i, post in enumerate(batch, 1):
|
||||
text = f"{i}. ID: {post['post_id']}\n"
|
||||
text += f" Site: {post.get('site', '')}\n"
|
||||
text += f" Title: {post.get('title', '')}\n"
|
||||
text += f" Current Categories: {post.get('categories', '')}\n"
|
||||
if 'content_preview' in post:
|
||||
@@ -67,22 +199,28 @@ class CategoryProposer:
|
||||
|
||||
prompt = f"""Analyze these blog posts and propose optimal categories.
|
||||
|
||||
{editorial_context}
|
||||
POSTS TO ANALYZE:
|
||||
|
||||
{posts_text}
|
||||
|
||||
For EACH post, provide:
|
||||
{{
|
||||
"post_id": <id>,
|
||||
"current_categories": "<current>",
|
||||
"proposed_category": "<best category>",
|
||||
"proposed_category": "<best category from editorial lines above>",
|
||||
"alternative_categories": ["<alt1>", "<alt2>"],
|
||||
"reason": "<brief explanation>",
|
||||
"confidence": "<High|Medium|Low>"
|
||||
"recommended_site": "<best site for this post>",
|
||||
"reason": "<brief explanation referencing editorial strategy>",
|
||||
"confidence": "<High|Medium|Low>",
|
||||
"should_migrate": <true/false>,
|
||||
"migration_reason": "<reason if should_migrate is true>"
|
||||
}}
|
||||
|
||||
Return ONLY a JSON array with one object per post."""
|
||||
|
||||
try:
|
||||
logger.info(f" Getting category proposals...")
|
||||
logger.info(f" Getting category proposals with editorial alignment...")
|
||||
|
||||
response = requests.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
@@ -129,11 +267,15 @@ Return ONLY a JSON array with one object per post."""
|
||||
return []
|
||||
|
||||
def propose_categories(self, batch_size: int = 10) -> bool:
|
||||
"""Propose categories for all posts."""
|
||||
"""Propose categories with editorial strategy alignment."""
|
||||
logger.info("\n" + "="*70)
|
||||
logger.info("PROPOSING CATEGORIES WITH AI")
|
||||
logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY")
|
||||
logger.info("="*70 + "\n")
|
||||
|
||||
# Analyze editorial strategy first
|
||||
if self.use_editorial_strategy:
|
||||
self.analyze_editorial_strategy()
|
||||
|
||||
batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)]
|
||||
logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n")
|
||||
|
||||
@@ -158,23 +300,68 @@ Return ONLY a JSON array with one object per post."""
|
||||
logger.info(f" API calls: {self.api_calls}")
|
||||
logger.info(f" Cost: ${self.ai_cost:.4f}")
|
||||
|
||||
# Process proposals with editorial alignment
|
||||
for post in self.posts:
|
||||
post_id = str(post['post_id'])
|
||||
proposal = all_proposals.get(post_id, {})
|
||||
|
||||
self.proposed_categories.append({
|
||||
|
||||
current_site = post.get('site', '')
|
||||
|
||||
# Get AI recommendation or use editorial strategy
|
||||
proposed_category = proposal.get('proposed_category', post.get('categories', ''))
|
||||
recommended_site = proposal.get('recommended_site', current_site)
|
||||
should_migrate = proposal.get('should_migrate', False)
|
||||
|
||||
# If AI didn't specify, use editorial strategy
|
||||
if not recommended_site or recommended_site == current_site:
|
||||
recommended_site, migration_reason, confidence = self.determine_best_site_for_post(post)
|
||||
should_migrate = (recommended_site != current_site)
|
||||
else:
|
||||
migration_reason = proposal.get('migration_reason', '')
|
||||
confidence = proposal.get('confidence', 'Medium')
|
||||
|
||||
# Build proposal record
|
||||
proposal_record = {
|
||||
**post,
|
||||
'proposed_category': proposal.get('proposed_category', post.get('categories', '')),
|
||||
'proposed_category': proposed_category,
|
||||
'alternative_categories': ', '.join(proposal.get('alternative_categories', [])),
|
||||
'category_reason': proposal.get('reason', ''),
|
||||
'category_confidence': proposal.get('confidence', 'Medium'),
|
||||
'current_categories': post.get('categories', '')
|
||||
})
|
||||
'current_categories': post.get('categories', ''),
|
||||
'recommended_site': recommended_site,
|
||||
'should_migrate': 'Yes' if should_migrate else 'No',
|
||||
'migration_reason': migration_reason,
|
||||
'current_site': current_site
|
||||
}
|
||||
|
||||
self.proposed_categories.append(proposal_record)
|
||||
|
||||
# Track migration recommendations
|
||||
if should_migrate:
|
||||
self.migration_recommendations.append({
|
||||
'post_id': post_id,
|
||||
'title': post.get('title', '')[:80],
|
||||
'from_site': current_site,
|
||||
'to_site': recommended_site,
|
||||
'reason': migration_reason,
|
||||
'category': proposed_category
|
||||
})
|
||||
|
||||
# Summary
|
||||
migration_count = len(self.migration_recommendations)
|
||||
logger.info(f"\n📊 Migration Recommendations: {migration_count} posts")
|
||||
if migration_count > 0:
|
||||
by_site = {}
|
||||
for mig in self.migration_recommendations:
|
||||
site = mig['to_site']
|
||||
by_site[site] = by_site.get(site, 0) + 1
|
||||
for site, count in by_site.items():
|
||||
logger.info(f" To {site}: {count} posts")
|
||||
|
||||
return True
|
||||
|
||||
def export_proposals(self, output_file: Optional[str] = None) -> str:
|
||||
"""Export category proposals to CSV."""
|
||||
"""Export category proposals with migration data."""
|
||||
if not output_file:
|
||||
output_dir = Path(__file__).parent.parent.parent / 'output'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -185,9 +372,10 @@ Return ONLY a JSON array with one object per post."""
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fieldnames = [
|
||||
'post_id', 'title', 'site', 'current_categories',
|
||||
'post_id', 'title', 'current_site', 'current_categories',
|
||||
'proposed_category', 'alternative_categories',
|
||||
'category_reason', 'category_confidence'
|
||||
'category_reason', 'category_confidence',
|
||||
'recommended_site', 'should_migrate', 'migration_reason'
|
||||
]
|
||||
|
||||
logger.info(f"\nExporting to: {output_file}")
|
||||
@@ -200,13 +388,45 @@ Return ONLY a JSON array with one object per post."""
|
||||
logger.info(f"✓ Exported {len(self.proposed_categories)} proposals")
|
||||
return str(output_file)
|
||||
|
||||
def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> str:
|
||||
"""Run complete category proposal process."""
|
||||
if not self.load_csv():
|
||||
def export_migrations(self, output_file: Optional[str] = None) -> str:
|
||||
"""Export migration recommendations separately."""
|
||||
if not self.migration_recommendations:
|
||||
logger.info("No migration recommendations to export")
|
||||
return ""
|
||||
|
||||
if not output_file:
|
||||
output_dir = Path(__file__).parent.parent.parent / 'output'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
output_file = output_dir / f'migration_recommendations_{timestamp}.csv'
|
||||
|
||||
output_file = Path(output_file)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fieldnames = [
|
||||
'post_id', 'title', 'from_site', 'to_site', 'reason', 'category'
|
||||
]
|
||||
|
||||
logger.info(f"\nExporting migrations to: {output_file}")
|
||||
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(self.migration_recommendations)
|
||||
|
||||
logger.info(f"✓ Exported {len(self.migration_recommendations)} migration recommendations")
|
||||
return str(output_file)
|
||||
|
||||
def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> Tuple[str, str]:
|
||||
"""Run complete category proposal process with editorial strategy."""
|
||||
if not self.load_csv():
|
||||
return "", ""
|
||||
|
||||
if not self.propose_categories(batch_size=batch_size):
|
||||
logger.error("Failed to propose categories")
|
||||
return ""
|
||||
return "", ""
|
||||
|
||||
return self.export_proposals(output_file)
|
||||
proposals_file = self.export_proposals(output_file)
|
||||
migrations_file = self.export_migrations()
|
||||
|
||||
return proposals_file, migrations_file
|
||||
|
||||
@@ -140,10 +140,17 @@ def cmd_category_propose(app, args):
|
||||
|
||||
csv_file = args.args[0] if args.args else None
|
||||
|
||||
result = app.category_propose(csv_file=csv_file, output=args.output)
|
||||
proposals_file, migrations_file = app.category_propose(csv_file=csv_file, output=args.output)
|
||||
|
||||
if result:
|
||||
print(f"✅ Category proposals saved to: {result}")
|
||||
if proposals_file:
|
||||
print(f"\n✅ Category proposals complete!")
|
||||
print(f" Proposals: {proposals_file}")
|
||||
if migrations_file:
|
||||
print(f" Migrations: {migrations_file}")
|
||||
print(f"\nReview the files to see:")
|
||||
print(f" 1. Proposed categories for each post")
|
||||
print(f" 2. Site migration recommendations")
|
||||
print(f" 3. Editorial strategy alignment")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user