Refactor category_propose to align with editorial strategy

Major refactoring of category proposal system:

### Changes:
- Integrated editorial strategy into category proposals
- Added site migration recommendations to category proposals
- AI now considers editorial lines when suggesting categories
- Automatic detection of posts that should migrate between sites

### New Features:
- Editorial line definitions for each site
- Topic-based site matching algorithm
- Migration recommendations alongside category proposals
- Dual output: category_proposals_*.csv + migration_recommendations_*.csv

### Editorial Lines:
mistergeek.net: VPN, Software, Gaming, SEO, Tech (high-value)
webscroll.fr: Torrenting, File-Sharing, Tracker Guides (niche)
hellogeek.net: Experimental, Low-Traffic, Off-Brand (catch-all)

### Output Files:
1. category_proposals_*.csv - Categories + site recommendations
2. migration_recommendations_*.csv - Posts to migrate between sites

### CSV Columns Added:
- recommended_site - Best site for the post
- should_migrate - Yes/No flag
- migration_reason - Why migration is recommended
- current_site - Original site for comparison

### Benefits:
- Categories aligned with site strategy
- Automatic migration detection
- Smarter AI prompts with editorial context
- Unified category + migration workflow

Usage:
./seo category_propose
# Generates both category and migration files

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
Kevin Bataille
2026-02-16 15:43:26 +01:00
parent c980edf047
commit b5c586e7ad
3 changed files with 270 additions and 34 deletions

View File

@@ -5,7 +5,7 @@ SEO Application Core - Integrated SEO automation functionality
import logging
from pathlib import Path
from datetime import datetime
from typing import Optional, List
from typing import Optional, List, Tuple
from .exporter import PostExporter
from .analyzer import EnhancedPostAnalyzer
@@ -64,9 +64,18 @@ class SEOApp:
analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields)
return analyzer.run(output_file=output, update_input=update)
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> str:
"""Propose categories for posts."""
logger.info("🏷️ Proposing categories with AI...")
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]:
"""
Propose categories for posts with editorial strategy alignment.
Args:
csv_file: Path to CSV file (uses latest export if not provided)
output: Custom output file path
Returns:
Tuple of (proposals_file, migrations_file)
"""
logger.info("🏷️ Proposing categories with AI (editorial strategy aligned)...")
if not csv_file:
csv_file = self._find_latest_export()
@@ -76,7 +85,7 @@ class SEOApp:
logger.info(f"Using file: {csv_file}")
proposer = CategoryProposer(csv_file)
proposer = CategoryProposer(csv_file, use_editorial_strategy=True)
return proposer.run(output_file=output)
def category_apply(self, proposals_csv: str, site_name: str,

View File

@@ -1,5 +1,6 @@
"""
Category Proposer - AI-powered category suggestions
Category Proposer - AI-powered category suggestions with editorial strategy alignment
Proposes categories based on content AND site editorial lines
"""
import csv
@@ -7,26 +8,74 @@ import json
import logging
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple
import requests
from .config import Config
from .editorial_strategy import EditorialStrategyAnalyzer
logger = logging.getLogger(__name__)
class CategoryProposer:
"""Propose categories for posts using AI."""
# Editorial line definitions for each site
EDITORIAL_LINES = {
'mistergeek.net': {
'focus': 'High-value tech content',
'ideal_categories': [
'VPN', 'Software/Tools', 'Gaming', 'SEO',
'Content Marketing', 'Tech Reviews', 'Tutorials'
],
'topic_keywords': {
'VPN': ['vpn', 'proxy', 'privacy', 'security', 'encryption'],
'Software': ['software', 'app', 'tool', 'download', 'install'],
'Gaming': ['game', 'gaming', 'console', 'steam', 'playstation'],
'SEO': ['seo', 'ranking', 'google', 'search', 'optimization'],
'Tech': ['tech', 'technology', 'review', 'device', 'hardware'],
}
},
'webscroll.fr': {
'focus': 'Torrenting and file-sharing niche',
'ideal_categories': [
'Torrenting', 'File-Sharing', 'Tracker Guides',
'VPN for Torrenting', 'Seedbox'
],
'topic_keywords': {
'Torrenting': ['torrent', 'download', 'upload', 'tracker', 'seed'],
'File-Sharing': ['file-sharing', 'ddl', 'hosting', 'upload'],
'Tracker Guides': ['tracker', 'ratio', 'invite', 'private'],
}
},
'hellogeek.net': {
'focus': 'Low-traffic, experimental, off-brand content',
'ideal_categories': [
'Experimental', 'Low-Traffic', 'Off-Brand', 'Testing'
],
'topic_keywords': {} # Catch-all for everything else
}
}
def __init__(self, csv_file: str):
"""Initialize proposer with CSV file."""
class CategoryProposer:
"""Propose categories for posts using AI with editorial strategy alignment."""
def __init__(self, csv_file: str, use_editorial_strategy: bool = True):
"""
Initialize proposer.
Args:
csv_file: Path to CSV file
use_editorial_strategy: If True, align proposals with editorial lines
"""
self.csv_file = Path(csv_file)
self.openrouter_api_key = Config.OPENROUTER_API_KEY
self.ai_model = Config.AI_MODEL
self.posts = []
self.proposed_categories = []
self.migration_recommendations = []
self.api_calls = 0
self.ai_cost = 0.0
self.use_editorial_strategy = use_editorial_strategy
self.site_analysis = {}
def load_csv(self) -> bool:
"""Load posts from CSV."""
@@ -48,15 +97,98 @@ class CategoryProposer:
logger.error(f"Error loading CSV: {e}")
return False
def analyze_editorial_strategy(self) -> Dict:
"""Analyze editorial strategy to inform category proposals."""
if not self.use_editorial_strategy:
return {}
logger.info("\n📊 Analyzing editorial strategy to inform category proposals...")
analyzer = EditorialStrategyAnalyzer()
analyzer.load_csv(str(self.csv_file))
self.site_analysis = analyzer.analyze_site_content()
logger.info("✓ Editorial strategy analysis complete")
return self.site_analysis
def determine_best_site_for_post(self, post: Dict) -> Tuple[str, str, float]:
"""
Determine the best site for a post based on content.
Returns:
Tuple of (site_name, reason, confidence)
"""
title = (post.get('title', '') + ' ' + post.get('content_preview', '')).lower()
current_site = post.get('site', '')
# Check topic match for each site
site_scores = {}
for site_name, editorial in EDITORIAL_LINES.items():
score = 0
matched_topics = []
for topic, keywords in editorial['topic_keywords'].items():
if any(kw in title for kw in keywords):
score += 1
matched_topics.append(topic)
# Bonus for staying on current site (avoid unnecessary moves)
if site_name == current_site:
score += 0.5
site_scores[site_name] = {
'score': score,
'topics': matched_topics
}
# Find best match
best_site = max(site_scores.items(), key=lambda x: x[1]['score'])
if best_site[1]['score'] >= 1:
return (
best_site[0],
f"Content matches {best_site[0]} editorial line ({', '.join(best_site[1]['topics'])})",
min(1.0, best_site[1]['score'] / 3.0) # Normalize confidence
)
else:
# No strong match, keep on current site or move to hellogeek
if current_site in ['mistergeek.net', 'webscroll.fr']:
return (current_site, "Keep on current site (no better match)", 0.5)
else:
return ('hellogeek.net', "Low-traffic/off-brand content", 0.4)
def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
"""Get AI category proposals for a batch of posts."""
"""Get AI category proposals with editorial strategy context."""
if not self.openrouter_api_key:
logger.error("OPENROUTER_API_KEY not set")
return None
# Build editorial context
editorial_context = ""
if self.use_editorial_strategy:
editorial_context = """
EDITORIAL STRATEGY GUIDELINES:
mistergeek.net (High-value tech):
- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
- Focus: Professional, high-traffic tech content
webscroll.fr (Torrenting niche):
- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
- Focus: Torrenting and file-sharing only
hellogeek.net (Catch-all):
- Categories: Experimental, Low-Traffic, Off-Brand, Testing
- Focus: Everything else, low-traffic content
"""
# Format posts for AI
formatted = []
for i, post in enumerate(batch, 1):
text = f"{i}. ID: {post['post_id']}\n"
text += f" Site: {post.get('site', '')}\n"
text += f" Title: {post.get('title', '')}\n"
text += f" Current Categories: {post.get('categories', '')}\n"
if 'content_preview' in post:
@@ -67,22 +199,28 @@ class CategoryProposer:
prompt = f"""Analyze these blog posts and propose optimal categories.
{editorial_context}
POSTS TO ANALYZE:
{posts_text}
For EACH post, provide:
{{
"post_id": <id>,
"current_categories": "<current>",
"proposed_category": "<best category>",
"proposed_category": "<best category from editorial lines above>",
"alternative_categories": ["<alt1>", "<alt2>"],
"reason": "<brief explanation>",
"confidence": "<High|Medium|Low>"
"recommended_site": "<best site for this post>",
"reason": "<brief explanation referencing editorial strategy>",
"confidence": "<High|Medium|Low>",
"should_migrate": <true/false>,
"migration_reason": "<reason if should_migrate is true>"
}}
Return ONLY a JSON array with one object per post."""
try:
logger.info(f" Getting category proposals...")
logger.info(f" Getting category proposals with editorial alignment...")
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
@@ -129,11 +267,15 @@ Return ONLY a JSON array with one object per post."""
return []
def propose_categories(self, batch_size: int = 10) -> bool:
"""Propose categories for all posts."""
"""Propose categories with editorial strategy alignment."""
logger.info("\n" + "="*70)
logger.info("PROPOSING CATEGORIES WITH AI")
logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY")
logger.info("="*70 + "\n")
# Analyze editorial strategy first
if self.use_editorial_strategy:
self.analyze_editorial_strategy()
batches = [self.posts[i:i + batch_size] for i in range(0, len(self.posts), batch_size)]
logger.info(f"Processing {len(self.posts)} posts in {len(batches)} batches...\n")
@@ -158,23 +300,68 @@ Return ONLY a JSON array with one object per post."""
logger.info(f" API calls: {self.api_calls}")
logger.info(f" Cost: ${self.ai_cost:.4f}")
# Process proposals with editorial alignment
for post in self.posts:
post_id = str(post['post_id'])
proposal = all_proposals.get(post_id, {})
self.proposed_categories.append({
current_site = post.get('site', '')
# Get AI recommendation or use editorial strategy
proposed_category = proposal.get('proposed_category', post.get('categories', ''))
recommended_site = proposal.get('recommended_site', current_site)
should_migrate = proposal.get('should_migrate', False)
# If AI didn't specify, use editorial strategy
if not recommended_site or recommended_site == current_site:
recommended_site, migration_reason, confidence = self.determine_best_site_for_post(post)
should_migrate = (recommended_site != current_site)
else:
migration_reason = proposal.get('migration_reason', '')
confidence = proposal.get('confidence', 'Medium')
# Build proposal record
proposal_record = {
**post,
'proposed_category': proposal.get('proposed_category', post.get('categories', '')),
'proposed_category': proposed_category,
'alternative_categories': ', '.join(proposal.get('alternative_categories', [])),
'category_reason': proposal.get('reason', ''),
'category_confidence': proposal.get('confidence', 'Medium'),
'current_categories': post.get('categories', '')
})
'current_categories': post.get('categories', ''),
'recommended_site': recommended_site,
'should_migrate': 'Yes' if should_migrate else 'No',
'migration_reason': migration_reason,
'current_site': current_site
}
self.proposed_categories.append(proposal_record)
# Track migration recommendations
if should_migrate:
self.migration_recommendations.append({
'post_id': post_id,
'title': post.get('title', '')[:80],
'from_site': current_site,
'to_site': recommended_site,
'reason': migration_reason,
'category': proposed_category
})
# Summary
migration_count = len(self.migration_recommendations)
logger.info(f"\n📊 Migration Recommendations: {migration_count} posts")
if migration_count > 0:
by_site = {}
for mig in self.migration_recommendations:
site = mig['to_site']
by_site[site] = by_site.get(site, 0) + 1
for site, count in by_site.items():
logger.info(f" To {site}: {count} posts")
return True
def export_proposals(self, output_file: Optional[str] = None) -> str:
"""Export category proposals to CSV."""
"""Export category proposals with migration data."""
if not output_file:
output_dir = Path(__file__).parent.parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
@@ -185,9 +372,10 @@ Return ONLY a JSON array with one object per post."""
output_file.parent.mkdir(parents=True, exist_ok=True)
fieldnames = [
'post_id', 'title', 'site', 'current_categories',
'post_id', 'title', 'current_site', 'current_categories',
'proposed_category', 'alternative_categories',
'category_reason', 'category_confidence'
'category_reason', 'category_confidence',
'recommended_site', 'should_migrate', 'migration_reason'
]
logger.info(f"\nExporting to: {output_file}")
@@ -200,13 +388,45 @@ Return ONLY a JSON array with one object per post."""
logger.info(f"✓ Exported {len(self.proposed_categories)} proposals")
return str(output_file)
def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> str:
"""Run complete category proposal process."""
if not self.load_csv():
def export_migrations(self, output_file: Optional[str] = None) -> str:
"""Export migration recommendations separately."""
if not self.migration_recommendations:
logger.info("No migration recommendations to export")
return ""
if not output_file:
output_dir = Path(__file__).parent.parent.parent / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = output_dir / f'migration_recommendations_{timestamp}.csv'
output_file = Path(output_file)
output_file.parent.mkdir(parents=True, exist_ok=True)
fieldnames = [
'post_id', 'title', 'from_site', 'to_site', 'reason', 'category'
]
logger.info(f"\nExporting migrations to: {output_file}")
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.migration_recommendations)
logger.info(f"✓ Exported {len(self.migration_recommendations)} migration recommendations")
return str(output_file)
def run(self, output_file: Optional[str] = None, batch_size: int = 10) -> Tuple[str, str]:
"""Run complete category proposal process with editorial strategy."""
if not self.load_csv():
return "", ""
if not self.propose_categories(batch_size=batch_size):
logger.error("Failed to propose categories")
return ""
return "", ""
return self.export_proposals(output_file)
proposals_file = self.export_proposals(output_file)
migrations_file = self.export_migrations()
return proposals_file, migrations_file

View File

@@ -140,10 +140,17 @@ def cmd_category_propose(app, args):
csv_file = args.args[0] if args.args else None
result = app.category_propose(csv_file=csv_file, output=args.output)
proposals_file, migrations_file = app.category_propose(csv_file=csv_file, output=args.output)
if result:
print(f"✅ Category proposals saved to: {result}")
if proposals_file:
print(f"\n✅ Category proposals complete!")
print(f" Proposals: {proposals_file}")
if migrations_file:
print(f" Migrations: {migrations_file}")
print(f"\nReview the files to see:")
print(f" 1. Proposed categories for each post")
print(f" 2. Site migration recommendations")
print(f" 3. Editorial strategy alignment")
return 0