Add existing category fetching to category_propose
New Feature: - Fetch existing categories from WordPress sites before AI proposals - AI now prefers existing categories to avoid duplicates - Shows existing categories in AI prompt for better suggestions - Tracks whether proposed categories are existing or new ### Changes: - fetch_existing_categories() method - Gets categories from all sites - Updated AI prompt includes existing categories list - New CSV column: is_existing_category (Yes/No) - Statistics showing existing vs new categories ### Benefits: - Reduces category duplication - Maintains consistency across posts - AI makes smarter category suggestions - Users can see which are existing vs new categories ### AI Prompt Enhancement: EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES): mistergeek.net: - VPN - Software - Gaming ... webscroll.fr: - Torrenting - File-Sharing ... IMPORTANT: Use existing categories when possible... ### Output: Category Statistics: Using existing categories: 145 posts Proposing new categories: 12 posts Usage: ./seo category_propose # Now fetches existing categories automatically Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
@@ -64,13 +64,15 @@ class SEOApp:
|
||||
analyzer = EnhancedPostAnalyzer(csv_file, analyze_fields=fields)
|
||||
return analyzer.run(output_file=output, update_input=update)
|
||||
|
||||
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None) -> Tuple[str, str]:
|
||||
def category_propose(self, csv_file: Optional[str] = None, output: Optional[str] = None,
|
||||
fetch_existing: bool = True) -> Tuple[str, str]:
|
||||
"""
|
||||
Propose categories for posts with editorial strategy alignment.
|
||||
|
||||
Args:
|
||||
csv_file: Path to CSV file (uses latest export if not provided)
|
||||
output: Custom output file path
|
||||
fetch_existing: If True, fetch existing categories from WordPress
|
||||
|
||||
Returns:
|
||||
Tuple of (proposals_file, migrations_file)
|
||||
@@ -85,7 +87,8 @@ class SEOApp:
|
||||
|
||||
logger.info(f"Using file: {csv_file}")
|
||||
|
||||
proposer = CategoryProposer(csv_file, use_editorial_strategy=True)
|
||||
proposer = CategoryProposer(csv_file, use_editorial_strategy=True,
|
||||
fetch_existing_categories=fetch_existing)
|
||||
return proposer.run(output_file=output)
|
||||
|
||||
def category_apply(self, proposals_csv: str, site_name: str,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
Category Proposer - AI-powered category suggestions with editorial strategy alignment
|
||||
Proposes categories based on content AND site editorial lines
|
||||
Fetches existing categories from WordPress sites and uses them in proposals
|
||||
"""
|
||||
|
||||
import csv
|
||||
@@ -10,6 +10,7 @@ from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
from .config import Config
|
||||
from .editorial_strategy import EditorialStrategyAnalyzer
|
||||
@@ -58,13 +59,15 @@ EDITORIAL_LINES = {
|
||||
class CategoryProposer:
|
||||
"""Propose categories for posts using AI with editorial strategy alignment."""
|
||||
|
||||
def __init__(self, csv_file: str, use_editorial_strategy: bool = True):
|
||||
def __init__(self, csv_file: str, use_editorial_strategy: bool = True,
|
||||
fetch_existing_categories: bool = True):
|
||||
"""
|
||||
Initialize proposer.
|
||||
|
||||
Args:
|
||||
csv_file: Path to CSV file
|
||||
use_editorial_strategy: If True, align proposals with editorial lines
|
||||
fetch_existing_categories: If True, fetch existing categories from WordPress
|
||||
"""
|
||||
self.csv_file = Path(csv_file)
|
||||
self.openrouter_api_key = Config.OPENROUTER_API_KEY
|
||||
@@ -75,7 +78,63 @@ class CategoryProposer:
|
||||
self.api_calls = 0
|
||||
self.ai_cost = 0.0
|
||||
self.use_editorial_strategy = use_editorial_strategy
|
||||
self.fetch_existing_categories = fetch_existing_categories
|
||||
self.site_analysis = {}
|
||||
self.existing_categories_by_site = {} # Fetched from WordPress
|
||||
self.sites = Config.WORDPRESS_SITES # Add sites config
|
||||
|
||||
def fetch_existing_categories(self) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Fetch existing categories from all WordPress sites.
|
||||
|
||||
Returns:
|
||||
Dict mapping site name to list of category names
|
||||
"""
|
||||
logger.info("\n📁 Fetching existing categories from WordPress sites...")
|
||||
|
||||
categories_by_site = {}
|
||||
|
||||
for site_name, site_config in self.sites.items():
|
||||
try:
|
||||
base_url = site_config['url'].rstrip('/')
|
||||
auth = HTTPBasicAuth(site_config['username'], site_config['password'])
|
||||
|
||||
logger.info(f" Fetching from {site_name}...")
|
||||
|
||||
all_categories = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
response = requests.get(
|
||||
f"{base_url}/wp-json/wp/v2/categories",
|
||||
params={'per_page': 100, 'page': page},
|
||||
auth=auth,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(f" Could not fetch categories: {response.status_code}")
|
||||
break
|
||||
|
||||
page_categories = response.json()
|
||||
if not page_categories:
|
||||
break
|
||||
|
||||
all_categories.extend([cat['name'] for cat in page_categories])
|
||||
|
||||
if len(page_categories) < 100:
|
||||
break
|
||||
page += 1
|
||||
|
||||
categories_by_site[site_name] = all_categories
|
||||
logger.info(f" ✓ Found {len(all_categories)} categories")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Error fetching from {site_name}: {e}")
|
||||
categories_by_site[site_name] = []
|
||||
|
||||
self.existing_categories_by_site = categories_by_site
|
||||
return categories_by_site
|
||||
|
||||
def load_csv(self) -> bool:
|
||||
"""Load posts from CSV."""
|
||||
@@ -159,29 +218,53 @@ class CategoryProposer:
|
||||
return ('hellogeek.net', "Low-traffic/off-brand content", 0.4)
|
||||
|
||||
def get_category_proposals(self, batch: List[Dict]) -> Optional[str]:
|
||||
"""Get AI category proposals with editorial strategy context."""
|
||||
"""Get AI category proposals with editorial strategy context and existing categories."""
|
||||
if not self.openrouter_api_key:
|
||||
logger.error("OPENROUTER_API_KEY not set")
|
||||
return None
|
||||
|
||||
# Build editorial context
|
||||
# Build editorial context with existing categories
|
||||
editorial_context = ""
|
||||
if self.use_editorial_strategy:
|
||||
editorial_context = """
|
||||
EDITORIAL STRATEGY GUIDELINES:
|
||||
|
||||
mistergeek.net (High-value tech):
|
||||
- Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
|
||||
- Ideal Categories: VPN, Software/Tools, Gaming, SEO, Content Marketing, Tech Reviews
|
||||
- Focus: Professional, high-traffic tech content
|
||||
|
||||
webscroll.fr (Torrenting niche):
|
||||
- Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
|
||||
- Ideal Categories: Torrenting, File-Sharing, Tracker Guides, VPN for Torrenting, Seedbox
|
||||
- Focus: Torrenting and file-sharing only
|
||||
|
||||
hellogeek.net (Catch-all):
|
||||
- Categories: Experimental, Low-Traffic, Off-Brand, Testing
|
||||
- Ideal Categories: Experimental, Low-Traffic, Off-Brand, Testing
|
||||
- Focus: Everything else, low-traffic content
|
||||
|
||||
"""
|
||||
|
||||
# Add existing categories from WordPress
|
||||
existing_cats_context = ""
|
||||
if self.existing_categories_by_site:
|
||||
existing_cats_context = """
|
||||
EXISTING CATEGORIES (PREFER THESE TO AVOID DUPLICATES):
|
||||
|
||||
"""
|
||||
for site_name, categories in self.existing_categories_by_site.items():
|
||||
if categories:
|
||||
existing_cats_context += f"{site_name}:\n"
|
||||
for cat in categories[:20]: # Show first 20
|
||||
existing_cats_context += f" - {cat}\n"
|
||||
if len(categories) > 20:
|
||||
existing_cats_context += f" ... and {len(categories) - 20} more\n"
|
||||
existing_cats_context += "\n"
|
||||
|
||||
existing_cats_context += """
|
||||
IMPORTANT: Use existing categories when possible. Only propose new categories if:
|
||||
1. The post doesn't fit any existing category
|
||||
2. The new category would have multiple posts (not one-off)
|
||||
3. It aligns with the site's editorial strategy
|
||||
|
||||
"""
|
||||
|
||||
# Format posts for AI
|
||||
@@ -200,6 +283,7 @@ hellogeek.net (Catch-all):
|
||||
prompt = f"""Analyze these blog posts and propose optimal categories.
|
||||
|
||||
{editorial_context}
|
||||
{existing_cats_context}
|
||||
POSTS TO ANALYZE:
|
||||
|
||||
{posts_text}
|
||||
@@ -208,19 +292,20 @@ For EACH post, provide:
|
||||
{{
|
||||
"post_id": <id>,
|
||||
"current_categories": "<current>",
|
||||
"proposed_category": "<best category from editorial lines above>",
|
||||
"proposed_category": "<best category - PREFER EXISTING ONES FROM LIST ABOVE>",
|
||||
"alternative_categories": ["<alt1>", "<alt2>"],
|
||||
"recommended_site": "<best site for this post>",
|
||||
"reason": "<brief explanation referencing editorial strategy>",
|
||||
"confidence": "<High|Medium|Low>",
|
||||
"should_migrate": <true/false>,
|
||||
"migration_reason": "<reason if should_migrate is true>"
|
||||
"migration_reason": "<reason if should_migrate is true>",
|
||||
"is_existing_category": <true/false - true if proposed_category exists on the site>
|
||||
}}
|
||||
|
||||
Return ONLY a JSON array with one object per post."""
|
||||
|
||||
try:
|
||||
logger.info(f" Getting category proposals with editorial alignment...")
|
||||
logger.info(f" Getting category proposals with existing categories...")
|
||||
|
||||
response = requests.post(
|
||||
"https://openrouter.ai/api/v1/chat/completions",
|
||||
@@ -267,11 +352,15 @@ Return ONLY a JSON array with one object per post."""
|
||||
return []
|
||||
|
||||
def propose_categories(self, batch_size: int = 10) -> bool:
|
||||
"""Propose categories with editorial strategy alignment."""
|
||||
"""Propose categories with editorial strategy alignment and existing categories."""
|
||||
logger.info("\n" + "="*70)
|
||||
logger.info("PROPOSING CATEGORIES WITH EDITORIAL STRATEGY")
|
||||
logger.info("="*70 + "\n")
|
||||
|
||||
# Fetch existing categories from WordPress
|
||||
if self.fetch_existing_categories:
|
||||
self.fetch_existing_categories()
|
||||
|
||||
# Analyze editorial strategy first
|
||||
if self.use_editorial_strategy:
|
||||
self.analyze_editorial_strategy()
|
||||
@@ -311,6 +400,7 @@ Return ONLY a JSON array with one object per post."""
|
||||
proposed_category = proposal.get('proposed_category', post.get('categories', ''))
|
||||
recommended_site = proposal.get('recommended_site', current_site)
|
||||
should_migrate = proposal.get('should_migrate', False)
|
||||
is_existing = proposal.get('is_existing_category', True)
|
||||
|
||||
# If AI didn't specify, use editorial strategy
|
||||
if not recommended_site or recommended_site == current_site:
|
||||
@@ -331,7 +421,8 @@ Return ONLY a JSON array with one object per post."""
|
||||
'recommended_site': recommended_site,
|
||||
'should_migrate': 'Yes' if should_migrate else 'No',
|
||||
'migration_reason': migration_reason,
|
||||
'current_site': current_site
|
||||
'current_site': current_site,
|
||||
'is_existing_category': 'Yes' if is_existing else 'No'
|
||||
}
|
||||
|
||||
self.proposed_categories.append(proposal_record)
|
||||
@@ -357,6 +448,13 @@ Return ONLY a JSON array with one object per post."""
|
||||
by_site[site] = by_site.get(site, 0) + 1
|
||||
for site, count in by_site.items():
|
||||
logger.info(f" To {site}: {count} posts")
|
||||
|
||||
# Summary of existing vs new categories
|
||||
existing_count = sum(1 for p in self.proposed_categories if p.get('is_existing_category') == 'Yes')
|
||||
new_count = len(self.proposed_categories) - existing_count
|
||||
logger.info(f"\n📊 Category Statistics:")
|
||||
logger.info(f" Using existing categories: {existing_count} posts")
|
||||
logger.info(f" Proposing new categories: {new_count} posts")
|
||||
|
||||
return True
|
||||
|
||||
@@ -375,7 +473,8 @@ Return ONLY a JSON array with one object per post."""
|
||||
'post_id', 'title', 'current_site', 'current_categories',
|
||||
'proposed_category', 'alternative_categories',
|
||||
'category_reason', 'category_confidence',
|
||||
'recommended_site', 'should_migrate', 'migration_reason'
|
||||
'recommended_site', 'should_migrate', 'migration_reason',
|
||||
'is_existing_category'
|
||||
]
|
||||
|
||||
logger.info(f"\nExporting to: {output_file}")
|
||||
|
||||
Reference in New Issue
Block a user