Add post migration and author filter features

- Add migrate command to transfer posts between websites - Support CSV-based and filtered migration modes - Preserve original post dates (with --ignore-original-date option) - Auto-create categories and tags on destination site - Add author filtering to export (--author and --author-id flags) - Include author_name column in exported CSV - Add comprehensive documentation (MIGRATION_GUIDE.md, AUTHOR_FILTER_GUIDE.md) Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-02-16 23:50:24 +01:00
parent 06d660f9c8
commit 84f8fc6db5
6 changed files with 1903 additions and 29 deletions
--- a/src/seo/app.py
+++ b/src/seo/app.py
@@ -12,6 +12,7 @@ from .analyzer import EnhancedPostAnalyzer
 from .category_proposer import CategoryProposer
 from .category_manager import WordPressCategoryManager, CategoryAssignmentProcessor
 from .editorial_strategy import EditorialStrategyAnalyzer
+from .post_migrator import WordPressPostMigrator

 logger = logging.getLogger(__name__)

@@ -34,11 +35,23 @@ class SEOApp:
        else:
            logging.basicConfig(level=logging.INFO)
    
-    def export(self) -> str:
-        """Export all posts from WordPress sites."""
+    def export(self, author_filter: Optional[List[str]] = None, 
+               author_ids: Optional[List[int]] = None,
+               site_filter: Optional[str] = None) -> str:
+        """
+        Export all posts from WordPress sites.
+        
+        Args:
+            author_filter: List of author names to filter by
+            author_ids: List of author IDs to filter by
+            site_filter: Export from specific site only
+            
+        Returns:
+            Path to exported CSV file
+        """
        logger.info("📦 Exporting all posts from WordPress sites...")
-        exporter = PostExporter()
-        return exporter.run()
+        exporter = PostExporter(author_filter=author_filter, author_ids=author_ids)
+        return exporter.run(site_filter=site_filter)
    
    def analyze(self, csv_file: Optional[str] = None, fields: Optional[List[str]] = None, 
                update: bool = False, output: Optional[str] = None) -> str:
@@ -146,23 +159,110 @@ class SEOApp:
    def editorial_strategy(self, csv_file: Optional[str] = None) -> dict:
        """
        Analyze editorial strategy and recommend migrations.
-        
+
        Args:
            csv_file: Path to posts CSV (uses latest export if not provided)
-            
+
        Returns:
            Analysis results dict
        """
        logger.info("📊 Analyzing editorial strategy...")
-        
+
        if not csv_file:
            csv_file = self._find_latest_export()
-        
+
        if not csv_file:
            raise FileNotFoundError("No exported posts found. Run export() first.")
-        
+
        analyzer = EditorialStrategyAnalyzer()
        return analyzer.run(csv_file)
+
+    def migrate(self, csv_file: str, destination_site: str,
+                create_categories: bool = True, create_tags: bool = True,
+                delete_after: bool = False, status: str = 'draft',
+                output_file: Optional[str] = None,
+                ignore_original_date: bool = False) -> str:
+        """
+        Migrate posts from CSV file to destination site.
+
+        Args:
+            csv_file: Path to CSV file with posts to migrate (must have 'site' and 'post_id' columns)
+            destination_site: Destination site name (mistergeek.net, webscroll.fr, hellogeek.net)
+            create_categories: If True, create categories if they don't exist
+            create_tags: If True, create tags if they don't exist
+            delete_after: If True, delete posts from source after migration
+            status: Status for new posts ('draft', 'publish', 'pending')
+            output_file: Custom output file path for migration report
+            ignore_original_date: If True, use current date instead of original post date
+
+        Returns:
+            Path to migration report CSV
+        """
+        logger.info(f"🚀 Migrating posts to {destination_site}...")
+
+        migrator = WordPressPostMigrator()
+        return migrator.migrate_posts_from_csv(
+            csv_file=csv_file,
+            destination_site=destination_site,
+            create_categories=create_categories,
+            create_tags=create_tags,
+            delete_after=delete_after,
+            status=status,
+            output_file=output_file,
+            ignore_original_date=ignore_original_date
+        )
+
+    def migrate_by_filter(self, source_site: str, destination_site: str,
+                          category_filter: Optional[List[str]] = None,
+                          tag_filter: Optional[List[str]] = None,
+                          date_after: Optional[str] = None,
+                          date_before: Optional[str] = None,
+                          status_filter: Optional[List[str]] = None,
+                          create_categories: bool = True,
+                          create_tags: bool = True,
+                          delete_after: bool = False,
+                          status: str = 'draft',
+                          limit: Optional[int] = None,
+                          ignore_original_date: bool = False) -> str:
+        """
+        Migrate posts based on filters.
+
+        Args:
+            source_site: Source site name
+            destination_site: Destination site name
+            category_filter: List of category names to filter by
+            tag_filter: List of tag names to filter by
+            date_after: Only migrate posts after this date (YYYY-MM-DD)
+            date_before: Only migrate posts before this date (YYYY-MM-DD)
+            status_filter: List of statuses to filter by (e.g., ['publish', 'draft'])
+            create_categories: If True, create categories if they don't exist
+            create_tags: If True, create tags if they don't exist
+            delete_after: If True, delete posts from source after migration
+            status: Status for new posts
+            limit: Maximum number of posts to migrate
+            ignore_original_date: If True, use current date instead of original post date
+
+        Returns:
+            Path to migration report CSV
+        """
+        logger.info(f"🚀 Migrating posts from {source_site} to {destination_site}...")
+
+        migrator = WordPressPostMigrator()
+        return migrator.migrate_posts_by_filter(
+            source_site=source_site,
+            destination_site=destination_site,
+            category_filter=category_filter,
+            tag_filter=tag_filter,
+            date_after=date_after,
+            date_before=date_before,
+            status_filter=status_filter,
+            create_categories=create_categories,
+            create_tags=create_tags,
+            delete_after=delete_after,
+            status=status,
+            limit=limit,
+            ignore_original_date=ignore_original_date
+        )
    
    def status(self) -> dict:
        """Get status of output files."""
--- a/src/seo/cli.py
+++ b/src/seo/cli.py
@@ -37,17 +37,38 @@ Examples:
    parser.add_argument('args', nargs='*', help='Arguments for the command')
    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done')
-    parser.add_argument('--fields', '-f', nargs='+', 
+    parser.add_argument('--fields', '-f', nargs='+',
                       choices=['title', 'meta_description', 'categories', 'site'],
                       help='Fields to analyze')
    parser.add_argument('--update', '-u', action='store_true', help='Update input file')
    parser.add_argument('--output', '-o', help='Output file path')
-    parser.add_argument('--confidence', '-c', choices=['High', 'Medium', 'Low'], 
+    parser.add_argument('--confidence', '-c', choices=['High', 'Medium', 'Low'],
                       default='Medium', help='Confidence threshold for category apply')
    parser.add_argument('--site', '-s', choices=['mistergeek.net', 'webscroll.fr', 'hellogeek.net'],
                       help='WordPress site for category operations')
    parser.add_argument('--description', '-d', help='Category description')
    parser.add_argument('--strict', action='store_true', help='Strict confidence matching (exact match only)')
+    
+    # Export arguments
+    parser.add_argument('--author', nargs='+', help='Filter by author name(s) for export')
+    parser.add_argument('--author-id', type=int, nargs='+', help='Filter by author ID(s) for export')
+    
+    # Migration arguments
+    parser.add_argument('--destination', '--to', choices=['mistergeek.net', 'webscroll.fr', 'hellogeek.net'],
+                       help='Destination site for migration')
+    parser.add_argument('--source', '--from', choices=['mistergeek.net', 'webscroll.fr', 'hellogeek.net'],
+                       help='Source site for filtered migration')
+    parser.add_argument('--keep-source', action='store_true', help='Keep posts on source site (default: delete after migration)')
+    parser.add_argument('--post-status', choices=['draft', 'publish', 'pending'], default='draft',
+                       help='Status for migrated posts (default: draft)')
+    parser.add_argument('--no-categories', action='store_true', help='Do not create categories automatically')
+    parser.add_argument('--no-tags', action='store_true', help='Do not create tags automatically')
+    parser.add_argument('--category-filter', nargs='+', help='Filter by category names (for filtered migration)')
+    parser.add_argument('--tag-filter', nargs='+', help='Filter by tag names (for filtered migration)')
+    parser.add_argument('--date-after', help='Migrate posts after this date (YYYY-MM-DD)')
+    parser.add_argument('--date-before', help='Migrate posts before this date (YYYY-MM-DD)')
+    parser.add_argument('--limit', type=int, help='Limit number of posts to migrate')
+    parser.add_argument('--ignore-original-date', action='store_true', help='Use current date instead of original post date')

    args = parser.parse_args()

@@ -73,6 +94,7 @@ Examples:
        'category_apply': cmd_category_apply,
        'category_create': cmd_category_create,
        'editorial_strategy': cmd_editorial_strategy,
+        'migrate': cmd_migrate,
        'status': cmd_status,
        'help': cmd_help,
    }
@@ -104,8 +126,19 @@ def cmd_export(app, args):
    """Export all posts."""
    if args.dry_run:
        print("Would export all posts from WordPress sites")
+        if args.author:
+            print(f"  Author filter: {args.author}")
+        if args.author_id:
+            print(f"  Author ID filter: {args.author_id}")
        return 0
-    app.export()
+    
+    result = app.export(
+        author_filter=args.author,
+        author_ids=args.author_id,
+        site_filter=args.site
+    )
+    if result:
+        print(f"✅ Export completed! Output: {result}")
    return 0


@@ -241,12 +274,12 @@ def cmd_editorial_strategy(app, args):
    if args.dry_run:
        print("Would analyze editorial strategy and recommend migrations")
        return 0
-    
+
    csv_file = args.args[0] if args.args else None
-    
+
    print("Analyzing editorial strategy...")
    results = app.editorial_strategy(csv_file=csv_file)
-    
+
    if results and results.get('report_file'):
        print(f"\n✅ Editorial strategy analysis complete!")
        print(f"  Report: {results['report_file']}")
@@ -259,6 +292,94 @@ def cmd_editorial_strategy(app, args):
    return 0


+def cmd_migrate(app, args):
+    """Migrate posts between websites."""
+    if args.dry_run:
+        print("Would migrate posts between websites")
+        if args.destination:
+            print(f"  Destination: {args.destination}")
+        if args.source:
+            print(f"  Source: {args.source}")
+        return 0
+
+    # Validate required arguments
+    if not args.destination:
+        print("❌ Destination site required. Use --destination mistergeek.net|webscroll.fr|hellogeek.net")
+        return 1
+
+    delete_after = not args.keep_source
+    create_categories = not args.no_categories
+    create_tags = not args.no_tags
+
+    # Check if using filtered migration or CSV-based migration
+    if args.source:
+        # Filtered migration
+        print(f"Migrating posts from {args.source} to {args.destination}")
+        print(f"Post status: {args.post_status}")
+        print(f"Delete after migration: {delete_after}")
+        if args.category_filter:
+            print(f"Category filter: {args.category_filter}")
+        if args.tag_filter:
+            print(f"Tag filter: {args.tag_filter}")
+        if args.date_after:
+            print(f"Date after: {args.date_after}")
+        if args.date_before:
+            print(f"Date before: {args.date_before}")
+        if args.limit:
+            print(f"Limit: {args.limit}")
+
+        result = app.migrate_by_filter(
+            source_site=args.source,
+            destination_site=args.destination,
+            category_filter=args.category_filter,
+            tag_filter=args.tag_filter,
+            date_after=args.date_after,
+            date_before=args.date_before,
+            status_filter=None,
+            create_categories=create_categories,
+            create_tags=create_tags,
+            delete_after=delete_after,
+            status=args.post_status,
+            limit=args.limit,
+            ignore_original_date=args.ignore_original_date
+        )
+
+        if result:
+            print(f"\n✅ Migration completed!")
+            print(f"  Report: {result}")
+    else:
+        # CSV-based migration
+        csv_file = args.args[0] if args.args else None
+
+        if not csv_file:
+            print("❌ CSV file required. Provide path to CSV with 'site' and 'post_id' columns")
+            print("   Usage: seo migrate <csv_file> --destination <site>")
+            print("   Or use filtered migration: seo migrate --source <site> --destination <site>")
+            return 1
+
+        print(f"Migrating posts from CSV: {csv_file}")
+        print(f"Destination: {args.destination}")
+        print(f"Post status: {args.post_status}")
+        print(f"Delete after migration: {delete_after}")
+
+        result = app.migrate(
+            csv_file=csv_file,
+            destination_site=args.destination,
+            create_categories=create_categories,
+            create_tags=create_tags,
+            delete_after=delete_after,
+            status=args.post_status,
+            output_file=args.output,
+            ignore_original_date=args.ignore_original_date
+        )
+
+        if result:
+            print(f"\n✅ Migration completed!")
+            print(f"  Report: {result}")
+
+    return 0
+
+
 def cmd_status(app, args):
    """Show status."""
    if args.dry_run:
@@ -285,6 +406,9 @@ SEO Automation CLI - Available Commands

 Export & Analysis:
  export                    Export all posts from WordPress sites
+  export --author "John Doe"  Export posts by specific author
+  export --author-id 1 2    Export posts by author IDs
+  export -s mistergeek.net  Export from specific site only
  analyze [csv_file]        Analyze posts with AI
  analyze -f title          Analyze specific fields (title, meta_description, categories, site)
  analyze -u                Update input CSV with new columns (creates backup)
@@ -299,11 +423,35 @@ Category Management:
 Strategy & Migration:
  editorial_strategy [csv]  Analyze editorial lines and recommend migrations
  editorial_strategy        Get migration recommendations between sites
+  migrate <csv> --destination <site>  Migrate posts from CSV to destination site
+  migrate --source <site> --destination <site>  Migrate posts with filters
+  migrate --source A --to B --category-filter "VPN"  Migrate specific categories
+  migrate --source A --to B --date-after 2024-01-01 --limit 10

 Utility:
  status                    Show output files status
  help                      Show this help message

+Export Options:
+  --author                  Filter by author name(s) (case-insensitive, partial match)
+  --author-id               Filter by author ID(s)
+  --site, -s                Export from specific site only
+
+Migration Options:
+  --destination, --to       Destination site: mistergeek.net, webscroll.fr, hellogeek.net
+  --source, --from          Source site for filtered migration
+  --keep-source             Keep posts on source site (default: delete after migration)
+  --post-status             Status for migrated posts: draft, publish, pending (default: draft)
+  --no-categories           Do not create categories automatically
+  --no-tags                 Do not create tags automatically
+  --category-filter         Filter by category names (for filtered migration)
+  --tag-filter              Filter by tag names (for filtered migration)
+  --date-after              Migrate posts after this date (YYYY-MM-DD)
+  --date-before             Migrate posts before this date (YYYY-MM-DD)
+  --limit                   Limit number of posts to migrate
+  --ignore-original-date    Use current date instead of original post date
+  --output, -o              Custom output file path for migration report
+
 Options:
  --verbose, -v             Enable verbose logging
  --dry-run                 Show what would be done without doing it
@@ -317,11 +465,17 @@ Options:

 Examples:
  seo export
+  seo export --author "John Doe"
+  seo export --author-id 1 2
+  seo export -s mistergeek.net --author "admin"
  seo analyze -f title categories
  seo category_propose
  seo category_apply -s mistergeek.net -c Medium
  seo category_create -s webscroll.fr "Torrent Clients"
  seo editorial_strategy
+  seo migrate posts_to_migrate.csv --destination mistergeek.net
+  seo migrate --source webscroll.fr --destination mistergeek.net --category-filter VPN
+  seo migrate --source A --to B --date-after 2024-01-01 --limit 10 --keep-source
  seo status
    """)
    return 0
--- a/src/seo/exporter.py
+++ b/src/seo/exporter.py
@@ -20,11 +20,21 @@ logger = logging.getLogger(__name__)
 class PostExporter:
    """Export posts from WordPress sites to CSV."""

-    def __init__(self):
-        """Initialize the exporter."""
+    def __init__(self, author_filter: Optional[List[str]] = None, 
+                 author_ids: Optional[List[int]] = None):
+        """
+        Initialize the exporter.
+        
+        Args:
+            author_filter: List of author names to filter by (case-insensitive)
+            author_ids: List of author IDs to filter by
+        """
        self.sites = Config.WORDPRESS_SITES
        self.all_posts = []
        self.category_cache = {}
+        self.author_filter = author_filter
+        self.author_ids = author_ids
+        self.author_cache = {}  # Cache author info by site

    def fetch_category_names(self, site_name: str, site_config: Dict) -> Dict[int, Dict]:
        """Fetch category names from a WordPress site."""
@@ -50,8 +60,55 @@ class PostExporter:
        self.category_cache[site_name] = categories
        return categories

-    def fetch_posts_from_site(self, site_name: str, site_config: Dict) -> List[Dict]:
-        """Fetch all posts from a WordPress site."""
+    def fetch_authors(self, site_name: str, site_config: Dict) -> Dict[int, Dict]:
+        """
+        Fetch all authors/users from a WordPress site.
+        
+        Returns:
+            Dict mapping author ID to author data (name, slug)
+        """
+        if site_name in self.author_cache:
+            return self.author_cache[site_name]
+
+        logger.info(f"  Fetching authors from {site_name}...")
+        authors = {}
+        base_url = site_config['url'].rstrip('/')
+        api_url = f"{base_url}/wp-json/wp/v2/users"
+        auth = HTTPBasicAuth(site_config['username'], site_config['password'])
+
+        try:
+            response = requests.get(api_url, params={'per_page': 100}, auth=auth, timeout=10)
+            response.raise_for_status()
+
+            for user in response.json():
+                authors[user['id']] = {
+                    'id': user['id'],
+                    'name': user.get('name', ''),
+                    'slug': user.get('slug', ''),
+                    'description': user.get('description', '')
+                }
+            logger.info(f"    ✓ Fetched {len(authors)} authors")
+        except Exception as e:
+            logger.warning(f"  Could not fetch authors from {site_name}: {e}")
+            # Fallback: create empty dict if authors can't be fetched
+            # Author IDs will still be exported, just without names
+
+        self.author_cache[site_name] = authors
+        return authors
+
+    def fetch_posts_from_site(self, site_name: str, site_config: Dict,
+                              authors_map: Optional[Dict[int, Dict]] = None) -> List[Dict]:
+        """
+        Fetch all posts from a WordPress site.
+        
+        Args:
+            site_name: Site name
+            site_config: Site configuration
+            authors_map: Optional authors mapping for filtering
+            
+        Returns:
+            List of post data
+        """
        logger.info(f"\nFetching posts from {site_name}...")

        posts = []
@@ -59,14 +116,23 @@ class PostExporter:
        api_url = f"{base_url}/wp-json/wp/v2/posts"
        auth = HTTPBasicAuth(site_config['username'], site_config['password'])

+        # Build base params
+        base_params = {'page': 1, 'per_page': 100, '_embed': True}
+        
+        # Add author filter if specified
+        if self.author_ids:
+            base_params['author'] = ','.join(map(str, self.author_ids))
+            logger.info(f"  Filtering by author IDs: {self.author_ids}")
+
        for status in ['publish', 'draft']:
            page = 1
            while True:
                try:
+                    params = {**base_params, 'page': page, 'status': status}
                    logger.info(f"  Fetching page {page} ({status} posts)...")
                    response = requests.get(
                        api_url,
-                        params={'page': page, 'per_page': 100, 'status': status},
+                        params=params,
                        auth=auth,
                        timeout=10
                    )
@@ -76,8 +142,29 @@ class PostExporter:
                    if not page_posts:
                        break

+                    # Filter by author name if specified
+                    if self.author_filter and authors_map:
+                        filtered_posts = []
+                        for post in page_posts:
+                            author_id = post.get('author')
+                            if author_id and author_id in authors_map:
+                                author_name = authors_map[author_id]['name'].lower()
+                                author_slug = authors_map[author_id]['slug'].lower()
+                                
+                                # Check if author matches filter
+                                for filter_name in self.author_filter:
+                                    filter_lower = filter_name.lower()
+                                    if (filter_lower in author_name or 
+                                        filter_lower == author_slug):
+                                        filtered_posts.append(post)
+                                        break
+                        
+                        page_posts = filtered_posts
+                        logger.info(f"    ✓ Got {len(page_posts)} posts after author filter")
+
                    posts.extend(page_posts)
-                    logger.info(f"    ✓ Got {len(page_posts)} posts")
+                    if page_posts:
+                        logger.info(f"    ✓ Got {len(page_posts)} posts")

                    page += 1
                    time.sleep(0.5)
@@ -94,7 +181,8 @@ class PostExporter:
        logger.info(f"✓ Total posts from {site_name}: {len(posts)}\n")
        return posts

-    def extract_post_details(self, post: Dict, site_name: str, category_map: Dict) -> Dict:
+    def extract_post_details(self, post: Dict, site_name: str, category_map: Dict,
+                            author_map: Optional[Dict[int, Dict]] = None) -> Dict:
        """Extract post details for CSV export."""
        title = post.get('title', {})
        if isinstance(title, dict):
@@ -122,6 +210,13 @@ class PostExporter:
            for cat_id in category_ids
        ]) if category_ids else ''

+        # Get author name from author map
+        author_id = post.get('author', '')
+        author_name = ''
+        if author_map and author_id:
+            author_data = author_map.get(author_id, {})
+            author_name = author_data.get('name', '')
+
        return {
            'site': site_name,
            'post_id': post['id'],
@@ -129,7 +224,8 @@ class PostExporter:
            'title': title.strip(),
            'slug': post.get('slug', ''),
            'url': post.get('link', ''),
-            'author_id': post.get('author', ''),
+            'author_id': author_id,
+            'author_name': author_name,
            'date_published': post.get('date', ''),
            'date_modified': post.get('modified', ''),
            'categories': category_names,
@@ -158,7 +254,7 @@ class PostExporter:
            return ""

        fieldnames = [
-            'site', 'post_id', 'status', 'title', 'slug', 'url', 'author_id',
+            'site', 'post_id', 'status', 'title', 'slug', 'url', 'author_id', 'author_name',
            'date_published', 'date_modified', 'categories', 'tags', 'excerpt',
            'content_preview', 'seo_title', 'meta_description', 'focus_keyword', 'word_count',
        ]
@@ -173,24 +269,46 @@ class PostExporter:
        logger.info(f"✓ CSV exported to: {output_file}")
        return str(output_file)

-    def run(self) -> str:
-        """Run the complete export process."""
+    def run(self, site_filter: Optional[str] = None) -> str:
+        """
+        Run the complete export process.
+        
+        Args:
+            site_filter: Optional site name to export from (default: all sites)
+            
+        Returns:
+            Path to exported CSV file
+        """
        logger.info("="*70)
        logger.info("EXPORTING ALL POSTS")
        logger.info("="*70)
+        
+        if self.author_filter:
+            logger.info(f"Author filter: {self.author_filter}")
+        if self.author_ids:
+            logger.info(f"Author IDs: {self.author_ids}")
+        if site_filter:
+            logger.info(f"Site filter: {site_filter}")
+            
        logger.info("Sites configured: " + ", ".join(self.sites.keys()))

        for site_name, config in self.sites.items():
+            # Skip sites if filter is specified
+            if site_filter and site_name != site_filter:
+                logger.info(f"Skipping {site_name} (not in filter)")
+                continue
+                
            categories = self.fetch_category_names(site_name, config)
-            posts = self.fetch_posts_from_site(site_name, config)
+            authors = self.fetch_authors(site_name, config)
+            posts = self.fetch_posts_from_site(site_name, config, authors)

            if posts:
                for post in posts:
-                    post_details = self.extract_post_details(post, site_name, categories)
+                    post_details = self.extract_post_details(post, site_name, categories, authors)
                    self.all_posts.append(post_details)

        if not self.all_posts:
-            logger.error("No posts found on any site")
+            logger.warning("No posts found matching criteria")
            return ""

        self.all_posts.sort(key=lambda x: (x['site'], x['post_id']))
--- a/src/seo/post_migrator.py
+++ b/src/seo/post_migrator.py