personnal-accounting/scripts/process_sncf_improved.py

import subprocess
import re
import csv
import os
import glob
from collections import defaultdict

def extract_month_from_filename(filename):
    """Extract month from SNCF filename"""
    months = {
        'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
        'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
        'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
    }

    filename_upper = filename.upper()
    for month, num in months.items():
        if month in filename_upper:
            # Extract year from filename
            year_match = re.search(r'20(\d{2})', filename)
            year = int(year_match.group(1)) if year_match else 2025
            return year, num

    return 2025, 1  # Default

def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
    """Process SNCF salary PDF files with proper salary extraction"""
    # Get all PDF files in the directory
    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
    all_transactions = []

    for pdf_file in pdf_files:
        try:
            # Convert PDF to text
            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
                                  capture_output=True, text=True, check=True)
            content = result.stdout

            # Extract month from filename
            year, month = extract_month_from_filename(os.path.basename(pdf_file))
            month_name = [
                '', 'January', 'February', 'March', 'April', 'May', 'June',
                'July', 'August', 'September', 'October', 'November', 'December'
            ][month]

            # Extract salary amount
            lines = content.split('\n')
            salary_amount = 0.0

            # Look for "SALAIRE BRUT MENSUEL" line
            for line in lines:
                if 'SALAIRE BRUT MENSUEL' in line:
                    # Extract the amount after this label
                    amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
                    if amount_match:
                        amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
                        try:
                            salary_amount = float(amount_str)
                            break
                        except ValueError:
                            continue

            # Also look for other salary indicators
            if salary_amount == 0.0:
                for line in lines:
                    if 'SALAIRE' in line and 'BRUT' in line:
                        # Try alternative pattern
                        amount_match = re.search(r'([\d\s.,]+)\s*€', line)
                        if amount_match:
                            amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
                            try:
                                salary_amount = float(amount_str)
                                break
                            except ValueError:
                                continue

            # Also check for base salary in the table
            if salary_amount == 0.0:
                for line in lines:
                    if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
                        # Extract from the salary table
                        parts = line.split()
                        for part in parts:
                            try:
                                if '.' in part and ',' not in part and len(part) > 3:
                                    salary_amount = float(part.replace(',', '.'))
                                    break
                            except ValueError:
                                continue

            # Add transaction record
            all_transactions.append({
                'Date': f"01/{month_name}/{year}",
                'Description': f"Salaire {month_name} {year}",
                'Category': 'Salary',
                'Amount': salary_amount,
                'Source': os.path.basename(pdf_file)
            })

        except (subprocess.CalledProcessError, FileNotFoundError) as e:
            print(f"Error processing {pdf_file}: {e}")
            continue

    # Output CSV if requested
    if output_csv and all_transactions:
        csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
        os.makedirs(output_dir, exist_ok=True)
        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_transactions)
        print(f"\nTransaction data saved to {csv_file}")

    print(f"--- SNCF Salary Statements ---")
    print(f"Found {len(pdf_files)} salary statement files")
    total_salary = sum(t['Amount'] for t in all_transactions)
    if total_salary > 0:
        print(f"Total Salary Extracted: €{total_salary:,.2f}")

    return all_transactions

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Process SNCF salary statements')
    parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
                       help='Directory containing SNCF PDF files')
    parser.add_argument('--output-dir', default='../../output/csv',
                       help='Directory to save CSV output files')
    parser.add_argument('--csv', action='store_true',
                       help='Output transaction data to CSV files')
    args = parser.parse_args()

    # Process all PDF files in the directory
    process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)