personnal-accounting/scripts/process_sncf.py

#!/usr/bin/env python3

"""
Enhanced SNCF processor to extract NET PAYÉ EN EUROS amounts
"""

import subprocess
import re
import csv
import os
import glob
import argparse
from collections import defaultdict

def extract_sncf_salary_data(content, filename):
    """Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS"""
    # Extract month and year from content (e.g., "BULLETIN DE PAIE DU MOIS DE Janvier 2026")
    months = {
        'JANVIER': 1, 'FÉVRIER': 2, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
        'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOÛT': 8, 'AOUT': 8,
        'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DÉCEMBRE': 12, 'DECEMBRE': 12
    }

    # Try to find month/year from content
    month_num = 1
    year = 2025
    month_name = ''

    # Look for pattern like "MOIS DE Janvier 2026" in content
    mois_match = re.search(r'MOIS DE\s+(\w+)\s+(\d{4})', content, re.IGNORECASE)
    if mois_match:
        month_str = mois_match.group(1).upper()
        year = int(mois_match.group(2))
        if month_str in months:
            month_num = months[month_str]

    # Get month name
    month_names = [
        '', 'January', 'February', 'March', 'April', 'May', 'June',
        'July', 'August', 'September', 'October', 'November', 'December'
    ]
    month_name = month_names[month_num]

    # Initialize salary data
    salary_data = {
        'month': '',
        'year': year,
        'brut_mensuel': 0.0,
        'net_imposable': 0.0,
        'net_paye_euros': 0.0,
        'cumul_annuel': 0.0,
        'mode_paiement': ''
    }

    lines = content.split('\n')

    # Look for the salary table with NET PAYÉ EN EUROS
    for i, line in enumerate(lines):
        if 'NET PAYÉ EN EUROS' in line:
            # The next line should be the MENSUEL line with the actual values
            next_line = lines[i + 1] if i + 1 < len(lines) else ''

            # Parse the MENSUEL line which has format:
            # MENSUEL <brut> <net_imposable> <prelevement> <net_paye> EUR
            mensuel_match = re.search(r'MENSUEL\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+EUR', next_line)

            if mensuel_match:
                try:
                    # Extract values and convert from French format (comma as decimal)
                    brut_mensuel = float(mensuel_match.group(1).replace(' ', '').replace(',', '.'))
                    net_imposable = float(mensuel_match.group(2).replace(' ', '').replace(',', '.'))
                    prelevement = float(mensuel_match.group(3).replace(' ', '').replace(',', '.'))
                    net_paye_euros = float(mensuel_match.group(4).replace(' ', '').replace(',', '.'))

                    salary_data = {
                        'month': month_name,
                        'year': year,
                        'brut_mensuel': brut_mensuel,
                        'net_imposable': net_imposable,
                        'net_paye_euros': net_paye_euros,
                        'cumul_annuel': 0.0,
                        'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
                    }
                    break
                except (ValueError, IndexError):
                    continue


    return salary_data

def process_sncf_pdf_files(directory, output_csv=False, output_dir='output/csv'):
    """Process SNCF salary PDF files with proper NET PAYÉ extraction"""
    # Get all PDF files in the directory
    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
    all_transactions = []

    for pdf_file in pdf_files:
        try:
            # Convert PDF to text
            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
                                  capture_output=True, text=True, check=True)
            content = result.stdout

            # Extract salary data
            salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
            continue


        # Create transaction record with proper salary amount
        if salary_data['month'] and salary_data['net_paye_euros'] > 0:
            all_transactions.append({
                'Date': f"01/{salary_data['month']}/{salary_data['year']}",
                'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
                'Category': 'Salary',
                'Amount': salary_data['net_paye_euros'],
                'Source': os.path.basename(pdf_file),
                'Brut Mensuel': salary_data['brut_mensuel'],
                'Net Imposable': salary_data['net_imposable'],
                'Cumul Annuel': salary_data['cumul_annuel']
            })
        else:
            # Still create an entry but with zero amount for data integrity
            all_transactions.append({
                'Date': f"01/{salary_data.get('month', '')}/{salary_data.get('year', '2025')}",
                'Description': f"Salaire {salary_data.get('month', '')} {salary_data.get('year', '2025')}",
                'Category': 'Salary',
                'Amount': salary_data.get('net_paye_euros', 0),
                'Source': os.path.basename(pdf_file),
                'Brut Mensuel': salary_data.get('brut_mensuel', 0),
                'Net Imposable': salary_data.get('net_imposable', 0),
                'Cumul Annuel': salary_data.get('cumul_annuel', 0),
                'Mode Paiement': salary_data.get('mode_paiement', '')
            })

    # Output CSV with enhanced SNCF data
    if output_csv and all_transactions:
        csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
        os.makedirs(output_dir, exist_ok=True)

        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
                       'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_transactions)

        print(f"\nTransaction data saved to {csv_file}")

    print(f"--- SNCF Salary Statements ---")
    print(f"Found {len(pdf_files)} salary statement files")

    # Calculate totals
    total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
    total_net = sum(t['Net Imposable'] for t in all_transactions)

    if total_brut > 0:
        print(f"Total Brut Mensuel: €{total_brut:,.2f}")
        print(f"Total Net Imposable: €{total_net:,.2f}")

    return all_transactions

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
    parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
                       help='Directory containing SNCF PDF files')
    parser.add_argument('--output-dir', default='output/csv',
                       help='Directory to save CSV output files')
    parser.add_argument('--csv', action='store_true',
                       help='Output transaction data to CSV files')

    args = parser.parse_args()

    # Process all PDF files in the directory
    process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)