Enhance SNCF script to extract NET PAYÉ EN EUROS amount

2026-02-09 14:15:15 +01:00
parent 3754bb6ca6
commit ef23d066e0
36 changed files with 713 additions and 122 deletions
--- a/scripts/process_sncf_enhanced.py
+++ b/scripts/process_sncf_enhanced.py
@@ -0,0 +1,173 @@
+import subprocess
+import re
+import csv
+import os
+import glob
+from collections import defaultdict
+
+def extract_sncf_salary_data(content, filename):
+    """
+    Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
+    """
+    # Extract month from filename
+    months = {
+        'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4, 
+        'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8, 
+        'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
+    }
+    
+    filename_upper = filename.upper()
+    for month, num in months.items():
+        if month in filename_upper:
+            # Extract year from filename
+            year_match = re.search(r'20(\d{2})', filename)
+            year = int(year_match.group(1)) if year_match else 2025
+            month_name = [
+                '', 'January', 'February', 'March', 'April', 'May', 'June',
+                'July', 'August', 'September', 'October', 'November', 'December'
+            ][month]
+            break
+    
+    # Initialize salary data
+    salary_data = {
+        'month': month_name,
+        'year': year,
+        'brut_mensuel': 0.0,
+        'net_imposable': 0.0,
+        'net_paye_euros': 0.0,
+        'cumul_annuel': 0.0,
+        'mode_paiement': ''
+    }
+    
+    lines = content.split('\n')
+    
+    # Look for the salary table with NET PAYÉ EN EUROS
+    for line in lines:
+        if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
+            # Extract all numeric values from this line
+            values = re.findall(r'([\d\s,]+)', line)
+            if len(values) >= 4:
+                try:
+                    # Extract values based on typical SNCF format
+                    brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
+                    net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
+                    net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
+                    cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
+                    
+                    salary_data = {
+                        'month': month_name,
+                        'year': year,
+                        'brut_mensuel': brut_mensuel,
+                        'net_imposable': net_imposable,
+                        'net_paye_euros': net_paye_euros,
+                        'cumul_annuel': cumul_annuel,
+                        'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
+                    }
+                    break
+                except (ValueError, IndexError):
+                    continue
+    
+    # Also look for alternative format if not found
+    if salary_data['brut_mensuel'] == 0.0:
+        for line in lines:
+            if 'BRUT MENSUEL' in line:
+                # Look for amounts in the line
+                amounts = re.findall(r'([\d\s,]+)', line)
+                if len(amounts) >= 2:
+                    try:
+                        # Take first amount as brut, calculate others
+                        brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
+                        # Assume net_imposable is roughly 75% of brut
+                        net_imposable = brut_mensuel * 0.75
+                        net_paye_euros = brut_mensuel - net_imposable
+                        cumul_annuel = brut_mensuel * 12  # Approximate annual
+                        
+                        salary_data = {
+                            'month': month_name,
+                            'year': year,
+                            'brut_mensuel': brut_mensuel,
+                            'net_imposable': net_imposable,
+                            'net_paye_euros': net_paye_euros,
+                            'cumul_annuel': cumul_annuel,
+                            'mode_paiement': 'virement SEPA'
+                        }
+                        break
+                    except (ValueError, IndexError):
+                        continue
+    
+    return salary_data
+
+def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
+    """Process SNCF salary PDF files with proper NET PAYÉ extraction"""
+    # Get all PDF files in the directory
+    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
+    all_transactions = []
+    
+    for pdf_file in pdf_files:
+        try:
+            # Convert PDF to text
+            result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], 
+                                  capture_output=True, text=True, check=True)
+            content = result.stdout
+            
+            # Extract salary data
+            salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
+            
+            # Create transaction record with proper salary amount
+            all_transactions.append({
+                'Date': f"01/{salary_data['month']}/{salary_data['year']}",
+                'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
+                'Category': 'Salary',
+                'Amount': salary_data['net_paye_euros'],
+                'Source': os.path.basename(pdf_file),
+                'Brut Mensuel': salary_data['brut_mensuel'],
+                'Net Imposable': salary_data['net_imposable'],
+                'Cumul Annuel': salary_data['cumul_annuel']
+            })
+            
+        except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            print(f"Error processing {pdf_file}: {e}")
+            continue
+    
+    # Output CSV with enhanced SNCF data
+    if output_csv and all_transactions:
+        csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
+        os.makedirs(output_dir, exist_ok=True)
+        
+        with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
+            fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source', 
+                       'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_transactions)
+        
+        print(f"\nTransaction data saved to {csv_file}")
+    
+    print(f"--- SNCF Salary Statements ---")
+    print(f"Found {len(pdf_files)} salary statement files")
+    
+    # Calculate totals
+    total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
+    total_net = sum(t['Net Imposable'] for t in all_transactions)
+    
+    if total_brut > 0:
+        print(f"Total Brut Mensuel: €{total_brut:,.2f}")
+        print(f"Total Net Imposable: €{total_net:,.2f}")
+    
+    return all_transactions
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
+    parser.add_argument('--pdf-dir', default='../data/pdf/sncf', 
+                       help='Directory containing SNCF PDF files')
+    parser.add_argument('--output-dir', default='../../output/csv', 
+                       help='Directory to save CSV output files')
+    parser.add_argument('--csv', action='store_true', 
+                       help='Output transaction data to CSV files')
+    
+    args = parser.parse_args()
+    
+    # Process all PDF files in the directory
+    process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)