Files
personnal-accounting/scripts/process_sncf.py
Kevin Bataille eb66c7a43e Refactor SNCF processor and add Revolut aggregator
- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line
- Extract month/year from PDF content instead of filename
- Add new Revolut CSV processor to aggregate account statements
- Organize Revolut data files into data/csv/revolut/
- Clean up redundant scripts and reports
2026-02-09 16:17:48 +01:00

182 lines
7.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Enhanced SNCF processor to extract NET PAYÉ EN EUROS amounts
"""
import subprocess
import re
import csv
import os
import glob
import argparse
from collections import defaultdict
def extract_sncf_salary_data(content, filename):
"""Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS"""
# Extract month and year from content (e.g., "BULLETIN DE PAIE DU MOIS DE Janvier 2026")
months = {
'JANVIER': 1, 'FÉVRIER': 2, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOÛT': 8, 'AOUT': 8,
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DÉCEMBRE': 12, 'DECEMBRE': 12
}
# Try to find month/year from content
month_num = 1
year = 2025
month_name = ''
# Look for pattern like "MOIS DE Janvier 2026" in content
mois_match = re.search(r'MOIS DE\s+(\w+)\s+(\d{4})', content, re.IGNORECASE)
if mois_match:
month_str = mois_match.group(1).upper()
year = int(mois_match.group(2))
if month_str in months:
month_num = months[month_str]
# Get month name
month_names = [
'', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
]
month_name = month_names[month_num]
# Initialize salary data
salary_data = {
'month': '',
'year': year,
'brut_mensuel': 0.0,
'net_imposable': 0.0,
'net_paye_euros': 0.0,
'cumul_annuel': 0.0,
'mode_paiement': ''
}
lines = content.split('\n')
# Look for the salary table with NET PAYÉ EN EUROS
for i, line in enumerate(lines):
if 'NET PAYÉ EN EUROS' in line:
# The next line should be the MENSUEL line with the actual values
next_line = lines[i + 1] if i + 1 < len(lines) else ''
# Parse the MENSUEL line which has format:
# MENSUEL <brut> <net_imposable> <prelevement> <net_paye> EUR
mensuel_match = re.search(r'MENSUEL\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+EUR', next_line)
if mensuel_match:
try:
# Extract values and convert from French format (comma as decimal)
brut_mensuel = float(mensuel_match.group(1).replace(' ', '').replace(',', '.'))
net_imposable = float(mensuel_match.group(2).replace(' ', '').replace(',', '.'))
prelevement = float(mensuel_match.group(3).replace(' ', '').replace(',', '.'))
net_paye_euros = float(mensuel_match.group(4).replace(' ', '').replace(',', '.'))
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': brut_mensuel,
'net_imposable': net_imposable,
'net_paye_euros': net_paye_euros,
'cumul_annuel': 0.0,
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
}
break
except (ValueError, IndexError):
continue
return salary_data
def process_sncf_pdf_files(directory, output_csv=False, output_dir='output/csv'):
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract salary data
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
except Exception as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Create transaction record with proper salary amount
if salary_data['month'] and salary_data['net_paye_euros'] > 0:
all_transactions.append({
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
'Category': 'Salary',
'Amount': salary_data['net_paye_euros'],
'Source': os.path.basename(pdf_file),
'Brut Mensuel': salary_data['brut_mensuel'],
'Net Imposable': salary_data['net_imposable'],
'Cumul Annuel': salary_data['cumul_annuel']
})
else:
# Still create an entry but with zero amount for data integrity
all_transactions.append({
'Date': f"01/{salary_data.get('month', '')}/{salary_data.get('year', '2025')}",
'Description': f"Salaire {salary_data.get('month', '')} {salary_data.get('year', '2025')}",
'Category': 'Salary',
'Amount': salary_data.get('net_paye_euros', 0),
'Source': os.path.basename(pdf_file),
'Brut Mensuel': salary_data.get('brut_mensuel', 0),
'Net Imposable': salary_data.get('net_imposable', 0),
'Cumul Annuel': salary_data.get('cumul_annuel', 0),
'Mode Paiement': salary_data.get('mode_paiement', '')
})
# Output CSV with enhanced SNCF data
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- SNCF Salary Statements ---")
print(f"Found {len(pdf_files)} salary statement files")
# Calculate totals
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
total_net = sum(t['Net Imposable'] for t in all_transactions)
if total_brut > 0:
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
print(f"Total Net Imposable: €{total_net:,.2f}")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
help='Directory containing SNCF PDF files')
parser.add_argument('--output-dir', default='output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)