- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line - Extract month/year from PDF content instead of filename - Add new Revolut CSV processor to aggregate account statements - Organize Revolut data files into data/csv/revolut/ - Clean up redundant scripts and reports
182 lines
7.3 KiB
Python
Executable File
182 lines
7.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Enhanced SNCF processor to extract NET PAYÉ EN EUROS amounts
|
|
"""
|
|
|
|
import subprocess
|
|
import re
|
|
import csv
|
|
import os
|
|
import glob
|
|
import argparse
|
|
from collections import defaultdict
|
|
|
|
def extract_sncf_salary_data(content, filename):
|
|
"""Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS"""
|
|
# Extract month and year from content (e.g., "BULLETIN DE PAIE DU MOIS DE Janvier 2026")
|
|
months = {
|
|
'JANVIER': 1, 'FÉVRIER': 2, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
|
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOÛT': 8, 'AOUT': 8,
|
|
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DÉCEMBRE': 12, 'DECEMBRE': 12
|
|
}
|
|
|
|
# Try to find month/year from content
|
|
month_num = 1
|
|
year = 2025
|
|
month_name = ''
|
|
|
|
# Look for pattern like "MOIS DE Janvier 2026" in content
|
|
mois_match = re.search(r'MOIS DE\s+(\w+)\s+(\d{4})', content, re.IGNORECASE)
|
|
if mois_match:
|
|
month_str = mois_match.group(1).upper()
|
|
year = int(mois_match.group(2))
|
|
if month_str in months:
|
|
month_num = months[month_str]
|
|
|
|
# Get month name
|
|
month_names = [
|
|
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
|
'July', 'August', 'September', 'October', 'November', 'December'
|
|
]
|
|
month_name = month_names[month_num]
|
|
|
|
# Initialize salary data
|
|
salary_data = {
|
|
'month': '',
|
|
'year': year,
|
|
'brut_mensuel': 0.0,
|
|
'net_imposable': 0.0,
|
|
'net_paye_euros': 0.0,
|
|
'cumul_annuel': 0.0,
|
|
'mode_paiement': ''
|
|
}
|
|
|
|
lines = content.split('\n')
|
|
|
|
# Look for the salary table with NET PAYÉ EN EUROS
|
|
for i, line in enumerate(lines):
|
|
if 'NET PAYÉ EN EUROS' in line:
|
|
# The next line should be the MENSUEL line with the actual values
|
|
next_line = lines[i + 1] if i + 1 < len(lines) else ''
|
|
|
|
# Parse the MENSUEL line which has format:
|
|
# MENSUEL <brut> <net_imposable> <prelevement> <net_paye> EUR
|
|
mensuel_match = re.search(r'MENSUEL\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+EUR', next_line)
|
|
|
|
if mensuel_match:
|
|
try:
|
|
# Extract values and convert from French format (comma as decimal)
|
|
brut_mensuel = float(mensuel_match.group(1).replace(' ', '').replace(',', '.'))
|
|
net_imposable = float(mensuel_match.group(2).replace(' ', '').replace(',', '.'))
|
|
prelevement = float(mensuel_match.group(3).replace(' ', '').replace(',', '.'))
|
|
net_paye_euros = float(mensuel_match.group(4).replace(' ', '').replace(',', '.'))
|
|
|
|
salary_data = {
|
|
'month': month_name,
|
|
'year': year,
|
|
'brut_mensuel': brut_mensuel,
|
|
'net_imposable': net_imposable,
|
|
'net_paye_euros': net_paye_euros,
|
|
'cumul_annuel': 0.0,
|
|
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
|
|
}
|
|
break
|
|
except (ValueError, IndexError):
|
|
continue
|
|
|
|
|
|
return salary_data
|
|
|
|
def process_sncf_pdf_files(directory, output_csv=False, output_dir='output/csv'):
|
|
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
|
|
# Get all PDF files in the directory
|
|
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
|
all_transactions = []
|
|
|
|
for pdf_file in pdf_files:
|
|
try:
|
|
# Convert PDF to text
|
|
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
|
capture_output=True, text=True, check=True)
|
|
content = result.stdout
|
|
|
|
# Extract salary data
|
|
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
|
|
except Exception as e:
|
|
print(f"Error processing {pdf_file}: {e}")
|
|
continue
|
|
|
|
|
|
|
|
# Create transaction record with proper salary amount
|
|
if salary_data['month'] and salary_data['net_paye_euros'] > 0:
|
|
all_transactions.append({
|
|
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
|
|
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
|
|
'Category': 'Salary',
|
|
'Amount': salary_data['net_paye_euros'],
|
|
'Source': os.path.basename(pdf_file),
|
|
'Brut Mensuel': salary_data['brut_mensuel'],
|
|
'Net Imposable': salary_data['net_imposable'],
|
|
'Cumul Annuel': salary_data['cumul_annuel']
|
|
})
|
|
else:
|
|
# Still create an entry but with zero amount for data integrity
|
|
all_transactions.append({
|
|
'Date': f"01/{salary_data.get('month', '')}/{salary_data.get('year', '2025')}",
|
|
'Description': f"Salaire {salary_data.get('month', '')} {salary_data.get('year', '2025')}",
|
|
'Category': 'Salary',
|
|
'Amount': salary_data.get('net_paye_euros', 0),
|
|
'Source': os.path.basename(pdf_file),
|
|
'Brut Mensuel': salary_data.get('brut_mensuel', 0),
|
|
'Net Imposable': salary_data.get('net_imposable', 0),
|
|
'Cumul Annuel': salary_data.get('cumul_annuel', 0),
|
|
'Mode Paiement': salary_data.get('mode_paiement', '')
|
|
})
|
|
|
|
# Output CSV with enhanced SNCF data
|
|
if output_csv and all_transactions:
|
|
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
|
|
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(all_transactions)
|
|
|
|
print(f"\nTransaction data saved to {csv_file}")
|
|
|
|
print(f"--- SNCF Salary Statements ---")
|
|
print(f"Found {len(pdf_files)} salary statement files")
|
|
|
|
# Calculate totals
|
|
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
|
|
total_net = sum(t['Net Imposable'] for t in all_transactions)
|
|
|
|
if total_brut > 0:
|
|
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
|
|
print(f"Total Net Imposable: €{total_net:,.2f}")
|
|
|
|
return all_transactions
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
|
|
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
|
help='Directory containing SNCF PDF files')
|
|
parser.add_argument('--output-dir', default='output/csv',
|
|
help='Directory to save CSV output files')
|
|
parser.add_argument('--csv', action='store_true',
|
|
help='Output transaction data to CSV files')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Process all PDF files in the directory
|
|
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
|
|
|
|