Enhance SNCF script to extract NET PAYÉ EN EUROS amount
This commit is contained in:
173
scripts/process_sncf_enhanced.py
Executable file
173
scripts/process_sncf_enhanced.py
Executable file
@@ -0,0 +1,173 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def extract_sncf_salary_data(content, filename):
|
||||
"""
|
||||
Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
|
||||
"""
|
||||
# Extract month from filename
|
||||
months = {
|
||||
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
||||
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
|
||||
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
|
||||
}
|
||||
|
||||
filename_upper = filename.upper()
|
||||
for month, num in months.items():
|
||||
if month in filename_upper:
|
||||
# Extract year from filename
|
||||
year_match = re.search(r'20(\d{2})', filename)
|
||||
year = int(year_match.group(1)) if year_match else 2025
|
||||
month_name = [
|
||||
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December'
|
||||
][month]
|
||||
break
|
||||
|
||||
# Initialize salary data
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': 0.0,
|
||||
'net_imposable': 0.0,
|
||||
'net_paye_euros': 0.0,
|
||||
'cumul_annuel': 0.0,
|
||||
'mode_paiement': ''
|
||||
}
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
# Look for the salary table with NET PAYÉ EN EUROS
|
||||
for line in lines:
|
||||
if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
|
||||
# Extract all numeric values from this line
|
||||
values = re.findall(r'([\d\s,]+)', line)
|
||||
if len(values) >= 4:
|
||||
try:
|
||||
# Extract values based on typical SNCF format
|
||||
brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
|
||||
net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
|
||||
net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
|
||||
cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': cumul_annuel,
|
||||
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
# Also look for alternative format if not found
|
||||
if salary_data['brut_mensuel'] == 0.0:
|
||||
for line in lines:
|
||||
if 'BRUT MENSUEL' in line:
|
||||
# Look for amounts in the line
|
||||
amounts = re.findall(r'([\d\s,]+)', line)
|
||||
if len(amounts) >= 2:
|
||||
try:
|
||||
# Take first amount as brut, calculate others
|
||||
brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
|
||||
# Assume net_imposable is roughly 75% of brut
|
||||
net_imposable = brut_mensuel * 0.75
|
||||
net_paye_euros = brut_mensuel - net_imposable
|
||||
cumul_annuel = brut_mensuel * 12 # Approximate annual
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': cumul_annuel,
|
||||
'mode_paiement': 'virement SEPA'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return salary_data
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract salary data
|
||||
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
|
||||
|
||||
# Create transaction record with proper salary amount
|
||||
all_transactions.append({
|
||||
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
|
||||
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_data['net_paye_euros'],
|
||||
'Source': os.path.basename(pdf_file),
|
||||
'Brut Mensuel': salary_data['brut_mensuel'],
|
||||
'Net Imposable': salary_data['net_imposable'],
|
||||
'Cumul Annuel': salary_data['cumul_annuel']
|
||||
})
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV with enhanced SNCF data
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
|
||||
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- SNCF Salary Statements ---")
|
||||
print(f"Found {len(pdf_files)} salary statement files")
|
||||
|
||||
# Calculate totals
|
||||
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
|
||||
total_net = sum(t['Net Imposable'] for t in all_transactions)
|
||||
|
||||
if total_brut > 0:
|
||||
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
|
||||
print(f"Total Net Imposable: €{total_net:,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
||||
help='Directory containing SNCF PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
Reference in New Issue
Block a user