Refactor SNCF processor and add Revolut aggregator

- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line
- Extract month/year from PDF content instead of filename
- Add new Revolut CSV processor to aggregate account statements
- Organize Revolut data files into data/csv/revolut/
- Clean up redundant scripts and reports
This commit is contained in:
Kevin Bataille
2026-02-09 16:17:48 +01:00
parent ef23d066e0
commit eb66c7a43e
85 changed files with 3270 additions and 2106 deletions

View File

@@ -1,328 +0,0 @@
#!/usr/bin/env python3
"""
Script to aggregate all account statements by month or year
"""
import os
import csv
import sys
import argparse
import re
from datetime import datetime
from collections import defaultdict
import calendar
def parse_date(date_str, source_file):
"""
Parse date from various formats and return normalized (year, month, day)
"""
# Try different date formats
formats = [
'%d/%m/%Y', # DD/MM/YYYY
'%m/%d/%Y', # MM/DD/YYYY (Amex format)
'%Y-%m-%d', # YYYY-MM-DD (Revolut format)
]
for fmt in formats:
try:
dt = datetime.strptime(date_str, fmt)
return (dt.year, dt.month, dt.day)
except ValueError:
continue
# Try to extract from filename (for SNCF)
if 'salaire' in source_file.lower():
months = ['janvier', 'fevrier', 'mars', 'avril', 'mai', 'juin',
'juillet', 'aout', 'septembre', 'octobre', 'novembre', 'decembre']
for i, month in enumerate(months, 1):
if month.lower() in source_file.lower():
year_match = re.search(r'20(\d{2})', source_file)
year = int(year_match.group(1)) if year_match else datetime.now().year
return (year, i, 1)
# Default: return current date
return (datetime.now().year, datetime.now().month, 1)
def categorize_institution(source_file):
"""
Determine the institution based on the source filename
"""
source_lower = source_file.lower()
if 'boursobank' in source_lower or 'releve-compte' in source_lower:
return 'Boursobank'
elif 'american_express' in source_lower or 'amex' in source_lower:
return 'American Express'
elif 'monabanq' in source_lower or 'extrait de comptes' in source_lower:
return 'Monabanq'
elif 'revolut' in source_lower:
return 'Revolut'
elif 'sncf' in source_lower or 'salaire' in source_lower:
return 'SNCF'
elif 'la_poste' in source_lower or '2-la.poste' in source_lower or 'releve_ccp' in source_lower:
return 'La Poste'
return 'Other'
def process_csv_file(file_path):
"""
Process a CSV file and return a list of transactions
"""
transactions = []
institution = categorize_institution(os.path.basename(file_path))
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Get the date
date_str = row.get('Date', '')
if not date_str:
continue
# Parse and normalize the date
year, month, day = parse_date(date_str, row.get('Source', ''))
# Get amount (handle different column names)
amount_str = row.get('Amount', '') or row.get('Debit', '') or row.get('Credit', '0')
try:
amount = float(amount_str.replace(',', '.')) if amount_str else 0
except ValueError:
amount = 0
# Create transaction record
transactions.append({
'year': year,
'month': month,
'day': day,
'date_str': date_str,
'description': row.get('Description', ''),
'category': row.get('Category', 'Other'),
'amount': amount,
'institution': institution,
'source': row.get('Source', os.path.basename(file_path))
})
return transactions
def main():
parser = argparse.ArgumentParser(description='Aggregate all account statements by month or year')
parser.add_argument('--input-dir', default='output/csv',
help='Directory containing CSV files to aggregate (default: output/csv)')
parser.add_argument('--output-dir', default='output/reports',
help='Directory to save aggregated reports (default: output/reports)')
parser.add_argument('--annual', action='store_true',
help='Create annual reports instead of monthly reports')
parser.add_argument('--year', type=int,
help='Generate reports for a specific year only')
args = parser.parse_args()
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
report_type = "Annual" if args.annual else "Monthly"
print(f"\n{'='*60}")
print(f"{report_type} Aggregation of All Account Statements")
print(f"Input Directory: {os.path.abspath(args.input_dir)}")
print(f"Output Directory: {os.path.abspath(args.output_dir)}")
if args.year:
print(f"Year Filter: {args.year}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
# Collect all transactions
all_transactions = []
# Find all CSV files in input directory
csv_files = [f for f in os.listdir(args.input_dir) if f.endswith('.csv')]
if not csv_files:
print(f"\nError: No CSV files found in {args.input_dir}")
return
# Process each CSV file
for csv_file in csv_files:
file_path = os.path.join(args.input_dir, csv_file)
print(f"\nProcessing: {csv_file}")
transactions = process_csv_file(file_path)
all_transactions.extend(transactions)
print(f" Found {len(transactions)} transactions")
# Group transactions by month
monthly_transactions = defaultdict(list)
for transaction in all_transactions:
key = (transaction['year'], transaction['month'])
monthly_transactions[key].append(transaction)
# Create monthly summary report
summary_file = os.path.join(args.output_dir, 'monthly_summary.csv')
with open(summary_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
# Header
writer.writerow([
'Year', 'Month', 'Total Income', 'Total Expenses', 'Net Balance',
'Transaction Count', 'Institutions'
])
# Process each month
for year, month in sorted(monthly_transactions.keys()):
transactions = monthly_transactions[(year, month)]
month_name = calendar.month_name[month]
# Calculate totals
total_income = sum(t['amount'] for t in transactions if t['amount'] < 0) # Negative amounts are income in Revolut
total_expenses = sum(t['amount'] for t in transactions if t['amount'] > 0)
net_balance = total_income + total_expenses
transaction_count = len(transactions)
# Get unique institutions
institutions = sorted(list(set(t['institution'] for t in transactions)))
institutions_str = ', '.join(institutions)
# Write row
writer.writerow([
year, month_name, total_income, total_expenses, net_balance,
transaction_count, institutions_str
])
# Create yearly summary
yearly_summary = defaultdict(lambda: {'income': 0, 'expenses': 0, 'count': 0})
for transaction in all_transactions:
year = transaction['year']
yearly_summary[year]['count'] += 1
if transaction['amount'] < 0:
yearly_summary[year]['income'] += transaction['amount']
else:
yearly_summary[year]['expenses'] += transaction['amount']
# Create yearly summary file
yearly_file = os.path.join(args.output_dir, 'yearly_summary.csv')
with open(yearly_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Year', 'Total Income', 'Total Expenses', 'Net Balance', 'Transaction Count'])
for year in sorted(yearly_summary.keys()):
data = yearly_summary[year]
net_balance = data['income'] + data['expenses']
writer.writerow([
year, data['income'], data['expenses'], net_balance, data['count']
])
# Create annual reports if requested
generated_files = [
os.path.basename(summary_file),
os.path.basename(yearly_file)
]
if args.annual:
# Create annual reports
for year in sorted(yearly_summary.keys()):
if args.year and year != args.year:
continue # Skip years not matching filter
print(f"\nCreating annual report for {year}...")
# Get all transactions for the year
year_transactions = [t for t in all_transactions if t['year'] == year]
# Group by category for the annual report
categories = defaultdict(lambda: {'count': 0, 'total': 0})
for transaction in year_transactions:
category = transaction['category']
amount = transaction['amount']
categories[category]['count'] += 1
categories[category]['total'] += amount
# Create annual detailed report
annual_file = os.path.join(args.output_dir, f'annual_report_{year}.csv')
with open(annual_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Category', 'Transaction Count', 'Total Amount', 'Percentage'])
year_total = sum(c['total'] for c in categories.values())
# Sort categories by total amount
sorted_categories = sorted(categories.items(), key=lambda x: x[1]['total'], reverse=True)
for category, data in sorted_categories:
percentage = (data['total'] / year_total) * 100 if year_total != 0 else 0
writer.writerow([category, data['count'], data['total'], f"{percentage:.2f}%"])
# Create annual transactions file
annual_transactions_file = os.path.join(args.output_dir, f'annual_transactions_{year}.csv')
with open(annual_transactions_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=[
'Date', 'Description', 'Category', 'Amount',
'Institution', 'Source'
])
writer.writeheader()
# Sort transactions by date
sorted_transactions = sorted(year_transactions, key=lambda x: (x['month'], x['day'], x['description']))
for transaction in sorted_transactions:
writer.writerow({
'Date': transaction['date_str'],
'Description': transaction['description'],
'Category': transaction['category'],
'Amount': transaction['amount'],
'Institution': transaction['institution'],
'Source': transaction['source']
})
generated_files.append(os.path.basename(annual_file))
generated_files.append(os.path.basename(annual_transactions_file))
print(f" Created {os.path.basename(annual_file)} and {os.path.basename(annual_transactions_file)}")
else:
# Create monthly reports (existing functionality)
for year, month in sorted(monthly_transactions.keys()):
month_name = calendar.month_name[month].lower()
transactions = monthly_transactions[(year, month)]
# Create filename
detail_file = os.path.join(args.output_dir, f'transactions_{year}_{month_name}.csv')
with open(detail_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=[
'Date', 'Description', 'Category', 'Amount',
'Institution', 'Source'
])
writer.writeheader()
# Sort transactions by date
sorted_transactions = sorted(transactions, key=lambda x: (x['day'], x['description']))
for transaction in sorted_transactions:
writer.writerow({
'Date': transaction['date_str'],
'Description': transaction['description'],
'Category': transaction['category'],
'Amount': transaction['amount'],
'Institution': transaction['institution'],
'Source': transaction['source']
})
generated_files.append(f'transactions_{year}_{month_name}.csv')
# Print summary statistics
print(f"\n{'='*60}")
print(f"Aggregation Complete")
print(f"Total Transactions: {len(all_transactions)}")
print(f"Years with Data: {len(yearly_summary)}")
if not args.annual:
print(f"Months with Data: {len(monthly_transactions)}")
print(f"{'='*60}")
# List generated files
print("\nGenerated Files:")
for file in generated_files:
file_path = os.path.join(args.output_dir, file)
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f" - {file} ({file_size:,} bytes)")
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
[["pdftotext", "-layout", "/home/acid/Downloads/comptabilite/american.express/2025-01-02.pdf", "-"], ["line for line in amex_lines if re.match(r'd{1,2} w{3}', line) and not line.endswith('CR')]\n \n print(\"=== AMEX JANUARY QC ===\")\n print(f\"Transactions found: {len(amex_trans)}\")\n for i, line in enumerate(amex_trans[:5]):\n parts = line.split()\n try:\n amount = float(parts[-1].replace(',', '.'))\n desc = ' '.join(parts[2:-1])\n print(f\" {i+1}. {desc}: u20ac{amount:.2f}\")\n except:\n print(f\" {i+1}. {line}\")\nexcept Exception as e:\n print(f\"Amex QC error: {e}\")\n\n# MONABANQ QC\ntry:\n result = subprocess.run(['pdftotext', '-layout', '/home/acid/Downloads/comptabilite/monabanq/Extrait de comptes au 2025-01-31.pdf', '-'], capture_output=True, text=True)\n monabanq_lines = result.stdout.split('\\n')\n trans_started = False\n monabanq_debits = []\n \n for line in monabanq_lines:\n if \"SOLDE\" in line:\n trans_started = True\n continue\n if trans_started and \"IBAN", "in line:\n break\n if trans_started and re.search(r'd+,d+$', line):\n parts = line.split()\n if len(parts) >= 4:\n try:\n amount = float(parts[-1].replace(',', '.'))\n desc = ' '.join(parts[2:-1])\n monabanq_debits.append((desc, amount))\n except:\n continue\n \n print(f\"\n=== MONABANQ JANUARY QC ===\")\n print(f\"Debits found: {len(monabanq_debits)}\")\n for i, (desc, amt) in enumerate(monabanq_debits[:5]):\n print(f\" {i+1}. {desc}: u20ac{amt:.2f}\")\nexcept Exception as e:\n print(f\"Monabanq QC error: {e}\")\n\n# REVOLUT QC\ntry:\n with open('/home/acid/Downloads/comptabilite/revolut/account-statement_2025-01-01_2025-01-31_en-us_58f89a.csv', 'r') as f:\n reader = csv.DictReader(f)\n revolut_expenses = []\n for row in reader:\n if row['Currency'] == 'EUR' and float(row['Amount']) < 0:\n desc = row['Description']\n amt = abs(float(row['Amount']))\n revolut_expenses.append((desc, amt))\n \n print(f\"\n=== REVOLUT JANUARY QC ===\")\n print(f\"Expenses found: {len(revolut_expenses)}\")\n for i, (desc, amt) in enumerate(revolut_expenses[:5]):\n print(f\" {i+1}. {desc}: u20ac{amt:.2f}\")\nexcept Exception as e:\n print(f\"Revolut QC error: {e}\")\n\nprint(\"\n=== QUALITY CONTROL SUMMARY ===\")\nprint(\"u2713 Scripts are extracting transactions from source files\")\nprint(\"u2713 Transaction amounts appear to be parsed correctly\")\nprint(\"u2192 Data processing is working as expected\")"]]

View File

@@ -1,159 +0,0 @@
#!/usr/bin/env python3
"""
Fully dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
import glob
import re
import argparse
from datetime import datetime
def main():
"""
Main function to dynamically discover and process all financial statements
"""
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--data-dir',
help='Base directory containing PDF files (default: ../data/pdf)')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files (default: ../output/csv)')
parser.add_argument('--csv', action='store_true',
help='Generate CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
# Determine data directory
if args.data_dir:
data_dir = args.data_dir
if not os.path.isabs(data_dir):
data_dir = os.path.join(project_root, data_dir)
else:
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
if args.output_dir:
output_dir = args.output_dir
if not os.path.isabs(output_dir):
output_dir = os.path.join(project_root, args.output_dir)
else:
output_dir = os.path.join(project_root, 'output/csv')
# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
# Discover all PDF directories
pdf_dirs = {}
# Get all directories in the data directory
if not os.path.exists(data_dir):
print(f"Error: Data directory not found: {data_dir}")
return
for item in os.listdir(data_dir):
dir_path = os.path.join(data_dir, item)
if os.path.isdir(dir_path):
# Check if this directory contains PDF files
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
if pdf_files:
# Determine account type based on directory name
dir_name_lower = item.lower()
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
account_type = 'Boursobank'
script_name = 'process_bourso.py'
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
account_type = 'American Express'
script_name = 'process_amex.py'
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
account_type = 'Monabanq'
script_name = 'process_monabanq.py'
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
account_type = 'SNCF'
script_name = 'process_sncf_improved.py'
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
account_type = 'La Poste'
script_name = 'process_laposte_improved.py'
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
account_type = 'Impôts'
script_name = None # Skip tax documents
else:
account_type = item.replace('_', ' ').title()
script_name = f'process_{account_type.lower().replace(" ", "_")}.py'
pdf_dirs[account_type] = {
'path': dir_path,
'count': len(pdf_files),
'files': pdf_files,
'script': script_name
}
if not pdf_dirs:
print("No directories with PDF files found!")
return
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
for account_type, info in pdf_dirs.items():
print(f" - {account_type}: {info['count']} PDF files")
# Process each account type
success_count = 0
for account_type, info in pdf_dirs.items():
if not info['script']:
print(f"\nSkipping {account_type}: No processing script available")
continue
# For Revolut, use CSV directory instead of PDF directory
process_dir = info['path']
if account_type == 'Revolut':
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
if process_dir and not os.path.exists(process_dir):
print(f"Warning: Directory not found: {process_dir}")
continue
# Build command
cmd = [sys.executable,
os.path.join(script_dir, info['script']),
'--pdf-dir' if account_type != 'Revolut' else '--csv-dir',
process_dir,
'--output-dir', output_dir]
if args.csv:
cmd.append('--csv')
print(f"\nProcessing {account_type}...")
print(f"Running: {' '.join(cmd[2:])}")
try:
result = subprocess.run(cmd, check=True, capture_output=True)
if result.stdout:
print(result.stdout)
if result.returncode == 0:
success_count += 1
print(f"{account_type} processing completed successfully")
else:
print(f"{account_type} processing failed with exit code {result.returncode}")
except subprocess.CalledProcessError as e:
print(f"✗ Error processing {account_type}: {e}")
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@@ -1,173 +0,0 @@
#!/usr/bin/env python3
"""
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
import glob
import re
from collections import defaultdict
import calendar
import argparse
from datetime import datetime
def discover_pdf_directories(base_data_dir):
"""
Scan base data directory and return all subdirectories containing PDF files
"""
pdf_dirs = {}
# Get all directories in the base data directory
for item in os.listdir(base_data_dir):
dir_path = os.path.join(base_data_dir, item)
if os.path.isdir(dir_path):
# Check if this directory contains PDF files
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
if pdf_files:
# Determine account type based on directory name
dir_name_lower = item.lower()
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
account_type = 'Boursobank'
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
account_type = 'American Express'
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
account_type = 'Monabanq'
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
account_type = 'SNCF'
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
account_type = 'La Poste'
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
account_type = 'Impôts'
else:
account_type = item.replace('_', ' ').title()
pdf_dirs[account_type] = {
'path': dir_path,
'count': len(pdf_files),
'files': pdf_files
}
return pdf_dirs
def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
"""
Generic function to process PDF files in any directory
"""
if not os.path.exists(pdf_directory):
print(f"Warning: Directory not found: {pdf_directory}")
return []
# Get all PDF files
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {pdf_directory}")
return []
# Build command
script_path = os.path.abspath(process_script)
script_dir = os.path.dirname(script_path)
cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)),
'--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
# Run the processing script
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(result.stdout)
return result.returncode == 0
except subprocess.CalledProcessError as e:
print(f"Error processing {pdf_directory}: {e}")
return 0
def main():
"""
Main function to dynamically discover and process all financial statements
"""
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--data-dir',
help='Base directory containing PDF files (default: auto-discovered)')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
# Determine data directory
if args.data_dir:
data_dir = args.data_dir
if not os.path.isabs(data_dir):
data_dir = os.path.join(project_root, data_dir)
else:
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
# Discover all PDF directories
pdf_dirs = discover_pdf_directories(data_dir)
if not pdf_dirs:
print("No directories with PDF files found!")
return
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
for account_type, info in pdf_dirs.items():
print(f" - {account_type}: {info['count']} files in {info['path']}")
# Define processing scripts for each account type
script_map = {
'Boursobank': 'process_bourso.py',
'American Express': 'process_amex.py',
'Monabanq': 'process_monabanq.py',
'SNCF': 'process_sncf_improved.py',
'La Poste': 'process_laposte_improved.py',
'Revolut': 'process_expenses.py', # Special case: uses CSV input
'Impôts': None # No processing script for tax documents yet
}
# Process each account type
success_count = 0
for account_type, info in pdf_dirs.items():
if account_type not in script_map:
print(f"\nWarning: No processing script available for {account_type}")
continue
# For Revolut, use CSV directory instead of PDF directory
process_dir = info['path']
if account_type == 'Revolut':
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
if process_dir and not os.path.exists(process_dir):
print(f"Warning: Directory not found: {process_dir}")
continue
success = process_dynamic_pdf_files(
script_map[account_type],
process_dir,
output_dir
)
if success:
success_count += 1
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@@ -1,61 +0,0 @@
#!/usr/bin/env python3
"""
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
def main():
"""
Main function to dynamically discover and process all financial statements
"""
import argparse
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
# Build command
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
'--data-dir', data_dir, '--output-dir', output_dir]
# Run the dynamic processor
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(f"\nDiscovery Results:")
print(result.stdout)
if result.returncode == 0:
print(f"\n{'='*60}")
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
else:
print(f"\nError during dynamic processing: exit code {result.returncode}")
except subprocess.CalledProcessError as e:
print(f"\nError running dynamic processor: {e}")
if __name__ == "__main__":
from datetime import datetime
# Add date to print
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
main()

View File

@@ -1,56 +0,0 @@
#!/usr/bin/env python3
"""
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
from datetime import datetime
def main():
"""
Main function to dynamically discover and process all financial statements
"""
import argparse
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
# Build command
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
'--data-dir', data_dir, '--output-dir', output_dir]
# Run the dynamic processor
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(f"\nDiscovery Results:")
print(result.stdout)
if result.returncode == 0:
print(f"\n{'='*60}")
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
else:
print(f"\nError during dynamic processing: exit code {result.returncode}")
except subprocess.CalledProcessError as e:
print(f"\nError running dynamic processor: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,89 +0,0 @@
#!/usr/bin/env python3
"""
Master script to process all financial statements and generate CSV outputs
"""
import os
import subprocess
import sys
from datetime import datetime
def run_script(script_name, csv_output=False):
"""Run a processing script with optional CSV output"""
cmd = [sys.executable, script_name]
if csv_output:
cmd.append('--csv')
try:
print(f"\n{'='*50}")
print(f"Processing {script_name.replace('process_', '').replace('.py', '').replace('_', ' ').title()}...")
print('='*50)
subprocess.run(cmd, check=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error running {script_name}: {e}")
return False
def main():
import argparse
parser = argparse.ArgumentParser(description='Process all financial statements')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
parser.add_argument('--bourso', action='store_true',
help='Process only BoursoBank statements')
parser.add_argument('--amex', action='store_true',
help='Process only American Express statements')
parser.add_argument('--monabanq', action='store_true',
help='Process only Monabanq statements')
parser.add_argument('--revolut', action='store_true',
help='Process only Revolut statements')
parser.add_argument('--sncf', action='store_true',
help='Process only SNCF statements')
parser.add_argument('--laposte', action='store_true',
help='Process only La Poste statements')
args = parser.parse_args()
print(f"\n{'='*60}")
print(f"Financial Statement Processor")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
scripts_to_run = []
# Determine which scripts to run based on arguments
if args.bourso or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
scripts_to_run.append('process_bourso.py')
if args.amex or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
scripts_to_run.append('process_amex.py')
if args.monabanq or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
scripts_to_run.append('process_monabanq.py')
if args.revolut or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
scripts_to_run.append('process_expenses.py')
if args.sncf or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
scripts_to_run.append('process_sncf.py')
if args.laposte or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
scripts_to_run.append('process_laposte.py')
# Run each script
success_count = 0
output_dir = '../output/csv'
os.makedirs(output_dir, exist_ok=True)
for script in scripts_to_run:
if os.path.exists(script):
# Pass CSV flag and output directory to all scripts
if run_script(script, args.csv):
success_count += 1
else:
print(f"Script not found: {script}")
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{len(scripts_to_run)} scripts executed successfully")
if args.csv:
print("CSV files have been generated for each directory")
print(f"{'='*60}")
if __name__ == "__main__":
main()

1
scripts/process_amex.py Normal file → Executable file
View File

@@ -1,3 +1,4 @@
#!/usr/bin/env python3
import subprocess
import re

3
scripts/process_bourso.py Normal file → Executable file
View File

@@ -1,3 +1,4 @@
#!/usr/bin/env python3
import re
import csv
@@ -67,7 +68,7 @@ def process_bourso_statement(file_path, output_csv=False, output_dir='../../outp
# Output CSV if requested
if output_csv:
csv_file = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '_transactions.csv')
csv_file = os.path.join(output_dir, 'boursobank_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Value Date']

1
scripts/process_expenses.py Normal file → Executable file
View File

@@ -1,3 +1,4 @@
#!/usr/bin/env python3
import csv
import glob

View File

@@ -1,107 +0,0 @@
import subprocess
import re
import csv
import os
import glob
from collections import defaultdict
def categorize_laposte_transaction(description):
description = description.lower()
if 'virement' in description or 'vir' in description:
return 'Transfer'
if 'retrait' in description:
return 'Cash Withdrawal'
if 'carte' in description or 'paiement' in description:
return 'Card Payment'
if 'frais' in description:
return 'Bank Fees'
if 'cotisation' in description:
return 'Deductions'
if 'impot' in description:
return 'Tax'
return 'Other'
def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract transactions from the PDF
lines = content.split('\n')
for line in lines:
# Basic regex to find transaction lines (may need refinement based on actual format)
if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
parts = line.split()
if len(parts) > 2:
try:
date = parts[0]
# Extract description parts between date and amount
description_parts = []
amount = 0
# Find amount (last numeric value)
for part in reversed(parts):
if re.match(r'[\d,.]+', part):
amount = float(part.replace(',', '.'))
break
description_parts.insert(0, part)
description = ' '.join(description_parts).strip()
category = categorize_laposte_transaction(description)
# Store transaction for CSV output
all_transactions.append({
'Date': date,
'Description': description,
'Category': category,
'Amount': amount,
'Source': os.path.basename(pdf_file)
})
except (ValueError, IndexError):
continue
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Output CSV if requested
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- La Poste Account Statements ---")
print(f"Found {len(pdf_files)} account statement files")
print(f"Processed {len(all_transactions)} transactions")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process La Poste account statements')
parser.add_argument('--pdf-dir', default='../data/pdf/la_poste',
help='Directory containing La Poste PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)

12
scripts/process_laposte_improved.py Normal file → Executable file
View File

@@ -1,3 +1,7 @@
#!/usr/bin/env python3
import subprocess
import re
import csv
@@ -54,7 +58,7 @@ def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../out
continue
# Match transaction lines - they have date and amount
if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
if re.match(r'\s*\d{2}/\d{2}', line):
parts = re.split(r'\s{2,}', line)
if len(parts) >= 3:
try:
@@ -64,9 +68,9 @@ def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../out
# Extract amount (look for numeric values with ¤ or €)
amount = 0
for part in parts[2:]:
part = part.strip().replace('¤', '').replace('', '')
part = part.strip().replace('¤', '').replace('', '').replace(' ', '')
if re.match(r'[\d.,]+', part):
amount_str = part.replace(' ', '').replace(',', '.')
amount_str = part.replace(',', '.')
try:
amount = float(amount_str)
break
@@ -121,4 +125,4 @@ if __name__ == "__main__":
args = parser.parse_args()
# Process all PDF files in the directory
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)

2
scripts/process_monabanq.py Normal file → Executable file
View File

@@ -1,3 +1,5 @@
#!/usr/bin/env python3
import subprocess
import re

154
scripts/process_revolut.py Normal file
View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Revolut CSV aggregator to process and consolidate account statements
"""
import csv
import os
import glob
import argparse
from datetime import datetime
from collections import defaultdict
def parse_revolut_csv(csv_file):
"""Parse a single Revolut CSV file and return list of transactions"""
transactions = []
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Skip if not completed
if row.get('State', '').upper() != 'COMPLETED':
continue
# Parse date
started_date = row.get('Started Date', '')
try:
# Format: 2026-01-03 04:39:38
date_obj = datetime.strptime(started_date, '%Y-%m-%d %H:%M:%S')
formatted_date = date_obj.strftime('%d/%m/%Y')
except (ValueError, TypeError):
formatted_date = started_date
# Determine amount (negative = expense, positive = income)
try:
amount = float(row.get('Amount', '0'))
except ValueError:
amount = 0.0
# Get fee
try:
fee = float(row.get('Fee', '0'))
except ValueError:
fee = 0.0
# Calculate net amount (amount includes fee already in Revolut)
net_amount = amount
transaction = {
'Date': formatted_date,
'Description': row.get('Description', ''),
'Type': row.get('Type', ''),
'Product': row.get('Product', ''),
'Amount': net_amount,
'Fee': fee,
'Currency': row.get('Currency', 'EUR'),
'State': row.get('State', ''),
'Balance': row.get('Balance', ''),
'Source': os.path.basename(csv_file)
}
transactions.append(transaction)
return transactions
def categorize_transaction(description, trans_type):
"""Categorize transaction based on description and type"""
description_upper = description.upper()
trans_type_upper = trans_type.upper()
if 'POCKET' in description_upper or 'ÉPARGNE' in description_upper:
return 'Savings Transfer'
elif trans_type_upper == 'TRANSFER':
return 'Transfer'
elif trans_type_upper == 'CARD_PAYMENT':
return 'Card Payment'
elif trans_type_upper == 'CARD_REFUND':
return 'Card Refund'
elif trans_type_upper == 'EXCHANGE':
return 'Currency Exchange'
elif trans_type_upper == 'TOPUP':
return 'Top Up'
elif trans_type_upper == 'REWARD':
return 'Reward'
else:
return 'Other'
def process_revolut_csv_files(directory, output_csv=False, output_dir='output/csv'):
"""Process all Revolut CSV files and aggregate transactions"""
# Get all CSV files in the directory
csv_files = glob.glob(os.path.join(directory, "*.csv"))
all_transactions = []
for csv_file in csv_files:
try:
transactions = parse_revolut_csv(csv_file)
all_transactions.extend(transactions)
print(f"Processed {os.path.basename(csv_file)}: {len(transactions)} transactions")
except Exception as e:
print(f"Error processing {csv_file}: {e}")
# Sort transactions by date
all_transactions.sort(key=lambda x: datetime.strptime(x['Date'], '%d/%m/%Y') if x['Date'] else datetime.min)
# Add categories
for trans in all_transactions:
trans['Category'] = categorize_transaction(trans['Description'], trans['Type'])
# Output CSV
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'revolut_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Type', 'Product', 'Amount', 'Fee',
'Currency', 'State', 'Balance', 'Source']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"\n--- Revolut Account Statements ---")
print(f"Found {len(csv_files)} statement files")
print(f"Total transactions: {len(all_transactions)}")
# Calculate totals
total_income = sum(t['Amount'] for t in all_transactions if t['Amount'] > 0)
total_expenses = sum(t['Amount'] for t in all_transactions if t['Amount'] < 0)
total_fees = sum(t['Fee'] for t in all_transactions)
print(f"Total Income: €{total_income:,.2f}")
print(f"Total Expenses: €{total_expenses:,.2f}")
print(f"Total Fees: €{total_fees:,.2f}")
print(f"Net Flow: €{(total_income + total_expenses):,.2f}")
return all_transactions
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process and aggregate Revolut CSV account statements')
parser.add_argument('--csv-dir', default='data/csv/revolut',
help='Directory containing Revolut CSV files')
parser.add_argument('--output-dir', default='output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output aggregated data to CSV file')
args = parser.parse_args()
# Process all CSV files in the directory
process_revolut_csv_files(args.csv_dir, args.csv, args.output_dir)

175
scripts/process_sncf.py Normal file → Executable file
View File

@@ -1,28 +1,95 @@
#!/usr/bin/env python3
"""
Enhanced SNCF processor to extract NET PAYÉ EN EUROS amounts
"""
import subprocess
import re
import csv
import os
import glob
import argparse
from collections import defaultdict
def categorize_sncf_transaction(description):
description = description.lower()
# For salary statements, we'll categorize based on the different components
if 'salaire' in description:
return 'Salary'
if 'prime' in description:
return 'Bonus/Prime'
if 'cotisation' in description or 'retenue' in description:
return 'Deductions'
if 'impot' in description:
return 'Tax'
if 'avantage' in description:
return 'Benefits'
return 'Other'
def extract_sncf_salary_data(content, filename):
"""Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS"""
# Extract month and year from content (e.g., "BULLETIN DE PAIE DU MOIS DE Janvier 2026")
months = {
'JANVIER': 1, 'FÉVRIER': 2, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOÛT': 8, 'AOUT': 8,
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DÉCEMBRE': 12, 'DECEMBRE': 12
}
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
# Try to find month/year from content
month_num = 1
year = 2025
month_name = ''
# Look for pattern like "MOIS DE Janvier 2026" in content
mois_match = re.search(r'MOIS DE\s+(\w+)\s+(\d{4})', content, re.IGNORECASE)
if mois_match:
month_str = mois_match.group(1).upper()
year = int(mois_match.group(2))
if month_str in months:
month_num = months[month_str]
# Get month name
month_names = [
'', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
]
month_name = month_names[month_num]
# Initialize salary data
salary_data = {
'month': '',
'year': year,
'brut_mensuel': 0.0,
'net_imposable': 0.0,
'net_paye_euros': 0.0,
'cumul_annuel': 0.0,
'mode_paiement': ''
}
lines = content.split('\n')
# Look for the salary table with NET PAYÉ EN EUROS
for i, line in enumerate(lines):
if 'NET PAYÉ EN EUROS' in line:
# The next line should be the MENSUEL line with the actual values
next_line = lines[i + 1] if i + 1 < len(lines) else ''
# Parse the MENSUEL line which has format:
# MENSUEL <brut> <net_imposable> <prelevement> <net_paye> EUR
mensuel_match = re.search(r'MENSUEL\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+EUR', next_line)
if mensuel_match:
try:
# Extract values and convert from French format (comma as decimal)
brut_mensuel = float(mensuel_match.group(1).replace(' ', '').replace(',', '.'))
net_imposable = float(mensuel_match.group(2).replace(' ', '').replace(',', '.'))
prelevement = float(mensuel_match.group(3).replace(' ', '').replace(',', '.'))
net_paye_euros = float(mensuel_match.group(4).replace(' ', '').replace(',', '.'))
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': brut_mensuel,
'net_imposable': net_imposable,
'net_paye_euros': net_paye_euros,
'cumul_annuel': 0.0,
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
}
break
except (ValueError, IndexError):
continue
return salary_data
def process_sncf_pdf_files(directory, output_csv=False, output_dir='output/csv'):
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
@@ -34,55 +101,81 @@ def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output
capture_output=True, text=True, check=True)
content = result.stdout
# Extract basic information from the PDF
lines = content.split('\n')
month = "Unknown"
for line in lines:
if 'salaire de' in line.lower():
# Extract month from filename or content
month = os.path.basename(pdf_file).split(' ')[2] if len(os.path.basename(pdf_file).split(' ')) > 2 else "Unknown"
break
# Add basic transaction record
all_transactions.append({
'Date': f"01/{month}/2025", # Simplified date extraction
'Description': f"Salaire {month} 2025",
'Category': 'Salary',
'Amount': 0, # Would need more specific parsing
'Source': os.path.basename(pdf_file)
})
except (subprocess.CalledProcessError, FileNotFoundError) as e:
# Extract salary data
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
except Exception as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Create transaction record with proper salary amount
if salary_data['month'] and salary_data['net_paye_euros'] > 0:
all_transactions.append({
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
'Category': 'Salary',
'Amount': salary_data['net_paye_euros'],
'Source': os.path.basename(pdf_file),
'Brut Mensuel': salary_data['brut_mensuel'],
'Net Imposable': salary_data['net_imposable'],
'Cumul Annuel': salary_data['cumul_annuel']
})
else:
# Still create an entry but with zero amount for data integrity
all_transactions.append({
'Date': f"01/{salary_data.get('month', '')}/{salary_data.get('year', '2025')}",
'Description': f"Salaire {salary_data.get('month', '')} {salary_data.get('year', '2025')}",
'Category': 'Salary',
'Amount': salary_data.get('net_paye_euros', 0),
'Source': os.path.basename(pdf_file),
'Brut Mensuel': salary_data.get('brut_mensuel', 0),
'Net Imposable': salary_data.get('net_imposable', 0),
'Cumul Annuel': salary_data.get('cumul_annuel', 0),
'Mode Paiement': salary_data.get('mode_paiement', '')
})
# Output CSV if requested
# Output CSV with enhanced SNCF data
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- SNCF Salary Statements ---")
print(f"Found {len(pdf_files)} salary statement files")
# Calculate totals
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
total_net = sum(t['Net Imposable'] for t in all_transactions)
if total_brut > 0:
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
print(f"Total Net Imposable: €{total_net:,.2f}")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
help='Directory containing SNCF PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
parser.add_argument('--output-dir', default='output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)

View File

@@ -1,173 +0,0 @@
import subprocess
import re
import csv
import os
import glob
from collections import defaultdict
def extract_sncf_salary_data(content, filename):
"""
Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
"""
# Extract month from filename
months = {
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
}
filename_upper = filename.upper()
for month, num in months.items():
if month in filename_upper:
# Extract year from filename
year_match = re.search(r'20(\d{2})', filename)
year = int(year_match.group(1)) if year_match else 2025
month_name = [
'', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
][month]
break
# Initialize salary data
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': 0.0,
'net_imposable': 0.0,
'net_paye_euros': 0.0,
'cumul_annuel': 0.0,
'mode_paiement': ''
}
lines = content.split('\n')
# Look for the salary table with NET PAYÉ EN EUROS
for line in lines:
if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
# Extract all numeric values from this line
values = re.findall(r'([\d\s,]+)', line)
if len(values) >= 4:
try:
# Extract values based on typical SNCF format
brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': brut_mensuel,
'net_imposable': net_imposable,
'net_paye_euros': net_paye_euros,
'cumul_annuel': cumul_annuel,
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
}
break
except (ValueError, IndexError):
continue
# Also look for alternative format if not found
if salary_data['brut_mensuel'] == 0.0:
for line in lines:
if 'BRUT MENSUEL' in line:
# Look for amounts in the line
amounts = re.findall(r'([\d\s,]+)', line)
if len(amounts) >= 2:
try:
# Take first amount as brut, calculate others
brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
# Assume net_imposable is roughly 75% of brut
net_imposable = brut_mensuel * 0.75
net_paye_euros = brut_mensuel - net_imposable
cumul_annuel = brut_mensuel * 12 # Approximate annual
salary_data = {
'month': month_name,
'year': year,
'brut_mensuel': brut_mensuel,
'net_imposable': net_imposable,
'net_paye_euros': net_paye_euros,
'cumul_annuel': cumul_annuel,
'mode_paiement': 'virement SEPA'
}
break
except (ValueError, IndexError):
continue
return salary_data
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract salary data
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
# Create transaction record with proper salary amount
all_transactions.append({
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
'Category': 'Salary',
'Amount': salary_data['net_paye_euros'],
'Source': os.path.basename(pdf_file),
'Brut Mensuel': salary_data['brut_mensuel'],
'Net Imposable': salary_data['net_imposable'],
'Cumul Annuel': salary_data['cumul_annuel']
})
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Output CSV with enhanced SNCF data
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- SNCF Salary Statements ---")
print(f"Found {len(pdf_files)} salary statement files")
# Calculate totals
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
total_net = sum(t['Net Imposable'] for t in all_transactions)
if total_brut > 0:
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
print(f"Total Net Imposable: €{total_net:,.2f}")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
help='Directory containing SNCF PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)

View File

@@ -1,136 +0,0 @@
import subprocess
import re
import csv
import os
import glob
from collections import defaultdict
def extract_month_from_filename(filename):
"""Extract month from SNCF filename"""
months = {
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
}
filename_upper = filename.upper()
for month, num in months.items():
if month in filename_upper:
# Extract year from filename
year_match = re.search(r'20(\d{2})', filename)
year = int(year_match.group(1)) if year_match else 2025
return year, num
return 2025, 1 # Default
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
"""Process SNCF salary PDF files with proper salary extraction"""
# Get all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
all_transactions = []
for pdf_file in pdf_files:
try:
# Convert PDF to text
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
capture_output=True, text=True, check=True)
content = result.stdout
# Extract month from filename
year, month = extract_month_from_filename(os.path.basename(pdf_file))
month_name = [
'', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December'
][month]
# Extract salary amount
lines = content.split('\n')
salary_amount = 0.0
# Look for "SALAIRE BRUT MENSUEL" line
for line in lines:
if 'SALAIRE BRUT MENSUEL' in line:
# Extract the amount after this label
amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
if amount_match:
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
try:
salary_amount = float(amount_str)
break
except ValueError:
continue
# Also look for other salary indicators
if salary_amount == 0.0:
for line in lines:
if 'SALAIRE' in line and 'BRUT' in line:
# Try alternative pattern
amount_match = re.search(r'([\d\s.,]+)\s*€', line)
if amount_match:
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
try:
salary_amount = float(amount_str)
break
except ValueError:
continue
# Also check for base salary in the table
if salary_amount == 0.0:
for line in lines:
if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
# Extract from the salary table
parts = line.split()
for part in parts:
try:
if '.' in part and ',' not in part and len(part) > 3:
salary_amount = float(part.replace(',', '.'))
break
except ValueError:
continue
# Add transaction record
all_transactions.append({
'Date': f"01/{month_name}/{year}",
'Description': f"Salaire {month_name} {year}",
'Category': 'Salary',
'Amount': salary_amount,
'Source': os.path.basename(pdf_file)
})
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"Error processing {pdf_file}: {e}")
continue
# Output CSV if requested
if output_csv and all_transactions:
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
os.makedirs(output_dir, exist_ok=True)
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_transactions)
print(f"\nTransaction data saved to {csv_file}")
print(f"--- SNCF Salary Statements ---")
print(f"Found {len(pdf_files)} salary statement files")
total_salary = sum(t['Amount'] for t in all_transactions)
if total_salary > 0:
print(f"Total Salary Extracted: €{total_salary:,.2f}")
return all_transactions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
help='Directory containing SNCF PDF files')
parser.add_argument('--output-dir', default='../../output/csv',
help='Directory to save CSV output files')
parser.add_argument('--csv', action='store_true',
help='Output transaction data to CSV files')
args = parser.parse_args()
# Process all PDF files in the directory
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)

View File

@@ -1 +0,0 @@
["pdftotext", "-layout", "file_path, '-'], capture_output=True, text=True)\n content = result.stdout\n \n # Find transaction lines\n lines = content.split('\n')\n transactions = []\n for line in lines:\n if re.match(r'd{1,2} w{3}', line) and not line.endswith('CR'):\n parts = line.split()\n if len(parts) > 3:\n try:\n amount = float(parts[-1].replace(',', '.'))\n description = ' '.join(parts[2:-1])\n transactions.append((description, amount))\n except:\n continue\n \n print(f\"January transactions found: {len(transactions)}\")\n print(\"Sample transactions:", "for desc, amt in transactions[:5]:\n print(f\" {desc}: \u20ac{amt:.2f}", "total = sum(amt for _, amt in transactions)\n print(f\"January total: \u20ac{total:.2f}\")\n\ndef check_monabanq_qc():\n print(\"\n=== MONABANQ QC ===\")\n file_path = \"/home/acid/Downloads/comptabilite/monabanq/Extrait de comptes au 2025-01-31.pdf", "result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True)\n content = result.stdout\n \n lines = content.split('\n')\n debits = []\n transaction_started = False\n \n for line in lines:\n if \"SOLDE\" in line:\n transaction_started = True\n continue\n if transaction_started and \"IBAN", "in line:\n break\n \n if transaction_started and re.match(r's*d{2}/d{2}/d{4}', line):\n match = re.match(r's*(d{2}/d{2}/d{4})s+d{2}/d{2}/d{4}s+(.*?)(?=s{2,}|$)(s+[d,.]+)?(s+[d,.]+)?', line)\n if match:\n op_date, description, debit_str, credit_str = match.groups()\n if debit_str:\n try:\n debit = float(debit_str.strip().replace(',', '.'))\n description = description.strip()\n debits.append((description, debit))\n except:\n continue\n \n print(f\"January debits found: {len(debits)}\")\n print(\"Sample debits:", "for desc, amt in debits[:5]:\n print(f\" {desc}: \u20ac{amt:.2f}", "total = sum(amt for _, amt in debits)\n print(f\"January total: \u20ac{total:.2f}\")\n\ndef check_revolut_qc():\n print(\"\n=== REVOLUT QC ===\")\n file_path = \"/home/acid/Downloads/comptabilite/revolut/account-statement_2025-01-01_2025-01-31_en-us_58f89a.csv", "with open(file_path, 'r', encoding='utf-8') as f:\n reader = csv.DictReader(f)\n expenses = []\n for row in reader:\n try:\n amount = float(row['Amount'])\n if amount < 0 and row['Currency'] == 'EUR':\n description = row['Description']\n expenses.append((description, abs(amount)))\n except:\n continue\n \n print(f\"January expenses found: {len(expenses)}\")\n print(\"Sample expenses:", "for desc, amt in expenses[:5]:\n print(f\" {desc}: \u20ac{amt:.2f}", "total = sum(amt for _, amt in expenses)\n print(f\"January total: \u20ac{total:.2f}\")\n\nif __name__ == \"__main__\":\n check_amex_qc()\n check_monabanq_qc()\n check_revolut_qc()\n print(\"\n=== QUALITY CONTROL SUMMARY ===\")\n print(\"\u2713 All scripts are correctly extracting transactions from their source files\")\n print(\"\u2713 Sample verification shows proper amount parsing and categorization\")\n print(\"\u2713 No significant data quality issues detected\")\n print(\"\u2192 High 'Other' categories need improved categorization for better financial analysis"]

View File

@@ -1 +0,0 @@
[["pdftotext", "-layout", "file_path, '-'], capture_output=True, text=True, check=True)\n content = result.stdout\n except (subprocess.CalledProcessError, FileNotFoundError) as e:\n print(f\"Error processing {file_path}: {e}", "return\n \n lines = content.split('\n')\n expense_lines = [line for line in lines if re.match(r'd{1,2} w{3}', line) and not line.endswith('CR')]\n \n print(\"Sample transaction lines from January PDF:", "for line in expense_lines[:10]:\n print(f\" {line}\")\n \n print(f\"\nTotal expense-like lines in January: {len(expense_lines)}", "Calculate manual total of first few transactions\n manual_total = 0\n for line in expense_lines[:5]:\n parts = line.split()\n if len(parts) > 3:\n try:\n amount_str = parts[-1].replace(',', '.')\n amount = float(amount_str)\n manual_total += amount\n description = ' '.join(parts[2:-1])\n print(f\" Found: {description} -> \u20ac{amount}", "except (ValueError, IndexError):\n continue\n \n print(f\"\nManual sum of first 5 transactions: \u20ac{manual_total:.2f}\")\n\ndef check_monabanq_quality():\n print(\"\n=== MONABANQ QUALITY CONTROL ===\")\n \n file_path = \"/home/acid/Downloads/comptabilite/monabanq/Extrait de comptes au 2025-01-31.pdf", "try:\n result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True, check=True)\n content = result.stdout\n except (subprocess.CalledProcessError, FileNotFoundError) as e:\n print(f\"Error processing {file_path}: {e}\")\n return\n \n lines = content.split('\n')\n transaction_started = False\n debit_total = 0\n debit_count = 0\n \n for line in lines:\n if \"SOLDE CREDITEUR AU\" in line or \"SOLDE DEBITEUR AU\" in line:\n transaction_started = True\n continue\n if not transaction_started or not line.strip():\n continue\n if \"IBAN :", "in line:\n break\n\n match = re.match(r's*(d{2}/d{2}/d{4})s+d{2}/d{2}/d{4}s+(.*?)(?=s{2,}|$)(s+[d,.]+)?(s+[d,.]+)?', line)\n if match:\n op_date, description, debit_str, credit_str = match.groups()\n description = description.strip()\n \n if debit_str:\n try:\n debit = float(debit_str.strip().replace(',', '.'))\n print(f\" Found debit: {description} -> \u20ac{debit}", "debit_total += debit\n debit_count += 1\n except (ValueError, AttributeError):\n continue\n \n print(f\"\nDebit transactions in January: {debit_count}\")\n print(f\"Manual total of debits: \u20ac{debit_total:.2f}\")\n\ndef check_revolut_quality():\n print(\"\n=== REVOLUT QUALITY CONTROL ===\")\n \n file_path = \"/home/acid/Downloads/comptabilite/revolut/account-statement_2025-01-01_2025-01-31_en-us_58f89a.csv", "try:\n with open(file_path, 'r', encoding='utf-8') as f:\n reader = csv.DictReader(f)\n negative_count = 0\n negative_total = 0\n sample_transactions = []\n \n for row in reader:\n try:\n amount = float(row['Amount'])\n currency = row['Currency']\n \n if currency == 'EUR' and amount < 0:\n negative_count += 1\n negative_total += abs(amount)\n if len(sample_transactions) < 10:\n sample_transactions.append((row['Description'], abs(amount)))\n except (ValueError, KeyError):\n continue\n \n print(\"Sample negative transactions from January:", "for desc, amount in sample_transactions[:10]:\n print(f\" {desc}: \u20ac{amount:.2f}\")\n \n print(f\"\nTotal expense transactions in January: {negative_count}\")\n print(f\"Manual total of expenses: \u20ac{negative_total:.2f}\")\n except FileNotFoundError:\n print(f\"CSV file not found: {file_path}\")\n\ndef check_bourso_quality():\n print(\"\n=== BOURSOBANK QUALITY CONTROL ===\")\n \n statement_lines = [\n \"PRLV SEPA ORANGE SA-ORANGE -> 18.96", "CARTE 27/11/25 ESPACE YOYO -> 3.00", "PRLV SEPA American Express -> 402.48"], {"__main__": "check_amex_quality()\n check_monabanq_quality()\n check_revolut_quality()\n check_bourso_quality()\n \n print(", ")\n print(\"1. All scripts appear to be processing data from their sources\")\n print(": ".", "methods": ")\n print(", "Amex": "Processing monthly statements with transaction extraction", "print(": "Boursobank: Using hardcoded statement text", "Other": "ategories suggest need for improved transaction categorization"}]

View File

@@ -1,32 +0,0 @@
print("=== QUALITY CONTROL ===")
print("\n1. American Express")
print("- Processing 12 monthly PDF statements (Jan-Dec 2025)")
print("- Total extracted: €16,618.47")
print("- Sample categories: Travel €2,269.93, Groceries €1,439.74")
print("\n2. Monabanq")
print("- Processing 12 monthly account statements (Jan-Dec 2025)")
print("- Total extracted: €9,092.59")
print("- Sample categories: Loan Repayment €450.00, Other €8,531.95")
print("\n3. Boursobank")
print("- Processing hardcoded December 2025 statement")
print("- Total extracted: €666.21")
print("- Sample categories: Credit Card Payment €402.48, Card Payment €127.00")
print("\n4. Revolut")
print("- Processing 12 monthly CSV files (Jan-Dec 2025)")
print("- Total extracted: €18,233.10")
print("- Sample categories: Transfers Out €5,902.59, Other €4,072.64")
print("\n=== VERIFICATION RESULTS ===")
print("✓ All scripts successfully processed their data sources")
print("✓ Amounts appear to be extracted correctly")
print("✓ Categorization is functioning")
print("✓ Total expenses across all accounts: €44,610.37")
print("\n=== DATA QUALITY NOTES ===")
print("• High 'Other' percentages suggest need for better categorization")
print("• All source files exist and are readable")
print("• Processing logic appears to be working correctly")
print("• Summary document created successfully with aggregated data")