Refactor SNCF processor and add Revolut aggregator
- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line - Extract month/year from PDF content instead of filename - Add new Revolut CSV processor to aggregate account statements - Organize Revolut data files into data/csv/revolut/ - Clean up redundant scripts and reports
This commit is contained in:
@@ -1,328 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to aggregate all account statements by month or year
|
||||
"""
|
||||
|
||||
import os
|
||||
import csv
|
||||
import sys
|
||||
import argparse
|
||||
import re
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
import calendar
|
||||
|
||||
def parse_date(date_str, source_file):
|
||||
"""
|
||||
Parse date from various formats and return normalized (year, month, day)
|
||||
"""
|
||||
# Try different date formats
|
||||
formats = [
|
||||
'%d/%m/%Y', # DD/MM/YYYY
|
||||
'%m/%d/%Y', # MM/DD/YYYY (Amex format)
|
||||
'%Y-%m-%d', # YYYY-MM-DD (Revolut format)
|
||||
]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
dt = datetime.strptime(date_str, fmt)
|
||||
return (dt.year, dt.month, dt.day)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try to extract from filename (for SNCF)
|
||||
if 'salaire' in source_file.lower():
|
||||
months = ['janvier', 'fevrier', 'mars', 'avril', 'mai', 'juin',
|
||||
'juillet', 'aout', 'septembre', 'octobre', 'novembre', 'decembre']
|
||||
for i, month in enumerate(months, 1):
|
||||
if month.lower() in source_file.lower():
|
||||
year_match = re.search(r'20(\d{2})', source_file)
|
||||
year = int(year_match.group(1)) if year_match else datetime.now().year
|
||||
return (year, i, 1)
|
||||
|
||||
# Default: return current date
|
||||
return (datetime.now().year, datetime.now().month, 1)
|
||||
|
||||
def categorize_institution(source_file):
|
||||
"""
|
||||
Determine the institution based on the source filename
|
||||
"""
|
||||
source_lower = source_file.lower()
|
||||
|
||||
if 'boursobank' in source_lower or 'releve-compte' in source_lower:
|
||||
return 'Boursobank'
|
||||
elif 'american_express' in source_lower or 'amex' in source_lower:
|
||||
return 'American Express'
|
||||
elif 'monabanq' in source_lower or 'extrait de comptes' in source_lower:
|
||||
return 'Monabanq'
|
||||
elif 'revolut' in source_lower:
|
||||
return 'Revolut'
|
||||
elif 'sncf' in source_lower or 'salaire' in source_lower:
|
||||
return 'SNCF'
|
||||
elif 'la_poste' in source_lower or '2-la.poste' in source_lower or 'releve_ccp' in source_lower:
|
||||
return 'La Poste'
|
||||
|
||||
return 'Other'
|
||||
|
||||
def process_csv_file(file_path):
|
||||
"""
|
||||
Process a CSV file and return a list of transactions
|
||||
"""
|
||||
transactions = []
|
||||
institution = categorize_institution(os.path.basename(file_path))
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
|
||||
for row in reader:
|
||||
# Get the date
|
||||
date_str = row.get('Date', '')
|
||||
if not date_str:
|
||||
continue
|
||||
|
||||
# Parse and normalize the date
|
||||
year, month, day = parse_date(date_str, row.get('Source', ''))
|
||||
|
||||
# Get amount (handle different column names)
|
||||
amount_str = row.get('Amount', '') or row.get('Debit', '') or row.get('Credit', '0')
|
||||
try:
|
||||
amount = float(amount_str.replace(',', '.')) if amount_str else 0
|
||||
except ValueError:
|
||||
amount = 0
|
||||
|
||||
# Create transaction record
|
||||
transactions.append({
|
||||
'year': year,
|
||||
'month': month,
|
||||
'day': day,
|
||||
'date_str': date_str,
|
||||
'description': row.get('Description', ''),
|
||||
'category': row.get('Category', 'Other'),
|
||||
'amount': amount,
|
||||
'institution': institution,
|
||||
'source': row.get('Source', os.path.basename(file_path))
|
||||
})
|
||||
|
||||
return transactions
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Aggregate all account statements by month or year')
|
||||
parser.add_argument('--input-dir', default='output/csv',
|
||||
help='Directory containing CSV files to aggregate (default: output/csv)')
|
||||
parser.add_argument('--output-dir', default='output/reports',
|
||||
help='Directory to save aggregated reports (default: output/reports)')
|
||||
parser.add_argument('--annual', action='store_true',
|
||||
help='Create annual reports instead of monthly reports')
|
||||
parser.add_argument('--year', type=int,
|
||||
help='Generate reports for a specific year only')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
report_type = "Annual" if args.annual else "Monthly"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{report_type} Aggregation of All Account Statements")
|
||||
print(f"Input Directory: {os.path.abspath(args.input_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(args.output_dir)}")
|
||||
if args.year:
|
||||
print(f"Year Filter: {args.year}")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Collect all transactions
|
||||
all_transactions = []
|
||||
|
||||
# Find all CSV files in input directory
|
||||
csv_files = [f for f in os.listdir(args.input_dir) if f.endswith('.csv')]
|
||||
|
||||
if not csv_files:
|
||||
print(f"\nError: No CSV files found in {args.input_dir}")
|
||||
return
|
||||
|
||||
# Process each CSV file
|
||||
for csv_file in csv_files:
|
||||
file_path = os.path.join(args.input_dir, csv_file)
|
||||
print(f"\nProcessing: {csv_file}")
|
||||
transactions = process_csv_file(file_path)
|
||||
all_transactions.extend(transactions)
|
||||
print(f" Found {len(transactions)} transactions")
|
||||
|
||||
# Group transactions by month
|
||||
monthly_transactions = defaultdict(list)
|
||||
for transaction in all_transactions:
|
||||
key = (transaction['year'], transaction['month'])
|
||||
monthly_transactions[key].append(transaction)
|
||||
|
||||
# Create monthly summary report
|
||||
summary_file = os.path.join(args.output_dir, 'monthly_summary.csv')
|
||||
with open(summary_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
|
||||
# Header
|
||||
writer.writerow([
|
||||
'Year', 'Month', 'Total Income', 'Total Expenses', 'Net Balance',
|
||||
'Transaction Count', 'Institutions'
|
||||
])
|
||||
|
||||
# Process each month
|
||||
for year, month in sorted(monthly_transactions.keys()):
|
||||
transactions = monthly_transactions[(year, month)]
|
||||
month_name = calendar.month_name[month]
|
||||
|
||||
# Calculate totals
|
||||
total_income = sum(t['amount'] for t in transactions if t['amount'] < 0) # Negative amounts are income in Revolut
|
||||
total_expenses = sum(t['amount'] for t in transactions if t['amount'] > 0)
|
||||
net_balance = total_income + total_expenses
|
||||
transaction_count = len(transactions)
|
||||
|
||||
# Get unique institutions
|
||||
institutions = sorted(list(set(t['institution'] for t in transactions)))
|
||||
institutions_str = ', '.join(institutions)
|
||||
|
||||
# Write row
|
||||
writer.writerow([
|
||||
year, month_name, total_income, total_expenses, net_balance,
|
||||
transaction_count, institutions_str
|
||||
])
|
||||
|
||||
# Create yearly summary
|
||||
yearly_summary = defaultdict(lambda: {'income': 0, 'expenses': 0, 'count': 0})
|
||||
for transaction in all_transactions:
|
||||
year = transaction['year']
|
||||
yearly_summary[year]['count'] += 1
|
||||
if transaction['amount'] < 0:
|
||||
yearly_summary[year]['income'] += transaction['amount']
|
||||
else:
|
||||
yearly_summary[year]['expenses'] += transaction['amount']
|
||||
|
||||
# Create yearly summary file
|
||||
yearly_file = os.path.join(args.output_dir, 'yearly_summary.csv')
|
||||
with open(yearly_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Year', 'Total Income', 'Total Expenses', 'Net Balance', 'Transaction Count'])
|
||||
|
||||
for year in sorted(yearly_summary.keys()):
|
||||
data = yearly_summary[year]
|
||||
net_balance = data['income'] + data['expenses']
|
||||
writer.writerow([
|
||||
year, data['income'], data['expenses'], net_balance, data['count']
|
||||
])
|
||||
|
||||
# Create annual reports if requested
|
||||
generated_files = [
|
||||
os.path.basename(summary_file),
|
||||
os.path.basename(yearly_file)
|
||||
]
|
||||
|
||||
if args.annual:
|
||||
# Create annual reports
|
||||
for year in sorted(yearly_summary.keys()):
|
||||
if args.year and year != args.year:
|
||||
continue # Skip years not matching filter
|
||||
|
||||
print(f"\nCreating annual report for {year}...")
|
||||
|
||||
# Get all transactions for the year
|
||||
year_transactions = [t for t in all_transactions if t['year'] == year]
|
||||
|
||||
# Group by category for the annual report
|
||||
categories = defaultdict(lambda: {'count': 0, 'total': 0})
|
||||
for transaction in year_transactions:
|
||||
category = transaction['category']
|
||||
amount = transaction['amount']
|
||||
categories[category]['count'] += 1
|
||||
categories[category]['total'] += amount
|
||||
|
||||
# Create annual detailed report
|
||||
annual_file = os.path.join(args.output_dir, f'annual_report_{year}.csv')
|
||||
with open(annual_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Category', 'Transaction Count', 'Total Amount', 'Percentage'])
|
||||
|
||||
year_total = sum(c['total'] for c in categories.values())
|
||||
|
||||
# Sort categories by total amount
|
||||
sorted_categories = sorted(categories.items(), key=lambda x: x[1]['total'], reverse=True)
|
||||
|
||||
for category, data in sorted_categories:
|
||||
percentage = (data['total'] / year_total) * 100 if year_total != 0 else 0
|
||||
writer.writerow([category, data['count'], data['total'], f"{percentage:.2f}%"])
|
||||
|
||||
# Create annual transactions file
|
||||
annual_transactions_file = os.path.join(args.output_dir, f'annual_transactions_{year}.csv')
|
||||
with open(annual_transactions_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=[
|
||||
'Date', 'Description', 'Category', 'Amount',
|
||||
'Institution', 'Source'
|
||||
])
|
||||
writer.writeheader()
|
||||
|
||||
# Sort transactions by date
|
||||
sorted_transactions = sorted(year_transactions, key=lambda x: (x['month'], x['day'], x['description']))
|
||||
|
||||
for transaction in sorted_transactions:
|
||||
writer.writerow({
|
||||
'Date': transaction['date_str'],
|
||||
'Description': transaction['description'],
|
||||
'Category': transaction['category'],
|
||||
'Amount': transaction['amount'],
|
||||
'Institution': transaction['institution'],
|
||||
'Source': transaction['source']
|
||||
})
|
||||
|
||||
generated_files.append(os.path.basename(annual_file))
|
||||
generated_files.append(os.path.basename(annual_transactions_file))
|
||||
|
||||
print(f" Created {os.path.basename(annual_file)} and {os.path.basename(annual_transactions_file)}")
|
||||
else:
|
||||
# Create monthly reports (existing functionality)
|
||||
for year, month in sorted(monthly_transactions.keys()):
|
||||
month_name = calendar.month_name[month].lower()
|
||||
transactions = monthly_transactions[(year, month)]
|
||||
|
||||
# Create filename
|
||||
detail_file = os.path.join(args.output_dir, f'transactions_{year}_{month_name}.csv')
|
||||
|
||||
with open(detail_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=[
|
||||
'Date', 'Description', 'Category', 'Amount',
|
||||
'Institution', 'Source'
|
||||
])
|
||||
writer.writeheader()
|
||||
|
||||
# Sort transactions by date
|
||||
sorted_transactions = sorted(transactions, key=lambda x: (x['day'], x['description']))
|
||||
|
||||
for transaction in sorted_transactions:
|
||||
writer.writerow({
|
||||
'Date': transaction['date_str'],
|
||||
'Description': transaction['description'],
|
||||
'Category': transaction['category'],
|
||||
'Amount': transaction['amount'],
|
||||
'Institution': transaction['institution'],
|
||||
'Source': transaction['source']
|
||||
})
|
||||
|
||||
generated_files.append(f'transactions_{year}_{month_name}.csv')
|
||||
|
||||
# Print summary statistics
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Aggregation Complete")
|
||||
print(f"Total Transactions: {len(all_transactions)}")
|
||||
print(f"Years with Data: {len(yearly_summary)}")
|
||||
if not args.annual:
|
||||
print(f"Months with Data: {len(monthly_transactions)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# List generated files
|
||||
print("\nGenerated Files:")
|
||||
for file in generated_files:
|
||||
file_path = os.path.join(args.output_dir, file)
|
||||
if os.path.exists(file_path):
|
||||
file_size = os.path.getsize(file_path)
|
||||
print(f" - {file} ({file_size:,} bytes)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
[["pdftotext", "-layout", "/home/acid/Downloads/comptabilite/american.express/2025-01-02.pdf", "-"], ["line for line in amex_lines if re.match(r'd{1,2} w{3}', line) and not line.endswith('CR')]\n \n print(\"=== AMEX JANUARY QC ===\")\n print(f\"Transactions found: {len(amex_trans)}\")\n for i, line in enumerate(amex_trans[:5]):\n parts = line.split()\n try:\n amount = float(parts[-1].replace(',', '.'))\n desc = ' '.join(parts[2:-1])\n print(f\" {i+1}. {desc}: u20ac{amount:.2f}\")\n except:\n print(f\" {i+1}. {line}\")\nexcept Exception as e:\n print(f\"Amex QC error: {e}\")\n\n# MONABANQ QC\ntry:\n result = subprocess.run(['pdftotext', '-layout', '/home/acid/Downloads/comptabilite/monabanq/Extrait de comptes au 2025-01-31.pdf', '-'], capture_output=True, text=True)\n monabanq_lines = result.stdout.split('\\n')\n trans_started = False\n monabanq_debits = []\n \n for line in monabanq_lines:\n if \"SOLDE\" in line:\n trans_started = True\n continue\n if trans_started and \"IBAN", "in line:\n break\n if trans_started and re.search(r'd+,d+$', line):\n parts = line.split()\n if len(parts) >= 4:\n try:\n amount = float(parts[-1].replace(',', '.'))\n desc = ' '.join(parts[2:-1])\n monabanq_debits.append((desc, amount))\n except:\n continue\n \n print(f\"\n=== MONABANQ JANUARY QC ===\")\n print(f\"Debits found: {len(monabanq_debits)}\")\n for i, (desc, amt) in enumerate(monabanq_debits[:5]):\n print(f\" {i+1}. {desc}: u20ac{amt:.2f}\")\nexcept Exception as e:\n print(f\"Monabanq QC error: {e}\")\n\n# REVOLUT QC\ntry:\n with open('/home/acid/Downloads/comptabilite/revolut/account-statement_2025-01-01_2025-01-31_en-us_58f89a.csv', 'r') as f:\n reader = csv.DictReader(f)\n revolut_expenses = []\n for row in reader:\n if row['Currency'] == 'EUR' and float(row['Amount']) < 0:\n desc = row['Description']\n amt = abs(float(row['Amount']))\n revolut_expenses.append((desc, amt))\n \n print(f\"\n=== REVOLUT JANUARY QC ===\")\n print(f\"Expenses found: {len(revolut_expenses)}\")\n for i, (desc, amt) in enumerate(revolut_expenses[:5]):\n print(f\" {i+1}. {desc}: u20ac{amt:.2f}\")\nexcept Exception as e:\n print(f\"Revolut QC error: {e}\")\n\nprint(\"\n=== QUALITY CONTROL SUMMARY ===\")\nprint(\"u2713 Scripts are extracting transactions from source files\")\nprint(\"u2713 Transaction amounts appear to be parsed correctly\")\nprint(\"u2192 Data processing is working as expected\")"]]
|
||||
@@ -1,159 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fully dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import glob
|
||||
import re
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--data-dir',
|
||||
help='Base directory containing PDF files (default: ../data/pdf)')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files (default: ../output/csv)')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Generate CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
|
||||
# Determine data directory
|
||||
if args.data_dir:
|
||||
data_dir = args.data_dir
|
||||
if not os.path.isabs(data_dir):
|
||||
data_dir = os.path.join(project_root, data_dir)
|
||||
else:
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
if args.output_dir:
|
||||
output_dir = args.output_dir
|
||||
if not os.path.isabs(output_dir):
|
||||
output_dir = os.path.join(project_root, args.output_dir)
|
||||
else:
|
||||
output_dir = os.path.join(project_root, 'output/csv')
|
||||
|
||||
# Create output directory if needed
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Discover all PDF directories
|
||||
pdf_dirs = {}
|
||||
|
||||
# Get all directories in the data directory
|
||||
if not os.path.exists(data_dir):
|
||||
print(f"Error: Data directory not found: {data_dir}")
|
||||
return
|
||||
|
||||
for item in os.listdir(data_dir):
|
||||
dir_path = os.path.join(data_dir, item)
|
||||
if os.path.isdir(dir_path):
|
||||
# Check if this directory contains PDF files
|
||||
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
|
||||
if pdf_files:
|
||||
# Determine account type based on directory name
|
||||
dir_name_lower = item.lower()
|
||||
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
|
||||
account_type = 'Boursobank'
|
||||
script_name = 'process_bourso.py'
|
||||
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
|
||||
account_type = 'American Express'
|
||||
script_name = 'process_amex.py'
|
||||
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
|
||||
account_type = 'Monabanq'
|
||||
script_name = 'process_monabanq.py'
|
||||
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
|
||||
account_type = 'SNCF'
|
||||
script_name = 'process_sncf_improved.py'
|
||||
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
|
||||
account_type = 'La Poste'
|
||||
script_name = 'process_laposte_improved.py'
|
||||
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
|
||||
account_type = 'Impôts'
|
||||
script_name = None # Skip tax documents
|
||||
else:
|
||||
account_type = item.replace('_', ' ').title()
|
||||
script_name = f'process_{account_type.lower().replace(" ", "_")}.py'
|
||||
|
||||
pdf_dirs[account_type] = {
|
||||
'path': dir_path,
|
||||
'count': len(pdf_files),
|
||||
'files': pdf_files,
|
||||
'script': script_name
|
||||
}
|
||||
|
||||
if not pdf_dirs:
|
||||
print("No directories with PDF files found!")
|
||||
return
|
||||
|
||||
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
|
||||
for account_type, info in pdf_dirs.items():
|
||||
print(f" - {account_type}: {info['count']} PDF files")
|
||||
|
||||
# Process each account type
|
||||
success_count = 0
|
||||
|
||||
for account_type, info in pdf_dirs.items():
|
||||
if not info['script']:
|
||||
print(f"\nSkipping {account_type}: No processing script available")
|
||||
continue
|
||||
|
||||
# For Revolut, use CSV directory instead of PDF directory
|
||||
process_dir = info['path']
|
||||
if account_type == 'Revolut':
|
||||
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
|
||||
|
||||
if process_dir and not os.path.exists(process_dir):
|
||||
print(f"Warning: Directory not found: {process_dir}")
|
||||
continue
|
||||
|
||||
# Build command
|
||||
cmd = [sys.executable,
|
||||
os.path.join(script_dir, info['script']),
|
||||
'--pdf-dir' if account_type != 'Revolut' else '--csv-dir',
|
||||
process_dir,
|
||||
'--output-dir', output_dir]
|
||||
|
||||
if args.csv:
|
||||
cmd.append('--csv')
|
||||
|
||||
print(f"\nProcessing {account_type}...")
|
||||
print(f"Running: {' '.join(cmd[2:])}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
if result.stdout:
|
||||
print(result.stdout)
|
||||
if result.returncode == 0:
|
||||
success_count += 1
|
||||
print(f"✓ {account_type} processing completed successfully")
|
||||
else:
|
||||
print(f"✗ {account_type} processing failed with exit code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"✗ Error processing {account_type}: {e}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
|
||||
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,173 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import glob
|
||||
import re
|
||||
from collections import defaultdict
|
||||
import calendar
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
def discover_pdf_directories(base_data_dir):
|
||||
"""
|
||||
Scan base data directory and return all subdirectories containing PDF files
|
||||
"""
|
||||
pdf_dirs = {}
|
||||
|
||||
# Get all directories in the base data directory
|
||||
for item in os.listdir(base_data_dir):
|
||||
dir_path = os.path.join(base_data_dir, item)
|
||||
if os.path.isdir(dir_path):
|
||||
# Check if this directory contains PDF files
|
||||
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
|
||||
if pdf_files:
|
||||
# Determine account type based on directory name
|
||||
dir_name_lower = item.lower()
|
||||
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
|
||||
account_type = 'Boursobank'
|
||||
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
|
||||
account_type = 'American Express'
|
||||
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
|
||||
account_type = 'Monabanq'
|
||||
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
|
||||
account_type = 'SNCF'
|
||||
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
|
||||
account_type = 'La Poste'
|
||||
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
|
||||
account_type = 'Impôts'
|
||||
else:
|
||||
account_type = item.replace('_', ' ').title()
|
||||
|
||||
pdf_dirs[account_type] = {
|
||||
'path': dir_path,
|
||||
'count': len(pdf_files),
|
||||
'files': pdf_files
|
||||
}
|
||||
|
||||
return pdf_dirs
|
||||
|
||||
def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
|
||||
"""
|
||||
Generic function to process PDF files in any directory
|
||||
"""
|
||||
if not os.path.exists(pdf_directory):
|
||||
print(f"Warning: Directory not found: {pdf_directory}")
|
||||
return []
|
||||
|
||||
# Get all PDF files
|
||||
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"No PDF files found in {pdf_directory}")
|
||||
return []
|
||||
|
||||
# Build command
|
||||
script_path = os.path.abspath(process_script)
|
||||
script_dir = os.path.dirname(script_path)
|
||||
cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)),
|
||||
'--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
|
||||
|
||||
# Run the processing script
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(result.stdout)
|
||||
return result.returncode == 0
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error processing {pdf_directory}: {e}")
|
||||
return 0
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--data-dir',
|
||||
help='Base directory containing PDF files (default: auto-discovered)')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
|
||||
# Determine data directory
|
||||
if args.data_dir:
|
||||
data_dir = args.data_dir
|
||||
if not os.path.isabs(data_dir):
|
||||
data_dir = os.path.join(project_root, data_dir)
|
||||
else:
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Discover all PDF directories
|
||||
pdf_dirs = discover_pdf_directories(data_dir)
|
||||
|
||||
if not pdf_dirs:
|
||||
print("No directories with PDF files found!")
|
||||
return
|
||||
|
||||
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
|
||||
for account_type, info in pdf_dirs.items():
|
||||
print(f" - {account_type}: {info['count']} files in {info['path']}")
|
||||
|
||||
# Define processing scripts for each account type
|
||||
script_map = {
|
||||
'Boursobank': 'process_bourso.py',
|
||||
'American Express': 'process_amex.py',
|
||||
'Monabanq': 'process_monabanq.py',
|
||||
'SNCF': 'process_sncf_improved.py',
|
||||
'La Poste': 'process_laposte_improved.py',
|
||||
'Revolut': 'process_expenses.py', # Special case: uses CSV input
|
||||
'Impôts': None # No processing script for tax documents yet
|
||||
}
|
||||
|
||||
# Process each account type
|
||||
success_count = 0
|
||||
|
||||
for account_type, info in pdf_dirs.items():
|
||||
if account_type not in script_map:
|
||||
print(f"\nWarning: No processing script available for {account_type}")
|
||||
continue
|
||||
|
||||
# For Revolut, use CSV directory instead of PDF directory
|
||||
process_dir = info['path']
|
||||
if account_type == 'Revolut':
|
||||
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
|
||||
|
||||
if process_dir and not os.path.exists(process_dir):
|
||||
print(f"Warning: Directory not found: {process_dir}")
|
||||
continue
|
||||
|
||||
success = process_dynamic_pdf_files(
|
||||
script_map[account_type],
|
||||
process_dir,
|
||||
output_dir
|
||||
)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
|
||||
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,61 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
|
||||
# Build command
|
||||
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
|
||||
'--data-dir', data_dir, '--output-dir', output_dir]
|
||||
|
||||
# Run the dynamic processor
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(f"\nDiscovery Results:")
|
||||
print(result.stdout)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
|
||||
else:
|
||||
print(f"\nError during dynamic processing: exit code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"\nError running dynamic processor: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
from datetime import datetime
|
||||
|
||||
# Add date to print
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
main()
|
||||
@@ -1,56 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
|
||||
# Build command
|
||||
cmd = [sys.executable, os.path.join(script_dir, 'dynamic_processor.py'),
|
||||
'--data-dir', data_dir, '--output-dir', output_dir]
|
||||
|
||||
# Run the dynamic processor
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(f"\nDiscovery Results:")
|
||||
print(result.stdout)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Processing Complete: CSV files saved to {os.path.abspath(output_dir)}")
|
||||
else:
|
||||
print(f"\nError during dynamic processing: exit code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"\nError running dynamic processor: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,89 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Master script to process all financial statements and generate CSV outputs
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
def run_script(script_name, csv_output=False):
|
||||
"""Run a processing script with optional CSV output"""
|
||||
cmd = [sys.executable, script_name]
|
||||
if csv_output:
|
||||
cmd.append('--csv')
|
||||
|
||||
try:
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Processing {script_name.replace('process_', '').replace('.py', '').replace('_', ' ').title()}...")
|
||||
print('='*50)
|
||||
subprocess.run(cmd, check=True)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running {script_name}: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process all financial statements')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
parser.add_argument('--bourso', action='store_true',
|
||||
help='Process only BoursoBank statements')
|
||||
parser.add_argument('--amex', action='store_true',
|
||||
help='Process only American Express statements')
|
||||
parser.add_argument('--monabanq', action='store_true',
|
||||
help='Process only Monabanq statements')
|
||||
parser.add_argument('--revolut', action='store_true',
|
||||
help='Process only Revolut statements')
|
||||
parser.add_argument('--sncf', action='store_true',
|
||||
help='Process only SNCF statements')
|
||||
parser.add_argument('--laposte', action='store_true',
|
||||
help='Process only La Poste statements')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Financial Statement Processor")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
scripts_to_run = []
|
||||
|
||||
# Determine which scripts to run based on arguments
|
||||
if args.bourso or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
|
||||
scripts_to_run.append('process_bourso.py')
|
||||
if args.amex or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
|
||||
scripts_to_run.append('process_amex.py')
|
||||
if args.monabanq or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
|
||||
scripts_to_run.append('process_monabanq.py')
|
||||
if args.revolut or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
|
||||
scripts_to_run.append('process_expenses.py')
|
||||
if args.sncf or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
|
||||
scripts_to_run.append('process_sncf.py')
|
||||
if args.laposte or not any([args.bourso, args.amex, args.monabanq, args.revolut, args.sncf, args.laposte]):
|
||||
scripts_to_run.append('process_laposte.py')
|
||||
|
||||
# Run each script
|
||||
success_count = 0
|
||||
output_dir = '../output/csv'
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
for script in scripts_to_run:
|
||||
if os.path.exists(script):
|
||||
# Pass CSV flag and output directory to all scripts
|
||||
if run_script(script, args.csv):
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"Script not found: {script}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete: {success_count}/{len(scripts_to_run)} scripts executed successfully")
|
||||
if args.csv:
|
||||
print("CSV files have been generated for each directory")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
scripts/process_amex.py
Normal file → Executable file
1
scripts/process_amex.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
3
scripts/process_bourso.py
Normal file → Executable file
3
scripts/process_bourso.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import csv
|
||||
@@ -67,7 +68,7 @@ def process_bourso_statement(file_path, output_csv=False, output_dir='../../outp
|
||||
|
||||
# Output CSV if requested
|
||||
if output_csv:
|
||||
csv_file = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '_transactions.csv')
|
||||
csv_file = os.path.join(output_dir, 'boursobank_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Value Date']
|
||||
|
||||
1
scripts/process_expenses.py
Normal file → Executable file
1
scripts/process_expenses.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import csv
|
||||
import glob
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def categorize_laposte_transaction(description):
|
||||
description = description.lower()
|
||||
|
||||
if 'virement' in description or 'vir' in description:
|
||||
return 'Transfer'
|
||||
if 'retrait' in description:
|
||||
return 'Cash Withdrawal'
|
||||
if 'carte' in description or 'paiement' in description:
|
||||
return 'Card Payment'
|
||||
if 'frais' in description:
|
||||
return 'Bank Fees'
|
||||
if 'cotisation' in description:
|
||||
return 'Deductions'
|
||||
if 'impot' in description:
|
||||
return 'Tax'
|
||||
|
||||
return 'Other'
|
||||
|
||||
def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract transactions from the PDF
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
# Basic regex to find transaction lines (may need refinement based on actual format)
|
||||
if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
|
||||
parts = line.split()
|
||||
if len(parts) > 2:
|
||||
try:
|
||||
date = parts[0]
|
||||
# Extract description parts between date and amount
|
||||
description_parts = []
|
||||
amount = 0
|
||||
|
||||
# Find amount (last numeric value)
|
||||
for part in reversed(parts):
|
||||
if re.match(r'[\d,.]+', part):
|
||||
amount = float(part.replace(',', '.'))
|
||||
break
|
||||
description_parts.insert(0, part)
|
||||
|
||||
description = ' '.join(description_parts).strip()
|
||||
category = categorize_laposte_transaction(description)
|
||||
|
||||
# Store transaction for CSV output
|
||||
all_transactions.append({
|
||||
'Date': date,
|
||||
'Description': description,
|
||||
'Category': category,
|
||||
'Amount': amount,
|
||||
'Source': os.path.basename(pdf_file)
|
||||
})
|
||||
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV if requested
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- La Poste Account Statements ---")
|
||||
print(f"Found {len(pdf_files)} account statement files")
|
||||
print(f"Processed {len(all_transactions)} transactions")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process La Poste account statements')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/la_poste',
|
||||
help='Directory containing La Poste PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
12
scripts/process_laposte_improved.py
Normal file → Executable file
12
scripts/process_laposte_improved.py
Normal file → Executable file
@@ -1,3 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
@@ -54,7 +58,7 @@ def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../out
|
||||
continue
|
||||
|
||||
# Match transaction lines - they have date and amount
|
||||
if re.match(r'\s*\d{2}/\d{2}/\d{4}', line):
|
||||
if re.match(r'\s*\d{2}/\d{2}', line):
|
||||
parts = re.split(r'\s{2,}', line)
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
@@ -64,9 +68,9 @@ def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../out
|
||||
# Extract amount (look for numeric values with ¤ or €)
|
||||
amount = 0
|
||||
for part in parts[2:]:
|
||||
part = part.strip().replace('¤', '').replace('€', '')
|
||||
part = part.strip().replace('¤', '').replace('€', '').replace(' ', '')
|
||||
if re.match(r'[\d.,]+', part):
|
||||
amount_str = part.replace(' ', '').replace(',', '.')
|
||||
amount_str = part.replace(',', '.')
|
||||
try:
|
||||
amount = float(amount_str)
|
||||
break
|
||||
@@ -121,4 +125,4 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
|
||||
2
scripts/process_monabanq.py
Normal file → Executable file
2
scripts/process_monabanq.py
Normal file → Executable file
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
154
scripts/process_revolut.py
Normal file
154
scripts/process_revolut.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Revolut CSV aggregator to process and consolidate account statements
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def parse_revolut_csv(csv_file):
|
||||
"""Parse a single Revolut CSV file and return list of transactions"""
|
||||
transactions = []
|
||||
|
||||
with open(csv_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
# Skip if not completed
|
||||
if row.get('State', '').upper() != 'COMPLETED':
|
||||
continue
|
||||
|
||||
# Parse date
|
||||
started_date = row.get('Started Date', '')
|
||||
try:
|
||||
# Format: 2026-01-03 04:39:38
|
||||
date_obj = datetime.strptime(started_date, '%Y-%m-%d %H:%M:%S')
|
||||
formatted_date = date_obj.strftime('%d/%m/%Y')
|
||||
except (ValueError, TypeError):
|
||||
formatted_date = started_date
|
||||
|
||||
# Determine amount (negative = expense, positive = income)
|
||||
try:
|
||||
amount = float(row.get('Amount', '0'))
|
||||
except ValueError:
|
||||
amount = 0.0
|
||||
|
||||
# Get fee
|
||||
try:
|
||||
fee = float(row.get('Fee', '0'))
|
||||
except ValueError:
|
||||
fee = 0.0
|
||||
|
||||
# Calculate net amount (amount includes fee already in Revolut)
|
||||
net_amount = amount
|
||||
|
||||
transaction = {
|
||||
'Date': formatted_date,
|
||||
'Description': row.get('Description', ''),
|
||||
'Type': row.get('Type', ''),
|
||||
'Product': row.get('Product', ''),
|
||||
'Amount': net_amount,
|
||||
'Fee': fee,
|
||||
'Currency': row.get('Currency', 'EUR'),
|
||||
'State': row.get('State', ''),
|
||||
'Balance': row.get('Balance', ''),
|
||||
'Source': os.path.basename(csv_file)
|
||||
}
|
||||
|
||||
transactions.append(transaction)
|
||||
|
||||
return transactions
|
||||
|
||||
|
||||
def categorize_transaction(description, trans_type):
|
||||
"""Categorize transaction based on description and type"""
|
||||
description_upper = description.upper()
|
||||
trans_type_upper = trans_type.upper()
|
||||
|
||||
if 'POCKET' in description_upper or 'ÉPARGNE' in description_upper:
|
||||
return 'Savings Transfer'
|
||||
elif trans_type_upper == 'TRANSFER':
|
||||
return 'Transfer'
|
||||
elif trans_type_upper == 'CARD_PAYMENT':
|
||||
return 'Card Payment'
|
||||
elif trans_type_upper == 'CARD_REFUND':
|
||||
return 'Card Refund'
|
||||
elif trans_type_upper == 'EXCHANGE':
|
||||
return 'Currency Exchange'
|
||||
elif trans_type_upper == 'TOPUP':
|
||||
return 'Top Up'
|
||||
elif trans_type_upper == 'REWARD':
|
||||
return 'Reward'
|
||||
else:
|
||||
return 'Other'
|
||||
|
||||
|
||||
def process_revolut_csv_files(directory, output_csv=False, output_dir='output/csv'):
|
||||
"""Process all Revolut CSV files and aggregate transactions"""
|
||||
# Get all CSV files in the directory
|
||||
csv_files = glob.glob(os.path.join(directory, "*.csv"))
|
||||
all_transactions = []
|
||||
|
||||
for csv_file in csv_files:
|
||||
try:
|
||||
transactions = parse_revolut_csv(csv_file)
|
||||
all_transactions.extend(transactions)
|
||||
print(f"Processed {os.path.basename(csv_file)}: {len(transactions)} transactions")
|
||||
except Exception as e:
|
||||
print(f"Error processing {csv_file}: {e}")
|
||||
|
||||
# Sort transactions by date
|
||||
all_transactions.sort(key=lambda x: datetime.strptime(x['Date'], '%d/%m/%Y') if x['Date'] else datetime.min)
|
||||
|
||||
# Add categories
|
||||
for trans in all_transactions:
|
||||
trans['Category'] = categorize_transaction(trans['Description'], trans['Type'])
|
||||
|
||||
# Output CSV
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'revolut_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Type', 'Product', 'Amount', 'Fee',
|
||||
'Currency', 'State', 'Balance', 'Source']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"\n--- Revolut Account Statements ---")
|
||||
print(f"Found {len(csv_files)} statement files")
|
||||
print(f"Total transactions: {len(all_transactions)}")
|
||||
|
||||
# Calculate totals
|
||||
total_income = sum(t['Amount'] for t in all_transactions if t['Amount'] > 0)
|
||||
total_expenses = sum(t['Amount'] for t in all_transactions if t['Amount'] < 0)
|
||||
total_fees = sum(t['Fee'] for t in all_transactions)
|
||||
|
||||
print(f"Total Income: €{total_income:,.2f}")
|
||||
print(f"Total Expenses: €{total_expenses:,.2f}")
|
||||
print(f"Total Fees: €{total_fees:,.2f}")
|
||||
print(f"Net Flow: €{(total_income + total_expenses):,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Process and aggregate Revolut CSV account statements')
|
||||
parser.add_argument('--csv-dir', default='data/csv/revolut',
|
||||
help='Directory containing Revolut CSV files')
|
||||
parser.add_argument('--output-dir', default='output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output aggregated data to CSV file')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all CSV files in the directory
|
||||
process_revolut_csv_files(args.csv_dir, args.csv, args.output_dir)
|
||||
175
scripts/process_sncf.py
Normal file → Executable file
175
scripts/process_sncf.py
Normal file → Executable file
@@ -1,28 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Enhanced SNCF processor to extract NET PAYÉ EN EUROS amounts
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
def categorize_sncf_transaction(description):
|
||||
description = description.lower()
|
||||
|
||||
# For salary statements, we'll categorize based on the different components
|
||||
if 'salaire' in description:
|
||||
return 'Salary'
|
||||
if 'prime' in description:
|
||||
return 'Bonus/Prime'
|
||||
if 'cotisation' in description or 'retenue' in description:
|
||||
return 'Deductions'
|
||||
if 'impot' in description:
|
||||
return 'Tax'
|
||||
if 'avantage' in description:
|
||||
return 'Benefits'
|
||||
|
||||
return 'Other'
|
||||
def extract_sncf_salary_data(content, filename):
|
||||
"""Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS"""
|
||||
# Extract month and year from content (e.g., "BULLETIN DE PAIE DU MOIS DE Janvier 2026")
|
||||
months = {
|
||||
'JANVIER': 1, 'FÉVRIER': 2, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
||||
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOÛT': 8, 'AOUT': 8,
|
||||
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DÉCEMBRE': 12, 'DECEMBRE': 12
|
||||
}
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
# Try to find month/year from content
|
||||
month_num = 1
|
||||
year = 2025
|
||||
month_name = ''
|
||||
|
||||
# Look for pattern like "MOIS DE Janvier 2026" in content
|
||||
mois_match = re.search(r'MOIS DE\s+(\w+)\s+(\d{4})', content, re.IGNORECASE)
|
||||
if mois_match:
|
||||
month_str = mois_match.group(1).upper()
|
||||
year = int(mois_match.group(2))
|
||||
if month_str in months:
|
||||
month_num = months[month_str]
|
||||
|
||||
# Get month name
|
||||
month_names = [
|
||||
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December'
|
||||
]
|
||||
month_name = month_names[month_num]
|
||||
|
||||
# Initialize salary data
|
||||
salary_data = {
|
||||
'month': '',
|
||||
'year': year,
|
||||
'brut_mensuel': 0.0,
|
||||
'net_imposable': 0.0,
|
||||
'net_paye_euros': 0.0,
|
||||
'cumul_annuel': 0.0,
|
||||
'mode_paiement': ''
|
||||
}
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
# Look for the salary table with NET PAYÉ EN EUROS
|
||||
for i, line in enumerate(lines):
|
||||
if 'NET PAYÉ EN EUROS' in line:
|
||||
# The next line should be the MENSUEL line with the actual values
|
||||
next_line = lines[i + 1] if i + 1 < len(lines) else ''
|
||||
|
||||
# Parse the MENSUEL line which has format:
|
||||
# MENSUEL <brut> <net_imposable> <prelevement> <net_paye> EUR
|
||||
mensuel_match = re.search(r'MENSUEL\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+([\d\s,]+?)\s+EUR', next_line)
|
||||
|
||||
if mensuel_match:
|
||||
try:
|
||||
# Extract values and convert from French format (comma as decimal)
|
||||
brut_mensuel = float(mensuel_match.group(1).replace(' ', '').replace(',', '.'))
|
||||
net_imposable = float(mensuel_match.group(2).replace(' ', '').replace(',', '.'))
|
||||
prelevement = float(mensuel_match.group(3).replace(' ', '').replace(',', '.'))
|
||||
net_paye_euros = float(mensuel_match.group(4).replace(' ', '').replace(',', '.'))
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': 0.0,
|
||||
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
|
||||
return salary_data
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='output/csv'):
|
||||
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
@@ -34,55 +101,81 @@ def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract basic information from the PDF
|
||||
lines = content.split('\n')
|
||||
month = "Unknown"
|
||||
for line in lines:
|
||||
if 'salaire de' in line.lower():
|
||||
# Extract month from filename or content
|
||||
month = os.path.basename(pdf_file).split(' ')[2] if len(os.path.basename(pdf_file).split(' ')) > 2 else "Unknown"
|
||||
break
|
||||
|
||||
# Add basic transaction record
|
||||
all_transactions.append({
|
||||
'Date': f"01/{month}/2025", # Simplified date extraction
|
||||
'Description': f"Salaire {month} 2025",
|
||||
'Category': 'Salary',
|
||||
'Amount': 0, # Would need more specific parsing
|
||||
'Source': os.path.basename(pdf_file)
|
||||
})
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
# Extract salary data
|
||||
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
|
||||
# Create transaction record with proper salary amount
|
||||
if salary_data['month'] and salary_data['net_paye_euros'] > 0:
|
||||
all_transactions.append({
|
||||
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
|
||||
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_data['net_paye_euros'],
|
||||
'Source': os.path.basename(pdf_file),
|
||||
'Brut Mensuel': salary_data['brut_mensuel'],
|
||||
'Net Imposable': salary_data['net_imposable'],
|
||||
'Cumul Annuel': salary_data['cumul_annuel']
|
||||
})
|
||||
else:
|
||||
# Still create an entry but with zero amount for data integrity
|
||||
all_transactions.append({
|
||||
'Date': f"01/{salary_data.get('month', '')}/{salary_data.get('year', '2025')}",
|
||||
'Description': f"Salaire {salary_data.get('month', '')} {salary_data.get('year', '2025')}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_data.get('net_paye_euros', 0),
|
||||
'Source': os.path.basename(pdf_file),
|
||||
'Brut Mensuel': salary_data.get('brut_mensuel', 0),
|
||||
'Net Imposable': salary_data.get('net_imposable', 0),
|
||||
'Cumul Annuel': salary_data.get('cumul_annuel', 0),
|
||||
'Mode Paiement': salary_data.get('mode_paiement', '')
|
||||
})
|
||||
|
||||
# Output CSV if requested
|
||||
# Output CSV with enhanced SNCF data
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
|
||||
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- SNCF Salary Statements ---")
|
||||
print(f"Found {len(pdf_files)} salary statement files")
|
||||
|
||||
# Calculate totals
|
||||
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
|
||||
total_net = sum(t['Net Imposable'] for t in all_transactions)
|
||||
|
||||
if total_brut > 0:
|
||||
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
|
||||
print(f"Total Net Imposable: €{total_net:,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
||||
help='Directory containing SNCF PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
parser.add_argument('--output-dir', default='output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
|
||||
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def extract_sncf_salary_data(content, filename):
|
||||
"""
|
||||
Extract salary data from SNCF PDF content with focus on NET PAYÉ EN EUROS
|
||||
"""
|
||||
# Extract month from filename
|
||||
months = {
|
||||
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
||||
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
|
||||
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
|
||||
}
|
||||
|
||||
filename_upper = filename.upper()
|
||||
for month, num in months.items():
|
||||
if month in filename_upper:
|
||||
# Extract year from filename
|
||||
year_match = re.search(r'20(\d{2})', filename)
|
||||
year = int(year_match.group(1)) if year_match else 2025
|
||||
month_name = [
|
||||
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December'
|
||||
][month]
|
||||
break
|
||||
|
||||
# Initialize salary data
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': 0.0,
|
||||
'net_imposable': 0.0,
|
||||
'net_paye_euros': 0.0,
|
||||
'cumul_annuel': 0.0,
|
||||
'mode_paiement': ''
|
||||
}
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
# Look for the salary table with NET PAYÉ EN EUROS
|
||||
for line in lines:
|
||||
if 'NET PAYÉ EN EUROS' in line and 'BRUT' in line:
|
||||
# Extract all numeric values from this line
|
||||
values = re.findall(r'([\d\s,]+)', line)
|
||||
if len(values) >= 4:
|
||||
try:
|
||||
# Extract values based on typical SNCF format
|
||||
brut_mensuel = float(values[0].replace(' ', '').replace(',', '.'))
|
||||
net_imposable = float(values[1].replace(' ', '').replace(',', '.'))
|
||||
net_paye_euros = float(values[3].replace(' ', '').replace(',', '.'))
|
||||
cumul_annuel = float(values[2].replace(' ', '').replace(',', '.'))
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': cumul_annuel,
|
||||
'mode_paiement': 'virement SEPA A COMPTER DU DERNIER JOUR OUVRE DU MOIS'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
# Also look for alternative format if not found
|
||||
if salary_data['brut_mensuel'] == 0.0:
|
||||
for line in lines:
|
||||
if 'BRUT MENSUEL' in line:
|
||||
# Look for amounts in the line
|
||||
amounts = re.findall(r'([\d\s,]+)', line)
|
||||
if len(amounts) >= 2:
|
||||
try:
|
||||
# Take first amount as brut, calculate others
|
||||
brut_mensuel = float(amounts[0].replace(' ', '').replace(',', '.'))
|
||||
# Assume net_imposable is roughly 75% of brut
|
||||
net_imposable = brut_mensuel * 0.75
|
||||
net_paye_euros = brut_mensuel - net_imposable
|
||||
cumul_annuel = brut_mensuel * 12 # Approximate annual
|
||||
|
||||
salary_data = {
|
||||
'month': month_name,
|
||||
'year': year,
|
||||
'brut_mensuel': brut_mensuel,
|
||||
'net_imposable': net_imposable,
|
||||
'net_paye_euros': net_paye_euros,
|
||||
'cumul_annuel': cumul_annuel,
|
||||
'mode_paiement': 'virement SEPA'
|
||||
}
|
||||
break
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return salary_data
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
"""Process SNCF salary PDF files with proper NET PAYÉ extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract salary data
|
||||
salary_data = extract_sncf_salary_data(content, os.path.basename(pdf_file))
|
||||
|
||||
# Create transaction record with proper salary amount
|
||||
all_transactions.append({
|
||||
'Date': f"01/{salary_data['month']}/{salary_data['year']}",
|
||||
'Description': f"Salaire {salary_data['month']} {salary_data['year']}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_data['net_paye_euros'],
|
||||
'Source': os.path.basename(pdf_file),
|
||||
'Brut Mensuel': salary_data['brut_mensuel'],
|
||||
'Net Imposable': salary_data['net_imposable'],
|
||||
'Cumul Annuel': salary_data['cumul_annuel']
|
||||
})
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV with enhanced SNCF data
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source',
|
||||
'Brut Mensuel', 'Net Imposable', 'Cumul Annuel']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- SNCF Salary Statements ---")
|
||||
print(f"Found {len(pdf_files)} salary statement files")
|
||||
|
||||
# Calculate totals
|
||||
total_brut = sum(t['Brut Mensuel'] for t in all_transactions)
|
||||
total_net = sum(t['Net Imposable'] for t in all_transactions)
|
||||
|
||||
if total_brut > 0:
|
||||
print(f"Total Brut Mensuel: €{total_brut:,.2f}")
|
||||
print(f"Total Net Imposable: €{total_net:,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements with enhanced NET PAYÉ extraction')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
||||
help='Directory containing SNCF PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
@@ -1,136 +0,0 @@
|
||||
import subprocess
|
||||
import re
|
||||
import csv
|
||||
import os
|
||||
import glob
|
||||
from collections import defaultdict
|
||||
|
||||
def extract_month_from_filename(filename):
|
||||
"""Extract month from SNCF filename"""
|
||||
months = {
|
||||
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
||||
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
|
||||
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
|
||||
}
|
||||
|
||||
filename_upper = filename.upper()
|
||||
for month, num in months.items():
|
||||
if month in filename_upper:
|
||||
# Extract year from filename
|
||||
year_match = re.search(r'20(\d{2})', filename)
|
||||
year = int(year_match.group(1)) if year_match else 2025
|
||||
return year, num
|
||||
|
||||
return 2025, 1 # Default
|
||||
|
||||
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
||||
"""Process SNCF salary PDF files with proper salary extraction"""
|
||||
# Get all PDF files in the directory
|
||||
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
||||
all_transactions = []
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
# Convert PDF to text
|
||||
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
||||
capture_output=True, text=True, check=True)
|
||||
content = result.stdout
|
||||
|
||||
# Extract month from filename
|
||||
year, month = extract_month_from_filename(os.path.basename(pdf_file))
|
||||
month_name = [
|
||||
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
||||
'July', 'August', 'September', 'October', 'November', 'December'
|
||||
][month]
|
||||
|
||||
# Extract salary amount
|
||||
lines = content.split('\n')
|
||||
salary_amount = 0.0
|
||||
|
||||
# Look for "SALAIRE BRUT MENSUEL" line
|
||||
for line in lines:
|
||||
if 'SALAIRE BRUT MENSUEL' in line:
|
||||
# Extract the amount after this label
|
||||
amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
|
||||
if amount_match:
|
||||
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
salary_amount = float(amount_str)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Also look for other salary indicators
|
||||
if salary_amount == 0.0:
|
||||
for line in lines:
|
||||
if 'SALAIRE' in line and 'BRUT' in line:
|
||||
# Try alternative pattern
|
||||
amount_match = re.search(r'([\d\s.,]+)\s*€', line)
|
||||
if amount_match:
|
||||
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
salary_amount = float(amount_str)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Also check for base salary in the table
|
||||
if salary_amount == 0.0:
|
||||
for line in lines:
|
||||
if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
|
||||
# Extract from the salary table
|
||||
parts = line.split()
|
||||
for part in parts:
|
||||
try:
|
||||
if '.' in part and ',' not in part and len(part) > 3:
|
||||
salary_amount = float(part.replace(',', '.'))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Add transaction record
|
||||
all_transactions.append({
|
||||
'Date': f"01/{month_name}/{year}",
|
||||
'Description': f"Salaire {month_name} {year}",
|
||||
'Category': 'Salary',
|
||||
'Amount': salary_amount,
|
||||
'Source': os.path.basename(pdf_file)
|
||||
})
|
||||
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
||||
print(f"Error processing {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
# Output CSV if requested
|
||||
if output_csv and all_transactions:
|
||||
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_transactions)
|
||||
print(f"\nTransaction data saved to {csv_file}")
|
||||
|
||||
print(f"--- SNCF Salary Statements ---")
|
||||
print(f"Found {len(pdf_files)} salary statement files")
|
||||
total_salary = sum(t['Amount'] for t in all_transactions)
|
||||
if total_salary > 0:
|
||||
print(f"Total Salary Extracted: €{total_salary:,.2f}")
|
||||
|
||||
return all_transactions
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
|
||||
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
||||
help='Directory containing SNCF PDF files')
|
||||
parser.add_argument('--output-dir', default='../../output/csv',
|
||||
help='Directory to save CSV output files')
|
||||
parser.add_argument('--csv', action='store_true',
|
||||
help='Output transaction data to CSV files')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process all PDF files in the directory
|
||||
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|
||||
@@ -1 +0,0 @@
|
||||
["pdftotext", "-layout", "file_path, '-'], capture_output=True, text=True)\n content = result.stdout\n \n # Find transaction lines\n lines = content.split('\n')\n transactions = []\n for line in lines:\n if re.match(r'd{1,2} w{3}', line) and not line.endswith('CR'):\n parts = line.split()\n if len(parts) > 3:\n try:\n amount = float(parts[-1].replace(',', '.'))\n description = ' '.join(parts[2:-1])\n transactions.append((description, amount))\n except:\n continue\n \n print(f\"January transactions found: {len(transactions)}\")\n print(\"Sample transactions:", "for desc, amt in transactions[:5]:\n print(f\" {desc}: \u20ac{amt:.2f}", "total = sum(amt for _, amt in transactions)\n print(f\"January total: \u20ac{total:.2f}\")\n\ndef check_monabanq_qc():\n print(\"\n=== MONABANQ QC ===\")\n file_path = \"/home/acid/Downloads/comptabilite/monabanq/Extrait de comptes au 2025-01-31.pdf", "result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True)\n content = result.stdout\n \n lines = content.split('\n')\n debits = []\n transaction_started = False\n \n for line in lines:\n if \"SOLDE\" in line:\n transaction_started = True\n continue\n if transaction_started and \"IBAN", "in line:\n break\n \n if transaction_started and re.match(r's*d{2}/d{2}/d{4}', line):\n match = re.match(r's*(d{2}/d{2}/d{4})s+d{2}/d{2}/d{4}s+(.*?)(?=s{2,}|$)(s+[d,.]+)?(s+[d,.]+)?', line)\n if match:\n op_date, description, debit_str, credit_str = match.groups()\n if debit_str:\n try:\n debit = float(debit_str.strip().replace(',', '.'))\n description = description.strip()\n debits.append((description, debit))\n except:\n continue\n \n print(f\"January debits found: {len(debits)}\")\n print(\"Sample debits:", "for desc, amt in debits[:5]:\n print(f\" {desc}: \u20ac{amt:.2f}", "total = sum(amt for _, amt in debits)\n print(f\"January total: \u20ac{total:.2f}\")\n\ndef check_revolut_qc():\n print(\"\n=== REVOLUT QC ===\")\n file_path = \"/home/acid/Downloads/comptabilite/revolut/account-statement_2025-01-01_2025-01-31_en-us_58f89a.csv", "with open(file_path, 'r', encoding='utf-8') as f:\n reader = csv.DictReader(f)\n expenses = []\n for row in reader:\n try:\n amount = float(row['Amount'])\n if amount < 0 and row['Currency'] == 'EUR':\n description = row['Description']\n expenses.append((description, abs(amount)))\n except:\n continue\n \n print(f\"January expenses found: {len(expenses)}\")\n print(\"Sample expenses:", "for desc, amt in expenses[:5]:\n print(f\" {desc}: \u20ac{amt:.2f}", "total = sum(amt for _, amt in expenses)\n print(f\"January total: \u20ac{total:.2f}\")\n\nif __name__ == \"__main__\":\n check_amex_qc()\n check_monabanq_qc()\n check_revolut_qc()\n print(\"\n=== QUALITY CONTROL SUMMARY ===\")\n print(\"\u2713 All scripts are correctly extracting transactions from their source files\")\n print(\"\u2713 Sample verification shows proper amount parsing and categorization\")\n print(\"\u2713 No significant data quality issues detected\")\n print(\"\u2192 High 'Other' categories need improved categorization for better financial analysis"]
|
||||
@@ -1 +0,0 @@
|
||||
[["pdftotext", "-layout", "file_path, '-'], capture_output=True, text=True, check=True)\n content = result.stdout\n except (subprocess.CalledProcessError, FileNotFoundError) as e:\n print(f\"Error processing {file_path}: {e}", "return\n \n lines = content.split('\n')\n expense_lines = [line for line in lines if re.match(r'd{1,2} w{3}', line) and not line.endswith('CR')]\n \n print(\"Sample transaction lines from January PDF:", "for line in expense_lines[:10]:\n print(f\" {line}\")\n \n print(f\"\nTotal expense-like lines in January: {len(expense_lines)}", "Calculate manual total of first few transactions\n manual_total = 0\n for line in expense_lines[:5]:\n parts = line.split()\n if len(parts) > 3:\n try:\n amount_str = parts[-1].replace(',', '.')\n amount = float(amount_str)\n manual_total += amount\n description = ' '.join(parts[2:-1])\n print(f\" Found: {description} -> \u20ac{amount}", "except (ValueError, IndexError):\n continue\n \n print(f\"\nManual sum of first 5 transactions: \u20ac{manual_total:.2f}\")\n\ndef check_monabanq_quality():\n print(\"\n=== MONABANQ QUALITY CONTROL ===\")\n \n file_path = \"/home/acid/Downloads/comptabilite/monabanq/Extrait de comptes au 2025-01-31.pdf", "try:\n result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True, check=True)\n content = result.stdout\n except (subprocess.CalledProcessError, FileNotFoundError) as e:\n print(f\"Error processing {file_path}: {e}\")\n return\n \n lines = content.split('\n')\n transaction_started = False\n debit_total = 0\n debit_count = 0\n \n for line in lines:\n if \"SOLDE CREDITEUR AU\" in line or \"SOLDE DEBITEUR AU\" in line:\n transaction_started = True\n continue\n if not transaction_started or not line.strip():\n continue\n if \"IBAN :", "in line:\n break\n\n match = re.match(r's*(d{2}/d{2}/d{4})s+d{2}/d{2}/d{4}s+(.*?)(?=s{2,}|$)(s+[d,.]+)?(s+[d,.]+)?', line)\n if match:\n op_date, description, debit_str, credit_str = match.groups()\n description = description.strip()\n \n if debit_str:\n try:\n debit = float(debit_str.strip().replace(',', '.'))\n print(f\" Found debit: {description} -> \u20ac{debit}", "debit_total += debit\n debit_count += 1\n except (ValueError, AttributeError):\n continue\n \n print(f\"\nDebit transactions in January: {debit_count}\")\n print(f\"Manual total of debits: \u20ac{debit_total:.2f}\")\n\ndef check_revolut_quality():\n print(\"\n=== REVOLUT QUALITY CONTROL ===\")\n \n file_path = \"/home/acid/Downloads/comptabilite/revolut/account-statement_2025-01-01_2025-01-31_en-us_58f89a.csv", "try:\n with open(file_path, 'r', encoding='utf-8') as f:\n reader = csv.DictReader(f)\n negative_count = 0\n negative_total = 0\n sample_transactions = []\n \n for row in reader:\n try:\n amount = float(row['Amount'])\n currency = row['Currency']\n \n if currency == 'EUR' and amount < 0:\n negative_count += 1\n negative_total += abs(amount)\n if len(sample_transactions) < 10:\n sample_transactions.append((row['Description'], abs(amount)))\n except (ValueError, KeyError):\n continue\n \n print(\"Sample negative transactions from January:", "for desc, amount in sample_transactions[:10]:\n print(f\" {desc}: \u20ac{amount:.2f}\")\n \n print(f\"\nTotal expense transactions in January: {negative_count}\")\n print(f\"Manual total of expenses: \u20ac{negative_total:.2f}\")\n except FileNotFoundError:\n print(f\"CSV file not found: {file_path}\")\n\ndef check_bourso_quality():\n print(\"\n=== BOURSOBANK QUALITY CONTROL ===\")\n \n statement_lines = [\n \"PRLV SEPA ORANGE SA-ORANGE -> 18.96", "CARTE 27/11/25 ESPACE YOYO -> 3.00", "PRLV SEPA American Express -> 402.48"], {"__main__": "check_amex_quality()\n check_monabanq_quality()\n check_revolut_quality()\n check_bourso_quality()\n \n print(", ")\n print(\"1. All scripts appear to be processing data from their sources\")\n print(": ".", "methods": ")\n print(", "Amex": "Processing monthly statements with transaction extraction", "print(": "Boursobank: Using hardcoded statement text", "Other": "ategories suggest need for improved transaction categorization"}]
|
||||
@@ -1,32 +0,0 @@
|
||||
print("=== QUALITY CONTROL ===")
|
||||
print("\n1. American Express")
|
||||
print("- Processing 12 monthly PDF statements (Jan-Dec 2025)")
|
||||
print("- Total extracted: €16,618.47")
|
||||
print("- Sample categories: Travel €2,269.93, Groceries €1,439.74")
|
||||
|
||||
print("\n2. Monabanq")
|
||||
print("- Processing 12 monthly account statements (Jan-Dec 2025)")
|
||||
print("- Total extracted: €9,092.59")
|
||||
print("- Sample categories: Loan Repayment €450.00, Other €8,531.95")
|
||||
|
||||
print("\n3. Boursobank")
|
||||
print("- Processing hardcoded December 2025 statement")
|
||||
print("- Total extracted: €666.21")
|
||||
print("- Sample categories: Credit Card Payment €402.48, Card Payment €127.00")
|
||||
|
||||
print("\n4. Revolut")
|
||||
print("- Processing 12 monthly CSV files (Jan-Dec 2025)")
|
||||
print("- Total extracted: €18,233.10")
|
||||
print("- Sample categories: Transfers Out €5,902.59, Other €4,072.64")
|
||||
|
||||
print("\n=== VERIFICATION RESULTS ===")
|
||||
print("✓ All scripts successfully processed their data sources")
|
||||
print("✓ Amounts appear to be extracted correctly")
|
||||
print("✓ Categorization is functioning")
|
||||
print("✓ Total expenses across all accounts: €44,610.37")
|
||||
|
||||
print("\n=== DATA QUALITY NOTES ===")
|
||||
print("• High 'Other' percentages suggest need for better categorization")
|
||||
print("• All source files exist and are readable")
|
||||
print("• Processing logic appears to be working correctly")
|
||||
print("• Summary document created successfully with aggregated data")
|
||||
Reference in New Issue
Block a user