#!/usr/bin/env python3 import subprocess import re import csv import os from collections import defaultdict def categorize_amex_transaction(description): description = description.lower() if any(keyword in description for keyword in ['carrefour', 'run market', 'intermarche']): return 'Groceries' if any(keyword in description for keyword in ['esko bar', 'le choka bleu', 'columbus cafe']): return 'Restaurants/Food' if any(keyword in description for keyword in ['openrouter', 'stripe-z.ai', 'claude.ai', 'ama eu sarl prime_new', 'scaleway', 'servperso* invoice pro']): return 'Online Services/Subscriptions' if any(keyword in description for keyword in ['air austral', 'run duty free', 'lm saint louis leroym4']): return 'Travel' if any(keyword in description for keyword in ['mon brico', 'sumup*kulture metisse', 'sumup*glamport', 'relay']): return 'Shopping' return 'Other' def process_amex_files(file_list, output_csv=False, output_dir='../../output/csv'): expense_summary = defaultdict(float) total_expenses = 0 all_transactions = [] for file_path in file_list: try: result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True, check=True) content = result.stdout except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"Error processing {file_path}: {e}") continue # Regex for amex transactions transaction_regex = re.compile(r'(\d{1,2} \w{3})\s+\d{1,2} \w{3}\s+(.*?)\s+([\d,.]+)$(? 3: try: date = parts[0] + ' ' + parts[1] amount_str = parts[-1].replace(',', '.') amount = float(amount_str) description = ' '.join(parts[2:-1]) category = categorize_amex_transaction(description) expense_summary[category] += amount total_expenses += amount # Store transaction for CSV output all_transactions.append({ 'Date': date, 'Description': description, 'Category': category, 'Amount': amount, 'Source': os.path.basename(file_path) }) except (ValueError, IndexError): continue # Output CSV if requested if output_csv and all_transactions: csv_file = os.path.join(output_dir, 'american_express_all_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nTransaction data saved to {csv_file}") print("--- American Express Expense Summary for 2025 ---") print(f"Total Expenses Analyzed: €{total_expenses:,.2f}") print("\n--- Spending by Category ---") sorted_expenses = sorted(expense_summary.items(), key=lambda item: item[1], reverse=True) if total_expenses > 0: for category, total in sorted_expenses: percentage = (total / total_expenses) * 100 print(f"{category:<25} €{total:9,.2f} ({percentage:5.2f}%)") else: print("No expenses found.") return all_transactions if __name__ == "__main__": import argparse import glob parser = argparse.ArgumentParser(description='Process American Express statements') parser.add_argument('--pdf-dir', default='../data/pdf/american_express', help='Directory containing American Express PDF files') parser.add_argument('--output-dir', default='../../output/csv', help='Directory to save CSV output files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(args.pdf_dir, "*.pdf")) # Sort files by date if possible pdf_files.sort() # Process all PDF files in the directory process_amex_files(pdf_files, args.csv, args.output_dir)