import subprocess import re import csv import os from collections import defaultdict def categorize_monabanq_transaction(description): description = description.lower() if 'ech pret' in description: return 'Loan Repayment' if 'f cotis pratiq+' in description: return 'Bank Fees' if 'google ireland' in description: return 'Google Services' if 'vir mr bataille kevin' in description: return 'Internal Transfer' return 'Other' def process_monabanq_files(file_list, output_csv=False, output_dir='../../output/csv'): expense_summary = defaultdict(float) total_expenses = 0 all_transactions = [] for file_path in file_list: try: # Use pdftotext to extract text result = subprocess.run(['pdftotext', '-layout', file_path, '-'], capture_output=True, text=True, check=True) content = result.stdout except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"Error processing {file_path}: {e}") continue lines = content.split('\n') transaction_started = False for line in lines: if "SOLDE CREDITEUR AU" in line or "SOLDE DEBITEUR AU" in line: transaction_started = True continue if not transaction_started or not line.strip(): continue if "IBAN :" in line: break # Regex to capture date, description, and debit/credit match = re.match(r'\s*(\d{2}/\d{2}/\d{4})\s+\d{2}/\d{2}/\d{4}\s+(.*?)(?=\s{2,}|$)(\s+[\d,.]+)?(\s+[\d,.]+)?', line) if match: op_date, description, debit_str, credit_str = match.groups() description = description.strip() debit = 0 credit = 0 if debit_str: try: debit = float(debit_str.strip().replace(',', '.')) except (ValueError, AttributeError): debit = 0 if credit_str: try: credit = float(credit_str.strip().replace(',', '.')) except (ValueError, AttributeError): credit = 0 category = categorize_monabanq_transaction(description) # Store transaction for CSV output all_transactions.append({ 'Date': op_date, 'Description': description, 'Category': category, 'Debit': debit, 'Credit': credit, 'Source': os.path.basename(file_path) }) if debit > 0 and category != 'Internal Transfer': expense_summary[category] += debit total_expenses += debit # Output CSV if requested if output_csv and all_transactions: csv_file = os.path.join(output_dir, 'monabanq_all_transactions.csv') os.makedirs(output_dir, exist_ok=True) with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Debit', 'Credit', 'Source'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nTransaction data saved to {csv_file}") print("--- Monabanq Expense Summary for 2025 ---") print(f"Total Expenses Analyzed: €{total_expenses:,.2f}") print("\n--- Spending by Category ---") sorted_expenses = sorted(expense_summary.items(), key=lambda item: item[1], reverse=True) if total_expenses > 0: for category, total in sorted_expenses: percentage = (total / total_expenses) * 100 print(f"{category:<25} €{total:9,.2f} ({percentage:5.2f}%)") else: print("No expenses found.") return all_transactions if __name__ == "__main__": import argparse import glob parser = argparse.ArgumentParser(description='Process Monabanq statements') parser.add_argument('--pdf-dir', default='../data/pdf/monabanq', help='Directory containing Monabanq PDF files') parser.add_argument('--output-dir', default='../../output/csv', help='Directory to save CSV output files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(args.pdf_dir, "*.pdf")) # Sort files by date if possible pdf_files.sort() # Process all PDF files in the directory process_monabanq_files(pdf_files, args.csv, args.output_dir)