#!/usr/bin/env python3 """ Dynamic script to auto-discover and process all financial statements """ import os import subprocess import sys import glob import re from collections import defaultdict import calendar import argparse from datetime import datetime def discover_pdf_directories(base_data_dir): """ Scan base data directory and return all subdirectories containing PDF files """ pdf_dirs = {} # Get all directories in the base data directory for item in os.listdir(base_data_dir): dir_path = os.path.join(base_data_dir, item) if os.path.isdir(dir_path): # Check if this directory contains PDF files pdf_files = glob.glob(os.path.join(dir_path, "*.pdf")) if pdf_files: # Determine account type based on directory name dir_name_lower = item.lower() if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower: account_type = 'Boursobank' elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower: account_type = 'American Express' elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower: account_type = 'Monabanq' elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower: account_type = 'SNCF' elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower: account_type = 'La Poste' elif 'impots' in dir_name_lower or 'impot' in dir_name_lower: account_type = 'Impôts' else: account_type = item.replace('_', ' ').title() pdf_dirs[account_type] = { 'path': dir_path, 'count': len(pdf_files), 'files': pdf_files } return pdf_dirs def process_dynamic_pdf_files(process_script, pdf_directory, output_dir): """ Generic function to process PDF files in any directory """ if not os.path.exists(pdf_directory): print(f"Warning: Directory not found: {pdf_directory}") return [] # Get all PDF files pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf")) if not pdf_files: print(f"No PDF files found in {pdf_directory}") return [] # Build command script_path = os.path.abspath(process_script) script_dir = os.path.dirname(script_path) cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)), '--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv'] # Run the processing script try: result = subprocess.run(cmd, check=True, capture_output=True) print(result.stdout) return result.returncode == 0 except subprocess.CalledProcessError as e: print(f"Error processing {pdf_directory}: {e}") return 0 def main(): """ Main function to dynamically discover and process all financial statements """ parser = argparse.ArgumentParser(description='Dynamically process all financial statements') parser.add_argument('--data-dir', help='Base directory containing PDF files (default: auto-discovered)') parser.add_argument('--output-dir', default=None, help='Directory to save CSV output files') args = parser.parse_args() # Get paths script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(script_dir) # Determine data directory if args.data_dir: data_dir = args.data_dir if not os.path.isabs(data_dir): data_dir = os.path.join(project_root, data_dir) else: data_dir = os.path.join(project_root, 'data/pdf') # Set output directory output_dir = args.output_dir or os.path.join(project_root, 'output/csv') os.makedirs(output_dir, exist_ok=True) print(f"\n{'='*60}") print(f"Dynamic Financial Statement Processor") print(f"Data Directory: {os.path.abspath(data_dir)}") print(f"Output Directory: {os.path.abspath(output_dir)}") print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'='*60}") # Discover all PDF directories pdf_dirs = discover_pdf_directories(data_dir) if not pdf_dirs: print("No directories with PDF files found!") return print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:") for account_type, info in pdf_dirs.items(): print(f" - {account_type}: {info['count']} files in {info['path']}") # Define processing scripts for each account type script_map = { 'Boursobank': 'process_bourso.py', 'American Express': 'process_amex.py', 'Monabanq': 'process_monabanq.py', 'SNCF': 'process_sncf_improved.py', 'La Poste': 'process_laposte_improved.py', 'Revolut': 'process_expenses.py', # Special case: uses CSV input 'Impôts': None # No processing script for tax documents yet } # Process each account type success_count = 0 for account_type, info in pdf_dirs.items(): if account_type not in script_map: print(f"\nWarning: No processing script available for {account_type}") continue # For Revolut, use CSV directory instead of PDF directory process_dir = info['path'] if account_type == 'Revolut': process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv if process_dir and not os.path.exists(process_dir): print(f"Warning: Directory not found: {process_dir}") continue success = process_dynamic_pdf_files( script_map[account_type], process_dir, output_dir ) if success: success_count += 1 print(f"\n{'='*60}") print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully") print(f"CSV files saved to: {os.path.abspath(output_dir)}") print(f"{'='*60}") if __name__ == "__main__": main()