From 48e290d5d0723c7540cbd804c58ce382f6922d44 Mon Sep 17 00:00:00 2001 From: Kevin Bataille Date: Mon, 9 Feb 2026 14:12:40 +0100 Subject: [PATCH] Add fully dynamic processor that auto-discovers all PDF directories --- scripts/dynamic_all_processor.py | 159 +++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100755 scripts/dynamic_all_processor.py diff --git a/scripts/dynamic_all_processor.py b/scripts/dynamic_all_processor.py new file mode 100755 index 0000000..76994a5 --- /dev/null +++ b/scripts/dynamic_all_processor.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Fully dynamic script to auto-discover and process all financial statements +""" + +import os +import subprocess +import sys +import glob +import re +import argparse +from datetime import datetime + +def main(): + """ + Main function to dynamically discover and process all financial statements + """ + parser = argparse.ArgumentParser(description='Dynamically process all financial statements') + parser.add_argument('--data-dir', + help='Base directory containing PDF files (default: ../data/pdf)') + parser.add_argument('--output-dir', default=None, + help='Directory to save CSV output files (default: ../output/csv)') + parser.add_argument('--csv', action='store_true', + help='Generate CSV output files') + + args = parser.parse_args() + + # Get paths + script_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.dirname(script_dir) + + # Determine data directory + if args.data_dir: + data_dir = args.data_dir + if not os.path.isabs(data_dir): + data_dir = os.path.join(project_root, data_dir) + else: + data_dir = os.path.join(project_root, 'data/pdf') + + # Set output directory + if args.output_dir: + output_dir = args.output_dir + if not os.path.isabs(output_dir): + output_dir = os.path.join(project_root, args.output_dir) + else: + output_dir = os.path.join(project_root, 'output/csv') + + # Create output directory if needed + os.makedirs(output_dir, exist_ok=True) + + print(f"\n{'='*60}") + print(f"Dynamic Financial Statement Processor") + print(f"Data Directory: {os.path.abspath(data_dir)}") + print(f"Output Directory: {os.path.abspath(output_dir)}") + print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"{'='*60}") + + # Discover all PDF directories + pdf_dirs = {} + + # Get all directories in the data directory + if not os.path.exists(data_dir): + print(f"Error: Data directory not found: {data_dir}") + return + + for item in os.listdir(data_dir): + dir_path = os.path.join(data_dir, item) + if os.path.isdir(dir_path): + # Check if this directory contains PDF files + pdf_files = glob.glob(os.path.join(dir_path, "*.pdf")) + if pdf_files: + # Determine account type based on directory name + dir_name_lower = item.lower() + if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower: + account_type = 'Boursobank' + script_name = 'process_bourso.py' + elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower: + account_type = 'American Express' + script_name = 'process_amex.py' + elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower: + account_type = 'Monabanq' + script_name = 'process_monabanq.py' + elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower: + account_type = 'SNCF' + script_name = 'process_sncf_improved.py' + elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower: + account_type = 'La Poste' + script_name = 'process_laposte_improved.py' + elif 'impots' in dir_name_lower or 'impot' in dir_name_lower: + account_type = 'Impôts' + script_name = None # Skip tax documents + else: + account_type = item.replace('_', ' ').title() + script_name = f'process_{account_type.lower().replace(" ", "_")}.py' + + pdf_dirs[account_type] = { + 'path': dir_path, + 'count': len(pdf_files), + 'files': pdf_files, + 'script': script_name + } + + if not pdf_dirs: + print("No directories with PDF files found!") + return + + print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:") + for account_type, info in pdf_dirs.items(): + print(f" - {account_type}: {info['count']} PDF files") + + # Process each account type + success_count = 0 + + for account_type, info in pdf_dirs.items(): + if not info['script']: + print(f"\nSkipping {account_type}: No processing script available") + continue + + # For Revolut, use CSV directory instead of PDF directory + process_dir = info['path'] + if account_type == 'Revolut': + process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv + + if process_dir and not os.path.exists(process_dir): + print(f"Warning: Directory not found: {process_dir}") + continue + + # Build command + cmd = [sys.executable, + os.path.join(script_dir, info['script']), + '--pdf-dir' if account_type != 'Revolut' else '--csv-dir', + process_dir, + '--output-dir', output_dir] + + if args.csv: + cmd.append('--csv') + + print(f"\nProcessing {account_type}...") + print(f"Running: {' '.join(cmd[2:])}") + + try: + result = subprocess.run(cmd, check=True, capture_output=True) + if result.stdout: + print(result.stdout) + if result.returncode == 0: + success_count += 1 + print(f"✓ {account_type} processing completed successfully") + else: + print(f"✗ {account_type} processing failed with exit code {result.returncode}") + except subprocess.CalledProcessError as e: + print(f"✗ Error processing {account_type}: {e}") + + print(f"\n{'='*60}") + print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully") + print(f"CSV files saved to: {os.path.abspath(output_dir)}") + print(f"{'='*60}") + +if __name__ == "__main__": + main() \ No newline at end of file