#!/usr/bin/env python3 """ Main script to process all financial statements with a clean one-file-per-structure """ import os import sys import glob import subprocess import argparse from datetime import datetime # Import functionality from the dynamic processor # Add the directory to the path so we can import it sys.path.append(os.path.dirname(os.path.abspath(__file__))) from dynamic_processor import discover_pdf_directories, process_dynamic_pdf_files def main(): """ Main function with a clean, organized structure """ parser = argparse.ArgumentParser(description='Process financial statements with one file per entity') parser.add_argument('--data-dir', help='Base directory containing PDF files (default: auto-discovered)') parser.add_argument('--output-dir', default=None, help='Directory to save CSV output files (default: auto-discovered)') parser.add_argument('--csv', action='store_true', help='Generate CSV output files') parser.add_argument('--single', action='store_true', help='Process only the current entity (for testing)') args = parser.parse_args() # Get paths script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(script_dir) # Determine data directory if args.data_dir: data_dir = args.data_dir if not os.path.isabs(data_dir): data_dir = os.path.join(project_root, data_dir) else: data_dir = os.path.join(project_root, 'data/pdf') # Set output directory output_dir = args.output_dir or os.path.join(project_root, 'output/csv') os.makedirs(output_dir, exist_ok=True) print(f"\n{'='*60}") print(f"Financial Statement Processor") print(f"Data Directory: {os.path.abspath(data_dir)}") print(f"Output Directory: {os.path.abspath(output_dir)}") print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'='*60}") # Discover all PDF directories pdf_dirs = discover_pdf_directories(data_dir) if not pdf_dirs: print("No directories with PDF files found!") return print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:") for account_type, info in pdf_dirs.items(): print(f" - {account_type}: {info['count']} files in {info['path']}") # Process each account type to its own file for account_type, info in pdf_dirs.items(): if account_type not in ['Boursobank', 'American Express', 'Monabanq', 'SNCF', 'La Poste']: continue # Skip unsupported types # Create a specialized processor for each account type if account_type == 'Revolut': # Special case for Revolut (CSV files) process_revolut(data_dir, output_dir, args.csv) else: process_pdf_account(account_type, info, output_dir, args.csv, args.single) print(f"\n{'='*60}") print(f"Processing Complete") def process_revolut(data_dir, output_dir, generate_csv, single_mode=False): """Process Revolut CSV files""" # Revolut CSV files are in raw_csv directory csv_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') csv_files = glob.glob(os.path.join(csv_dir, "*.csv")) if not csv_files: print(f"No Revolut CSV files found in {csv_dir}") return # Sort files by date csv_files.sort() for csv_file in csv_files: print(f"Processing Revolut CSV: {os.path.basename(csv_file)}") # Build the command cmd = [ sys.executable, os.path.join(os.path.dirname(os.path.abspath(__file__)), 'process_expenses.py', '--csv-dir', csv_dir, '--output-dir', output_dir ] if generate_csv: cmd.append('--csv') if single_mode: cmd.append('--single') try: result = subprocess.run(cmd, check=True, capture_output=True) if result.stdout: print(result.stdout) except subprocess.CalledProcessError as e: print(f"Error processing {csv_file}: {e}") def process_pdf_account(account_type, info, output_dir, generate_csv, single_mode=False): """Create and run a specialized processor for a PDF-based account""" # Create a temporary processor script processor_name = f"{account_type.lower().replace(' ', '_')}_processor.py" processor_content = f'''#!/usr/bin/env python3 """ Temporary processor for {account_type} """ import os import sys import subprocess import glob import csv def main(): import argparse parser = argparse.ArgumentParser(description='Process {account_type} statements') parser.add_argument('--pdf-dir') parser.add_argument('--output-dir') parser.add_argument('--csv') args = parser.parse_args() cmd = [ sys.executable, os.path.join(os.path.dirname(os.path.abspath(__file__))), 'process_{"account_type.lower().replace(' ', '_')}.py', '--pdf-dir', args.pdf_dir, '--output-dir', args.output_dir ] if args.csv: cmd.append('--csv') subprocess.run(cmd, check=True) ''' # Write the processor script processor_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), processor_name) with open(processor_path, 'w') as f: f.write(processor_content) # Make it executable and run it os.chmod(processor_path, 0o755) cmd = [sys.executable, processor_path, '--pdf-dir', info['path'], '--output-dir', output_dir] if generate_csv: cmd.append('--csv') if single_mode: cmd.append('--single') print(f"Running: {' '.join(cmd[2:])}") try: result = subprocess.run(cmd, check=True, capture_output=True) if result.stdout: print(result.stdout) except subprocess.CalledProcessError as e: print(f"Error: {e}") # Clean up the temporary script os.remove(processor_path) if __name__ == "__main__": main()