Files
personnal-accounting/scripts/dynamic_processor.py
2026-02-09 14:15:15 +01:00

173 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Dynamic script to auto-discover and process all financial statements
"""
import os
import subprocess
import sys
import glob
import re
from collections import defaultdict
import calendar
import argparse
from datetime import datetime
def discover_pdf_directories(base_data_dir):
"""
Scan base data directory and return all subdirectories containing PDF files
"""
pdf_dirs = {}
# Get all directories in the base data directory
for item in os.listdir(base_data_dir):
dir_path = os.path.join(base_data_dir, item)
if os.path.isdir(dir_path):
# Check if this directory contains PDF files
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
if pdf_files:
# Determine account type based on directory name
dir_name_lower = item.lower()
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
account_type = 'Boursobank'
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
account_type = 'American Express'
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
account_type = 'Monabanq'
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
account_type = 'SNCF'
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
account_type = 'La Poste'
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
account_type = 'Impôts'
else:
account_type = item.replace('_', ' ').title()
pdf_dirs[account_type] = {
'path': dir_path,
'count': len(pdf_files),
'files': pdf_files
}
return pdf_dirs
def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
"""
Generic function to process PDF files in any directory
"""
if not os.path.exists(pdf_directory):
print(f"Warning: Directory not found: {pdf_directory}")
return []
# Get all PDF files
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {pdf_directory}")
return []
# Build command
script_path = os.path.abspath(process_script)
script_dir = os.path.dirname(script_path)
cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)),
'--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
# Run the processing script
try:
result = subprocess.run(cmd, check=True, capture_output=True)
print(result.stdout)
return result.returncode == 0
except subprocess.CalledProcessError as e:
print(f"Error processing {pdf_directory}: {e}")
return 0
def main():
"""
Main function to dynamically discover and process all financial statements
"""
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
parser.add_argument('--data-dir',
help='Base directory containing PDF files (default: auto-discovered)')
parser.add_argument('--output-dir', default=None,
help='Directory to save CSV output files')
args = parser.parse_args()
# Get paths
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
# Determine data directory
if args.data_dir:
data_dir = args.data_dir
if not os.path.isabs(data_dir):
data_dir = os.path.join(project_root, data_dir)
else:
data_dir = os.path.join(project_root, 'data/pdf')
# Set output directory
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
os.makedirs(output_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Dynamic Financial Statement Processor")
print(f"Data Directory: {os.path.abspath(data_dir)}")
print(f"Output Directory: {os.path.abspath(output_dir)}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")
# Discover all PDF directories
pdf_dirs = discover_pdf_directories(data_dir)
if not pdf_dirs:
print("No directories with PDF files found!")
return
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
for account_type, info in pdf_dirs.items():
print(f" - {account_type}: {info['count']} files in {info['path']}")
# Define processing scripts for each account type
script_map = {
'Boursobank': 'process_bourso.py',
'American Express': 'process_amex.py',
'Monabanq': 'process_monabanq.py',
'SNCF': 'process_sncf_improved.py',
'La Poste': 'process_laposte_improved.py',
'Revolut': 'process_expenses.py', # Special case: uses CSV input
'Impôts': None # No processing script for tax documents yet
}
# Process each account type
success_count = 0
for account_type, info in pdf_dirs.items():
if account_type not in script_map:
print(f"\nWarning: No processing script available for {account_type}")
continue
# For Revolut, use CSV directory instead of PDF directory
process_dir = info['path']
if account_type == 'Revolut':
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
if process_dir and not os.path.exists(process_dir):
print(f"Warning: Directory not found: {process_dir}")
continue
success = process_dynamic_pdf_files(
script_map[account_type],
process_dir,
output_dir
)
if success:
success_count += 1
print(f"\n{'='*60}")
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
print(f"{'='*60}")
if __name__ == "__main__":
main()