Enhance SNCF script to extract NET PAYÉ EN EUROS amount
This commit is contained in:
173
scripts/dynamic_processor.py
Executable file
173
scripts/dynamic_processor.py
Executable file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dynamic script to auto-discover and process all financial statements
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import glob
|
||||
import re
|
||||
from collections import defaultdict
|
||||
import calendar
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
def discover_pdf_directories(base_data_dir):
|
||||
"""
|
||||
Scan base data directory and return all subdirectories containing PDF files
|
||||
"""
|
||||
pdf_dirs = {}
|
||||
|
||||
# Get all directories in the base data directory
|
||||
for item in os.listdir(base_data_dir):
|
||||
dir_path = os.path.join(base_data_dir, item)
|
||||
if os.path.isdir(dir_path):
|
||||
# Check if this directory contains PDF files
|
||||
pdf_files = glob.glob(os.path.join(dir_path, "*.pdf"))
|
||||
if pdf_files:
|
||||
# Determine account type based on directory name
|
||||
dir_name_lower = item.lower()
|
||||
if 'boursobank' in dir_name_lower or 'releve-compte' in dir_name_lower:
|
||||
account_type = 'Boursobank'
|
||||
elif 'american_express' in dir_name_lower or 'amex' in dir_name_lower:
|
||||
account_type = 'American Express'
|
||||
elif 'monabanq' in dir_name_lower or 'extrait' in dir_name_lower:
|
||||
account_type = 'Monabanq'
|
||||
elif 'sncf' in dir_name_lower or 'salaire' in dir_name_lower:
|
||||
account_type = 'SNCF'
|
||||
elif 'la_poste' in dir_name_lower or 'la-poste' in dir_name_lower or 'releve_ccp' in dir_name_lower:
|
||||
account_type = 'La Poste'
|
||||
elif 'impots' in dir_name_lower or 'impot' in dir_name_lower:
|
||||
account_type = 'Impôts'
|
||||
else:
|
||||
account_type = item.replace('_', ' ').title()
|
||||
|
||||
pdf_dirs[account_type] = {
|
||||
'path': dir_path,
|
||||
'count': len(pdf_files),
|
||||
'files': pdf_files
|
||||
}
|
||||
|
||||
return pdf_dirs
|
||||
|
||||
def process_dynamic_pdf_files(process_script, pdf_directory, output_dir):
|
||||
"""
|
||||
Generic function to process PDF files in any directory
|
||||
"""
|
||||
if not os.path.exists(pdf_directory):
|
||||
print(f"Warning: Directory not found: {pdf_directory}")
|
||||
return []
|
||||
|
||||
# Get all PDF files
|
||||
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"No PDF files found in {pdf_directory}")
|
||||
return []
|
||||
|
||||
# Build command
|
||||
script_path = os.path.abspath(process_script)
|
||||
script_dir = os.path.dirname(script_path)
|
||||
cmd = [sys.executable, os.path.join(script_dir, os.path.basename(process_script)),
|
||||
'--pdf-dir', pdf_directory, '--output-dir', output_dir, '--csv']
|
||||
|
||||
# Run the processing script
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True)
|
||||
print(result.stdout)
|
||||
return result.returncode == 0
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error processing {pdf_directory}: {e}")
|
||||
return 0
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to dynamically discover and process all financial statements
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Dynamically process all financial statements')
|
||||
parser.add_argument('--data-dir',
|
||||
help='Base directory containing PDF files (default: auto-discovered)')
|
||||
parser.add_argument('--output-dir', default=None,
|
||||
help='Directory to save CSV output files')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
|
||||
# Determine data directory
|
||||
if args.data_dir:
|
||||
data_dir = args.data_dir
|
||||
if not os.path.isabs(data_dir):
|
||||
data_dir = os.path.join(project_root, data_dir)
|
||||
else:
|
||||
data_dir = os.path.join(project_root, 'data/pdf')
|
||||
|
||||
# Set output directory
|
||||
output_dir = args.output_dir or os.path.join(project_root, 'output/csv')
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Dynamic Financial Statement Processor")
|
||||
print(f"Data Directory: {os.path.abspath(data_dir)}")
|
||||
print(f"Output Directory: {os.path.abspath(output_dir)}")
|
||||
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Discover all PDF directories
|
||||
pdf_dirs = discover_pdf_directories(data_dir)
|
||||
|
||||
if not pdf_dirs:
|
||||
print("No directories with PDF files found!")
|
||||
return
|
||||
|
||||
print(f"\nDiscovered {len(pdf_dirs)} directories with PDF files:")
|
||||
for account_type, info in pdf_dirs.items():
|
||||
print(f" - {account_type}: {info['count']} files in {info['path']}")
|
||||
|
||||
# Define processing scripts for each account type
|
||||
script_map = {
|
||||
'Boursobank': 'process_bourso.py',
|
||||
'American Express': 'process_amex.py',
|
||||
'Monabanq': 'process_monabanq.py',
|
||||
'SNCF': 'process_sncf_improved.py',
|
||||
'La Poste': 'process_laposte_improved.py',
|
||||
'Revolut': 'process_expenses.py', # Special case: uses CSV input
|
||||
'Impôts': None # No processing script for tax documents yet
|
||||
}
|
||||
|
||||
# Process each account type
|
||||
success_count = 0
|
||||
|
||||
for account_type, info in pdf_dirs.items():
|
||||
if account_type not in script_map:
|
||||
print(f"\nWarning: No processing script available for {account_type}")
|
||||
continue
|
||||
|
||||
# For Revolut, use CSV directory instead of PDF directory
|
||||
process_dir = info['path']
|
||||
if account_type == 'Revolut':
|
||||
process_dir = os.path.join(os.path.dirname(data_dir), 'raw_csv') # CSV files are in raw_csv
|
||||
|
||||
if process_dir and not os.path.exists(process_dir):
|
||||
print(f"Warning: Directory not found: {process_dir}")
|
||||
continue
|
||||
|
||||
success = process_dynamic_pdf_files(
|
||||
script_map[account_type],
|
||||
process_dir,
|
||||
output_dir
|
||||
)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing Complete: {success_count}/{len(pdf_dirs)} accounts processed successfully")
|
||||
print(f"CSV files saved to: {os.path.abspath(output_dir)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user