import subprocess import re import csv import os import glob from collections import defaultdict def categorize_sncf_transaction(description): description = description.lower() # For salary statements, we'll categorize based on the different components if 'salaire' in description: return 'Salary' if 'prime' in description: return 'Bonus/Prime' if 'cotisation' in description or 'retenue' in description: return 'Deductions' if 'impot' in description: return 'Tax' if 'avantage' in description: return 'Benefits' return 'Other' def process_sncf_pdf_files(directory, output_csv=False): # Get all PDF files in the directory pdf_files = glob.glob(os.path.join(directory, "*.pdf")) all_transactions = [] for pdf_file in pdf_files: try: # Convert PDF to text result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'], capture_output=True, text=True, check=True) content = result.stdout # Extract basic information from the PDF lines = content.split('\n') month = "Unknown" for line in lines: if 'salaire de' in line.lower(): # Extract month from filename or content month = os.path.basename(pdf_file).split(' ')[2] if len(os.path.basename(pdf_file).split(' ')) > 2 else "Unknown" break # Add basic transaction record all_transactions.append({ 'Date': f"01/{month}/2025", # Simplified date extraction 'Description': f"Salaire {month} 2025", 'Category': 'Salary', 'Amount': 0, # Would need more specific parsing 'Source': os.path.basename(pdf_file) }) except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"Error processing {pdf_file}: {e}") continue # Output CSV if requested if output_csv and all_transactions: csv_file = os.path.join(directory, 'sncf_all_transactions.csv') with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_transactions) print(f"\nTransaction data saved to {csv_file}") print(f"--- SNCF Salary Statements ---") print(f"Found {len(pdf_files)} salary statement files") return all_transactions if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Process SNCF salary statements') parser.add_argument('--pdf-dir', default='1-sncf', help='Directory containing SNCF PDF files') parser.add_argument('--csv', action='store_true', help='Output transaction data to CSV files') args = parser.parse_args() # Process all PDF files in the directory process_sncf_pdf_files(args.pdf_dir, args.csv)