85 lines
3.2 KiB
Python
85 lines
3.2 KiB
Python
import subprocess
|
|
import re
|
|
import csv
|
|
import os
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
def categorize_sncf_transaction(description):
|
|
description = description.lower()
|
|
|
|
# For salary statements, we'll categorize based on the different components
|
|
if 'salaire' in description:
|
|
return 'Salary'
|
|
if 'prime' in description:
|
|
return 'Bonus/Prime'
|
|
if 'cotisation' in description or 'retenue' in description:
|
|
return 'Deductions'
|
|
if 'impot' in description:
|
|
return 'Tax'
|
|
if 'avantage' in description:
|
|
return 'Benefits'
|
|
|
|
return 'Other'
|
|
|
|
def process_sncf_pdf_files(directory, output_csv=False):
|
|
# Get all PDF files in the directory
|
|
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
|
all_transactions = []
|
|
|
|
for pdf_file in pdf_files:
|
|
try:
|
|
# Convert PDF to text
|
|
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
|
capture_output=True, text=True, check=True)
|
|
content = result.stdout
|
|
|
|
# Extract basic information from the PDF
|
|
lines = content.split('\n')
|
|
month = "Unknown"
|
|
for line in lines:
|
|
if 'salaire de' in line.lower():
|
|
# Extract month from filename or content
|
|
month = os.path.basename(pdf_file).split(' ')[2] if len(os.path.basename(pdf_file).split(' ')) > 2 else "Unknown"
|
|
break
|
|
|
|
# Add basic transaction record
|
|
all_transactions.append({
|
|
'Date': f"01/{month}/2025", # Simplified date extraction
|
|
'Description': f"Salaire {month} 2025",
|
|
'Category': 'Salary',
|
|
'Amount': 0, # Would need more specific parsing
|
|
'Source': os.path.basename(pdf_file)
|
|
})
|
|
|
|
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
|
print(f"Error processing {pdf_file}: {e}")
|
|
continue
|
|
|
|
# Output CSV if requested
|
|
if output_csv and all_transactions:
|
|
csv_file = os.path.join(directory, 'sncf_all_transactions.csv')
|
|
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(all_transactions)
|
|
print(f"\nTransaction data saved to {csv_file}")
|
|
|
|
print(f"--- SNCF Salary Statements ---")
|
|
print(f"Found {len(pdf_files)} salary statement files")
|
|
|
|
return all_transactions
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
|
|
parser.add_argument('--pdf-dir', default='1-sncf',
|
|
help='Directory containing SNCF PDF files')
|
|
parser.add_argument('--csv', action='store_true',
|
|
help='Output transaction data to CSV files')
|
|
args = parser.parse_args()
|
|
|
|
# Process all PDF files in the directory
|
|
process_sncf_pdf_files(args.pdf_dir, args.csv) |