136 lines
5.7 KiB
Python
136 lines
5.7 KiB
Python
import subprocess
|
|
import re
|
|
import csv
|
|
import os
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
def extract_month_from_filename(filename):
|
|
"""Extract month from SNCF filename"""
|
|
months = {
|
|
'JANVIER': 1, 'FEVRIER': 2, 'MARS': 3, 'AVRIL': 4,
|
|
'MAI': 5, 'JUIN': 6, 'JUILLET': 7, 'AOUT': 8,
|
|
'SEPTEMBRE': 9, 'OCTOBRE': 10, 'NOVEMBRE': 11, 'DECEMBRE': 12
|
|
}
|
|
|
|
filename_upper = filename.upper()
|
|
for month, num in months.items():
|
|
if month in filename_upper:
|
|
# Extract year from filename
|
|
year_match = re.search(r'20(\d{2})', filename)
|
|
year = int(year_match.group(1)) if year_match else 2025
|
|
return year, num
|
|
|
|
return 2025, 1 # Default
|
|
|
|
def process_sncf_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
|
"""Process SNCF salary PDF files with proper salary extraction"""
|
|
# Get all PDF files in the directory
|
|
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
|
all_transactions = []
|
|
|
|
for pdf_file in pdf_files:
|
|
try:
|
|
# Convert PDF to text
|
|
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
|
capture_output=True, text=True, check=True)
|
|
content = result.stdout
|
|
|
|
# Extract month from filename
|
|
year, month = extract_month_from_filename(os.path.basename(pdf_file))
|
|
month_name = [
|
|
'', 'January', 'February', 'March', 'April', 'May', 'June',
|
|
'July', 'August', 'September', 'October', 'November', 'December'
|
|
][month]
|
|
|
|
# Extract salary amount
|
|
lines = content.split('\n')
|
|
salary_amount = 0.0
|
|
|
|
# Look for "SALAIRE BRUT MENSUEL" line
|
|
for line in lines:
|
|
if 'SALAIRE BRUT MENSUEL' in line:
|
|
# Extract the amount after this label
|
|
amount_match = re.search(r'SALAIRE BRUT MENSUEL\s+([\d\s.,]+)', line)
|
|
if amount_match:
|
|
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
|
|
try:
|
|
salary_amount = float(amount_str)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
|
|
# Also look for other salary indicators
|
|
if salary_amount == 0.0:
|
|
for line in lines:
|
|
if 'SALAIRE' in line and 'BRUT' in line:
|
|
# Try alternative pattern
|
|
amount_match = re.search(r'([\d\s.,]+)\s*€', line)
|
|
if amount_match:
|
|
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
|
|
try:
|
|
salary_amount = float(amount_str)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
|
|
# Also check for base salary in the table
|
|
if salary_amount == 0.0:
|
|
for line in lines:
|
|
if line.strip().startswith('2974,64') or line.strip().startswith('3123,36'):
|
|
# Extract from the salary table
|
|
parts = line.split()
|
|
for part in parts:
|
|
try:
|
|
if '.' in part and ',' not in part and len(part) > 3:
|
|
salary_amount = float(part.replace(',', '.'))
|
|
break
|
|
except ValueError:
|
|
continue
|
|
|
|
# Add transaction record
|
|
all_transactions.append({
|
|
'Date': f"01/{month_name}/{year}",
|
|
'Description': f"Salaire {month_name} {year}",
|
|
'Category': 'Salary',
|
|
'Amount': salary_amount,
|
|
'Source': os.path.basename(pdf_file)
|
|
})
|
|
|
|
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
|
print(f"Error processing {pdf_file}: {e}")
|
|
continue
|
|
|
|
# Output CSV if requested
|
|
if output_csv and all_transactions:
|
|
csv_file = os.path.join(output_dir, 'sncf_all_transactions.csv')
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(all_transactions)
|
|
print(f"\nTransaction data saved to {csv_file}")
|
|
|
|
print(f"--- SNCF Salary Statements ---")
|
|
print(f"Found {len(pdf_files)} salary statement files")
|
|
total_salary = sum(t['Amount'] for t in all_transactions)
|
|
if total_salary > 0:
|
|
print(f"Total Salary Extracted: €{total_salary:,.2f}")
|
|
|
|
return all_transactions
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Process SNCF salary statements')
|
|
parser.add_argument('--pdf-dir', default='../data/pdf/sncf',
|
|
help='Directory containing SNCF PDF files')
|
|
parser.add_argument('--output-dir', default='../../output/csv',
|
|
help='Directory to save CSV output files')
|
|
parser.add_argument('--csv', action='store_true',
|
|
help='Output transaction data to CSV files')
|
|
args = parser.parse_args()
|
|
|
|
# Process all PDF files in the directory
|
|
process_sncf_pdf_files(args.pdf_dir, args.csv, args.output_dir) |