- Fix SNCF NET PAYÉ EN EUROS extraction to correctly parse MENSUEL line - Extract month/year from PDF content instead of filename - Add new Revolut CSV processor to aggregate account statements - Organize Revolut data files into data/csv/revolut/ - Clean up redundant scripts and reports
129 lines
5.3 KiB
Python
Executable File
129 lines
5.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import subprocess
|
|
import re
|
|
import csv
|
|
import os
|
|
import glob
|
|
from collections import defaultdict
|
|
|
|
def categorize_laposte_transaction(description):
|
|
"""Categorize La Poste transactions"""
|
|
description = description.lower()
|
|
|
|
if 'virement' in description:
|
|
return 'Transfer'
|
|
if 'retrait' in description:
|
|
return 'Cash Withdrawal'
|
|
if 'carte' in description or 'paiement' in description:
|
|
return 'Card Payment'
|
|
if 'frais' in description or 'cotisation' in description:
|
|
return 'Bank Fees'
|
|
if 'cotis' in description:
|
|
return 'Deductions'
|
|
if 'impot' in description:
|
|
return 'Tax'
|
|
if 'edf' in description or 'bouygues' in description or 'orange' in description:
|
|
return 'Utilities'
|
|
|
|
return 'Other'
|
|
|
|
def process_laposte_pdf_files(directory, output_csv=False, output_dir='../../output/csv'):
|
|
"""Process La Poste account PDF files with improved transaction extraction"""
|
|
# Get all PDF files in the directory
|
|
pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
|
|
all_transactions = []
|
|
|
|
for pdf_file in pdf_files:
|
|
try:
|
|
# Convert PDF to text
|
|
result = subprocess.run(['pdftotext', '-layout', pdf_file, '-'],
|
|
capture_output=True, text=True, check=True)
|
|
content = result.stdout
|
|
|
|
# Extract transactions from the PDF
|
|
lines = content.split('\n')
|
|
in_transaction_section = False
|
|
|
|
for line in lines:
|
|
# Look for the transaction table section
|
|
if 'Opérations' in line:
|
|
in_transaction_section = True
|
|
continue
|
|
|
|
# Skip headers and footers
|
|
if not in_transaction_section or 'Date' in line or 'Total' in line or 'Page' in line:
|
|
continue
|
|
|
|
# Match transaction lines - they have date and amount
|
|
if re.match(r'\s*\d{2}/\d{2}', line):
|
|
parts = re.split(r'\s{2,}', line)
|
|
if len(parts) >= 3:
|
|
try:
|
|
date = parts[0].strip()
|
|
description = parts[1].strip() if len(parts) > 2 else ''
|
|
|
|
# Extract amount (look for numeric values with ¤ or €)
|
|
amount = 0
|
|
for part in parts[2:]:
|
|
part = part.strip().replace('¤', '').replace('€', '').replace(' ', '')
|
|
if re.match(r'[\d.,]+', part):
|
|
amount_str = part.replace(',', '.')
|
|
try:
|
|
amount = float(amount_str)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
|
|
category = categorize_laposte_transaction(description)
|
|
|
|
# Only add if amount is valid
|
|
if amount > 0:
|
|
all_transactions.append({
|
|
'Date': date,
|
|
'Description': description,
|
|
'Category': category,
|
|
'Amount': amount,
|
|
'Source': os.path.basename(pdf_file)
|
|
})
|
|
except (ValueError, IndexError):
|
|
continue
|
|
|
|
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
|
print(f"Error processing {pdf_file}: {e}")
|
|
continue
|
|
|
|
# Output CSV if requested
|
|
if output_csv and all_transactions:
|
|
csv_file = os.path.join(output_dir, 'laposte_all_transactions.csv')
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['Date', 'Description', 'Category', 'Amount', 'Source']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(all_transactions)
|
|
print(f"\nTransaction data saved to {csv_file}")
|
|
|
|
print(f"--- La Poste Account Statements ---")
|
|
print(f"Found {len(pdf_files)} account statement files")
|
|
print(f"Processed {len(all_transactions)} transactions")
|
|
|
|
return all_transactions
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Process La Poste (CCP) account statements')
|
|
parser.add_argument('--pdf-dir', default='../data/pdf/la_poste',
|
|
help='Directory containing La Poste PDF files')
|
|
parser.add_argument('--output-dir', default='../../output/csv',
|
|
help='Directory to save CSV output files')
|
|
parser.add_argument('--csv', action='store_true',
|
|
help='Output transaction data to CSV files')
|
|
args = parser.parse_args()
|
|
|
|
# Process all PDF files in the directory
|
|
process_laposte_pdf_files(args.pdf_dir, args.csv, args.output_dir)
|