102 lines
3.3 KiB
Python
102 lines
3.3 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import holidays
|
|
import xgboost as xgb
|
|
from sklearn.metrics import mean_absolute_error
|
|
|
|
print("Start training Model 1.5...")
|
|
|
|
# --- 1. DATA LADEN ---
|
|
# We gebruiken de CSV die we al hadden
|
|
try:
|
|
df = pd.read_csv("training_set_2024_2025.csv")
|
|
df['datum_tijd'] = pd.to_datetime(df['datum_tijd'])
|
|
df = df.set_index('datum_tijd').sort_index()
|
|
print(f"Data geladen: {len(df)} rijen.")
|
|
except FileNotFoundError:
|
|
print("Zorg dat 'training_set_2024_2025.csv' in de map staat.")
|
|
raise
|
|
|
|
# --- 2. FEATURE ENGINEERING (v1.5) ---
|
|
print("Feature Engineering (v1.5) gestart...")
|
|
|
|
# --- 2a. FEESTDAGEN FEATURE ---
|
|
# Maak een lijst van Nederlandse feestdagen voor de relevante jaren
|
|
nl_holidays = holidays.Netherlands(years=[2024, 2025])
|
|
df['is_feestdag'] = df.index.to_series().apply(lambda x: 1 if x in nl_holidays else 0)
|
|
print("Feestdagen feature toegevoegd.")
|
|
|
|
# --- 2b. TIJD-FEATURES (Basis) ---
|
|
df['uur_van_de_dag'] = df.index.hour
|
|
df['dag_van_de_week'] = df.index.dayofweek # 0=Maandag, 6=Zondag
|
|
df['maand'] = df.index.month
|
|
df['dag_van_het_jaar'] = df.index.dayofyear
|
|
|
|
# --- 2c. ONE-HOT ENCODING (DE BELANGRIJKSTE FIX) ---
|
|
# Converteer 'dag_van_de_week' en 'uur_van_de_dag' naar losse kolommen
|
|
print("One-Hot Encoding toepassen...")
|
|
df = pd.get_dummies(df, columns=['dag_van_de_week', 'uur_van_de_dag'],
|
|
prefix=['dag', 'uur'])
|
|
|
|
# --- 2d. Basis Lag/Rolling Features (deze hadden we al) ---
|
|
df['prijs_1u_geleden'] = df['gemiddelde_prijs'].shift(1)
|
|
df['prijs_24u_geleden'] = df['gemiddelde_prijs'].shift(24)
|
|
df['temp_avg_3u'] = df['temperatuur'].rolling(window=3).mean()
|
|
df['prijs_avg_6u'] = df['gemiddelde_prijs'].rolling(window=6).mean()
|
|
|
|
# --- 2e. Opschonen ---
|
|
# We verliezen nu maar 24 rijen (van de 'prijs_24u_geleden' lag)
|
|
print(f"Rijen vóór opschonen: {len(df)}")
|
|
df_clean = df.dropna()
|
|
print(f"Rijen ná opschonen: {len(df_clean)}")
|
|
|
|
# Toon ons de nieuwe features
|
|
print("\nVoorbeeld van de nieuwe 'dag' features:")
|
|
print(df_clean.filter(like='dag_').head())
|
|
|
|
# --- 3. DEFINIEER FEATURES (X) en TARGET (y) ---
|
|
TARGET = 'gemiddelde_prijs'
|
|
# Automatisch alle kolommen als feature gebruiken
|
|
FEATURES = [col for col in df_clean.columns if col not in [TARGET]]
|
|
|
|
print(f"\nModel wordt getraind met {len(FEATURES)} features.")
|
|
|
|
X = df_clean[FEATURES]
|
|
y = df_clean[TARGET]
|
|
|
|
# --- 4. CHRONOLOGISCHE SPLIT ---
|
|
# We splitsen op 1 jan 2025, net als de vorige keer
|
|
SPLIT_DATE = '2025-01-01 00:00:00'
|
|
train_mask = X.index < SPLIT_DATE
|
|
test_mask = X.index >= SPLIT_DATE
|
|
|
|
X_train, y_train = X[train_mask], y[train_mask]
|
|
X_test, y_test = X[test_mask], y[test_mask]
|
|
|
|
print(f"Trainingset: {len(X_train)} rijen")
|
|
print(f"Testset: {len(X_test)} rijen")
|
|
|
|
# --- 5. MODEL TRAINEN ---
|
|
xgb_model_v1_5 = xgb.XGBRegressor(
|
|
n_estimators=1000,
|
|
learning_rate=0.01,
|
|
early_stopping_rounds=50
|
|
)
|
|
|
|
print("\nModel v1.5 aan het trainen...")
|
|
xgb_model_v1_5.fit(
|
|
X_train, y_train,
|
|
eval_set=[(X_test, y_test)],
|
|
verbose=False # Zet op 100 om voortgang te zien
|
|
)
|
|
print("Training voltooid.")
|
|
|
|
# --- 6. EVALUATIE ---
|
|
voorspellingen_v1_5 = xgb_model_v1_5.predict(X_test)
|
|
mae_v1_5 = mean_absolute_error(y_test, voorspellingen_v1_5)
|
|
print(f"\nNieuwe Model (v1.5) MAE: {mae_v1_5:.4f}")
|
|
|
|
# --- 7. MODEL OPSLAAN ---
|
|
MODEL_FILE = 'price_forecast_model_v1_5.json'
|
|
xgb_model_v1_5.save_model(MODEL_FILE)
|
|
print(f"\nModel 1.5 opgeslagen als: {MODEL_FILE}") |