diff --git a/train_model_linux.py b/train_model_linux.py new file mode 100644 index 0000000..26ab402 --- /dev/null +++ b/train_model_linux.py @@ -0,0 +1,102 @@ +import pandas as pd +import numpy as np +import holidays +import xgboost as xgb +from sklearn.metrics import mean_absolute_error + +print("Start training Model 1.5...") + +# --- 1. DATA LADEN --- +# We gebruiken de CSV die we al hadden +try: + df = pd.read_csv("training_set_2024_2025.csv") + df['datum_tijd'] = pd.to_datetime(df['datum_tijd']) + df = df.set_index('datum_tijd').sort_index() + print(f"Data geladen: {len(df)} rijen.") +except FileNotFoundError: + print("Zorg dat 'training_set_2024_2025.csv' in de map staat.") + raise + +# --- 2. FEATURE ENGINEERING (v1.5) --- +print("Feature Engineering (v1.5) gestart...") + +# --- 2a. FEESTDAGEN FEATURE --- +# Maak een lijst van Nederlandse feestdagen voor de relevante jaren +nl_holidays = holidays.Netherlands(years=[2024, 2025]) +df['is_feestdag'] = df.index.to_series().apply(lambda x: 1 if x in nl_holidays else 0) +print("Feestdagen feature toegevoegd.") + +# --- 2b. TIJD-FEATURES (Basis) --- +df['uur_van_de_dag'] = df.index.hour +df['dag_van_de_week'] = df.index.dayofweek # 0=Maandag, 6=Zondag +df['maand'] = df.index.month +df['dag_van_het_jaar'] = df.index.dayofyear + +# --- 2c. ONE-HOT ENCODING (DE BELANGRIJKSTE FIX) --- +# Converteer 'dag_van_de_week' en 'uur_van_de_dag' naar losse kolommen +print("One-Hot Encoding toepassen...") +df = pd.get_dummies(df, columns=['dag_van_de_week', 'uur_van_de_dag'], + prefix=['dag', 'uur']) + +# --- 2d. Basis Lag/Rolling Features (deze hadden we al) --- +df['prijs_1u_geleden'] = df['gemiddelde_prijs'].shift(1) +df['prijs_24u_geleden'] = df['gemiddelde_prijs'].shift(24) +df['temp_avg_3u'] = df['temperatuur'].rolling(window=3).mean() +df['prijs_avg_6u'] = df['gemiddelde_prijs'].rolling(window=6).mean() + +# --- 2e. Opschonen --- +# We verliezen nu maar 24 rijen (van de 'prijs_24u_geleden' lag) +print(f"Rijen vóór opschonen: {len(df)}") +df_clean = df.dropna() +print(f"Rijen ná opschonen: {len(df_clean)}") + +# Toon ons de nieuwe features +print("\nVoorbeeld van de nieuwe 'dag' features:") +print(df_clean.filter(like='dag_').head()) + +# --- 3. DEFINIEER FEATURES (X) en TARGET (y) --- +TARGET = 'gemiddelde_prijs' +# Automatisch alle kolommen als feature gebruiken +FEATURES = [col for col in df_clean.columns if col not in [TARGET]] + +print(f"\nModel wordt getraind met {len(FEATURES)} features.") + +X = df_clean[FEATURES] +y = df_clean[TARGET] + +# --- 4. CHRONOLOGISCHE SPLIT --- +# We splitsen op 1 jan 2025, net als de vorige keer +SPLIT_DATE = '2025-01-01 00:00:00' +train_mask = X.index < SPLIT_DATE +test_mask = X.index >= SPLIT_DATE + +X_train, y_train = X[train_mask], y[train_mask] +X_test, y_test = X[test_mask], y[test_mask] + +print(f"Trainingset: {len(X_train)} rijen") +print(f"Testset: {len(X_test)} rijen") + +# --- 5. MODEL TRAINEN --- +xgb_model_v1_5 = xgb.XGBRegressor( + n_estimators=1000, + learning_rate=0.01, + early_stopping_rounds=50 +) + +print("\nModel v1.5 aan het trainen...") +xgb_model_v1_5.fit( + X_train, y_train, + eval_set=[(X_test, y_test)], + verbose=False # Zet op 100 om voortgang te zien +) +print("Training voltooid.") + +# --- 6. EVALUATIE --- +voorspellingen_v1_5 = xgb_model_v1_5.predict(X_test) +mae_v1_5 = mean_absolute_error(y_test, voorspellingen_v1_5) +print(f"\nNieuwe Model (v1.5) MAE: {mae_v1_5:.4f}") + +# --- 7. MODEL OPSLAAN --- +MODEL_FILE = 'price_forecast_model_v1_5.json' +xgb_model_v1_5.save_model(MODEL_FILE) +print(f"\nModel 1.5 opgeslagen als: {MODEL_FILE}") \ No newline at end of file