661 lines
193 KiB
Plaintext
661 lines
193 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "6487331d-ee31-46b1-bc35-a9c215e75de4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Collecting xgboost\n",
|
|
" Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)\n",
|
|
"Requirement already satisfied: matplotlib in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (3.10.3)\n",
|
|
"Requirement already satisfied: numpy in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from xgboost) (2.2.4)\n",
|
|
"Requirement already satisfied: scipy in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from xgboost) (1.15.2)\n",
|
|
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (1.3.2)\n",
|
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (0.12.1)\n",
|
|
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (4.58.5)\n",
|
|
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (1.4.8)\n",
|
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (25.0)\n",
|
|
"Requirement already satisfied: pillow>=8 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (11.2.1)\n",
|
|
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (3.2.3)\n",
|
|
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from matplotlib) (2.9.0.post0)\n",
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)\n",
|
|
"Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)\n",
|
|
" ---------------------------------------- 0.0/72.0 MB ? eta -:--:--\n",
|
|
" ---------------------------------------- 0.8/72.0 MB 13.0 MB/s eta 0:00:06\n",
|
|
" --- ------------------------------------ 6.8/72.0 MB 25.0 MB/s eta 0:00:03\n",
|
|
" --------- ------------------------------ 16.3/72.0 MB 33.4 MB/s eta 0:00:02\n",
|
|
" -------------- ------------------------- 26.2/72.0 MB 38.0 MB/s eta 0:00:02\n",
|
|
" ------------------- -------------------- 35.9/72.0 MB 39.9 MB/s eta 0:00:01\n",
|
|
" ------------------------- -------------- 45.1/72.0 MB 40.1 MB/s eta 0:00:01\n",
|
|
" ----------------------------- ---------- 53.5/72.0 MB 40.4 MB/s eta 0:00:01\n",
|
|
" ---------------------------------- ----- 62.7/72.0 MB 40.6 MB/s eta 0:00:01\n",
|
|
" --------------------------------------- 71.0/72.0 MB 40.8 MB/s eta 0:00:01\n",
|
|
" ---------------------------------------- 72.0/72.0 MB 38.5 MB/s 0:00:02\n",
|
|
"Installing collected packages: xgboost\n",
|
|
"Successfully installed xgboost-3.1.1\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install xgboost matplotlib"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "9cfedaeb-32a4-4eec-9e6a-22716054c27b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Start verwerking voor training_set_2024_2025.csv...\n",
|
|
"Stap 1: Feature Engineering...\n",
|
|
"Data opgeschoond. 11554 rijen overgebleven.\n",
|
|
"Stap 2: Definiëren Features (X) en Target (y)...\n",
|
|
"Stap 3: Data splitsen op 2025-01-01 00:00:00...\n",
|
|
"Trainingset: 4001 rijen\n",
|
|
"Testset: 7553 rijen\n",
|
|
"Stap 4: XGBoost Model trainen...\n",
|
|
"Model training voltooid.\n",
|
|
"Stap 5: Model evalueren...\n",
|
|
"\n",
|
|
"--- RESULTAAT ---\n",
|
|
"Gemiddelde Fout (MAE): 0.0122\n",
|
|
"Dit betekent dat het model er gemiddeld 1.22 cent naast zat.\n",
|
|
"Stap 6: Plot genereren...\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1500x600 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Stap 7: Belangrijkste Features tonen...\n",
|
|
"\n",
|
|
"Belangrijkste features volgens het model:\n",
|
|
"prijs_1u_geleden 0.771534\n",
|
|
"prijs_24u_geleden 0.037891\n",
|
|
"uur_van_de_dag 0.027163\n",
|
|
"temp_avg_3u 0.021784\n",
|
|
"prijs_avg_6u 0.020315\n",
|
|
"dag_van_de_week 0.016608\n",
|
|
"wind_snelheid 0.016497\n",
|
|
"luchtdruk 0.014006\n",
|
|
"maand 0.013830\n",
|
|
"dag_van_het_jaar 0.012987\n",
|
|
"dtype: float32\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1000x600 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"--- Analyse voltooid ---\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import xgboost as xgb\n",
|
|
"from sklearn.metrics import mean_absolute_error\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"# --- CONFIGURATIE ---\n",
|
|
"bestandsnaam = 'training_set_2024_2025.csv'\n",
|
|
"split_datum = '2025-01-01 00:00:00'\n",
|
|
"TARGET = 'gemiddelde_prijs'\n",
|
|
"\n",
|
|
"print(f\"Start verwerking voor {bestandsnaam}...\")\n",
|
|
"\n",
|
|
"# --- STAP 1: DATA LADEN & FEATURE ENGINEERING ---\n",
|
|
"try:\n",
|
|
" df = pd.read_csv(bestandsnaam)\n",
|
|
"except FileNotFoundError:\n",
|
|
" print(f\"FOUT: Bestand '{bestandsnaam}' niet gevonden.\")\n",
|
|
" # Stop de rest van het script als het bestand mist\n",
|
|
" raise\n",
|
|
"\n",
|
|
"print(\"Stap 1: Feature Engineering...\")\n",
|
|
"df['datum_tijd'] = pd.to_datetime(df['datum_tijd'])\n",
|
|
"df = df.set_index('datum_tijd').sort_index() # Sorteer op datum en maak het de index\n",
|
|
"\n",
|
|
"# Tijd-features\n",
|
|
"df['uur_van_de_dag'] = df.index.hour\n",
|
|
"df['dag_van_de_week'] = df.index.dayofweek\n",
|
|
"df['maand'] = df.index.month\n",
|
|
"df['dag_van_het_jaar'] = df.index.dayofyear\n",
|
|
"\n",
|
|
"# Lag-features (historie)\n",
|
|
"df['prijs_1u_geleden'] = df[TARGET].shift(1)\n",
|
|
"df['prijs_24u_geleden'] = df[TARGET].shift(24)\n",
|
|
"\n",
|
|
"# Rolling-features (trends)\n",
|
|
"df['temp_avg_3u'] = df['temperatuur'].rolling(window=3).mean()\n",
|
|
"df['prijs_avg_6u'] = df[TARGET].rolling(window=6).mean()\n",
|
|
"\n",
|
|
"# Verwijder NaNs die door .shift() en .rolling() zijn ontstaan\n",
|
|
"df_clean = df.dropna()\n",
|
|
"print(f\"Data opgeschoond. {len(df_clean)} rijen overgebleven.\")\n",
|
|
"\n",
|
|
"# --- STAP 2: DEFINIEER FEATURES (X) en TARGET (y) ---\n",
|
|
"print(\"Stap 2: Definiëren Features (X) en Target (y)...\")\n",
|
|
"FEATURES = [col for col in df_clean.columns if col not in [TARGET]]\n",
|
|
"\n",
|
|
"X = df_clean[FEATURES]\n",
|
|
"y = df_clean[TARGET]\n",
|
|
"\n",
|
|
"# --- STAP 3: CHRONOLOGISCHE TRAIN-TEST SPLIT ---\n",
|
|
"print(f\"Stap 3: Data splitsen op {split_datum}...\")\n",
|
|
"train_mask = X.index < split_datum\n",
|
|
"test_mask = X.index >= split_datum\n",
|
|
"\n",
|
|
"X_train, y_train = X[train_mask], y[train_mask]\n",
|
|
"X_test, y_test = X[test_mask], y[test_mask]\n",
|
|
"\n",
|
|
"print(f\"Trainingset: {X_train.shape[0]} rijen\")\n",
|
|
"print(f\"Testset: {X_test.shape[0]} rijen\")\n",
|
|
"\n",
|
|
"# --- STAP 4: MODEL TRAINEN (XGBoost) ---\n",
|
|
"print(\"Stap 4: XGBoost Model trainen...\")\n",
|
|
"# Dit is het model dat we wilden gebruiken\n",
|
|
"xgb_model = xgb.XGBRegressor(\n",
|
|
" n_estimators=1000, # Max 1000 \"bomen\"\n",
|
|
" learning_rate=0.01, # Leer langzaam\n",
|
|
" early_stopping_rounds=50 # Stop als de score 50 rondes niet verbetert\n",
|
|
")\n",
|
|
"\n",
|
|
"# Train het model en gebruik de testset om 'early stopping' toe te passen\n",
|
|
"xgb_model.fit(\n",
|
|
" X_train, y_train,\n",
|
|
" eval_set=[(X_test, y_test)],\n",
|
|
" verbose=False # Zet op True (of 100) als je de voortgang wilt zien\n",
|
|
")\n",
|
|
"print(\"Model training voltooid.\")\n",
|
|
"\n",
|
|
"# --- STAP 5: EVALUATIE ---\n",
|
|
"print(\"Stap 5: Model evalueren...\")\n",
|
|
"# Genereer voorspellingen op de testset\n",
|
|
"voorspellingen = xgb_model.predict(X_test)\n",
|
|
"\n",
|
|
"# Bereken de fout\n",
|
|
"mae = mean_absolute_error(y_test, voorspellingen)\n",
|
|
"print(f\"\\n--- RESULTAAT ---\")\n",
|
|
"print(f\"Gemiddelde Fout (MAE): {mae:.4f}\")\n",
|
|
"print(f\"Dit betekent dat het model er gemiddeld {mae*100:.2f} cent naast zat.\")\n",
|
|
"\n",
|
|
"# Maak een dataframe met de resultaten voor de plot\n",
|
|
"resultaten = pd.DataFrame({'Echte_Prijs': y_test, 'Voorspelde_Prijs': voorspellingen})\n",
|
|
"\n",
|
|
"# --- STAP 6: VISUALISATIE (PLOT) ---\n",
|
|
"print(\"Stap 6: Plot genereren...\")\n",
|
|
"# Plot de eerste week van de voorspellingen\n",
|
|
"resultaten.head(168).plot(figsize=(15, 6)) # 7 dagen * 24 uur = 168\n",
|
|
"plt.title('Voorspelling vs. Echte Prijzen (Eerste Week van 2025)')\n",
|
|
"plt.ylabel('Gemiddelde Prijs')\n",
|
|
"plt.grid(True)\n",
|
|
"plt.show() # Dit toont de grafiek direct in je notebook\n",
|
|
"\n",
|
|
"# --- STAP 7: FEATURE IMPORTANCE ---\n",
|
|
"print(\"Stap 7: Belangrijkste Features tonen...\")\n",
|
|
"# Wat vond het model het belangrijkst?\n",
|
|
"feature_importance = pd.Series(xgb_model.feature_importances_, index=FEATURES).sort_values(ascending=False)\n",
|
|
"\n",
|
|
"print(\"\\nBelangrijkste features volgens het model:\")\n",
|
|
"print(feature_importance.head(10)) # Top 10\n",
|
|
"\n",
|
|
"# Plot de feature importance\n",
|
|
"feature_importance.head(10).sort_values(ascending=True).plot(\n",
|
|
" kind='barh', \n",
|
|
" figsize=(10, 6), \n",
|
|
" title='Top 10 Belangrijkste Features'\n",
|
|
")\n",
|
|
"plt.xlabel('Belangrijkheid (Importance)')\n",
|
|
"plt.show() # Toont de grafiek\n",
|
|
"\n",
|
|
"print(\"\\n--- Analyse voltooid ---\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "b7fa76a8-9cea-4209-9f4d-80dada3e812b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Model succesvol opgeslagen als: price_forecast_model.json\n",
|
|
"Je kunt nu het 'voorspel_prijzen.py' script uitvoeren.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Definieer de bestandsnaam\n",
|
|
"model_bestandsnaam = 'price_forecast_model.json'\n",
|
|
"\n",
|
|
"# Sla het model op\n",
|
|
"# (De variabele 'xgb_model' bestaat nog van de vorige cel die je hebt gerund)\n",
|
|
"xgb_model.save_model(model_bestandsnaam)\n",
|
|
"\n",
|
|
"print(f\"Model succesvol opgeslagen als: {model_bestandsnaam}\")\n",
|
|
"print(\"Je kunt nu het 'voorspel_prijzen.py' script uitvoeren.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fe34a15b-0df5-43df-91a6-c66864992f93",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7221a2bd-fcbb-433f-b5aa-c76b010ab581",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Laden van model: price_forecast_model.json\n",
|
|
"Model succesvol geladen.\n",
|
|
"Simuleren van recente historische data...\n",
|
|
"Simuleren van toekomstige weersvoorspelling (volgende 24u)...\n",
|
|
"Feature Engineering toepassen op nieuwe data...\n",
|
|
"\n",
|
|
"--- Voorspelling wordt gemaakt ---\n",
|
|
" Voorspelde_Prijs\n",
|
|
"2025-11-12 23:31:02.684646 0.289561\n",
|
|
"2025-11-13 00:31:02.684646 0.304078\n",
|
|
"2025-11-13 01:31:02.684646 0.299526\n",
|
|
"2025-11-13 02:31:02.684646 0.311754\n",
|
|
"2025-11-13 03:31:02.684646 0.303218\n",
|
|
"2025-11-13 04:31:02.684646 0.305205\n",
|
|
"2025-11-13 05:31:02.684646 0.327496\n",
|
|
"2025-11-13 06:31:02.684646 0.317999\n",
|
|
"2025-11-13 07:31:02.684646 0.315852\n",
|
|
"2025-11-13 08:31:02.684646 0.311190\n",
|
|
"2025-11-13 09:31:02.684646 0.303526\n",
|
|
"2025-11-13 10:31:02.684646 0.303281\n",
|
|
"2025-11-13 11:31:02.684646 0.300676\n",
|
|
"2025-11-13 12:31:02.684646 0.300875\n",
|
|
"2025-11-13 13:31:02.684646 0.309934\n",
|
|
"2025-11-13 14:31:02.684646 0.316070\n",
|
|
"2025-11-13 15:31:02.684646 0.311042\n",
|
|
"2025-11-13 16:31:02.684646 0.327661\n",
|
|
"2025-11-13 17:31:02.684646 0.311183\n",
|
|
"2025-11-13 18:31:02.684646 0.309919\n",
|
|
"2025-11-13 19:31:02.684646 0.314924\n",
|
|
"2025-11-13 20:31:02.684646 0.301626\n",
|
|
"2025-11-13 21:31:02.684646 0.289536\n",
|
|
"2025-11-13 22:31:02.684646 0.297345\n",
|
|
"\n",
|
|
"Klaar. Je kunt dit script elke keer runnen als je een nieuwe weersvoorspelling hebt.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import xgboost as xgb\n",
|
|
"\n",
|
|
"# --- CONFIGURATIE ---\n",
|
|
"MODEL_FILE = 'price_forecast_model.json'\n",
|
|
"TARGET = 'gemiddelde_prijs'\n",
|
|
"\n",
|
|
"# Dit zijn de features die het model MOET hebben (kopieer van je trainingsscript)\n",
|
|
"FEATURES = [\n",
|
|
" 'temperatuur', 'gevoelstemperatuur', 'neerslag', 'wind_richting', \n",
|
|
" 'wind_snelheid', 'bewolking', 'luchtdruk', 'luchtvochtigheid', \n",
|
|
" 'uur_van_de_dag', 'dag_van_de_week', 'maand', 'dag_van_het_jaar', \n",
|
|
" 'prijs_1u_geleden', 'prijs_24u_geleden', 'temp_avg_3u', 'prijs_avg_6u'\n",
|
|
"]\n",
|
|
"\n",
|
|
"print(f\"Laden van model: {MODEL_FILE}\")\n",
|
|
"# --- 1. Laad het getrainde model ---\n",
|
|
"model = xgb.XGBRegressor()\n",
|
|
"model.load_model(MODEL_FILE)\n",
|
|
"print(\"Model succesvol geladen.\")\n",
|
|
"\n",
|
|
"\n",
|
|
"# --- 2. Verzamel de benodigde data ---\n",
|
|
"# Dit is de data die je \"live\" moet ophalen.\n",
|
|
"# We simuleren het nu, maar in de praktijk haal je dit uit je database of een API.\n",
|
|
"\n",
|
|
"# Je hebt de data van de afgelopen 24 uur nodig voor de Lag/Rolling features\n",
|
|
"# (HIER GEBRUIKEN WE NEP-DATA TER ILLUSTRATIE)\n",
|
|
"print(\"Simuleren van recente historische data...\")\n",
|
|
"historische_data = {\n",
|
|
" 'datum_tijd': pd.to_datetime(pd.date_range(end=pd.Timestamp.now(), periods=24, freq='h')),\n",
|
|
" TARGET: np.random.uniform(0.15, 0.35, 24),\n",
|
|
" 'temperatuur': np.random.uniform(5, 15, 24)\n",
|
|
"}\n",
|
|
"hist_df = pd.DataFrame(historische_data).set_index('datum_tijd')\n",
|
|
"\n",
|
|
"\n",
|
|
"# ==============================================================================\n",
|
|
"# === HIER VUL JE DE ECHTE WEERSVOORSPELLING IN ===\n",
|
|
"# ==============================================================================\n",
|
|
"# Dit is de *toekomstige* weersvoorspelling die je van een API (KNMI, OpenWeather) haalt.\n",
|
|
"# We simuleren nu een voorspelling voor de komende 24 uur.\n",
|
|
"print(\"Simuleren van toekomstige weersvoorspelling (volgende 24u)...\")\n",
|
|
"\n",
|
|
"toekomstige_datums = pd.date_range(start=pd.Timestamp.now() + pd.Timedelta(hours=1), periods=24, freq='h')\n",
|
|
"\n",
|
|
"simulated_forecast_df = pd.DataFrame({\n",
|
|
" 'datum_tijd': toekomstige_datums,\n",
|
|
" 'temperatuur': [10.1, 10.0, 9.8, 9.5, 9.2, 9.0, 8.8, 9.1, 10.2, 11.5, 12.0, 12.5, 13.0, 12.8, 12.0, 11.5, 11.0, 10.5, 10.2, 10.0, 9.8, 9.6, 9.4, 9.2],\n",
|
|
" 'gevoelstemperatuur': [7.1, 7.0, 6.8, 6.5, 6.2, 6.0, 5.8, 6.1, 7.2, 8.5, 9.0, 9.5, 10.0, 9.8, 9.0, 8.5, 8.0, 7.5, 7.2, 7.0, 6.8, 6.6, 6.4, 6.2],\n",
|
|
" 'neerslag': [0.0]*24,\n",
|
|
" 'wind_richting': [210]*24,\n",
|
|
" 'wind_snelheid': [18.0]*24,\n",
|
|
" 'bewolking': [80]*24,\n",
|
|
" 'luchtdruk': [1015]*24,\n",
|
|
" 'luchtvochtigheid': [90]*24,\n",
|
|
"}).set_index('datum_tijd')\n",
|
|
"\n",
|
|
"\n",
|
|
"# --- 3. Combineer historie en toekomst voor Feature Engineering ---\n",
|
|
"# We hebben de staart van de historie nodig om de 'lag' en 'rolling' features \n",
|
|
"# voor de *eerste* toekomstige uren te berekenen.\n",
|
|
"combined_df = pd.concat([hist_df, simulated_forecast_df])\n",
|
|
"\n",
|
|
"# --- 4. Pas EXACT dezelfde Feature Engineering toe ---\n",
|
|
"print(\"Feature Engineering toepassen op nieuwe data...\")\n",
|
|
"# Tijd-features\n",
|
|
"combined_df['uur_van_de_dag'] = combined_df.index.hour\n",
|
|
"combined_df['dag_van_de_week'] = combined_df.index.dayofweek\n",
|
|
"combined_df['maand'] = combined_df.index.month\n",
|
|
"combined_df['dag_van_het_jaar'] = combined_df.index.dayofyear\n",
|
|
"\n",
|
|
"# Lag-features (historie)\n",
|
|
"combined_df['prijs_1u_geleden'] = combined_df[TARGET].shift(1)\n",
|
|
"combined_df['prijs_24u_geleden'] = combined_df[TARGET].shift(24)\n",
|
|
"\n",
|
|
"# Rolling-features (trends)\n",
|
|
"combined_df['temp_avg_3u'] = combined_df['temperatuur'].rolling(window=3).mean()\n",
|
|
"combined_df['prijs_avg_6u'] = combined_df[TARGET].rolling(window=6).mean()\n",
|
|
"\n",
|
|
"# --- 5. Selecteer de data die we willen voorspellen ---\n",
|
|
"# We willen alleen de toekomstige rijen voorspellen.\n",
|
|
"# We pakken alleen de rijen waarvoor we een voorspelling willen doen (de toekomst)\n",
|
|
"X_toekomst = combined_df.loc[toekomstige_datums]\n",
|
|
"\n",
|
|
"# Check of alle benodigde kolommen aanwezig zijn\n",
|
|
"X_voorspelling_input = X_toekomst[FEATURES]\n",
|
|
"\n",
|
|
"# BELANGRIJK: De eerste paar rijen kunnen NaNs hebben (van de rolling features)\n",
|
|
"# We vullen die hier op een simpele manier (forward fill)\n",
|
|
"X_voorspelling_input = X_voorspelling_input.ffill()\n",
|
|
"\n",
|
|
"# --- 6. Maak de Voorspelling! ---\n",
|
|
"print(\"\\n--- Voorspelling wordt gemaakt ---\")\n",
|
|
"voorspelde_prijzen = model.predict(X_voorspelling_input)\n",
|
|
"\n",
|
|
"# --- 7. Toon de resultaten ---\n",
|
|
"resultaat = pd.DataFrame({\n",
|
|
" 'Voorspelde_Prijs': voorspelde_prijzen\n",
|
|
"}, index=toekomstige_datums)\n",
|
|
"\n",
|
|
"print(resultaat)\n",
|
|
"\n",
|
|
"print(\"\\nKlaar. Je kunt dit script elke keer runnen als je een nieuwe weersvoorspelling hebt.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "fb9a1193-745d-4cd7-afa6-6909a2136b4c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Collecting holidays\n",
|
|
" Downloading holidays-0.84-py3-none-any.whl.metadata (50 kB)\n",
|
|
"Requirement already satisfied: python-dateutil in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from holidays) (2.9.0.post0)\n",
|
|
"Requirement already satisfied: six>=1.5 in c:\\users\\markk\\appdata\\roaming\\python\\python313\\site-packages (from python-dateutil->holidays) (1.17.0)\n",
|
|
"Downloading holidays-0.84-py3-none-any.whl (1.3 MB)\n",
|
|
" ---------------------------------------- 0.0/1.3 MB ? eta -:--:--\n",
|
|
" ---------------------------------------- 1.3/1.3 MB 12.5 MB/s 0:00:00\n",
|
|
"Installing collected packages: holidays\n",
|
|
"Successfully installed holidays-0.84\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install holidays"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "f15f6814-aa0e-402d-afd4-c52c8a8d16ae",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Start training Model 1.5...\n",
|
|
"Data geladen: 11578 rijen.\n",
|
|
"Feature Engineering (v1.5) gestart...\n",
|
|
"Feestdagen feature toegevoegd.\n",
|
|
"One-Hot Encoding toepassen...\n",
|
|
"Rijen vóór opschonen: 11578\n",
|
|
"Rijen ná opschonen: 11554\n",
|
|
"\n",
|
|
"Voorbeeld van de nieuwe 'dag' features:\n",
|
|
" dag_van_het_jaar dag_0 dag_1 dag_2 dag_3 dag_4 \\\n",
|
|
"datum_tijd \n",
|
|
"2024-07-18 00:00:00 200 False False False True False \n",
|
|
"2024-07-18 01:00:00 200 False False False True False \n",
|
|
"2024-07-18 02:00:00 200 False False False True False \n",
|
|
"2024-07-18 03:00:00 200 False False False True False \n",
|
|
"2024-07-18 04:00:00 200 False False False True False \n",
|
|
"\n",
|
|
" dag_5 dag_6 \n",
|
|
"datum_tijd \n",
|
|
"2024-07-18 00:00:00 False False \n",
|
|
"2024-07-18 01:00:00 False False \n",
|
|
"2024-07-18 02:00:00 False False \n",
|
|
"2024-07-18 03:00:00 False False \n",
|
|
"2024-07-18 04:00:00 False False \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import holidays\n",
|
|
"import xgboost as xgb\n",
|
|
"from sklearn.metrics import mean_absolute_error\n",
|
|
"\n",
|
|
"print(\"Start training Model 1.5...\")\n",
|
|
"\n",
|
|
"# --- 1. DATA LADEN ---\n",
|
|
"# We gebruiken de CSV die we al hadden\n",
|
|
"try:\n",
|
|
" df = pd.read_csv(\"training_set_2024_2025.csv\")\n",
|
|
" df['datum_tijd'] = pd.to_datetime(df['datum_tijd'])\n",
|
|
" df = df.set_index('datum_tijd').sort_index()\n",
|
|
" print(f\"Data geladen: {len(df)} rijen.\")\n",
|
|
"except FileNotFoundError:\n",
|
|
" print(\"Zorg dat 'training_set_2024_2025.csv' in de map staat.\")\n",
|
|
" raise\n",
|
|
"\n",
|
|
"# --- 2. FEATURE ENGINEERING (v1.5) ---\n",
|
|
"print(\"Feature Engineering (v1.5) gestart...\")\n",
|
|
"\n",
|
|
"# --- 2a. FEESTDAGEN FEATURE ---\n",
|
|
"# Maak een lijst van Nederlandse feestdagen voor de relevante jaren\n",
|
|
"nl_holidays = holidays.Netherlands(years=[2024, 2025])\n",
|
|
"df['is_feestdag'] = df.index.to_series().apply(lambda x: 1 if x in nl_holidays else 0)\n",
|
|
"print(\"Feestdagen feature toegevoegd.\")\n",
|
|
"\n",
|
|
"# --- 2b. TIJD-FEATURES (Basis) ---\n",
|
|
"df['uur_van_de_dag'] = df.index.hour\n",
|
|
"df['dag_van_de_week'] = df.index.dayofweek # 0=Maandag, 6=Zondag\n",
|
|
"df['maand'] = df.index.month\n",
|
|
"df['dag_van_het_jaar'] = df.index.dayofyear\n",
|
|
"\n",
|
|
"# --- 2c. ONE-HOT ENCODING (DE BELANGRIJKSTE FIX) ---\n",
|
|
"# Converteer 'dag_van_de_week' en 'uur_van_de_dag' naar losse kolommen\n",
|
|
"print(\"One-Hot Encoding toepassen...\")\n",
|
|
"df = pd.get_dummies(df, columns=['dag_van_de_week', 'uur_van_de_dag'], \n",
|
|
" prefix=['dag', 'uur'])\n",
|
|
"\n",
|
|
"# --- 2d. Basis Lag/Rolling Features (deze hadden we al) ---\n",
|
|
"df['prijs_1u_geleden'] = df['gemiddelde_prijs'].shift(1)\n",
|
|
"df['prijs_24u_geleden'] = df['gemiddelde_prijs'].shift(24)\n",
|
|
"df['temp_avg_3u'] = df['temperatuur'].rolling(window=3).mean()\n",
|
|
"df['prijs_avg_6u'] = df['gemiddelde_prijs'].rolling(window=6).mean()\n",
|
|
"\n",
|
|
"# --- 2e. Opschonen ---\n",
|
|
"# We verliezen nu maar 24 rijen (van de 'prijs_24u_geleden' lag)\n",
|
|
"print(f\"Rijen vóór opschonen: {len(df)}\")\n",
|
|
"df_clean = df.dropna()\n",
|
|
"print(f\"Rijen ná opschonen: {len(df_clean)}\")\n",
|
|
"\n",
|
|
"# Toon ons de nieuwe features\n",
|
|
"print(\"\\nVoorbeeld van de nieuwe 'dag' features:\")\n",
|
|
"print(df_clean.filter(like='dag_').head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "00c9d26c-4078-4577-9575-d263ca22bf60",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Model wordt getraind met 46 features.\n",
|
|
"Trainingset: 4001 rijen\n",
|
|
"Testset: 7553 rijen\n",
|
|
"\n",
|
|
"Model v1.5 aan het trainen...\n",
|
|
"Training voltooid.\n",
|
|
"\n",
|
|
"Nieuwe Model (v1.5) MAE: 0.0122\n",
|
|
"\n",
|
|
"Model 1.5 opgeslagen als: price_forecast_model_v1_5.json\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# --- 3. DEFINIEER FEATURES (X) en TARGET (y) ---\n",
|
|
"TARGET = 'gemiddelde_prijs'\n",
|
|
"# Automatisch alle kolommen als feature gebruiken\n",
|
|
"FEATURES = [col for col in df_clean.columns if col not in [TARGET]]\n",
|
|
"\n",
|
|
"print(f\"\\nModel wordt getraind met {len(FEATURES)} features.\")\n",
|
|
"\n",
|
|
"X = df_clean[FEATURES]\n",
|
|
"y = df_clean[TARGET]\n",
|
|
"\n",
|
|
"# --- 4. CHRONOLOGISCHE SPLIT ---\n",
|
|
"# We splitsen op 1 jan 2025, net als de vorige keer\n",
|
|
"SPLIT_DATE = '2025-01-01 00:00:00'\n",
|
|
"train_mask = X.index < SPLIT_DATE\n",
|
|
"test_mask = X.index >= SPLIT_DATE\n",
|
|
"\n",
|
|
"X_train, y_train = X[train_mask], y[train_mask]\n",
|
|
"X_test, y_test = X[test_mask], y[test_mask]\n",
|
|
"\n",
|
|
"print(f\"Trainingset: {len(X_train)} rijen\")\n",
|
|
"print(f\"Testset: {len(X_test)} rijen\")\n",
|
|
"\n",
|
|
"# --- 5. MODEL TRAINEN ---\n",
|
|
"xgb_model_v1_5 = xgb.XGBRegressor(\n",
|
|
" n_estimators=1000,\n",
|
|
" learning_rate=0.01,\n",
|
|
" early_stopping_rounds=50\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nModel v1.5 aan het trainen...\")\n",
|
|
"xgb_model_v1_5.fit(\n",
|
|
" X_train, y_train,\n",
|
|
" eval_set=[(X_test, y_test)],\n",
|
|
" verbose=False # Zet op 100 om voortgang te zien\n",
|
|
")\n",
|
|
"print(\"Training voltooid.\")\n",
|
|
"\n",
|
|
"# --- 6. EVALUATIE ---\n",
|
|
"voorspellingen_v1_5 = xgb_model_v1_5.predict(X_test)\n",
|
|
"mae_v1_5 = mean_absolute_error(y_test, voorspellingen_v1_5)\n",
|
|
"print(f\"\\nNieuwe Model (v1.5) MAE: {mae_v1_5:.4f}\")\n",
|
|
"\n",
|
|
"# --- 7. MODEL OPSLAAN ---\n",
|
|
"MODEL_FILE = 'price_forecast_model_v1_5.json'\n",
|
|
"xgb_model_v1_5.save_model(MODEL_FILE)\n",
|
|
"print(f\"\\nModel 1.5 opgeslagen als: {MODEL_FILE}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "eb4e1adc-007a-4329-906b-9925a7f11ddf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|