Predicting the eolian energy production#
This Notebook aims at predicting the energy producte by wind turbines.
It uses weather data extracted from the MeteoFrance numerical models, as well as history of productions provided by RTE.
[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
[2]:
filename_wind_regions = "../../data/silver/mean_daily_wind_j0.csv"
filename_energy_preduction = "../../data/rte_agg_daily_2014_2024.csv"
[3]:
df_ssrd_regions = pd.read_csv(filename_wind_regions, parse_dates=["time"]).set_index(
"time"
)
# sanitise the column names
region_names = [
col.replace(" ", "_").replace("'", "_").replace("-", "_").lower()
for col in df_ssrd_regions.columns
]
df_ssrd_regions.columns = region_names
region_names = df_ssrd_regions.columns
df_ssrd_regions.plot(figsize=(15, 10))
df_ssrd_regions["days_from_start"] = [
(date - df_ssrd_regions.index[0]).days for date in df_ssrd_regions.index
]
df_ssrd_regions.head()
[3]:
auvergne_rhône_alpes | bourgogne_franche_comté | bretagne | centre_val_de_loire | corse | grand_est | hauts_de_france | normandie | nouvelle_aquitaine | occitanie | pays_de_la_loire | provence_alpes_côte_d_azur | île_de_france | days_from_start | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
time | ||||||||||||||
2022-02-01 | 4.205852 | 4.060221 | 5.199642 | 4.778342 | 3.561165 | 5.454614 | 6.432882 | 6.416779 | 3.152589 | 6.048412 | 4.748808 | 5.744050 | 4.977423 | 0 |
2022-02-02 | 3.156469 | 3.178754 | 3.410363 | 3.398772 | 2.383473 | 4.102714 | 4.987001 | 4.406740 | 2.294659 | 4.454043 | 3.110102 | 5.584663 | 3.663799 | 1 |
2022-02-03 | 2.234994 | 2.016168 | 3.373047 | 2.801909 | 2.217947 | 2.946621 | 4.726071 | 4.218961 | 2.206917 | 2.502523 | 3.111282 | 2.187303 | 3.396305 | 2 |
2022-02-04 | 2.453711 | 3.737635 | 4.975356 | 4.658530 | 2.061253 | 5.098637 | 6.986221 | 6.097625 | 3.139985 | 2.935683 | 4.232595 | 2.280711 | 5.306362 | 3 |
2022-02-05 | 2.609659 | 2.231746 | 4.131737 | 3.149658 | 1.859197 | 3.742508 | 5.961911 | 5.229400 | 2.279602 | 3.726291 | 3.058013 | 3.321954 | 3.946324 | 4 |
[4]:
df_energy_preduction = pd.read_csv(filename_energy_preduction, index_col=0)[
["Eolien", "Solaire"]
]
df_energy_preduction.index = pd.to_datetime(df_energy_preduction.index)
df_energy_preduction.head(), df_energy_preduction.tail()
[4]:
( Eolien Solaire
Date
2015-01-01 51127.0 11370.5
2015-01-02 78933.0 8297.5
2015-01-03 105299.0 5860.5
2015-01-04 30061.0 6926.0
2015-01-05 16004.0 9786.5,
Eolien Solaire
Date
2024-04-04 285321.0 76581.5
2024-04-05 232208.5 72847.5
2024-04-06 225106.0 61577.5
2024-04-07 138049.5 46718.5
2024-04-08 53990.0 26677.0)
[5]:
df_energy_preduction.index
[5]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
'2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
'2015-01-09', '2015-01-10',
...
'2024-03-30', '2024-03-31', '2024-04-01', '2024-04-02',
'2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06',
'2024-04-07', '2024-04-08'],
dtype='datetime64[ns]', name='Date', length=3386, freq=None)
[6]:
# align the indexes of the two dataframes
data = pd.concat([df_ssrd_regions, df_energy_preduction], join="inner", axis=1)
data.head()
[6]:
auvergne_rhône_alpes | bourgogne_franche_comté | bretagne | centre_val_de_loire | corse | grand_est | hauts_de_france | normandie | nouvelle_aquitaine | occitanie | pays_de_la_loire | provence_alpes_côte_d_azur | île_de_france | days_from_start | Eolien | Solaire | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2022-02-01 | 4.205852 | 4.060221 | 5.199642 | 4.778342 | 3.561165 | 5.454614 | 6.432882 | 6.416779 | 3.152589 | 6.048412 | 4.748808 | 5.744050 | 4.977423 | 0 | 227954.0 | 21938.5 |
2022-02-02 | 3.156469 | 3.178754 | 3.410363 | 3.398772 | 2.383473 | 4.102714 | 4.987001 | 4.406740 | 2.294659 | 4.454043 | 3.110102 | 5.584663 | 3.663799 | 1 | 138768.0 | 21271.0 |
2022-02-03 | 2.234994 | 2.016168 | 3.373047 | 2.801909 | 2.217947 | 2.946621 | 4.726071 | 4.218961 | 2.206917 | 2.502523 | 3.111282 | 2.187303 | 3.396305 | 2 | 63557.5 | 20527.5 |
2022-02-04 | 2.453711 | 3.737635 | 4.975356 | 4.658530 | 2.061253 | 5.098637 | 6.986221 | 6.097625 | 3.139985 | 2.935683 | 4.232595 | 2.280711 | 5.306362 | 3 | 178764.0 | 19051.0 |
2022-02-05 | 2.609659 | 2.231746 | 4.131737 | 3.149658 | 1.859197 | 3.742508 | 5.961911 | 5.229400 | 2.279602 | 3.726291 | 3.058013 | 3.321954 | 3.946324 | 4 | 145138.0 | 41271.5 |
[8]:
from statsmodels.formula.api import ols
# split test for time series
from sklearn.model_selection import TimeSeriesSplit
Modeling#
4 models are tested : - Only Total wind speed (no region details) - Only regions Wind Speed - Total Wind Speed + time - Regions wind Speed + tim
[9]:
exo_vars = region_names
data["mean_wind"] = data[exo_vars].mean(axis=1)
endog_var = "Eolien"
[10]:
tscv = TimeSeriesSplit(n_splits=60, test_size=1) # testing on 3 days forcast
[11]:
def test_model(formula="Eolien ~ mean_wind -1"):
mod_1_mape = []
for i, (train_index, test_index) in enumerate(tscv.split(data)):
model_1 = ols(formula, data=data.iloc[train_index]).fit()
if i == 0:
first_test_index = test_index
first_model_1 = model_1
predictions = model_1.predict(data.iloc[test_index])
error = data.iloc[test_index]["Eolien"] - predictions
mape = (error.abs() / data.iloc[test_index]["Eolien"]).mean()
mod_1_mape.append(mape)
last_test_index = test_index
last_model_1 = model_1
return mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1
formula_1 = "Eolien ~ mean_wind -1"
mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1 = test_model(
formula=formula_1
)
[12]:
ax = data.plot(y="Eolien", label="True")
first_model_1.predict(data.iloc[first_test_index]).plot(
ax=ax, label="First Test Predicted"
)
last_model_1.predict(data.iloc[last_test_index]).plot(
ax=ax, label="Last Test Predicted"
)
ax.legend()
[12]:
<matplotlib.legend.Legend at 0x705b33af1a80>
[13]:
fig, ax = plt.subplots()
ax.hist(mod_1_mape, bins=20)
ax.set_title("MAPE distribution for model 1")
ax.set_xlabel("MAPE")
[13]:
Text(0.5, 0, 'MAPE')
[14]:
formula_2 = f"Eolien ~ {' + '.join(exo_vars)} -1"
print(formula_2)
mod_2_mape, first_test_index, first_model_2, last_test_index, last_model_2 = test_model(
formula_2
)
Eolien ~ auvergne_rhône_alpes + bourgogne_franche_comté + bretagne + centre_val_de_loire + corse + grand_est + hauts_de_france + normandie + nouvelle_aquitaine + occitanie + pays_de_la_loire + provence_alpes_côte_d_azur + île_de_france -1
[15]:
fig, ax = plt.subplots()
ax.hist(mod_2_mape, bins=20)
ax.set_title("MAPE distribution for model 2")
[15]:
Text(0.5, 1.0, 'MAPE distribution for model 2')
[16]:
formula_3 = formula_1 + " + days_from_start"
mod_3_mape, first_test_index, first_model_3, last_test_index, last_model_3 = test_model(
formula_3
)
formula_4 = formula_2 + " + days_from_start"
mod_4_mape, first_test_index, first_model_4, last_test_index, last_model_4 = test_model(
formula_4
)
[17]:
# display the MAPE distribution for all models (KDE)
fig, ax = plt.subplots()
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
pd.Series(mape).plot.kde(ax=ax, label=f"Model {i+1}")
ax.set_title("MAPE distribution for all models")
ax.legend()
[17]:
<matplotlib.legend.Legend at 0x705b1e782e00>
[18]:
# print mean MAPE for all models
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
print(f"Model {i+1} mean MAPE: {np.mean(mape):.2%}")
Model 1 mean MAPE: 27.37%
Model 2 mean MAPE: 18.63%
Model 3 mean MAPE: 24.38%
Model 4 mean MAPE: 19.03%
Conclusion#
In contrast with the photo-voltaic power prediction, the eolien is a bit more consistent with the expected trend : - using regional data features is better than global wind values (even with the time trend added to the global value) - adding the time trend to the model improve the performances
The mean performance of model 4 (11% error) is quite good !
[19]:
data[["Eolien", "Solaire"]].mean()
[19]:
Eolien 121818.205577
Solaire 55192.725032
dtype: float64
As the production of the wind turbine is around 2 time higher than the Sun production, the performance of the wind energy prediction model is more important for the overall performance of the project.
[20]:
last_model_2.params.to_csv("wind_model_2_params.csv")