Predicting the eolian energy production#

This Notebook aims at predicting the energy producte by wind turbines.

It uses weather data extracted from the MeteoFrance numerical models, as well as history of productions provided by RTE.

[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
[2]:
filename_wind_regions = "../../data/silver/mean_daily_wind_j0.csv"

filename_energy_preduction = "../../data/rte_agg_daily_2014_2024.csv"
[3]:
df_ssrd_regions = pd.read_csv(filename_wind_regions, parse_dates=["time"]).set_index(
    "time"
)
# sanitise the column names
region_names = [
    col.replace(" ", "_").replace("'", "_").replace("-", "_").lower()
    for col in df_ssrd_regions.columns
]
df_ssrd_regions.columns = region_names
region_names = df_ssrd_regions.columns
df_ssrd_regions.plot(figsize=(15, 10))
df_ssrd_regions["days_from_start"] = [
    (date - df_ssrd_regions.index[0]).days for date in df_ssrd_regions.index
]
df_ssrd_regions.head()
[3]:
auvergne_rhône_alpes bourgogne_franche_comté bretagne centre_val_de_loire corse grand_est hauts_de_france normandie nouvelle_aquitaine occitanie pays_de_la_loire provence_alpes_côte_d_azur île_de_france days_from_start
time
2022-02-01 4.205852 4.060221 5.199642 4.778342 3.561165 5.454614 6.432882 6.416779 3.152589 6.048412 4.748808 5.744050 4.977423 0
2022-02-02 3.156469 3.178754 3.410363 3.398772 2.383473 4.102714 4.987001 4.406740 2.294659 4.454043 3.110102 5.584663 3.663799 1
2022-02-03 2.234994 2.016168 3.373047 2.801909 2.217947 2.946621 4.726071 4.218961 2.206917 2.502523 3.111282 2.187303 3.396305 2
2022-02-04 2.453711 3.737635 4.975356 4.658530 2.061253 5.098637 6.986221 6.097625 3.139985 2.935683 4.232595 2.280711 5.306362 3
2022-02-05 2.609659 2.231746 4.131737 3.149658 1.859197 3.742508 5.961911 5.229400 2.279602 3.726291 3.058013 3.321954 3.946324 4
../../_images/user_guide_notebooks_2_eolen_prediction_3_1.png
[4]:
df_energy_preduction = pd.read_csv(filename_energy_preduction, index_col=0)[
    ["Eolien", "Solaire"]
]
df_energy_preduction.index = pd.to_datetime(df_energy_preduction.index)
df_energy_preduction.head(), df_energy_preduction.tail()
[4]:
(              Eolien  Solaire
 Date
 2015-01-01   51127.0  11370.5
 2015-01-02   78933.0   8297.5
 2015-01-03  105299.0   5860.5
 2015-01-04   30061.0   6926.0
 2015-01-05   16004.0   9786.5,
               Eolien  Solaire
 Date
 2024-04-04  285321.0  76581.5
 2024-04-05  232208.5  72847.5
 2024-04-06  225106.0  61577.5
 2024-04-07  138049.5  46718.5
 2024-04-08   53990.0  26677.0)
[5]:
df_energy_preduction.index
[5]:
DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
               '2015-01-09', '2015-01-10',
               ...
               '2024-03-30', '2024-03-31', '2024-04-01', '2024-04-02',
               '2024-04-03', '2024-04-04', '2024-04-05', '2024-04-06',
               '2024-04-07', '2024-04-08'],
              dtype='datetime64[ns]', name='Date', length=3386, freq=None)
[6]:
# align the indexes of the two dataframes
data = pd.concat([df_ssrd_regions, df_energy_preduction], join="inner", axis=1)
data.head()
[6]:
auvergne_rhône_alpes bourgogne_franche_comté bretagne centre_val_de_loire corse grand_est hauts_de_france normandie nouvelle_aquitaine occitanie pays_de_la_loire provence_alpes_côte_d_azur île_de_france days_from_start Eolien Solaire
2022-02-01 4.205852 4.060221 5.199642 4.778342 3.561165 5.454614 6.432882 6.416779 3.152589 6.048412 4.748808 5.744050 4.977423 0 227954.0 21938.5
2022-02-02 3.156469 3.178754 3.410363 3.398772 2.383473 4.102714 4.987001 4.406740 2.294659 4.454043 3.110102 5.584663 3.663799 1 138768.0 21271.0
2022-02-03 2.234994 2.016168 3.373047 2.801909 2.217947 2.946621 4.726071 4.218961 2.206917 2.502523 3.111282 2.187303 3.396305 2 63557.5 20527.5
2022-02-04 2.453711 3.737635 4.975356 4.658530 2.061253 5.098637 6.986221 6.097625 3.139985 2.935683 4.232595 2.280711 5.306362 3 178764.0 19051.0
2022-02-05 2.609659 2.231746 4.131737 3.149658 1.859197 3.742508 5.961911 5.229400 2.279602 3.726291 3.058013 3.321954 3.946324 4 145138.0 41271.5
[8]:
from statsmodels.formula.api import ols

# split test for time series
from sklearn.model_selection import TimeSeriesSplit

Modeling#

4 models are tested : - Only Total wind speed (no region details) - Only regions Wind Speed - Total Wind Speed + time - Regions wind Speed + tim

[9]:
exo_vars = region_names
data["mean_wind"] = data[exo_vars].mean(axis=1)
endog_var = "Eolien"
[10]:
tscv = TimeSeriesSplit(n_splits=60, test_size=1)  # testing on 3 days forcast
[11]:
def test_model(formula="Eolien ~ mean_wind -1"):
    mod_1_mape = []
    for i, (train_index, test_index) in enumerate(tscv.split(data)):
        model_1 = ols(formula, data=data.iloc[train_index]).fit()
        if i == 0:
            first_test_index = test_index
            first_model_1 = model_1
        predictions = model_1.predict(data.iloc[test_index])
        error = data.iloc[test_index]["Eolien"] - predictions
        mape = (error.abs() / data.iloc[test_index]["Eolien"]).mean()
        mod_1_mape.append(mape)
    last_test_index = test_index
    last_model_1 = model_1
    return mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1


formula_1 = "Eolien ~ mean_wind -1"
mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1 = test_model(
    formula=formula_1
)
[12]:
ax = data.plot(y="Eolien", label="True")
first_model_1.predict(data.iloc[first_test_index]).plot(
    ax=ax, label="First Test Predicted"
)
last_model_1.predict(data.iloc[last_test_index]).plot(
    ax=ax, label="Last Test Predicted"
)
ax.legend()
[12]:
<matplotlib.legend.Legend at 0x705b33af1a80>
../../_images/user_guide_notebooks_2_eolen_prediction_12_1.png
[13]:
fig, ax = plt.subplots()
ax.hist(mod_1_mape, bins=20)
ax.set_title("MAPE distribution for model 1")
ax.set_xlabel("MAPE")
[13]:
Text(0.5, 0, 'MAPE')
../../_images/user_guide_notebooks_2_eolen_prediction_13_1.png
[14]:
formula_2 = f"Eolien ~ {' + '.join(exo_vars)} -1"
print(formula_2)
mod_2_mape, first_test_index, first_model_2, last_test_index, last_model_2 = test_model(
    formula_2
)
Eolien ~ auvergne_rhône_alpes + bourgogne_franche_comté + bretagne + centre_val_de_loire + corse + grand_est + hauts_de_france + normandie + nouvelle_aquitaine + occitanie + pays_de_la_loire + provence_alpes_côte_d_azur + île_de_france -1
[15]:
fig, ax = plt.subplots()
ax.hist(mod_2_mape, bins=20)
ax.set_title("MAPE distribution for model 2")
[15]:
Text(0.5, 1.0, 'MAPE distribution for model 2')
../../_images/user_guide_notebooks_2_eolen_prediction_15_1.png
[16]:
formula_3 = formula_1 + " + days_from_start"
mod_3_mape, first_test_index, first_model_3, last_test_index, last_model_3 = test_model(
    formula_3
)
formula_4 = formula_2 + " + days_from_start"
mod_4_mape, first_test_index, first_model_4, last_test_index, last_model_4 = test_model(
    formula_4
)
[17]:
# display the MAPE distribution for all models (KDE)
fig, ax = plt.subplots()
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
    pd.Series(mape).plot.kde(ax=ax, label=f"Model {i+1}")
ax.set_title("MAPE distribution for all models")
ax.legend()
[17]:
<matplotlib.legend.Legend at 0x705b1e782e00>
../../_images/user_guide_notebooks_2_eolen_prediction_17_1.png
[18]:
# print mean MAPE for all models
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
    print(f"Model {i+1} mean MAPE: {np.mean(mape):.2%}")
Model 1 mean MAPE: 27.37%
Model 2 mean MAPE: 18.63%
Model 3 mean MAPE: 24.38%
Model 4 mean MAPE: 19.03%

Conclusion#

In contrast with the photo-voltaic power prediction, the eolien is a bit more consistent with the expected trend : - using regional data features is better than global wind values (even with the time trend added to the global value) - adding the time trend to the model improve the performances

The mean performance of model 4 (11% error) is quite good !

[19]:
data[["Eolien", "Solaire"]].mean()
[19]:
Eolien     121818.205577
Solaire     55192.725032
dtype: float64

As the production of the wind turbine is around 2 time higher than the Sun production, the performance of the wind energy prediction model is more important for the overall performance of the project.

[20]:
last_model_2.params.to_csv("wind_model_2_params.csv")