Predicting the Photo-voltaic energy production#

This Notebook aims at predicting the energy producte by solar pannel.

It uses weather data extracted from the MeteoFrance numerical models, as well as history of productions provided by RTE.

[1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

[2]:

filename_ssrd_regions = "../../data/silver/mean_daily_sun_j0.csv"
filename_energy_preduction = "../../data/rte_agg_daily_2014_2024.csv"

[3]:

df_ssrd_regions = pd.read_csv(filename_ssrd_regions, parse_dates=["time"]).set_index(
    "time"
)
# sanitise the column names
region_names = [
    col.replace(" ", "_").replace("'", "_").replace("-", "_").lower()
    for col in df_ssrd_regions.columns
]
df_ssrd_regions.columns = region_names
region_names = df_ssrd_regions.columns


# Doing so to add empty rows for missing days
df_ssrd_regions.plot(figsize=(15, 10))
df_ssrd_regions["days_from_start"] = [
    (date - df_ssrd_regions.index[0]).days for date in df_ssrd_regions.index
]
df_ssrd_regions.head()

[3]:

	auvergne_rhône_alpes	bourgogne_franche_comté	bretagne	centre_val_de_loire	corse	grand_est	hauts_de_france	normandie	nouvelle_aquitaine	occitanie	pays_de_la_loire	provence_alpes_côte_d_azur	île_de_france	days_from_start
time
2022-02-01	261520.851689	90775.394750	163552.580967	34723.093875	456686.957700	44975.603958	21341.962363	42435.778336	185091.563333	315764.897917	122142.333333	425822.506650	15171.129004	0
2022-02-02	197822.709267	59674.926817	225057.752062	163614.112946	455816.711071	36565.521433	66463.473763	155318.998325	240519.501958	292222.562750	244625.748700	430004.543860	73755.427321	1
2022-02-03	299190.206288	190444.998208	81208.290680	150123.865750	408849.293058	124497.688083	48622.630200	47198.020700	218038.002000	342835.312917	119590.019712	383634.419163	87619.219558	2
2022-02-04	284906.858032	131882.476333	228456.231833	92169.886950	330461.622500	87107.948417	120341.102958	164888.956167	112435.209250	238153.647625	168470.041229	390108.203765	113925.635167	3
2022-02-05	362909.540192	346769.893083	318892.106625	308881.768208	310414.561742	243847.895724	259733.252833	262925.207583	402942.371958	325491.808458	338302.897500	442079.500878	274746.580333	4

../../_images/user_guide_notebooks_3_PV_prediction_3_1.png

[4]:

df_energy_preduction = pd.read_csv(filename_energy_preduction, index_col=0)[["Solaire"]]
df_energy_preduction.index = pd.to_datetime(df_energy_preduction.index)
df_energy_preduction.head(), df_energy_preduction.tail()

[4]:

(            Solaire
 Date
 2015-01-01  11370.5
 2015-01-02   8297.5
 2015-01-03   5860.5
 2015-01-04   6926.0
 2015-01-05   9786.5,
             Solaire
 Date
 2024-04-04  76581.5
 2024-04-05  72847.5
 2024-04-06  61577.5
 2024-04-07  46718.5
 2024-04-08  26677.0)

[5]:

# align the indexes of the two dataframes
data = pd.concat([df_ssrd_regions, df_energy_preduction], join="inner", axis=1)
data.head()

[5]:

	auvergne_rhône_alpes	bourgogne_franche_comté	bretagne	centre_val_de_loire	corse	grand_est	hauts_de_france	normandie	nouvelle_aquitaine	occitanie	pays_de_la_loire	provence_alpes_côte_d_azur	île_de_france	days_from_start	Solaire
2022-02-01	261520.851689	90775.394750	163552.580967	34723.093875	456686.957700	44975.603958	21341.962363	42435.778336	185091.563333	315764.897917	122142.333333	425822.506650	15171.129004	0	21938.5
2022-02-02	197822.709267	59674.926817	225057.752062	163614.112946	455816.711071	36565.521433	66463.473763	155318.998325	240519.501958	292222.562750	244625.748700	430004.543860	73755.427321	1	21271.0
2022-02-03	299190.206288	190444.998208	81208.290680	150123.865750	408849.293058	124497.688083	48622.630200	47198.020700	218038.002000	342835.312917	119590.019712	383634.419163	87619.219558	2	20527.5
2022-02-04	284906.858032	131882.476333	228456.231833	92169.886950	330461.622500	87107.948417	120341.102958	164888.956167	112435.209250	238153.647625	168470.041229	390108.203765	113925.635167	3	19051.0
2022-02-05	362909.540192	346769.893083	318892.106625	308881.768208	310414.561742	243847.895724	259733.252833	262925.207583	402942.371958	325491.808458	338302.897500	442079.500878	274746.580333	4	41271.5

[6]:

from statsmodels.formula.api import ols

# split test for time series
from sklearn.model_selection import TimeSeriesSplit

Modeling#

4 models are tested : - Only Total sun flux (no region details) - Only regions sun flux (no time feature) - Total sun flux + time - Regions sun flux + time

The Time feature allows to model a trend in the number of installed solar pannels.

[7]:

exo_vars = df_ssrd_regions.columns
exo_vars = exo_vars.drop("days_from_start")
data["mean_sun"] = data[exo_vars].mean(axis=1)
endog_var = "Solaire"

[8]:

tscv = TimeSeriesSplit(n_splits=60, test_size=1)  # testing on 1 days forcast

[9]:

def test_model(formula="Solaire ~ mean_sun -1"):
    mod_1_mape = []
    for i, (train_index, test_index) in enumerate(tscv.split(data)):
        model_1 = ols(formula, data=data.iloc[train_index]).fit()
        if i == 0:
            first_test_index = test_index
            first_model_1 = model_1
        predictions = model_1.predict(data.iloc[test_index])
        error = data.iloc[test_index]["Solaire"] - predictions
        mape = (error.abs() / data.iloc[test_index]["Solaire"]).mean()
        mod_1_mape.append(mape)
    last_test_index = test_index
    last_model_1 = model_1
    return mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1


mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1 = (
    test_model()
)

[10]:

ax = data.plot(y="Solaire", label="True")
first_model_1.predict(data.iloc[first_test_index]).plot(
    ax=ax, label="First Test Predicted"
)
last_model_1.predict(data.iloc[last_test_index]).plot(
    ax=ax, label="Last Test Predicted", lw=2
)

ax.legend()

[10]:

<matplotlib.legend.Legend at 0x7fccc45d9de0>

../../_images/user_guide_notebooks_3_PV_prediction_11_1.png

[11]:

fig, ax = plt.subplots()
ax.hist(mod_1_mape, bins=20)
ax.set_title("MAPE distribution for model 1")
ax.set_xlabel("MAPE")

[11]:

Text(0.5, 0, 'MAPE')

../../_images/user_guide_notebooks_3_PV_prediction_12_1.png

[14]:

formula_2 = f"Solaire ~ {' + '.join(exo_vars)} -1"
print(formula_2)
mod_2_mape, first_test_index, first_model_2, last_test_index, last_model_2 = test_model(
    formula_2
)

Solaire ~ auvergne_rhône_alpes + bourgogne_franche_comté + bretagne + centre_val_de_loire + corse + grand_est + hauts_de_france + normandie + nouvelle_aquitaine + occitanie + pays_de_la_loire + provence_alpes_côte_d_azur + île_de_france -1

[15]:

fig, ax = plt.subplots()
ax.hist(mod_2_mape, bins=20)
ax.set_title("MAPE distribution for model 2")

[15]:

Text(0.5, 1.0, 'MAPE distribution for model 2')

../../_images/user_guide_notebooks_3_PV_prediction_14_1.png

[16]:

formula_3 = "Solaire ~ mean_sun + days_from_start -1"
mod_3_mape, first_test_index, first_model_3, last_test_index, last_model_3 = test_model(
    formula_3
)
formula_4 = f"Solaire ~ {' + '.join(exo_vars)} + days_from_start -1"
mod_4_mape, first_test_index, first_model_4, last_test_index, last_model_4 = test_model(
    formula_4
)
print("formula_3", formula_3)
print("formula_4", formula_4)

formula_3 Solaire ~ mean_sun + days_from_start -1
formula_4 Solaire ~ auvergne_rhône_alpes + bourgogne_franche_comté + bretagne + centre_val_de_loire + corse + grand_est + hauts_de_france + normandie + nouvelle_aquitaine + occitanie + pays_de_la_loire + provence_alpes_côte_d_azur + île_de_france + days_from_start -1

[17]:

# display the MAPE distribution for all models (KDE)
fig, ax = plt.subplots()
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
    pd.Series(mape).plot.kde(ax=ax, label=f"Model {i+1}")
ax.set_title("MAPE distribution for all models")
ax.legend()

[17]:

<matplotlib.legend.Legend at 0x7fcc834ec7c0>

../../_images/user_guide_notebooks_3_PV_prediction_16_1.png

[18]:

# print mean MAPE for all models
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
    print(f"Model {i+1} mean MAPE: {np.mean(mape):.2%}")

Model 1 mean MAPE: 30.85%
Model 2 mean MAPE: 21.88%
Model 3 mean MAPE: 18.18%
Model 4 mean MAPE: 13.79%

Conclusion#

The model performances doesn’t really follow the expected order (more feature => better performance)

Maybe this is due to some sort of over-fitting, even thought there are not so mainy features…

[19]:

last_model_2.params.to_csv("sun_model_2_params.csv")

Table of Contents

Predicting the Photo-voltaic energy production#

Modeling#

Conclusion#