"""
Modeling with interval averages
===============================

Transposing interval-averaged irradiance data
"""

# %%
# This example shows how failing to account for the difference between
# instantaneous and interval-averaged time series data can introduce
# error in the modeling process. An instantaneous time series
# represents discrete measurements taken at each timestamp, while
# an interval-averaged time series represents the average value across
# each data interval.  For example, the value of an interval-averaged
# hourly time series at 11:00 represents the average value between
# 11:00 (inclusive) and 12:00 (exclusive), assuming the series is left-labeled.
# For a right-labeled time series it would be the average value
# between 10:00 (exclusive) and 11:00 (inclusive).  Sometimes timestamps
# are center-labeled, in which case it would be the
# average value between 10:30 and 11:30.
# Interval-averaged time series are common in
# field data, where the datalogger averages high-frequency measurements
# into low-frequency averages for archiving purposes.
#
# It is important to account for this difference when using
# interval-averaged weather data for modeling.  This example
# focuses on calculating solar position appropriately for
# irradiance transposition, but this concept is relevant for
# other steps in the modeling process as well.
#
# This example calculates a POA irradiance timeseries at 1-second
# resolution as a "ground truth" value.  Then it performs the
# transposition again at lower resolution using interval-averaged
# irradiance components, once using a half-interval shift and
# once just using the unmodified timestamps. The difference
# affects the solar position calculation: for example, assuming
# we have average irradiance for the interval 11:00 to 12:00,
# and it came from a left-labeled time series, naively using
# the unmodified timestamp will calculate solar position for 11:00,
# meaning the calculated solar position is used to represent
# times as far as an hour away.  A better option would be to
# calculate the solar position at 11:30 to reduce the maximum
# timing error to only half an hour.

import pvlib
import pandas as pd
import matplotlib.pyplot as plt

# %%
# First, we'll define a helper function that we can re-use several
# times in the following code:


def transpose(irradiance, timeshift):
    """
    Transpose irradiance components to plane-of-array, incorporating
    a timeshift in the solar position calculation.

    Parameters
    ----------
        irradiance: DataFrame
            Has columns dni, ghi, dhi
        timeshift: float
            Number of minutes to shift for solar position calculation
    Outputs:
        Series of POA irradiance
    """
    idx = irradiance.index
    # calculate solar position for shifted timestamps:
    idx = idx + pd.Timedelta(timeshift, unit='min')
    solpos = location.get_solarposition(idx)
    # but still report the values with the original timestamps:
    solpos.index = irradiance.index

    poa_components = pvlib.irradiance.get_total_irradiance(
        surface_tilt=20,
        surface_azimuth=180,
        solar_zenith=solpos['apparent_zenith'],
        solar_azimuth=solpos['azimuth'],
        dni=irradiance['dni'],
        ghi=irradiance['ghi'],
        dhi=irradiance['dhi'],
        model='isotropic',
    )
    return poa_components['poa_global']


# %%
# Now, calculate the "ground truth" irradiance data.  We'll simulate
# clear-sky irradiance components at 1-second intervals and calculate
# the corresponding POA irradiance.  At such a short timescale, the
# difference between instantaneous and interval-averaged irradiance
# is negligible.

# baseline: all calculations done at 1-second scale
location = pvlib.location.Location(40, -80, tz='Etc/GMT+5')
times = pd.date_range('2019-06-01 05:00', '2019-06-01 19:00',
                      freq='1s', tz='Etc/GMT+5')
solpos = location.get_solarposition(times)
clearsky = location.get_clearsky(times, solar_position=solpos)
poa_1s = transpose(clearsky, timeshift=0)  # no shift needed for 1s data

# %%
# Now, we will aggregate the 1-second values into interval averages.
# To see how the averaging interval affects results, we'll loop over
# a few common data intervals and accumulate the results.

fig, ax = plt.subplots(figsize=(5, 3))

results = []

for timescale_minutes in [1, 5, 10, 15, 30, 60]:

    timescale_str = f'{timescale_minutes}min'
    # get the "true" interval average of poa as the baseline for comparison
    poa_avg = poa_1s.resample(timescale_str).mean()
    # get interval averages of irradiance components to use for transposition
    clearsky_avg = clearsky.resample(timescale_str).mean()

    # low-res interval averages of 1-second data, with NO shift
    poa_avg_noshift = transpose(clearsky_avg, timeshift=0)

    # low-res interval averages of 1-second data, with half-interval shift
    poa_avg_halfshift = transpose(clearsky_avg, timeshift=timescale_minutes/2)

    df = pd.DataFrame({
        'ground truth': poa_avg,
        'modeled, half shift': poa_avg_halfshift,
        'modeled, no shift': poa_avg_noshift,
    })
    error = df.subtract(df['ground truth'], axis=0)
    # add another trace to the error plot
    error['modeled, no shift'].plot(ax=ax, label=timescale_str)
    # calculate error statistics and save for later
    stats = error.abs().mean()  # average absolute error across daylight hours
    stats['timescale_minutes'] = timescale_minutes
    results.append(stats)

ax.legend(ncol=2)
ax.set_ylabel('Transposition Error [W/m$^2$]')
fig.tight_layout()

df_results = pd.DataFrame(results).set_index('timescale_minutes')
print(df_results)

# %%
# The errors shown above are the average absolute difference in :math:`W/m^2`.
# In this example, using the timestamps unadjusted creates an error that
# increases with increasing interval length, up to a ~40% error
# at hourly resolution.  In contrast, incorporating a half-interval shift
# so that solar position is calculated in the middle of the interval
# instead of the edge reduces the error by one or two orders of magnitude:

fig, ax = plt.subplots(figsize=(5, 3))
df_results[['modeled, no shift', 'modeled, half shift']].plot.bar(rot=0, ax=ax)
ax.set_ylabel('Mean Absolute Error [W/m$^2$]')
ax.set_xlabel('Transposition Timescale [minutes]')
fig.tight_layout()

# %%
# We can also plot the underlying time series results of the last
# iteration (hourly in this case).  The modeled irradiance using
# no shift is effectively time-lagged compared with ground truth.
# In contrast, the half-shift model is nearly identical to the ground
# truth irradiance.

fig, ax = plt.subplots(figsize=(5, 3))
ax = df.plot(ax=ax, style=[None, ':', None], lw=3)
ax.set_ylabel('Irradiance [W/m$^2$]')
fig.tight_layout()
