Source code for pvlib.iotools.bsrn

"""Functions to read data from the Baseline Surface Radiation Network (BSRN).
.. codeauthor:: Adam R. Jensen<adam-r-j@hotmail.com>
"""

import pandas as pd
import gzip
import ftplib
import warnings
import io
import os

BSRN_FTP_URL = "ftp.bsrn.awi.de"

BSRN_LR0100_COL_SPECS = [(0, 3), (4, 9), (10, 16), (16, 22), (22, 27),
                         (27, 32), (32, 39), (39, 45), (45, 50), (50, 55),
                         (55, 64), (64, 70), (70, 75)]

BSRN_LR0300_COL_SPECS = [(1, 3), (4, 9), (10, 16), (16, 22), (22, 27),
                         (27, 31), (31, 38), (38, 44), (44, 49), (49, 54),
                         (54, 61), (61, 67), (67, 72), (72, 78)]

BSRN_LR0500_COL_SPECS = [(0, 3), (3, 8), (8, 14), (14, 20), (20, 26), (26, 32),
                         (32, 38), (38, 44), (44, 50), (50, 56), (56, 62),
                         (62, 68), (68, 74), (74, 80)]

BSRN_LR0100_COLUMNS = ['day', 'minute',
                       'ghi', 'ghi_std', 'ghi_min', 'ghi_max',
                       'dni', 'dni_std', 'dni_min', 'dni_max',
                       'empty', 'empty', 'empty', 'empty', 'empty',
                       'dhi', 'dhi_std', 'dhi_min', 'dhi_max',
                       'lwd', 'lwd_std', 'lwd_min', 'lwd_max',
                       'temp_air', 'relative_humidity', 'pressure']

BSRN_LR0300_COLUMNS = ['day', 'minute', 'gri', 'gri_std', 'gri_min', 'gri_max',
                       'lwu', 'lwu_std', 'lwu_min', 'lwu_max', 'net_radiation',
                       'net_radiation_std', 'net_radiation_min',
                       'net_radiation_max']

BSRN_LR0500_COLUMNS = ['day', 'minute', 'uva_global', 'uva_global_std',
                       'uva_global_min', 'uva_global_max', 'uvb_direct',
                       'uvb_direct_std', 'uvb_direct_min', 'uvb_direct_max',
                       'empty', 'empty', 'empty', 'empty',
                       'uvb_global', 'uvb_global_std', 'uvb_global_min',
                       'uvb_global_max', 'uvb_diffuse', 'uvb_diffuse_std',
                       'uvb_diffuse', 'uvb_diffuse_std',
                       'uvb_diffuse_min', 'uvb_diffuse_max',
                       'uvb_reflected', 'uvb_reflected_std',
                       'uvb_reflected_min', 'uvb_reflected_max']

BSRN_COLUMNS = {'0100': BSRN_LR0100_COLUMNS, '0300': BSRN_LR0300_COLUMNS,
                '0500': BSRN_LR0500_COLUMNS}


def _empty_dataframe_from_logical_records(logical_records):
    # Create an empty DataFrame with the column names corresponding to the
    # requested logical records
    columns = []
    for lr in logical_records:
        columns += BSRN_COLUMNS[lr][2:]
    columns = [c for c in columns if c != 'empty']
    return pd.DataFrame(columns=columns)


[docs]def get_bsrn(station, start, end, username, password, logical_records=('0100',), save_path=None): """ Retrieve ground measured irradiance data from the BSRN FTP server. The BSRN (Baseline Surface Radiation Network) is a world wide network of high-quality solar radiation monitoring stations as described in [1]_. Data is retrieved from the BSRN FTP server [2]_. Data is returned for the entire months between and including start and end. Parameters ---------- station: str 3-letter BSRN station abbreviation start: datetime-like First day of the requested period end: datetime-like Last day of the requested period username: str username for accessing the BSRN FTP server password: str password for accessing the BSRN FTP server logical_records: list or tuple, default: ('0100',) List of the logical records (LR) to parse. Options include: '0100', '0300', and '0500'. save_path: str or path-like, optional If specified, a directory path of where to save each monthly file. Returns ------- data: DataFrame timeseries data from the BSRN archive, see :func:`pvlib.iotools.read_bsrn` for fields. An empty DataFrame is returned if no data was found for the time period. metadata: dict metadata for the last available monthly file. Raises ------ KeyError If the specified station does not exist on the FTP server. Warns ----- UserWarning If one or more requested files are missing a UserWarning is returned with a list of the filenames missing. If no files match the specified station and timeframe a seperate UserWarning is given. Notes ----- The username and password for the BSRN FTP server can be obtained for free as described in the BSRN's Data Release Guidelines [3]_. Currently only parsing of logical records 0100, 0300 and 0500 is supported. Note not all stations measure LR0300 and LR0500. However, LR0100 is mandatory as it contains the basic irradiance and auxillary measurements. See [4]_ for a description of the different logical records. Future updates may include parsing of additional data and metadata. Important --------- While data from the BSRN is generally of high-quality, measurement data should always be quality controlled before usage! Examples -------- >>> # Retrieve two months irradiance data from the Cabauw BSRN station >>> data, metadata = pvlib.iotools.get_bsrn( # doctest: +SKIP >>> start=pd.Timestamp(2020,1,1), end=pd.Timestamp(2020,12,1), # doctest: +SKIP >>> station='cab', username='yourusername', password='yourpassword') # doctest: +SKIP See Also -------- pvlib.iotools.read_bsrn, pvlib.iotools.parse_bsrn References ---------- .. [1] `World Radiation Monitoring Center - Baseline Surface Radiation Network (BSRN) <https://bsrn.awi.de/>`_ .. [2] `BSRN Data Retrieval via FTP <https://bsrn.awi.de/data/data-retrieval-via-ftp/>`_ .. [4] `BSRN Data Release Guidelines <https://bsrn.awi.de/data/conditions-of-data-release/>`_ .. [3] `Update of the Technical Plan for BSRN Data Management, 2013, Global Climate Observing System (GCOS) GCOS-174. <https://bsrn.awi.de/fileadmin/user_upload/bsrn.awi.de/Publications/gcos-174.pdf>`_ """ # noqa: E501 # The FTP server uses lowercase station abbreviations station = station.lower() # Generate list files to download based on start/end (SSSMMYY.dat.gz) filenames = pd.date_range( start, end.replace(day=1) + pd.DateOffset(months=1), freq='1M')\ .strftime(f"{station}%m%y.dat.gz").tolist() # Create FTP connection with ftplib.FTP(BSRN_FTP_URL, username, password) as ftp: # Change to station sub-directory (checks that the station exists) try: ftp.cwd(f'/{station}') except ftplib.error_perm as e: raise KeyError('Station sub-directory does not exist. Specified ' 'station is probably not a proper three letter ' 'station abbreviation.') from e dfs = [] # Initialize list for monthly dataframes non_existing_files = [] # Initilize list of files that were not found for filename in filenames: try: bio = io.BytesIO() # Initialize BytesIO object # Retrieve binary file from server and write to BytesIO object response = ftp.retrbinary(f'RETR {filename}', bio.write) # Check that transfer was successfull if not response.startswith('226 Transfer complete'): raise ftplib.Error(response) # Save file locally if save_path is specified if save_path is not None: # Create local file with open(os.path.join(save_path, filename), 'wb') as f: f.write(bio.getbuffer()) # Write local file # Open gzip file and convert to StringIO bio.seek(0) # reset buffer to start of file gzip_file = io.TextIOWrapper(gzip.GzipFile(fileobj=bio), encoding='latin1') dfi, metadata = parse_bsrn(gzip_file, logical_records) dfs.append(dfi) # FTP client raises an error if the file does not exist on server except ftplib.error_perm as e: if str(e) == '550 Failed to open file.': non_existing_files.append(filename) else: raise ftplib.error_perm(e) ftp.quit() # Close and exit FTP connection # Raise user warnings if not dfs: # If no files were found warnings.warn('No files were available for the specified timeframe.') elif non_existing_files: # If only some files were missing warnings.warn(f'The following files were not found: {non_existing_files}') # noqa: E501 # Concatenate monthly dataframes to one dataframe if len(dfs): data = pd.concat(dfs, axis='rows') else: # Return empty dataframe data = _empty_dataframe_from_logical_records(logical_records) metadata = {} # Return dataframe and metadata (metadata belongs to last available file) return data, metadata
[docs]def parse_bsrn(fbuf, logical_records=('0100',)): """ Parse a file-like buffer of a BSRN station-to-archive file. Parameters ---------- fbuf: file-like buffer Buffer of a BSRN station-to-archive data file logical_records: list or tuple, default: ('0100',) List of the logical records (LR) to parse. Options include: '0100', '0300', and '0500'. Returns ------- data: DataFrame timeseries data from the BSRN archive, see :func:`pvlib.iotools.read_bsrn` for fields. An empty DataFrame is returned if the specified logical records were not found. metadata: dict Dictionary containing metadata (primarily from LR0004). See Also -------- pvlib.iotools.read_bsrn, pvlib.iotools.get_bsrn """ # Parse metadata fbuf.readline() # first line should be *U0001, so read it and discard date_line = fbuf.readline() # second line contains important metadata start_date = pd.Timestamp(year=int(date_line[7:11]), month=int(date_line[3:6]), day=1, tz='UTC') # BSRN timestamps are UTC metadata = {} # Initilize dictionary containing metadata metadata['start date'] = start_date metadata['station identification number'] = int(date_line[:3]) metadata['version of data'] = int(date_line.split()[-1]) for line in fbuf: if line[2:6] == '0004': # stop once LR0004 has been reached break elif line == '': raise ValueError('Mandatory record LR0004 not found.') metadata['date when station description changed'] = fbuf.readline().strip() metadata['surface type'] = int(fbuf.readline(3)) metadata['topography type'] = int(fbuf.readline()) metadata['address'] = fbuf.readline().strip() metadata['telephone no. of station'] = fbuf.readline(20).strip() metadata['FAX no. of station'] = fbuf.readline().strip() metadata['TCP/IP no. of station'] = fbuf.readline(15).strip() metadata['e-mail address of station'] = fbuf.readline().strip() metadata['latitude_bsrn'] = float(fbuf.readline(8)) # BSRN convention metadata['latitude'] = metadata['latitude_bsrn'] - 90 # ISO 19115 metadata['longitude_bsrn'] = float(fbuf.readline(8)) # BSRN convention metadata['longitude'] = metadata['longitude_bsrn'] - 180 # ISO 19115 metadata['altitude'] = int(fbuf.readline(5)) metadata['identification of "SYNOP" station'] = fbuf.readline().strip() metadata['date when horizon changed'] = fbuf.readline().strip() # Pass last section of LR0004 containing the horizon elevation data horizon = [] # list for raw horizon elevation data while True: line = fbuf.readline() if ('*' in line) | (line == ''): break else: horizon += [int(i) for i in line.split()] horizon = pd.Series(horizon[1::2], horizon[::2], name='horizon_elevation', dtype=int).drop(-1, errors='ignore').sort_index() horizon.index.name = 'azimuth' metadata['horizon'] = horizon # Read file and store the starting line number and number of lines for # each logical record (LR) fbuf.seek(0) # reset buffer to start of file lr_startrow = {} # Dictionary of starting line number for each LR lr_nrows = {} # Dictionary of end line number for each LR for num, line in enumerate(fbuf): if line.startswith('*'): # Find start of all logical records if len(lr_startrow) >= 1: lr_nrows[lr] = num - lr_startrow[lr] - 1 # noqa: F821 lr = line[2:6] # string of 4 digit LR number lr_startrow[lr] = num lr_nrows[lr] = num - lr_startrow[lr] for lr in logical_records: if lr not in ['0100', '0300', '0500']: raise ValueError(f"Logical record {lr} not in " "['0100', '0300','0500'].") dfs = [] # Initialize empty list for dataframe # Parse LR0100 - basic measurements including GHI, DNI, DHI and temperature if ('0100' in lr_startrow.keys()) & ('0100' in logical_records): fbuf.seek(0) # reset buffer to start of file LR_0100 = pd.read_fwf(fbuf, skiprows=lr_startrow['0100'] + 1, nrows=lr_nrows['0100'], header=None, colspecs=BSRN_LR0100_COL_SPECS, na_values=[-999.0, -99.9]) # Create multi-index and unstack, resulting in 1 col for each variable LR_0100 = LR_0100.set_index([LR_0100.index // 2, LR_0100.index % 2]) LR_0100 = LR_0100.unstack(level=1).swaplevel(i=0, j=1, axis='columns') # Sort columns to match original order and assign column names LR_0100 = LR_0100.reindex(sorted(LR_0100.columns), axis='columns') LR_0100.columns = BSRN_LR0100_COLUMNS # Set datetime index LR_0100.index = (start_date+pd.to_timedelta(LR_0100['day']-1, unit='d') + pd.to_timedelta(LR_0100['minute'], unit='T')) # Drop empty, minute, and day columns LR_0100 = LR_0100.drop(columns=['empty', 'day', 'minute']) dfs.append(LR_0100) # Parse LR0300 - other time series data, including upward and net radiation if ('0300' in lr_startrow.keys()) & ('0300' in logical_records): fbuf.seek(0) # reset buffer to start of file LR_0300 = pd.read_fwf(fbuf, skiprows=lr_startrow['0300']+1, nrows=lr_nrows['0300'], header=None, na_values=[-999.0, -99.9], colspecs=BSRN_LR0300_COL_SPECS, names=BSRN_LR0300_COLUMNS) LR_0300.index = (start_date+pd.to_timedelta(LR_0300['day']-1, unit='d') + pd.to_timedelta(LR_0300['minute'], unit='T')) LR_0300 = LR_0300.drop(columns=['day', 'minute']).astype(float) dfs.append(LR_0300) # Parse LR0500 - UV measurements if ('0500' in lr_startrow.keys()) & ('0500' in logical_records): fbuf.seek(0) # reset buffer to start of file LR_0500 = pd.read_fwf(fbuf, skiprows=lr_startrow['0500']+1, nrows=lr_nrows['0500'], na_values=[-99.9], header=None, colspecs=BSRN_LR0500_COL_SPECS) # Create multi-index and unstack, resulting in 1 col for each variable LR_0500 = LR_0500.set_index([LR_0500.index // 2, LR_0500.index % 2]) LR_0500 = LR_0500.unstack(level=1).swaplevel(i=0, j=1, axis='columns') # Sort columns to match original order and assign column names LR_0500 = LR_0500.reindex(sorted(LR_0500.columns), axis='columns') LR_0500.columns = BSRN_LR0500_COLUMNS LR_0500.index = (start_date+pd.to_timedelta(LR_0500['day']-1, unit='d') + pd.to_timedelta(LR_0500['minute'], unit='T')) LR_0500 = LR_0500.drop(columns=['empty', 'day', 'minute']) dfs.append(LR_0500) if len(dfs): data = pd.concat(dfs, axis='columns') else: data = _empty_dataframe_from_logical_records(logical_records) metadata = {} return data, metadata
[docs]def read_bsrn(filename, logical_records=('0100',)): """ Read a BSRN station-to-archive file into a DataFrame. The BSRN (Baseline Surface Radiation Network) is a world wide network of high-quality solar radiation monitoring stations as described in [1]_. The function is able to parse logical records (LR) 0100, 0300, and 0500. LR0100 contains the basic measurements, which include global, diffuse, and direct irradiance, as well as downwelling long-wave radiation [2]_. Future updates may include parsing of additional data and metadata. BSRN files are freely available and can be accessed via FTP [3]_. The username and password for the BSRN FTP server can be obtained for free as described in the BSRN's Data Release Guidelines [3]_. Parameters ---------- filename: str or path-like Name or path of a BSRN station-to-archive data file logical_records: list or tuple, default: ('0100',) List of the logical records (LR) to parse. Options include: '0100', '0300', and '0500'. Returns ------- data: DataFrame A DataFrame with the columns as described below. For a more extensive description of the variables, consult [2]_. An empty DataFrame is returned if the specified logical records were not found. metadata: dict Dictionary containing metadata (primarily from LR0004). Notes ----- The data DataFrame for LR0100 includes the following fields: ======================= ====== ========================================== Key Format Description ======================= ====== ========================================== **Logical record 0100** --------------------------------------------------------------------------- ghi† float Mean global horizontal irradiance [W/m^2] dni† float Mean direct normal irradiance [W/m^2] dhi† float Mean diffuse horizontal irradiance [W/m^2] lwd† float Mean. downward long-wave radiation [W/m^2] temp_air float Air temperature [°C] relative_humidity float Relative humidity [%] pressure float Atmospheric pressure [hPa] ----------------------- ------ ------------------------------------------ **Logical record 0300** --------------------------------------------------------------------------- gri† float Mean ground-reflected irradiance [W/m^2] lwu† float Mean long-wave upwelling irradiance [W/m^2] net_radiation† float Mean net radiation (net radiometer) [W/m^2] ----------------------- ------ ------------------------------------------ **Logical record 0500** --------------------------------------------------------------------------- uva_global† float Mean UV-A global irradiance [W/m^2] uvb_direct† float Mean UV-B direct irradiance [W/m^2] uvb_global† float Mean UV-B global irradiance [W/m^2] uvb_diffuse† float Mean UV-B diffuse irradiance [W/m^2] uvb_reflected† float Mean UV-B reflected irradiance [W/m^2] ======================= ====== ========================================== † Marked variables have corresponding columns for the standard deviation (_std), minimum (_min), and maximum (_max) calculated from the 60 samples that are average into each 1-minute measurement. Hint ---- According to [2]_ "All time labels in the station-to-archive files denote the start of a time interval." This corresponds to left bin edge labeling. See Also -------- pvlib.iotools.parse_bsrn, pvlib.iotools.get_bsrn References ---------- .. [1] `World Radiation Monitoring Center - Baseline Surface Radiation Network (BSRN) <https://bsrn.awi.de/>`_ .. [2] `Update of the Technical Plan for BSRN Data Management, 2013, Global Climate Observing System (GCOS) GCOS-174. <https://bsrn.awi.de/fileadmin/user_upload/bsrn.awi.de/Publications/gcos-174.pdf>`_ .. [3] `BSRN Data Retrieval via FTP <https://bsrn.awi.de/data/data-retrieval-via-ftp/>`_ .. [4] `BSRN Data Release Guidelines <https://bsrn.awi.de/data/conditions-of-data-release/>`_ """ # noqa: E501 if str(filename).endswith('.gz'): # check if file is a gzipped (.gz) file open_func, mode = gzip.open, 'rt' else: open_func, mode = open, 'r' with open_func(filename, mode) as f: content = parse_bsrn(f, logical_records) return content