Source code for pvlib.iotools.crn

"""Functions to read data from the US Climate Reference Network (CRN).
"""

import pandas as pd
import numpy as np


HEADERS = [
    'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
    'LONGITUDE', 'LATITUDE', 'AIR_TEMPERATURE', 'PRECIPITATION',
    'SOLAR_RADIATION', 'SR_FLAG', 'SURFACE_TEMPERATURE', 'ST_TYPE', 'ST_FLAG',
    'RELATIVE_HUMIDITY', 'RH_FLAG', 'SOIL_MOISTURE_5', 'SOIL_TEMPERATURE_5',
    'WETNESS', 'WET_FLAG', 'WIND_1_5', 'WIND_FLAG']

VARIABLE_MAP = {
    'LONGITUDE': 'longitude',
    'LATITUDE': 'latitude',
    'AIR_TEMPERATURE': 'temp_air',
    'SOLAR_RADIATION': 'ghi',
    'SR_FLAG': 'ghi_flag',
    'RELATIVE_HUMIDITY': 'relative_humidity',
    'RH_FLAG': 'relative_humidity_flag',
    'WIND_1_5': 'wind_speed',
    'WIND_FLAG': 'wind_speed_flag'
}

NAN_DICT = {
    'CRX_VN': -99999,
    'AIR_TEMPERATURE': -9999,
    'PRECIPITATION': -9999,
    'SOLAR_RADIATION': -99999,
    'SURFACE_TEMPERATURE': -9999,
    'RELATIVE_HUMIDITY': -9999,
    'SOIL_MOISTURE_5': -99,
    'SOIL_TEMPERATURE_5': -9999,
    'WETNESS': -9999,
    'WIND_1_5': -99}

# Add NUL characters to possible NaN values for all columns
NAN_DICT = {k: [v, '\x00\x00\x00\x00\x00\x00'] for k, v in NAN_DICT.items()}

# as specified in CRN README.txt file. excludes 1 space between columns
WIDTHS = [5, 8, 4, 8, 4, 6, 7, 7, 7, 7, 6, 1, 7, 1, 1, 5, 1, 7, 7, 5, 1, 6, 1]
# add 1 to make fields contiguous (required by pandas.read_fwf)
WIDTHS = [w + 1 for w in WIDTHS]
# no space after last column
WIDTHS[-1] -= 1

# specify dtypes for potentially problematic values
DTYPES = [
    'int64', 'int64', 'int64', 'int64', 'int64', 'str', 'float64', 'float64',
    'float64', 'float64', 'float64', 'int64', 'float64', 'O', 'int64',
    'float64', 'int64', 'float64', 'float64', 'int64', 'int64', 'float64',
    'int64'
]



[docs]
def read_crn(filename, map_variables=True):
    """Read a NOAA USCRN fixed-width file into a pandas dataframe.

    The CRN network consists of over 100 meteorological stations covering the
    U.S. and is described in [1]_ and [2]_. The primary goal of CRN is to
    provide long-term measurements of temperature, precipitation, and soil
    moisture and temperature. Additionally, global horizontal irradiance (GHI)
    is measured at each site using a photodiode pyranometer.

    Parameters
    ----------
    filename: str, path object, or file-like
        filepath or url to read for the fixed-width file.
    map_variables: boolean, default: True
        When true, renames columns of the Dataframe to pvlib variable names
        where applicable. See variable :const:`VARIABLE_MAP`.

    Returns
    -------
    data: Dataframe
        A dataframe with DatetimeIndex and all of the variables in the
        file.

    Notes
    -----
    CRN files contain 5 minute averages labeled by the interval ending
    time. Here, missing data is flagged as NaN, rather than the lowest
    possible integer for a field (e.g. -999 or -99). Air temperature is in
    deg C and wind speed is in m/s at a height of 1.5 m above ground level.

    Variables corresponding to standard pvlib variables are by default renamed,
    e.g. `SOLAR_RADIATION` becomes `ghi`. See the
    :const:`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.

    CRN files occasionally have a set of null characters on a line
    instead of valid data. This function drops those lines. Sometimes
    these null characters appear on a line of their own and sometimes
    they occur on the same line as valid data. In the latter case, the
    valid data will not be returned. Users may manually remove the null
    characters and reparse the file if they need that line.

    References
    ----------
    .. [1] U.S. Climate Reference Network
       `https://www.ncdc.noaa.gov/crn/qcdatasets.html
       <https://www.ncdc.noaa.gov/crn/qcdatasets.html>`_

    .. [2] Diamond, H. J. et. al., 2013: U.S. Climate Reference Network
       after one decade of operations: status and assessment. Bull.
       Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1`
    """

    # read in data
    # TODO: instead of parsing as strings and then post-processing, switch to
    # pd.read_fwf(..., dtype=dict(zip(HEADERS, DTYPES)), skip_blank_lines=True)
    # when our minimum pandas >= 1.2.0 (skip_blank_lines bug for <1.2.0).
    # As a workaround, parse all values as strings, then drop NaN, then cast
    # to the appropriate dtypes, and mask "sentinal" NaN (e.g. -9999.0)
    data = pd.read_fwf(filename, header=None, names=HEADERS, widths=WIDTHS,
                       dtype=str)

    # drop empty (bad) lines
    data = data.dropna(axis=0, how='all')

    # can't set dtypes in read_fwf because int cols can't contain NaN, so
    # do it here instead
    data = data.astype(dict(zip(HEADERS, DTYPES)))

    # finally, replace -999 values with NaN
    data = data.replace(NAN_DICT, value=np.nan)

    # set index
    # UTC_TIME does not have leading 0s, so must zfill(4) to comply
    # with %H%M format
    dts = data[['UTC_DATE', 'UTC_TIME']].astype(str)
    dtindex = pd.to_datetime(dts['UTC_DATE'] + dts['UTC_TIME'].str.zfill(4),
                             format='%Y%m%d%H%M', utc=True)
    data = data.set_index(dtindex)

    if map_variables:
        data = data.rename(columns=VARIABLE_MAP)

    return data