Source code for pytesmo.validation_framework.data_scalers

# Copyright (c) 2017,Vienna University of Technology,
# Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#   * Neither the name of the Vienna University of Technology, Department of
#     Geodesy and Geoinformation nor the names of its contributors may be used
#     to endorse or promote products derived from this software without specific
#     prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY, DEPARTMENT OF
# GEODESY AND GEOINFORMATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

'''
Data scaler classes to be used together with the validation framework.
'''

import numpy as np
import pandas as pd
import pytesmo.scaling as scaling
from pytesmo.scaling import lin_cdf_match_stored_params
from pytesmo.utils import unique_percentiles_interpolate
from pynetcf.point_data import GriddedPointData


[docs]class DefaultScaler(object): """ Scaling class that implements the scaling based on a given method from the pytesmo.scaling module. Parameters ---------- method: string The data will be scaled into the reference space using the method specified by this string. """ def __init__(self, method): self.method = method
[docs] def scale(self, data, reference_index, gpi_info): """ Scale all columns in data to the column at the reference_index. Parameters ---------- data: pandas.DataFrame temporally matched dataset reference_index: int Which column of the data contains the scaling reference. gpi_info: tuple tuple of at least, (gpi, lon, lat) Where gpi has to be the grid point indices of the grid of this scaler. Raises ------ ValueError if scaling is not successful """ return scaling.scale(data, method=self.method, reference_index=reference_index)
[docs]class CDFStoreParamsScaler(object): """ CDF scaling using stored parameters if available. If stored parameters are not available they are calculated and written to disk. Parameters ---------- path: string Path where the data is/should be stored grid: :py:class:`pygeogrids.grids.CellGrid` instance Grid on which the data is stored. Should be the same as the spatial reference grid of the validation framework instance in which this scaler is used. percentiles: list or np.ndarray Percentiles to use for CDF matching """ def __init__(self, path, grid, percentiles=[0, 5, 10, 30, 50, 70, 90, 95, 100]): self.path = path self.grid = grid self.percentiles = np.asanyarray(percentiles) self.io = GriddedPointData(path, grid, mode='a', ioclass_kws={'add_dims': {'percentiles': self.percentiles.size}})
[docs] def scale(self, data, reference_index, gpi_info): """ Scale all columns in data to the column at the reference_index. Parameters ---------- data: pandas.DataFrame temporally matched dataset reference_index: int Which column of the data contains the scaling reference. gpi_info: tuple tuple of at least, (gpi, lon, lat) Where gpi has to be the grid point indices of the grid of this scaler. Raises ------ ValueError if scaling is not successful """ gpi = gpi_info[0] parameters = self.get_parameters(data, gpi) reference_name = data.columns.values[reference_index] reference = data[reference_name] data = data.drop([reference_name], axis=1) for series in data: src_percentiles = parameters[series] ref_percentiles = parameters[reference_name] data[series] = pd.Series( lin_cdf_match_stored_params(data[series].values, src_percentiles, ref_percentiles), index=data.index) data.insert(reference_index, reference.name, reference) return data
[docs] def calc_parameters(self, data): """ Calculate the percentiles used for CDF matching. Parameters ---------- data: pandas.DataFrame temporally matched dataset Returns ------- parameters: dictionary keys -> Names of columns in the input data frame values -> numpy.ndarrays with the percentiles """ parameters = {} for column in data.columns: c_data = data[column].values perc = np.percentile(c_data, self.percentiles) perc = unique_percentiles_interpolate(perc, percentiles=self.percentiles) parameters[column] = perc return parameters
[docs] def get_parameters(self, data, gpi): """ Function to get scaling parameters. Try to load them, if they are not found we calculate them and store them. Parameters ---------- data: pandas.DataFrame temporally matched dataset gpi: int grid point index of self.grid Returns ------- params: dictionary keys -> Names of columns in the input data frame values -> numpy.ndarrays with the percentiles """ params = self.load_parameters(gpi) if params is None: params = self.calc_parameters(data) self.store_parameters(gpi, params) return params
[docs] def load_parameters(self, gpi): data = self.io.read(gpi) if data is not None: unwanted_keys = ['lat', 'lon', 'alt', 'time', 'location_id'] for key in unwanted_keys: del data[key] # remove extra dimension from reading from netCDF for key in data: data[key] = np.squeeze(data[key]) return data
[docs] def store_parameters(self, gpi, parameters): """ Store parameters for gpi into netCDF file. Parameters ---------- gpi: int grid point index of self.grid params: dictionary keys -> Names of columns in the input data frame values -> numpy.ndarrays with the percentiles """ data = [] dtypes = [] dim_info = {'dims': {}} for key in parameters: dim_info['dims'][key] = ('obs', 'percentiles') dtypes.append( (key, parameters[key].dtype, (parameters[key].size, ))) data.append(parameters[key]) data = np.core.records.fromarrays(data, dtype=np.dtype(dtypes, metadata=dim_info)) self.io.write(gpi, data)