Source code for pytesmo.validation_framework.data_scalers

# Copyright (c) 2020, TU Wien, Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the TU Wien, Department of Geodesy and
#      Geoinformation nor the names of its contributors may be used to endorse
#      or promote products derived from this software without specific prior
#      written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Data scaler classes to be used together with the validation framework.
"""

import numpy as np
import pandas as pd
import pytesmo.scaling as scaling
from pytesmo.cdf_matching import CDFMatching
from pynetcf.point_data import GriddedPointData


[docs]class DefaultScaler(object): """ Scaling class that implements the scaling based on a given method from the pytesmo.scaling module. Parameters ---------- method: string The data will be scaled into the reference space using the method specified by this string. """ def __init__(self, method): self.method = method
[docs] def scale(self, data, reference_index, gpi_info): """ Scale all columns in data to the column at the reference_index. Parameters ---------- data: pandas.DataFrame temporally matched dataset reference_index: int Which column of the data contains the scaling reference. gpi_info: tuple tuple of at least, (gpi, lon, lat) Where gpi has to be the grid point indices of the grid of this scaler. Raises ------ ValueError if scaling is not successful """ return scaling.scale( data, method=self.method, reference_index=reference_index )
[docs]class CDFStoreParamsScaler(object): """ CDF scaling using stored parameters if available. If stored parameters are not available they are calculated and written to disk. Parameters ---------- path : string Path where the data is/should be stored grid : :py:class:`pygeogrids.grids.CellGrid` instance Grid on which the data is stored. Should be the same as the spatial reference grid of the validation framework instance in which this scaler is used. percentiles : list or np.ndarray Percentiles to use for CDF matching **matcher_kwargs : keyword arguments Passed on to :py:class:`pytesmo.cdf_matching.CDFMatching`` """ def __init__( self, path, grid, percentiles=[0, 5, 10, 30, 50, 70, 90, 95, 100], **matcher_kwargs ): self.path = path self.grid = grid self.percentiles = np.asanyarray(percentiles) self.io = GriddedPointData( path, grid, mode="a", ioclass_kws={ "add_dims": {"src_ref": 2, "percentiles": len(percentiles)} }, ) self.matcher_kwargs = matcher_kwargs
[docs] def scale(self, data, reference_index, gpi_info): """ Scale all columns in data to the column at the reference_index. Parameters ---------- data: pandas.DataFrame temporally matched dataset reference_index: int Which column of the data contains the scaling reference. gpi_info: tuple tuple of at least, (gpi, lon, lat) Where gpi has to be the grid point indices of the grid of this scaler. Raises ------ ValueError if scaling is not successful """ gpi = gpi_info[0] parameters = self.get_parameters(data, reference_index, gpi) refname = data.columns.values[reference_index] reference = data[refname] for column in data: if column == refname: continue params = parameters[f"{column}_{refname}"] matcher = CDFMatching(percentiles=self.percentiles, **self.matcher_kwargs) matcher.x_perc_ = params[0, :] matcher.y_perc_ = params[1, :] data[column] = pd.Series(matcher.predict(data[column]), index=data.index) return data
[docs] def calc_parameters(self, data, reference_index): """ Calculate the percentiles used for CDF matching. Parameters ---------- data: pandas.DataFrame temporally matched dataset reference_index : int Index of the reference column in the dataset. Returns ------- matchers: dictionary keys -> Names of columns in the input data frame values -> nbins x 3 numpy.ndarrays with columns x_perc, y_perc, percentiles """ parameters = {} refname = data.columns[reference_index] for column in data.columns: if column == refname: continue matcher = CDFMatching(percentiles=self.percentiles, **self.matcher_kwargs) matcher.fit(data[column], data[refname]) nperc = matcher.nbins + 1 params = np.zeros((2, nperc), matcher.x_perc_.dtype) params[0, :] = matcher.x_perc_ params[1, :] = matcher.y_perc_ parameters[f"{column}_{refname}"] = params return parameters
[docs] def get_parameters(self, data, reference_index, gpi): """ Function to get scaling parameters. Try to load them, if they are not found we calculate them and store them. Parameters ---------- data: pandas.DataFrame temporally matched dataset gpi: int grid point index of self.grid Returns ------- params: dictionary keys -> Names of columns in the input data frame values -> numpy.ndarrays with the percentiles """ params = self.load_parameters(gpi) if params is None: params = self.calc_parameters(data, reference_index) self.store_parameters(gpi, params) return params
[docs] def load_parameters(self, gpi): data = self.io.read(gpi) if data is not None: unwanted_keys = ["lat", "lon", "alt", "time", "location_id"] for key in unwanted_keys: del data[key] # remove extra dimension from reading from netCDF for key in data: data[key] = np.squeeze(data[key]) return data
[docs] def store_parameters(self, gpi, parameters): """ Store parameters for gpi into netCDF file. Parameters ---------- gpi: int grid point index of self.grid params: dictionary keys -> Names of columns in the input data frame values -> numpy.ndarrays with the percentiles """ data = [] dtypes = [] dim_info = {"dims": {}} for key in parameters: dim_info["dims"][key] = ("obs", "src_ref", "percentiles") dtypes.append( (key, parameters[key].dtype, (parameters[key].shape[-1],)) ) data.append(parameters[key]) data = np.core.records.fromarrays( data, dtype=np.dtype(dtypes, metadata=dim_info) ) self.io.write(gpi, data)