Source code for pytesmo.validation_framework.data_scalers

# Copyright (c) 2020, TU Wien, Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the TU Wien, Department of Geodesy and
#      Geoinformation nor the names of its contributors may be used to endorse
#      or promote products derived from this software without specific prior
#      written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Data scaler classes to be used together with the validation framework.
"""

import numpy as np
import pandas as pd
import pytesmo.scaling as scaling
from pytesmo.cdf_matching import CDFMatching
from pynetcf.point_data import GriddedPointData


[docs]class DefaultScaler(object):
    """
    Scaling class that implements the scaling based on a
    given method from the pytesmo.scaling module.

    Parameters
    ----------
    method: string
        The data will be scaled into the reference space using the
        method specified by this string.
    """

    def __init__(self, method):
        self.method = method

[docs]    def scale(self, data, reference_index, gpi_info):
        """
        Scale all columns in data to the
        column at the reference_index.

        Parameters
        ----------
        data: pandas.DataFrame
            temporally matched dataset
        reference_index: int
            Which column of the data contains the
            scaling reference.
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)
            Where gpi has to be the grid point indices
            of the grid of this scaler.

        Raises
        ------
        ValueError
            if scaling is not successful
        """
        return scaling.scale(
            data, method=self.method, reference_index=reference_index
        )


[docs]class CDFStoreParamsScaler(object):
    """
    CDF scaling using stored parameters if available.
    If stored parameters are not available they are calculated
    and written to disk.

    Parameters
    ----------
    path : string
        Path where the data is/should be stored
    grid : :py:class:`pygeogrids.grids.CellGrid` instance
        Grid on which the data is stored.
        Should be the same as the spatial reference grid
        of the validation framework instance in which this
        scaler is used.
    percentiles : list or np.ndarray
        Percentiles to use for CDF matching
    **matcher_kwargs : keyword arguments
        Passed on to :py:class:`pytesmo.cdf_matching.CDFMatching``
    """

    def __init__(
        self, path, grid, percentiles=[0, 5, 10, 30, 50, 70, 90, 95, 100],
        **matcher_kwargs
    ):
        self.path = path
        self.grid = grid
        self.percentiles = np.asanyarray(percentiles)
        self.io = GriddedPointData(
            path,
            grid,
            mode="a",
            ioclass_kws={
                "add_dims": {"src_ref": 2, "percentiles": len(percentiles)}
            },
        )
        self.matcher_kwargs = matcher_kwargs

[docs]    def scale(self, data, reference_index, gpi_info):
        """
        Scale all columns in data to the
        column at the reference_index.

        Parameters
        ----------
        data: pandas.DataFrame
            temporally matched dataset
        reference_index: int
            Which column of the data contains the
            scaling reference.
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)
            Where gpi has to be the grid point indices
            of the grid of this scaler.

        Raises
        ------
        ValueError
            if scaling is not successful
        """
        gpi = gpi_info[0]
        parameters = self.get_parameters(data, reference_index, gpi)

        refname = data.columns.values[reference_index]
        reference = data[refname]
        for column in data:
            if column == refname:
                continue
            params = parameters[f"{column}_{refname}"]
            matcher = CDFMatching(percentiles=self.percentiles,
                                  **self.matcher_kwargs)
            matcher.x_perc_ = params[0, :]
            matcher.y_perc_ = params[1, :]
            data[column] = pd.Series(matcher.predict(data[column]),
                                     index=data.index)

        return data

[docs]    def calc_parameters(self, data, reference_index):
        """
        Calculate the percentiles used for CDF matching.

        Parameters
        ----------
        data: pandas.DataFrame
            temporally matched dataset
        reference_index : int
            Index of the reference column in the dataset.

        Returns
        -------
        matchers: dictionary
            keys -> Names of columns in the input data frame
            values -> nbins x 3 numpy.ndarrays with columns x_perc, y_perc,
                      percentiles
        """

        parameters = {}
        refname = data.columns[reference_index]
        for column in data.columns:
            if column == refname:
                continue
            matcher = CDFMatching(percentiles=self.percentiles,
                                  **self.matcher_kwargs)
            matcher.fit(data[column], data[refname])
            nperc = matcher.nbins + 1
            params = np.zeros((2, nperc), matcher.x_perc_.dtype)
            params[0, :] = matcher.x_perc_
            params[1, :] = matcher.y_perc_
            parameters[f"{column}_{refname}"] = params
        return parameters

[docs]    def get_parameters(self, data, reference_index, gpi):
        """
        Function to get scaling parameters.
        Try to load them, if they are not found we
        calculate them and store them.

        Parameters
        ----------
        data: pandas.DataFrame
            temporally matched dataset
        gpi: int
            grid point index of self.grid

        Returns
        -------
        params: dictionary
            keys -> Names of columns in the input data frame
            values -> numpy.ndarrays with the percentiles
        """

        params = self.load_parameters(gpi)
        if params is None:
            params = self.calc_parameters(data, reference_index)
            self.store_parameters(gpi, params)
        return params

[docs]    def load_parameters(self, gpi):
        data = self.io.read(gpi)
        if data is not None:
            unwanted_keys = ["lat", "lon", "alt", "time", "location_id"]
            for key in unwanted_keys:
                del data[key]

            # remove extra dimension from reading from netCDF
            for key in data:
                data[key] = np.squeeze(data[key])

        return data

[docs]    def store_parameters(self, gpi, parameters):
        """
        Store parameters for gpi into netCDF file.

        Parameters
        ----------
        gpi: int
            grid point index of self.grid
        params: dictionary
            keys -> Names of columns in the input data frame
            values -> numpy.ndarrays with the percentiles
        """
        data = []
        dtypes = []
        dim_info = {"dims": {}}
        for key in parameters:
            dim_info["dims"][key] = ("obs", "src_ref", "percentiles")
            dtypes.append(
                (key, parameters[key].dtype, (parameters[key].shape[-1],))
            )
            data.append(parameters[key])

        data = np.core.records.fromarrays(
            data, dtype=np.dtype(dtypes, metadata=dim_info)
        )
        self.io.write(gpi, data)