Source code for pytesmo.scaling

'''
Created on Apr 17, 2013

@author: Christoph Paulik christoph.paulik@geo.tuwien.ac.at
'''

from scipy import stats
import numpy as np
import pandas as pd
import scipy.interpolate as sc_int
from warnings import warn
from pytesmo.utils import unique_percentiles_interpolate


[docs]def add_scaled(df, method='linreg', label_in=None, label_scale=None):
    """
    takes a dataframe and appends a scaled time series to it. If no labels are
    given the first column will be scaled to the second column of the DataFrame

    Parameters
    ----------
    df : pandas.DataFrame
        input dataframe
    method : string
        scaling method
    label_in: string, optional
        the column of the dataframe that should be scaled to that with label_scale
        default is the first column
    label_scale : string, optional
        the column of the dataframe the label_in column should be scaled to
        default is the second column

    Returns
    -------
    df : pandas.DataFrame
        input dataframe with new column labeled label_in+'_scaled_'+method
    """

    if label_in == None:
        label_in = df.columns.values[0]
    if label_scale == None:
        label_scale = df.columns.values[1]

    scaling_func = get_scaling_function(method)

    scaled = scaling_func(df[label_in].values, df[label_scale].values)

    new_label = label_in + '_scaled_' + method

    df[new_label] = pd.Series(scaled, index=df.index)

    return df


[docs]def scale(df, method='linreg', reference_index=0):
    """
    takes pandas.DataFrame and scales all columns to the column specified
    by reference_index with the chosen method

    Parameters
    ----------
    df : pandas.DataFrame
        containing matched time series that should be scaled
    method : string, optional
        method definition, has to be a function in globals() that takes 2 numpy.array
        as input and returns one numpy.array of same length
    reference_index : int, optional
        default 0, column index of reference dataset in dataframe

    Returns
    -------
    scaled data : pandas.DataFrame
        all time series of the input DataFrame scaled to the one specified by
        reference_index
    """
    scaling_func = get_scaling_function(method)

    reference = df[df.columns.values[reference_index]]
    df = df.drop([df.columns.values[reference_index]], axis=1)
    #new_df = pd.DataFrame
    for series in df:
        df[series] = pd.Series(
            scaling_func(df[series].values, reference.values),
            index=df.index)

    df.insert(reference_index, reference.name, reference)

    return df


[docs]def get_scaling_method_lut():
    """
    Get all defined scaling methods and their function names.

    Returns
    -------
    lut: dictionary
       key: scaling method name
       value: function
    """

    lut = {'linreg': linreg,
           'mean_std': mean_std,
           'min_max': min_max,
           'lin_cdf_match': lin_cdf_match,
           'cdf_match': cdf_match}
    return lut


[docs]def get_scaling_function(method):
    """
    Get scaling function based on method name.

    Parameters
    ----------
    method: string
        method name as string

    Returns
    -------
    scaling_func: function
        function(src:numpy.ndarray, ref:numpy.ndarray) > scaled_src:np.ndarray

    Raises
    ------
    KeyError:
        if method is not found
    """
    lut = get_scaling_method_lut()
    try:
        return lut[method]
    except KeyError as e:
        warn('scaling method not found')
        raise e


[docs]def min_max(src, ref):
    """
    scales the input datasets so that they have the same minimum
    and maximum afterwards

    Parameters
    ----------
    src : numpy.array
        input dataset which will be scaled
    ref : numpy.array
        src will be scaled to this dataset

    Returns
    -------
    scaled dataset : numpy.array
        dataset src with same maximum and minimum as ref
    """
    return ((src - np.min(src)) / (np.max(src) - np.min(src)) *
            (np.max(ref) - np.min(ref)) + np.min(ref))


[docs]def linreg(src, ref):
    """
    scales the input datasets using linear regression

    Parameters
    ----------
    src : numpy.array
        input dataset which will be scaled
    ref : numpy.array
        src will be scaled to this dataset

    Returns
    -------
    scaled dataset : numpy.array
        dataset scaled using linear regression
    """

    slope, intercept, r_value, p_value, std_err = stats.linregress(
        src, ref)

    return np.abs(slope) * src + intercept


[docs]def mean_std(src, ref):
    """
    scales the input datasets so that they have the same mean
    and standard deviation afterwards

    Parameters
    ----------
    src : numpy.array
        input dataset which will be scaled
    ref : numpy.array
        src will be scaled to this dataset

    Returns
    -------
    scaled dataset : numpy.array
        dataset src with same mean and standard deviation as ref
    """
    return ((src - np.mean(src)) /
            np.std(src)) * np.std(ref) + np.mean(ref)


[docs]def lin_cdf_match(src, ref,
                  min_val=None, max_val=None,
                  percentiles=[0, 5, 10, 30, 50, 70, 90, 95, 100]):
    '''
    computes cumulative density functions of src and ref at their
    respective bin-edges by linear interpolation; then matches CDF of
    src to CDF of ref.

    This function does not make sure that the percentiles are unique so
    it can happen that multiple measurements are scaled to one point or that
    there are NaN values in the output array.

    Parameters
    ----------
    src: numpy.array
        input dataset which will be scaled
    ref: numpy.array
        src will be scaled to this dataset
    min_val: float, optional
        Minimum allowed value, output data is capped at this value
    max_val: float, optional
        Maximum allowed value, output data is capped at this value
    percentiles: list or numpy.ndarray
        Percentiles to use for CDF matching

    Returns
    -------
    CDF matched values: numpy.array
        dataset src with CDF as ref
    '''

    perc_src = np.array(np.percentile(src, percentiles))
    perc_ref = np.array(np.percentile(ref, percentiles))

    return lin_cdf_match_stored_params(src, perc_src, perc_ref,
                                       min_val=min_val, max_val=max_val)


[docs]def lin_cdf_match_stored_params(src, perc_src, perc_ref,
                                min_val=None, max_val=None):
    """
    Performs cdf matching using given percentiles.

    Parameters
    ----------
    src: numpy.array
        input data to scale
    perc_src: numpy.array
        percentiles of src estimated through method of choice
    perc_ref: numpy.array
        percentiles of reference data
        estimated through method of choice, must be same size as
        perc_src
    min_val: float, optional
        Minimum allowed value, output data is capped at this value
    max_val: float, optional
        Maximum allowed value, output data is capped at this value
    """

    return gen_cdf_match(src, perc_src, perc_ref,
                         min_val=min_val, max_val=max_val,
                         k=1)


[docs]def cdf_match(src, ref,
              min_val=None, max_val=None,
              nbins=100):
    '''
    computes cumulative density functions of src and ref at their
    respective bin-edges by 5th order spline interpolation; then matches CDF of
    src to CDF of ref.

    This function does not make sure that the percentiles are unique so
    it can happen that multiple measurements are scaled to one point or that
    there are NaN values in the output array.

    Parameters
    ----------
    src: numpy.array
        input dataset which will be scaled
    ref: numpy.array
        src will be scaled to this dataset
    min_val: float, optional
        Minimum allowed value, output data is capped at this value
    max_val: float, optional
        Maximum allowed value, output data is capped at this value
    nbins: int, optional
        Number of bins to use for estimation of the CDF

    Returns
    -------
    CDF matched values: numpy.array
        dataset src with CDF as ref
    '''

    percentiles = np.linspace(0, 100, nbins)
    perc_src = np.array(np.percentile(src, percentiles))
    perc_src = unique_percentiles_interpolate(perc_src,
                                              percentiles=percentiles)
    perc_ref = np.array(np.percentile(ref, percentiles))
    perc_ref = unique_percentiles_interpolate(perc_ref,
                                              percentiles=percentiles)

    return gen_cdf_match(src, perc_src, perc_ref,
                         min_val=min_val, max_val=max_val,
                         k=5)


[docs]def gen_cdf_match(src,
                  perc_src, perc_ref,
                  min_val=None, max_val=None,
                  k=1):
    """
    General cdf matching:

    1. computes discrete cumulative density functions of
       src- and ref at the given percentiles
    2. computes continuous CDFs by k-th order spline fitting
    3. CDF of src is matched to CDF of ref

    Parameters
    ----------
    src: numpy.array
        input dataset which will be scaled
    perc_src: numpy.array
        percentiles of src
    perc_ref: numpy.array
        percentiles of reference data
        estimated through method of choice, must be same size as
        perc_src
    min_val: float, optional
        Minimum allowed value, output data is capped at this value
    max_val: float, optional
        Maximum allowed value, output data is capped at this value
    k : int, optional
        Order of spline to fit

    Returns
    -------
    CDF matched values: numpy.array
        dataset src with CDF as ref
    """
    # InterpolatedUnivariateSpline uses extrapolation
    # outside of boundaries so all values can be rescaled
    # This is important if the stored percentiles were generated
    # using a subset of the data and the new data has values outside
    # of this original range
    try:
        inter = sc_int.InterpolatedUnivariateSpline(perc_src,
                                                    perc_ref,
                                                    k=k)
    except Exception:
        # here we must catch all exceptions since scipy does not raise a proper
        # Exception
        warn("Too few percentiles for chosen k.")
        return np.full_like(src, np.nan)

    scaled = inter(src)
    if max_val is not None:
        scaled[scaled > max_val] = max_val
    if min_val is not None:
        scaled[scaled < min_val] = min_val

    return scaled