Source code for pytesmo.scaling

'''
Created on Apr 17, 2013

@author: Christoph Paulik christoph.paulik@geo.tuwien.ac.at
'''

from scipy import stats
import numpy as np
import pandas as pd
import scipy.interpolate as sc_int
from warnings import warn
from pytesmo.utils import unique_percentiles_interpolate


[docs]def add_scaled(df, method='linreg', label_in=None, label_scale=None): """ takes a dataframe and appends a scaled time series to it. If no labels are given the first column will be scaled to the second column of the DataFrame Parameters ---------- df : pandas.DataFrame input dataframe method : string scaling method label_in: string, optional the column of the dataframe that should be scaled to that with label_scale default is the first column label_scale : string, optional the column of the dataframe the label_in column should be scaled to default is the second column Returns ------- df : pandas.DataFrame input dataframe with new column labeled label_in+'_scaled_'+method """ if label_in == None: label_in = df.columns.values[0] if label_scale == None: label_scale = df.columns.values[1] scaling_func = get_scaling_function(method) scaled = scaling_func(df[label_in].values, df[label_scale].values) new_label = label_in + '_scaled_' + method df[new_label] = pd.Series(scaled, index=df.index) return df
[docs]def scale(df, method='linreg', reference_index=0): """ takes pandas.DataFrame and scales all columns to the column specified by reference_index with the chosen method Parameters ---------- df : pandas.DataFrame containing matched time series that should be scaled method : string, optional method definition, has to be a function in globals() that takes 2 numpy.array as input and returns one numpy.array of same length reference_index : int, optional default 0, column index of reference dataset in dataframe Returns ------- scaled data : pandas.DataFrame all time series of the input DataFrame scaled to the one specified by reference_index """ scaling_func = get_scaling_function(method) reference = df[df.columns.values[reference_index]] df = df.drop([df.columns.values[reference_index]], axis=1) #new_df = pd.DataFrame for series in df: df[series] = pd.Series( scaling_func(df[series].values, reference.values), index=df.index) df.insert(reference_index, reference.name, reference) return df
[docs]def get_scaling_method_lut(): """ Get all defined scaling methods and their function names. Returns ------- lut: dictionary key: scaling method name value: function """ lut = {'linreg': linreg, 'mean_std': mean_std, 'min_max': min_max, 'lin_cdf_match': lin_cdf_match, 'cdf_match': cdf_match} return lut
[docs]def get_scaling_function(method): """ Get scaling function based on method name. Parameters ---------- method: string method name as string Returns ------- scaling_func: function function(src:numpy.ndarray, ref:numpy.ndarray) > scaled_src:np.ndarray Raises ------ KeyError: if method is not found """ lut = get_scaling_method_lut() try: return lut[method] except KeyError as e: warn('scaling method not found') raise e
[docs]def min_max(src, ref): """ scales the input datasets so that they have the same minimum and maximum afterwards Parameters ---------- src : numpy.array input dataset which will be scaled ref : numpy.array src will be scaled to this dataset Returns ------- scaled dataset : numpy.array dataset src with same maximum and minimum as ref """ return ((src - np.min(src)) / (np.max(src) - np.min(src)) * (np.max(ref) - np.min(ref)) + np.min(ref))
[docs]def linreg(src, ref): """ scales the input datasets using linear regression Parameters ---------- src : numpy.array input dataset which will be scaled ref : numpy.array src will be scaled to this dataset Returns ------- scaled dataset : numpy.array dataset scaled using linear regression """ slope, intercept, r_value, p_value, std_err = stats.linregress( src, ref) return np.abs(slope) * src + intercept
[docs]def mean_std(src, ref): """ scales the input datasets so that they have the same mean and standard deviation afterwards Parameters ---------- src : numpy.array input dataset which will be scaled ref : numpy.array src will be scaled to this dataset Returns ------- scaled dataset : numpy.array dataset src with same mean and standard deviation as ref """ return ((src - np.mean(src)) / np.std(src)) * np.std(ref) + np.mean(ref)
[docs]def lin_cdf_match(src, ref, min_val=None, max_val=None, percentiles=[0, 5, 10, 30, 50, 70, 90, 95, 100]): ''' computes cumulative density functions of src and ref at their respective bin-edges by linear interpolation; then matches CDF of src to CDF of ref. This function does not make sure that the percentiles are unique so it can happen that multiple measurements are scaled to one point or that there are NaN values in the output array. Parameters ---------- src: numpy.array input dataset which will be scaled ref: numpy.array src will be scaled to this dataset min_val: float, optional Minimum allowed value, output data is capped at this value max_val: float, optional Maximum allowed value, output data is capped at this value percentiles: list or numpy.ndarray Percentiles to use for CDF matching Returns ------- CDF matched values: numpy.array dataset src with CDF as ref ''' perc_src = np.array(np.percentile(src, percentiles)) perc_ref = np.array(np.percentile(ref, percentiles)) return lin_cdf_match_stored_params(src, perc_src, perc_ref, min_val=min_val, max_val=max_val)
[docs]def lin_cdf_match_stored_params(src, perc_src, perc_ref, min_val=None, max_val=None): """ Performs cdf matching using given percentiles. Parameters ---------- src: numpy.array input data to scale perc_src: numpy.array percentiles of src estimated through method of choice perc_ref: numpy.array percentiles of reference data estimated through method of choice, must be same size as perc_src min_val: float, optional Minimum allowed value, output data is capped at this value max_val: float, optional Maximum allowed value, output data is capped at this value """ return gen_cdf_match(src, perc_src, perc_ref, min_val=min_val, max_val=max_val, k=1)
[docs]def cdf_match(src, ref, min_val=None, max_val=None, nbins=100): ''' computes cumulative density functions of src and ref at their respective bin-edges by 5th order spline interpolation; then matches CDF of src to CDF of ref. This function does not make sure that the percentiles are unique so it can happen that multiple measurements are scaled to one point or that there are NaN values in the output array. Parameters ---------- src: numpy.array input dataset which will be scaled ref: numpy.array src will be scaled to this dataset min_val: float, optional Minimum allowed value, output data is capped at this value max_val: float, optional Maximum allowed value, output data is capped at this value nbins: int, optional Number of bins to use for estimation of the CDF Returns ------- CDF matched values: numpy.array dataset src with CDF as ref ''' percentiles = np.linspace(0, 100, nbins) perc_src = np.array(np.percentile(src, percentiles)) perc_src = unique_percentiles_interpolate(perc_src, percentiles=percentiles) perc_ref = np.array(np.percentile(ref, percentiles)) perc_ref = unique_percentiles_interpolate(perc_ref, percentiles=percentiles) return gen_cdf_match(src, perc_src, perc_ref, min_val=min_val, max_val=max_val, k=5)
[docs]def gen_cdf_match(src, perc_src, perc_ref, min_val=None, max_val=None, k=1): """ General cdf matching: 1. computes discrete cumulative density functions of src- and ref at the given percentiles 2. computes continuous CDFs by k-th order spline fitting 3. CDF of src is matched to CDF of ref Parameters ---------- src: numpy.array input dataset which will be scaled perc_src: numpy.array percentiles of src perc_ref: numpy.array percentiles of reference data estimated through method of choice, must be same size as perc_src min_val: float, optional Minimum allowed value, output data is capped at this value max_val: float, optional Maximum allowed value, output data is capped at this value k : int, optional Order of spline to fit Returns ------- CDF matched values: numpy.array dataset src with CDF as ref """ # InterpolatedUnivariateSpline uses extrapolation # outside of boundaries so all values can be rescaled # This is important if the stored percentiles were generated # using a subset of the data and the new data has values outside # of this original range try: inter = sc_int.InterpolatedUnivariateSpline(perc_src, perc_ref, k=k) except Exception: # here we must catch all exceptions since scipy does not raise a proper # Exception warn("Too few percentiles for chosen k.") return np.full_like(src, np.nan) scaled = inter(src) if max_val is not None: scaled[scaled > max_val] = max_val if min_val is not None: scaled[scaled < min_val] = min_val return scaled