Source code for pytesmo.scaling

"""
Created on Apr 17, 2013

@author: Christoph Paulik christoph.paulik@geo.tuwien.ac.at
"""

from scipy import stats
import numpy as np
import pandas as pd
import scipy.interpolate as sc_int
from warnings import warn
import pytesmo.utils as utils
from pytesmo.cdf_matching import CDFMatching


[docs]def add_scaled(df, method="linreg", label_in=None, label_scale=None, **kwargs): """ takes a dataframe and appends a scaled time series to it. If no labels are given the first column will be scaled to the second column of the DataFrame Parameters ---------- df : pandas.DataFrame input dataframe method : string scaling method label_in: string, optional the column of the dataframe that should be scaled to that with label_scale default is the first column label_scale : string, optional the column of the dataframe the label_in column should be scaled to default is the second column Returns ------- df : pandas.DataFrame input dataframe with new column labeled label_in+'_scaled_'+method """ if label_in is None: label_in = df.columns.values[0] if label_scale is None: label_scale = df.columns.values[1] scaling_func = get_scaling_function(method) scaled = scaling_func( df[label_in].values, df[label_scale].values, **kwargs ) new_label = label_in + "_scaled_" + method df[new_label] = pd.Series(scaled, index=df.index) return df
[docs]def scale(df, method="linreg", reference_index=0, **kwargs): """ takes pandas.DataFrame and scales all columns to the column specified by reference_index with the chosen method Parameters ---------- df : pandas.DataFrame containing matched time series that should be scaled method : string, optional method definition, has to be a function in globals() that takes 2 numpy.array as input and returns one numpy.array of same length reference_index : int, optional default 0, column index of reference dataset in dataframe Returns ------- scaled data : pandas.DataFrame all time series of the input DataFrame scaled to the one specified by reference_index """ scaling_func = get_scaling_function(method) reference = df[df.columns.values[reference_index]] df = df.drop([df.columns.values[reference_index]], axis=1) # new_df = pd.DataFrame for series in df: df[series] = pd.Series( scaling_func(df[series].values, reference.values, **kwargs), index=df.index, ) df.insert(reference_index, reference.name, reference) return df
[docs]def get_scaling_method_lut(): """ Get all defined scaling methods and their function names. Returns ------- lut: dictionary key: scaling method name value: function """ lut = { "linreg": linreg, "mean_std": mean_std, "min_max": min_max, "cdf_match": cdf_match, "cdf_beta_match": cdf_beta_match, } return lut
[docs]def get_scaling_function(method): """ Get scaling function based on method name. Parameters ---------- method: string method name as string Returns ------- scaling_func: function function(src:numpy.ndarray, ref:numpy.ndarray) > scaled_src:np.ndarray Raises ------ KeyError: if method is not found """ lut = get_scaling_method_lut() try: return lut[method] except KeyError: raise KeyError(f"Scaling method {method} not found.")
[docs]def min_max(src, ref, **kwargs): """ scales the input datasets so that they have the same minimum and maximum afterwards Parameters ---------- src : numpy.array input dataset which will be scaled ref : numpy.array src will be scaled to this dataset Returns ------- scaled dataset : numpy.array dataset src with same maximum and minimum as ref """ return (src - np.min(src)) / (np.max(src) - np.min(src)) * ( np.max(ref) - np.min(ref) ) + np.min(ref)
[docs]def linreg_stored_params(src, slope, intercept): """ Scale the input data with passed correction values Parameters ---------- src : numpy.array Candidate values, that are scaled slope : float Multiplicative correction value intercept : float Additive correction value Returns ------- src_scaled : numpy.array The scaled input values """ return np.abs(slope) * src + intercept
[docs]def linreg_params(src, ref): """ Calculate additive and multiplicative correction parameters based on linear regression models. Parameters ---------- src: numpy.array Candidate data (to which the corrections apply) ref : numpy.array Reference data (which candidate is scaled to) Returns ------- slope : float Multiplicative correction value intercept : float Additive correction value """ slope, intercept, r_value, p_value, std_err = stats.linregress(src, ref) return slope, intercept
[docs]def linreg(src, ref, **kwargs): """ scales the input datasets using linear regression Parameters ---------- src : numpy.array input dataset which will be scaled ref : numpy.array src will be scaled to this dataset Returns ------- scaled dataset : numpy.array dataset scaled using linear regression """ slope, intercept = linreg_params(src, ref) return linreg_stored_params(src, slope, intercept)
[docs]def mean_std(src, ref, **kwargs): """ scales the input datasets so that they have the same mean and standard deviation afterwards Parameters ---------- src : numpy.array input dataset which will be scaled ref : numpy.array src will be scaled to this dataset Returns ------- scaled dataset : numpy.array dataset src with same mean and standard deviation as ref """ return ((src - np.mean(src)) / np.std(src)) * np.std(ref) + np.mean(ref)
[docs]@utils.deprecated("Use the new implementation 'cdf_match' instead.") def cdf_beta_match(*args, **kwargs): return cdf_match(*args, **kwargs)
[docs]def cdf_match( src, ref, nbins=100, minobs=20, linear_edge_scaling=True, percentiles=None, combine_invalid=True, max_val=None, min_val=None ): """ Rescales by CDF matching. This calculates the empirical CDFs for source and reference dataset using a specified number of bins. In case of non-unique percentile values, a beta distribution is fitted to the CDF. For more robust estimation of the lower and upper bins, linear edge scaling is used (see Moesinger et al., 2020 for details). Parameters ---------- src: numpy.array input dataset which will be scaled ref: numpy.array src will be scaled to this dataset nbins: int, optional Number of bins to use for estimation of the CDF percentiles : sequence, optional Percentile values to use. If this is given, `nbins` is ignored. The percentiles might still be changed if `minobs` is given and the number data per bin is lower. Default is ``None``. minobs : int, optional Minimum desired number of observations in a bin for bin resizing. If it is ``None`` bins will not be resized. Default is 20. linear_edge_scaling : bool, optional Whether to derive the edge parameters via linear regression (more robust, see Moesinger et al. (2020) for more info). Default is ``True``. Note that this way only the outliers in the reference (y) CDF are handled. Outliers in the input data (x) will not be removed and will still show up in the data. combine_invalid : bool, optional Optional feature to combine the masks of invalid data (NaN, Inf) of both source (X) and reference (y) data passed to `fit`. This only makes sense if X and y are both timeseries data corresponding to the same index. In this case, this makes sures that data is only used if values for X and y are available, so that seasonal patterns in missing values in one of them do not lead to distortions. (For example, if X is available the whole year, but y is only available during summer, the distribution of y should not be matched against the whole year CDF of X, because that could introduce systematic seasonal biases). Default is True. max_val, min_val : float, optional Maximum and minimum values to enforce. Returns ------- CDF matched values: numpy.array dataset src with CDF as ref """ matcher = CDFMatching(nbins=nbins, minobs=minobs, linear_edge_scaling=linear_edge_scaling, percentiles=percentiles, combine_invalid=combine_invalid) matcher.fit(src, ref) scaled = matcher.predict(src) if max_val is not None: scaled[scaled > max_val] = max_val if min_val is not None: scaled[scaled < min_val] = min_val return scaled