Source code for pytesmo.df_metrics

# Copyright (c) 2013,Vienna University of Technology,
# Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the Vienna University of Technology,
#      Department of Geodesy and Geoinformation nor the
#      names of its contributors may be used to endorse or promote products
#      derived from this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
Module contains wrappers for methods in pytesmo.metrics which can be given
pandas.DataFrames instead of single numpy.arrays.
If the DataFrame has more columns than the function has input parameters
the function will be applied pairwise, resp. to triples.
"""

from collections import namedtuple, OrderedDict
from collections.abc import Iterable
import itertools
import numpy as np
import pandas as pd
from scipy import stats
import warnings

import pytesmo.metrics as metrics
from pytesmo.utils import array_dropna, deprecated


[docs]def n_combinations(iterable, n, must_include=None, permutations=False): """ Create possible combinations of an input iterable. Parameters --------- iterable: Iterable Elements from this iterable are combined. n : int Number of elements per combination. must_include : Iterable, optional (default: None) One or more element(s) of iterable that MUST be in each combination. permutations : bool, optional (default: False) Create combinations of n elements, order matters: e.g. AB -> AB, BA If this is False, the output combinations will be sorted. Returns: --------- combs: iterable The possible combinations of n elements. """ if must_include: if (not isinstance(must_include, Iterable)) or isinstance( must_include, str ): must_include = [must_include] if permutations: combs = [c for c in itertools.permutations(iterable, n)] else: combs = list(itertools.combinations(iterable, n)) if must_include: combs_filtered = [] for comb in combs: if all([i in comb for i in must_include]): combs_filtered.append(comb) combs = combs_filtered return combs
def _wrap_metric(metric, symmetric=True, name=None): """ Wraps a metric function to be called by only providing a dataframe. Parameters ---------- metric : callable Metric function from pytesmo.metrics symmetric : bool, optional Whether the metric is symmetrical w.r.t to the order of input arguments. Default is ``True``. name : str or None, optional The name of the namedtuple. If it is None (default), the name of the metric will be used. Returns ------- wrapped : callable New function that takes a dataframe as input and returns the metric value(s) as named tuple. The name of the function is the same as the input function name. """ metric_name = metric.__name__ if name is None: name = metric_name def wrapped(df): return _dict_to_namedtuple( nwise_apply(df, metric, n=2, comm=symmetric), metric_name ) # add name and docstring wrapped.__name__ = name wrapped.__doc__ = f""" Wrapper to call :py:func:`pytesmo.metrics.{metric_name}` on a dataframe Parameters ---------- df : pd.DataFrame Dataframe for whose columns combinations the metric should be evaluated. Returns ------- result : namedtuple Metric values for the different combinations. Member names are `df`'s column names separated by '_and_'. See also :py:func:`pytesmo.metrics.{metric_name}` docstring. """ return wrapped # add functions that are simple to wrap to the module bias = _wrap_metric(metrics.bias, symmetric=False) msd = _wrap_metric(metrics.msd) rmsd = _wrap_metric(metrics.rmsd) nrmsd = _wrap_metric(metrics.nrmsd) ubrmsd = _wrap_metric(metrics.ubrmsd) mse_corr = _wrap_metric(metrics.mse_corr) mse_var = _wrap_metric(metrics.mse_var) mse_bias = _wrap_metric(metrics.mse_bias) pearson_r = _wrap_metric(metrics.pearson_r) spearman_r = _wrap_metric(metrics.spearman_r) kendall_tau = _wrap_metric(metrics.kendall_tau) nash_sutcliffe = _wrap_metric(metrics.nash_sutcliffe, name="Nash_Sutcliffe") RSS = _wrap_metric(metrics.RSS)
[docs]def mse_decomposition(df): """ Mean square error (MSE) and decomposition of the MSE into individual error components. Returns ------- result : namedtuple with column names of df for which the calculation was done as name of the element separated by '_and_' See Also -------- pytesmo.metrics.mse_decomposition """ mse, mse_corr, mse_bias, mse_var = nwise_apply( df, metrics.mse_decomposition, n=2, comm=True ) return ( _dict_to_namedtuple(mse, "MSE"), _dict_to_namedtuple(mse_corr, "MSEcorr"), _dict_to_namedtuple(mse_bias, "MSEbias"), _dict_to_namedtuple(mse_var, "MSEvar"), )
[docs]@deprecated() def mse(df): """ Deprecated: use :py:func:`pytesmo.df_metrics.msd` and the functions for the individual components instead, or :py:func:`pytesmo.df_metrics.msd_decomposition` for the old functionality with better performance. Mean square error (MSE) as a decomposition of the RMSD into individual error components Returns ------- result : namedtuple with column names of df for which the calculation was done as name of the element separated by '_and_' See Also -------- pytesmo.metrics.mse """ MSE, MSEcorr, MSEbias, MSEvar = nwise_apply( df, metrics.mse, n=2, comm=True ) return ( _dict_to_namedtuple(MSE, "MSE"), _dict_to_namedtuple(MSEcorr, "MSEcorr"), _dict_to_namedtuple(MSEbias, "MSEbias"), _dict_to_namedtuple(MSEvar, "MSEvar"), )
[docs]@deprecated() def tcol_error(df): """ Deprecated: use :py:func:`pytesmo.df_metrics.tcol_metrics` instead. Triple collocation error estimate, applied to triples of columns of the passed data frame. Returns ------- triple_collocation_error_x : namedtuple Error for the first dataset triple_collocation_error_y : namedtuple Error for the second dataset triple_collocation_error_z : namedtuple Error for the third dataset See Also -------- pytesmo.metrics.tcol_error """ # For TC, the input order has NO effect --> comm=True err0, err1, err2 = nwise_apply(df, metrics.tcol_error, n=3, comm=True) trips = list(err0.keys()) # triples in all err are equal assert trips == list(err0.keys()) == list(err1.keys()) == list(err2.keys()) errors = [] for trip in trips: res = [err0[trip], err1[trip], err2[trip]] Inner = namedtuple( "triple_collocation_error", OrderedDict(zip(trip, res)) ) errors.append(Inner(*res)) return tuple(errors)
[docs]@deprecated() def tcol_snr(df, ref_ind=0): """DEPRECATED: use `tcol_metrics` instead.""" return tcol_metrics(df, ref_ind=0)
[docs]def tcol_metrics(df, ref_ind=0): """ Triple Collocation metrics applied to triples of dataframe columns. Parameters ---------- df : pd.DataFrame Contains the input values as time series in the df columns ref_ind : int or None, optional (default: 0) The index of the column in df that contains the reference data set. If None is passed, we use the first column of each triple as the reference, otherwise only triples that contain the reference dataset are considered during processing. Returns ------- snr : namedtuple signal-to-noise (variance) ratio [dB] from the named columns. err_std_dev : namedtuple **SCALED** error standard deviation from the named columns beta : namedtuple Scaling coefficients (i_scaled = i * beta_i) """ # For TC, the input order has NO effect --> comm=True if ref_ind is not None: # This column must be part of each triple and is always used as the # reference incl = [ref_ind] else: # All unique triples are processed, the first dataset of a triple is # the reference. incl = None ref_ind = 0 snr, err, beta = nwise_apply( df, metrics.tcol_metrics, n=3, comm=True, must_include=incl, ref_ind=ref_ind, ) results = {} var_dict = {"snr": snr, "err_std_dev": err, "beta": beta} for var_name, var_vals in var_dict.items(): results[var_name] = [] for trip, res in var_vals.items(): Inner = namedtuple(var_name, OrderedDict(zip(trip, res))) results[var_name].append(Inner(*res)) return (results["snr"], results["err_std_dev"], results["beta"])
[docs]def pearsonr(df): """ Wrapper for scipy.stats.pearsonr Returns ------- result : namedtuple with column names of df for which the calculation was done as name of the element separated by '_and_' See Also -------- pytesmo.metrics.pearsonr scipy.stats.pearsonr """ r, p = nwise_apply(df, stats.pearsonr, n=2, comm=True) return ( _dict_to_namedtuple(r, "Pearsons_r"), _dict_to_namedtuple(p, "p_value"), )
[docs]def spearmanr(df): """ Wrapper for scipy.stats.spearmanr Returns ------- result : namedtuple with column names of df for which the calculation was done as name of the element separated by '_and_' See Also -------- pytesmo.metrics.spearmenr scipy.stats.spearmenr """ r, p = nwise_apply(df, stats.spearmanr, n=2, comm=True) return ( _dict_to_namedtuple(r, "Spearman_r"), _dict_to_namedtuple(p, "p_value"), )
[docs]def kendalltau(df): """ Wrapper for scipy.stats.kendalltau Returns ------- result : namedtuple with column names of df for which the calculation was done as name of the element separated by '_and_' See Also -------- pytesmo.metrics.kendalltau scipy.stats.kendalltau """ r, p = nwise_apply(df, stats.kendalltau, n=2, comm=True) return ( _dict_to_namedtuple(r, "Kendall_tau"), _dict_to_namedtuple(p, "p_value"), )
[docs]def pairwise_apply(df, method, comm=False): """ Compute given method pairwise for all columns, excluding NA/null values Parameters ---------- df : pd.DataFrame input data, method will be applied to each column pair method : function method to apply to each column pair. has to take 2 input arguments of type np.array and return one value or tuple of values comm : bool, optional (default: False) Also fills the lower part of the results matrix Returns ------- results : pd.DataFrame """ warnings.warn( "pairwise_apply() is deprecated, use nwise_apply(..., n=2) instead", DeprecationWarning, ) numeric_df = df._get_numeric_data() cols = numeric_df.columns mat = numeric_df.values mat = mat.T applyf = method K = len(cols) result_empty = np.empty((K, K), dtype=float) result_empty.fill(np.nan) # find out how many variables the applyf returns c = applyf(mat[0], mat[0]) result = [] for index, value in enumerate(np.atleast_1d(c)): result.append(result_empty) result = np.array(result) mask = np.isfinite(mat) for i, ac in enumerate(mat): for j, bc in enumerate(mat): if i == j: continue if comm and np.isfinite(result[0][i, j]): continue valid = mask[i] & mask[j] if not valid.any(): continue if not valid.all(): c = applyf(ac[valid], bc[valid]) else: c = applyf(ac, bc) for index, value in enumerate(np.atleast_1d(c)): result[index][i, j] = value if comm: result[index][j, i] = value return_list = [] for data in result: return_list.append(df._constructor(data, index=cols, columns=cols)) if len(return_list) == 1: return return_list[0] else: return tuple(return_list)
[docs]def nwise_apply( df, method, n=2, comm=False, as_df=False, ds_names=True, must_include=None, **method_kwargs, ): """ Compute given method for column combinations of a data frame, excluding NA/null values. Parameters ---------- df : pd.DataFrame Input data, method will be applied to combinations of columns of this df. method : function method to apply to each column pair. Has to take 2 input arguments of type numpy.array and return one value or tuple of values n : int, optional (default: 2) Number of columns that are combined. The default n=2 is the same as the previous pairwise_apply() function. comm : bool, optional (default: False) Metrics do NOT depend on the order of input values. In these cases we can skip unnecessary calculations and simply copy the results if necessary (faster). as_df : bool, optional (default: False) Return matrix structure, same as for previous pairwise_apply(), only available for n=2. By default, the return value will be a list of ordered dicts. ds_names : bool, optional (default: True) Use the column names of df to identify the dataset instead of using their index. must_include : list, optional (default: None) The index of one or multiple columns in df that MUST be in part of each combination that is processed. method_kwargs : Keyword arguments that are passed to method. Returns ------- results : pd.DataFrame or dict or tuple """ numeric_df = df._get_numeric_data() cols = numeric_df.columns.values mat = numeric_df.values mat = mat.T applyf = method mask = np.isfinite(mat) # create the possible combinations of lines counter = list(range(mat.shape[0])) # get the number of lines? # ALL possible combinations of lines? perm = True if not comm else False combs = n_combinations( counter, n, must_include=must_include, permutations=perm ) # find out how many variables the applyf returns result = [] # apply the method using the first data set to find out the shape of c, c = applyf(*array_dropna(*[mat[i] for i in range(n)])) for index, value in enumerate(np.atleast_1d(c)): result.append(OrderedDict([(c, np.nan) for c in combs])) result = np.array(result) # array of OrderedDicts # each return value result is a dict that gets filled with dicts that have # the cols and keys and the results as values lut_comb_cols = dict() for comb in combs: valid = np.logical_and(*[mask[i] for i in comb]) # where all are True lut_comb_cols.update(dict(zip(comb, tuple(np.take(cols, comb))))) if not valid.any(): continue if not valid.all(): c = applyf(*[mat[i, :][valid] for i in comb], **method_kwargs) else: c = applyf(*[mat[i, :] for i in comb], **method_kwargs) for index, value in enumerate(np.atleast_1d(c)): result[index][comb] = value if as_df: if n != 2: raise ValueError("Array structure only available for n=2") else: if not ds_names: lut_comb_cols = None result = [ _to_df(r, comm=comm, lut_names=lut_comb_cols) for r in result ] else: if ds_names: formatted_results = [] for r in result: formatted = OrderedDict() for k, v in r.items(): formatted[tuple([lut_comb_cols[i] for i in k])] = v formatted_results.append(formatted) result = formatted_results if len(result) == 1: result = result[0] else: result = tuple(result) return result
def _to_df(result, comm=False, lut_names=None): """ Create a 2d results matrix/dataframe from the result dictionary to reproduce the output structure of the previous pairwise_apply() function. Parameters --------- result : OrderedDict The results as the are calculated in nwise_apply() comm : bool, optional (default: False) Copy elements from the upper diagonal matrix in the lower diagonal. lut_names: dict, optional (default: None) A LUT that applies nice names to the columns and lines in the data frame, e.g. {1:'ds1', 2:'ds2', 3:'ds3') """ # find out how large the matrix is imax = max([max(r) for r in list(result.keys())]) # create and fill the matrix res = np.full((imax + 1, imax + 1), np.nan) for k, v in result.items(): res[k[::-1]] = v res = res.transpose() if comm: i_upper = np.triu_indices(res.shape[0], 1) i_lower = np.tril_indices(res.shape[0], -1) res[i_lower] = res[i_upper] if lut_names is not None: res = pd.DataFrame( data={lut_names[i]: res[:, i] for i in list(range(max(res.shape)))} ) else: res = pd.DataFrame( data={i: res[:, i] for i in list(range(max(res.shape)))} ) res.index = res.columns return res def _dict_to_namedtuple(res_dict, name): """ Takes the OrderedDictionary produced by nwise_apply(..., as_df=False) and produces named tuples, using the dictionary keys. """ names = [] values = [] for k, v in res_dict.items(): names.append("_and_".join(k)) values.append(v) result = namedtuple(name, names) return result._make(values)