Source code for pytesmo.metrics.pairwise

"""
Pairwise metrics and analytical confidence intervals.

Metrics
-------

The metrics function implemented here all have the signature::

    def metric(x : np.ndarray, y : np.ndarray) -> float

Confidence intervals
--------------------

Formulas for confidence intervals have in general been taken from from
Gilleland (2010), 10.5065/D6WD3XJM,
https://opensky.ucar.edu/islandora/object/technotes:491

Other references are cited in the docstring of the respective function.

Analytical confidence interval functions implemented here are named
``<metric>_ci``, e.g. for ``bias``, the CI function is ``bias_ci``.
The signature is be::

    def metric_ci(x : np.ndarray, y : np.ndarray, m : float,
                  alpha=0.05 : float) -> float, float

where m is the metric value that has been calculated for x and y.

Typically, you should use
:py:func:`pytesmo.metrics.confidence_intervals.with_analytical_ci` for
calculating a metric CI.
"""

# IMPORTANT: DEVELOPERS NOTES
#
# Some of the RMSD methods take ddof as keyword argument. Although there are
# some warnings in the docstrings (and also when using it) that it is
# deprecated, we want to keep it for now, in order to reproduce previous
# results.


import numpy as np
from scipy import stats
import warnings

from pytesmo.metrics._fast_pairwise import (  # noqa: F401
    bias,
    mse_bias,
    mse_var,
    mse_corr,
    mse_decomposition,
    RSS,
    _ubrmsd,
    rolling_pr_rmsd,
)


has_ci = [
    "bias",
    "ubrmsd",
    "pearson_r",
    "spearman_r",
    "kendall_tau",
]

no_ci = [
    "aad",
    "mad",
    "mse_bias",
    "msd",
    "rmsd",
    "nrmsd",
    "mse_corr",
    "mse_var",
    "nash_sutcliffe",
    "index_of_agreement",
]


[docs]def bias_ci(x, y, b, alpha=0.05): """ Confidence interval for bias. The confidence interval is the same as the confidence interval for a mean. Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. b : float bias alpha : float, optional 1 - confidence level, default is 0.05 Returns ------- lower, upper : float Lower and upper confidence interval bounds. """ n = len(x) delta = np.std(x - y, ddof=1) / np.sqrt(n) * stats.t.ppf(1 - alpha / 2, n - 1) return b - delta, b + delta
def _bias_ci_from_moments(alpha, mx, my, varx, vary, cov, n): # This is based on the fact that: # var(x - y) = var(x) + var(y) - 2*cov(x,y) # and therefore # std(x - y, ddof=1) = sqrt(var(x - y, ddof=1)) # = sqrt(n/(n-1) * var(x-y)) std = np.sqrt(n / (n - 1) * (varx + vary - 2 * cov)) delta = std / np.sqrt(n) * stats.t.ppf(1 - alpha / 2, n - 1) b = mx - my return b - delta, b + delta
[docs]def aad(x, y): """ Average (=mean) absolute deviation (AAD). Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. Returns ------- d : float Mean absolute deviation. """ return np.mean(np.abs(x - y))
[docs]def mad(x, y): """ Median absolute deviation (MAD). Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. Returns ------- d : float Median absolute deviation. """ return np.median(np.abs(x - y))
[docs]def msd(x, y): r""" Mean square deviation/mean square error. For validation, MSD (same as MSE) is defined as ..math:: MSD = \frac{1}{n}\sum\limits_{i=1}^n (x_i - y_i)^2 MSD can be decomposed into a term describing the deviation of x and y attributable to non-perfect correlation (r < 1), a term depending on the difference in variances between x and y, and the difference in means between x and y (bias). ..math:: MSD &= MSD_{corr} + MSD_{var} + MSD_{bias}\\ &= 2\sigma_x\sigma_y (1-r) + (\sigma_x - \sigma_y)^2 + (\mu_x - \mu_y)^2 This function calculates the full MSD, the function `msd_corr`, `msd_var`, and `msd_bias` can be used to calculate the individual components. Parameters ---------- x : numpy.ndarray First input vector y : numpy.ndarray Second input vector Returns ------- msd : float Mean square deviation """ return RSS(x, y) / len(x)
[docs]def rmsd(x, y, ddof=0): """ Root-mean-square deviation (RMSD). This is the root of MSD (see :py:func:`pytesmo.metrics.msd`). If `x` and `y` have the same mean (i.e. `mean(x - y = 0`) RMSD corresponds to the square root of the variance of `x - y`. Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. ddof : int, optional, DEPRECATED Delta degree of freedom.The divisor used in calculations is N - ddof, where N represents the number of elements. By default ddof is zero. DEPRECATED: ddof is deprecated and might be removed in future versions. Returns ------- rmsd : float Root-mean-square deviation. """ if ddof == 0: return np.sqrt(msd(x, y)) else: warnings.warn( "ddof is deprecated and might be removed in future versions of" " pytesmo.", category=DeprecationWarning, ) return np.sqrt(RSS(x, y) / (len(x) - ddof))
[docs]def nrmsd(x, y, ddof=0): """ Normalized root-mean-square deviation (nRMSD). This is normalizes RMSD by ``max(max(x), max(y)) - min(min(x), min(y))``. Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. ddof : int, optional Delta degree of freedom.The divisor used in calculations is N - ddof, where N represents the number of elements. By default ddof is zero. DEPRECATED: ddof is deprecated and might be removed in future versions. Returns ------- nrmsd : float Normalized root-mean-square deviation (nRMSD). """ return rmsd(x, y, ddof) / (np.max([x, y]) - np.min([x, y]))
[docs]def ubrmsd(x, y, ddof=0): r""" Unbiased root-mean-square deviation (uRMSD). This corresponds to RMSD with mean biases removed beforehand, that is ..math:: ubRMSD = \sqrt{\frac{1}{n}\sum\limits_{i=1}^n \left((x - \bar{x}) - (y - \bar{y}))^2} NOTE: If you are scaling the data beforehand to have zero mean bias, this is exactly the same as RMSD. Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. ddof : int, optional Delta degree of freedom.The divisor used in calculations is N - ddof, where N represents the number of elements. By default ddof is zero. DEPRECATED: ddof is deprecated and might be removed in future versions. Returns ------- ubrmsd : float Unbiased root-mean-square deviation (uRMSD). """ if ddof != 0: warnings.warn( "ddof is deprecated and might be removed in future versions of" " pytesmo.", DeprecationWarning, ) return np.sqrt(RSS(x - np.mean(x), y - np.mean(y)) / (len(x) - ddof)) else: return _ubrmsd(x, y)
[docs]def ubrmsd_ci(x, y, ubrmsd, alpha=0.05): """ Confidende interval for unbiased root-mean-square deviation (uRMSD). Parameters ---------- x : numpy.ndarray First input vector y : numpy.ndarray Second input vector ubrmsd : float ubRMSD for this data alpha : float, optional 1 - confidence level, default is 0.05 Returns ------- lower, upper : float Lower and upper confidence interval bounds. """ n = len(x) ubMSD = ubrmsd ** 2 lb_ubMSD = n * ubMSD / stats.chi2.ppf(1 - alpha / 2, n - 1) ub_ubMSD = n * ubMSD / stats.chi2.ppf(alpha / 2, n - 1) return np.sqrt(lb_ubMSD), np.sqrt(ub_ubMSD)
[docs]def pearson_r(x, y): """ Pearson's linear correlation coefficient. Parameters ---------- x : numpy.ndarray First input vector. y : numpy.ndarray Second input vector. Returns ------- r : float Pearson's correlation coefficent. See Also -------- scipy.stats.pearsonr """ return stats.pearsonr(x, y)[0]
[docs]def pearson_r_ci(x, y, r, alpha=0.05): """ Confidence interval for Pearson correlation coefficient. Parameters ---------- x : numpy.ndarray First input vector y : numpy.ndarray Second input vector r : float Pearson r for this data alpha : float, optional 1 - confidence level, default is 0.05 Returns ------- lower, upper : float Lower and upper confidence interval bounds. References ---------- Bonett, D. G., & Wright, T. A. (2000). Sample size requirements for estimating Pearson, Kendall and Spearman correlations. Psychometrika, 65(1), 23-28. """ n = len(x) v = np.arctanh(r) z = stats.norm.ppf(1 - alpha / 2) cl = v - z / np.sqrt(n - 3) cu = v + z / np.sqrt(n - 3) return np.tanh(cl), np.tanh(cu)
[docs]def spearman_r(x, y): """ Spearman's rank correlation coefficient. Parameters ---------- x : numpy.array First input vector. y : numpy.array Second input vector. Returns ------- rho : float Spearman correlation coefficient See Also -------- scipy.stats.spearmenr """ return stats.spearmanr(x, y)[0]
[docs]def spearman_r_ci(x, y, r, alpha=0.05): """ Confidence interval for Spearman rank correlation coefficient. Parameters ---------- x : numpy.ndarray First input vector y : numpy.ndarray Second input vector r : float Spearman's r for this data alpha : float, optional 1 - confidence level, default is 0.05 Returns ------- lower, upper : float Lower and upper confidence interval bounds. References ---------- Bonett, D. G., & Wright, T. A. (2000). Sample size requirements for estimating Pearson, Kendall and Spearman correlations. Psychometrika, 65(1), 23-28. """ n = len(x) v = np.arctanh(r) z = stats.norm.ppf(1 - alpha / 2) # see reference for this formula cl = v - z * np.sqrt(1 + r ** 2 / 2) / np.sqrt(n - 3) cu = v + z * np.sqrt(1 + r ** 2 / 2) / np.sqrt(n - 3) return np.tanh(cl), np.tanh(cu)
[docs]def kendall_tau(x, y): """ Wrapper for scipy.stats.kendalltau Parameters ---------- x : numpy.array First input vector. y : numpy.array Second input vector. Returns ------- tau : float Kendall's tau statistic See Also -------- scipy.stats.kendalltau """ return stats.kendalltau(x, y)[0]
[docs]def kendall_tau_ci(x, y, tau, alpha=0.05): r""" Confidence intervall for Kendall's rank coefficient. Parameters ---------- x : numpy.ndarray First input vector y : numpy.ndarray Second input vector tau : float Kendall tau for this data alpha : float, optional 1 - confidence level, default is 0.05 Returns ------- lower, upper : float Lower and upper confidence interval bounds. References ---------- Bonett, D. G., & Wright, T. A. (2000). Sample size requirements for estimating Pearson, Kendall and Spearman correlations. Psychometrika, 65(1), 23-28. """ n = len(x) v = np.arctanh(tau) z = stats.norm.ppf(1 - alpha / 2) # see reference for this formula cl = v - z * 0.431 / np.sqrt(n - 3) cu = v + z * 0.431 / np.sqrt(n - 3) return np.tanh(cl), np.tanh(cu)
[docs]def index_of_agreement(o, p): """ Index of agreement was proposed by Willmot (1981), to overcome the insenstivity of Nash-Sutcliffe efficiency E and R^2 to differences in the observed and predicted means and variances (Legates and McCabe, 1999). The index of agreement represents the ratio of the mean square error and the potential error (Willmot, 1984). The potential error in the denominator represents the largest value that the squared difference of each pair can attain. The range of d is similar to that of R^2 and lies between 0 (no correlation) and 1 (perfect fit). Parameters ---------- o : numpy.ndarray Observations. p : numpy.ndarray Predictions. Returns ------- d : float Index of agreement. """ denom = np.sum((np.abs(p - np.mean(o)) + np.abs(o - np.mean(o))) ** 2) d = 1 - RSS(o, p) / denom return d
[docs]def nash_sutcliffe(o, p): """ Nash Sutcliffe model efficiency coefficient E. The range of E lies between 1.0 (perfect fit) and -inf. Parameters ---------- o : numpy.ndarray Observations. p : numpy.ndarray Predictions. Returns ------- E : float Nash Sutcliffe model efficiency coefficient E. """ return 1 - (np.sum((o - p) ** 2)) / (np.sum((o - np.mean(o)) ** 2))