Source code for pytesmo.df_metrics

# Copyright (c) 2013,Vienna University of Technology,
# Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the Vienna University of Technology,
#      Department of Geodesy and Geoinformation nor the
#      names of its contributors may be used to endorse or promote products
#      derived from this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

'''
Module contains wrappers for methods in pytesmo.metrics
which can be given pandas.DataFrames
instead of single numpy.arrays . If the DataFrame has more columns
than the function has input parameters
the function will be applied pairwise

Created on Aug 14, 2013

@author: Christoph Paulik Christoph.Paulik@geo.tuwien.ac.at
'''

import numpy as np
import pytesmo.metrics as metrics
from collections import namedtuple


[docs]class DataFrameDimensionError(Exception):
    pass


[docs]def bias(df):
    """Bias

    Returns
    -------
    bias : pandas.Dataframe
        of shape (len(df.columns),len(df.columns))
    See Also
    --------
    pytesmo.metrics.bias
    """
    return _to_namedtuple(pairwise_apply(df, metrics.bias), 'bias')


[docs]def rmsd(df):
    """Root-mean-square deviation

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.rmsd
    """
    return _to_namedtuple(pairwise_apply(df, metrics.rmsd, comm=True), 'rmsd')


[docs]def nrmsd(df):
    """Normalized root-mean-square deviation

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.nrmsd
    """
    return _to_namedtuple(pairwise_apply(df, metrics.nrmsd,
                                         comm=True), 'nrmsd')


[docs]def ubrmsd(df):
    """Unbiased root-mean-square deviation

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.ubrmsd
    """
    return _to_namedtuple(pairwise_apply(df, metrics.ubrmsd,
                                         comm=True), 'ubrmsd')


[docs]def mse(df):
    """Mean square error (MSE) as a decomposition of the RMSD into
    individual error components

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.mse

    """
    MSE, MSEcorr, MSEbias, MSEvar = pairwise_apply(df, metrics.mse, comm=True)
    return (_to_namedtuple(MSE, 'MSE'),
            _to_namedtuple(MSEcorr, 'MSEcorr'),
            _to_namedtuple(MSEbias, 'MSEbias'),
            _to_namedtuple(MSEvar, 'MSEvar'))


[docs]def tcol_error(df):
    """Triple collocation error estimate
    In this case df has to have exactly 3 columns, since triple wise
    application of a function is not yet implemented and
    would probably return a complicated structure

    Returns
    -------
    result : namedtuple
        with column names of df

    See Also
    --------
    pytesmo.metrics.tcol_error
    """

    if len(df.columns) != 3:
        raise DataFrameDimensionError("DataFrame has to have 3 columns")

    tcol_result = namedtuple('triple_collocation_error', df.columns)

    return tcol_result._make(metrics.tcol_error(df.ix[:, 0].values,
                                                df.ix[:, 1].values,
                                                df.ix[:, 2].values))


[docs]def nash_sutcliffe(df):
    """Nash Sutcliffe model efficiency coefficient

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.nash_sutcliffe
    """
    return _to_namedtuple(pairwise_apply(df, metrics.nash_sutcliffe,
                                         comm=True), 'Nash_Sutcliffe')


[docs]def RSS(df):
    """Redidual sum of squares

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.RSS
    """
    return _to_namedtuple(pairwise_apply(df, metrics.RSS, comm=True), 'RSS')


[docs]def pearsonr(df):
    """
    Wrapper for scipy.stats.pearsonr

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.pearsonr
    scipy.stats.pearsonr
    """
    r, p = pairwise_apply(df, metrics.pearsonr, comm=True)
    return _to_namedtuple(r, 'Pearsons_r'), _to_namedtuple(p, 'p_value')


[docs]def spearmanr(df):
    """
    Wrapper for scipy.stats.spearmanr

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.spearmenr
    scipy.stats.spearmenr
    """
    r, p = pairwise_apply(df, metrics.spearmanr, comm=True)
    return _to_namedtuple(r, 'Spearman_r'), _to_namedtuple(p, 'p_value')


[docs]def kendalltau(df):
    """
    Wrapper for scipy.stats.kendalltau

    Returns
    -------
    result : namedtuple
        with column names of df for which the calculation
        was done as name of the
        element separated by '_and_'

    See Also
    --------
    pytesmo.metrics.kendalltau
    scipy.stats.kendalltau
    """
    r, p = pairwise_apply(df, metrics.kendalltau, comm=True)
    return _to_namedtuple(r, 'Kendall_tau'), _to_namedtuple(p, 'p_value')


[docs]def pairwise_apply(df, method, comm=False):
    """
    Compute given method pairwise for all columns, excluding NA/null values

    Parameters
    ----------
    df : pandas.DataFrame
        input data, method will be applied to each column pair
    method : function
        method to apply to each column pair. has to take 2 input arguments of
        type numpy.array and return one value or tuple of values

    Returns
    -------
    results : pandas.DataFrame
    """
    numeric_df = df._get_numeric_data()
    cols = numeric_df.columns
    mat = numeric_df.values
    mat = mat.T
    applyf = method
    K = len(cols)
    result_empty = np.empty((K, K), dtype=float)
    result_empty.fill(np.nan)

    # find out how many variables the applyf returns
    c = applyf(mat[0], mat[0])
    result = []
    for index, value in enumerate(np.atleast_1d(c)):
        result.append(result_empty)
    result = np.array(result)
    mask = np.isfinite(mat)
    for i, ac in enumerate(mat):
        for j, bc in enumerate(mat):
            if i == j:
                continue
            if comm and np.isfinite(result[0][i, j]):
                continue
            valid = mask[i] & mask[j]
            if not valid.any():
                continue
            if not valid.all():
                c = applyf(ac[valid], bc[valid])
            else:
                c = applyf(ac, bc)

            for index, value in enumerate(np.atleast_1d(c)):
                result[index][i, j] = value
                if comm:
                    result[index][j, i] = value
    return_list = []
    for data in result:
        return_list.append(df._constructor(data, index=cols, columns=cols))

    if len(return_list) == 1:
        return return_list[0]
    else:
        return tuple(return_list)


def _to_namedtuple(df, name):
    """
    takes df produced by pairwise apply and produces named tuple
    of the non duplicate values for commutative operations(the triangle
    above the diagonal)
    """

    names = []
    values = []
    for i, column in enumerate(df.columns[:-1]):
        for column_names in df.columns[i + 1:]:
            names.append('_and_'.join([df.index[i], column_names]))
        values.extend(df[column].values[i + 1:])

    result = namedtuple(name, names)
    return result._make(values)