Source code for skgstat.estimators

"""
This file collects different functions for estimating the semivariance of a group of values. These functions
can be used to fit a experimental variogram to a list of points. Each of the given functions just calculate
the estimate for one bin. If you want a Variogram, use the variogram functions or Class from the Variogram and vario
submodule, or order the bins yourself
"""
import numpy as np
from scipy.special import binom
from numba import njit, jit

from skgstat.util import shannon_entropy



[docs]
@njit
def matheron(x):
    r"""Matheron Semi-Variance

    Calculates the Matheron Semi-Variance from an array of pairwise differences.
    Returns the semi-variance for the whole array. In case a semi-variance is
    needed for multiple groups, this function has to be mapped on each group.
    That is the typical use case in geostatistics.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.

    Returns
    -------
    numpy.float64

    Notes
    -----

    This implementation follows the original publication [1]_ and the
    notes on their application [2]_. Following the 1962 publication [1]_,
    the semi-variance is calculated as:

    .. math::
        \gamma (h) = \frac{1}{2N(h)} * \sum_{i=1}^{N(h)}(x)^2

    with:

    .. math::
        x = Z(x_i) - Z(x_{i+h})

    where x is exactly the input array x.

    References
    ----------

    .. [1] Matheron, G. (1962): Traité de Géostatistique Appliqué, Tonne 1.
       Memoires de Bureau de Recherches Géologiques et Miniéres, Paris.

    .. [2] Matheron, G. (1965): Les variables regionalisées et leur estimation.
       Editions Masson et Cie, 212 S., Paris.

    """
    # prevent ZeroDivisionError
    if x.size == 0:
        return np.nan

    return (1. / (2 * x.size)) * np.sum(np.power(x, 2))




[docs]
@njit
def cressie(x):
    r""" Cressie-Hawkins Semi-Variance

    Calculates the Cressie-Hawkins Semi-Variance from an array of pairwise
    differences. Returns the semi-variance for the whole array. In case a
    semi-variance is needed for multiple groups, this function has to be
    mapped on each group. That is the typical use case in geostatistics.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.

    Returns
    -------
    numpy.float64

    Notes
    -----

    This implementation is done after the publication by Cressie and Hawkins
    from 1980 [3]_:

    .. math::
        2\gamma (h) = \frac{(\frac{1}{N(h)} \sum_{i=1}^{N(h)} |x|^{0.5})^4}
        {0.457 + \frac{0.494}{N(h)} + \frac{0.045}{N^2(h)}}

    with:

    .. math::
        x = Z(x_i) - Z(x_{i+h})

    where x is exactly the input array x.

    References
    ----------

    .. [3] Cressie, N., and D. Hawkins (1980): Robust estimation of the
       variogram. Math. Geol., 12, 115-125.


    """
    # get the length
    n = x.size

    # prevent ZeroDivisionError
    if n == 0:
        return np.nan

    # Nominator
    nominator = np.power((1 / n) * np.sum(np.power(x, 0.5)), 4)

    # Denominator
    denominator = 0.457 + (0.494 / n) + (0.045 / n**2)

    return nominator / (2 * denominator)




[docs]
def dowd(x):
    r"""Dowd semi-variance

    Calculates the Dowd semi-variance from an array of pairwise
    differences. Returns the semi-variance for the whole array. In case a
    semi-variance is needed for multiple groups, this function has to be
    mapped on each group. That is the typical use case in geostatistics.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.

    Returns
    -------
    numpy.float64

    Notes
    -----
    The Dowd estimator is based on the median of all pairwise differences in
    each lag class and is therefore robust to exteme values at the cost of
    variability.
    This implementation follows Dowd's publication [4]_:

    .. math::

        2\gamma (h) = 2.198 * {median(x)}^2

    with:

    .. math::
        x = Z(x_i) - Z(x_{i+h})

    where x is exactly the input array x.

    References
    ----------

    .. [4] Dowd, P. A., (1984): The variogram and kriging: Robust and resistant
       estimators, in Geostatistics for Natural Resources Characterization.
       Edited by G. Verly et al., pp. 91 - 106, D. Reidel, Dordrecht.

    """
    # convert
    return 2.198 * np.nanmedian(x)**2 / 2




[docs]
@jit(forceobj=True)
def genton(x):
    r""" Genton robust semi-variance estimator

    Return the Genton semi-variance of the given sample x. Genton is a highly
    robust varigram estimator, that is designed to be location free and
    robust on extreme values in x.
    Genton is based on calculating kth order statistics and will for large
    data sets be close or equal to the 25% quartile of all ordered point pairs
    in X.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.

    Returns
    -------
    numpy.float64

    Notes
    -----
    The Genton estimator is described in great detail in the original
    publication [5]_ and is defined as:

    .. math:: Q_{N_h} = 2.2191\{|V_i(h) - V_j(h)|; i < j\}_{(k)}

    and

     .. math:: k = \binom{[N_h / 2] + 1}{2}

     and

     .. math:: q = \binom{N_h}{2}

     where k is the kth quantile of all q point pairs. For large N (k/q) will be
     close to 0.25. For N >= 500, (k/q) is close to 0.25 by two decimals and
     will therefore be set to 0.5 and the two binomial coefficients k,
     q are not calculated.

    References
    ----------

    ..  [5] Genton, M. G., (1998): Highly robust variogram estimation,
        Math. Geol., 30, 213 - 221.

    """
    # get length
    n = x.size

    if n < 2:
        return np.nan

    # calculate
    y = []
    for i in range(n):
        for j in range(i + 1, n):
            y.append(np.abs(x[i] - x[j]))

    # if N > 500, (k/q) will be ~ 1/4 anyway
    if n >= 500:
        k, q, = 1, 4
    else:
        # get k  k is binom(N(x)/2+1, 2)
        k = binom(n / 2 + 1, 2)

        # get q. Genton needs the kth quantile of q
        q = binom(n, 2)

    # return the kth percentile
    return 0.5 * np.power(2.219 * np.quantile(y, (k / q)), 2)




[docs]
def minmax(x):
    """Minimum - Maximum Estimator

    Returns a custom value. This estimator is the difference of maximum and
    minimum pairwise differences, normalized by the mean. MinMax will be very
    sensitive to extreme values.

    Do only use this estimator, in case you know what you are doing. It is
    experimental and might change its behaviour in a future version.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.

    Returns
    -------
    numpy.float64

    """
    return (np.nanmax(x) - np.nanmin(x)) / np.nanmean(x)




[docs]
def percentile(x, p=50):
    """Percentile estimator

    Returns a given percentile as semi-variance. Do only use this estimator,
    in case you know what you are doing.

    Do only use this estimator, in case you know what you are doing. It is
    experimental and might change its behaviour in a future version.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.
    p : int
        Desired percentile. Should be given as whole numbers 0 < p < 100.

    Returns
    -------
    np.float64

    """
    return np.percentile(x, q=p)




[docs]
def entropy(x, bins=None):
    """Shannon Entropy estimator

    Calculates the Shannon Entropy H as a variogram estimator. It is highly
    recommended to calculate the bins and explicitly set them as a list.
    In case this function is called for more than one lag class in a
    variogram, setting bins to None would result in different bin edges in
    each lag class. This would be very difficult to interpret.

    Parameters
    ----------
    x : numpy.ndarray
        Array of pairwise differences. These values should be the distances
        between pairwise observations in value space. If xi and x[i+h] fall
        into the h separating distance class, x should contain abs(xi - x[i+h])
        as an element.
    bins : int, list, str
        list of the bin edges used to calculate the empirical distribution of x.
        If bins is a list, these values are used directly. In case bins is a
        integer, as many even width bins will be calculated between the
        minimum and maximum value of x. In case bins is a string, it will be
        passed as bins argument to numpy.histograms function.

    Returns
    -------
    entropy : numpy.float64
        Shannon entropy of the given pairwise differences.

    Notes
    -----

    """
    # this is actually a bad idea
    if bins is None:
        bins = 15

    return shannon_entropy(x, bins)