Source code for skgstat.estimators

"""
This file collects different functions for estimating the semivariance of a group of values. These functions
can be used to fit a experimental variogram to a list of points. Each of the given functions just calculate
the estimate for one bin. If you want a Variogram, use the variogram functions or Class from the Variogram and vario
submodule, or order the bins yourself
"""
import numpy as np
from scipy.special import binom
from numba import njit, jit

from skgstat.util import shannon_entropy


[docs] @njit def matheron(x): r"""Matheron Semi-Variance Calculates the Matheron Semi-Variance from an array of pairwise differences. Returns the semi-variance for the whole array. In case a semi-variance is needed for multiple groups, this function has to be mapped on each group. That is the typical use case in geostatistics. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. Returns ------- numpy.float64 Notes ----- This implementation follows the original publication [1]_ and the notes on their application [2]_. Following the 1962 publication [1]_, the semi-variance is calculated as: .. math:: \gamma (h) = \frac{1}{2N(h)} * \sum_{i=1}^{N(h)}(x)^2 with: .. math:: x = Z(x_i) - Z(x_{i+h}) where x is exactly the input array x. References ---------- .. [1] Matheron, G. (1962): Traité de Géostatistique Appliqué, Tonne 1. Memoires de Bureau de Recherches Géologiques et Miniéres, Paris. .. [2] Matheron, G. (1965): Les variables regionalisées et leur estimation. Editions Masson et Cie, 212 S., Paris. """ # prevent ZeroDivisionError if x.size == 0: return np.nan return (1. / (2 * x.size)) * np.sum(np.power(x, 2))
[docs] @njit def cressie(x): r""" Cressie-Hawkins Semi-Variance Calculates the Cressie-Hawkins Semi-Variance from an array of pairwise differences. Returns the semi-variance for the whole array. In case a semi-variance is needed for multiple groups, this function has to be mapped on each group. That is the typical use case in geostatistics. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. Returns ------- numpy.float64 Notes ----- This implementation is done after the publication by Cressie and Hawkins from 1980 [3]_: .. math:: 2\gamma (h) = \frac{(\frac{1}{N(h)} \sum_{i=1}^{N(h)} |x|^{0.5})^4} {0.457 + \frac{0.494}{N(h)} + \frac{0.045}{N^2(h)}} with: .. math:: x = Z(x_i) - Z(x_{i+h}) where x is exactly the input array x. References ---------- .. [3] Cressie, N., and D. Hawkins (1980): Robust estimation of the variogram. Math. Geol., 12, 115-125. """ # get the length n = x.size # prevent ZeroDivisionError if n == 0: return np.nan # Nominator nominator = np.power((1 / n) * np.sum(np.power(x, 0.5)), 4) # Denominator denominator = 0.457 + (0.494 / n) + (0.045 / n**2) return nominator / (2 * denominator)
[docs] def dowd(x): r"""Dowd semi-variance Calculates the Dowd semi-variance from an array of pairwise differences. Returns the semi-variance for the whole array. In case a semi-variance is needed for multiple groups, this function has to be mapped on each group. That is the typical use case in geostatistics. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. Returns ------- numpy.float64 Notes ----- The Dowd estimator is based on the median of all pairwise differences in each lag class and is therefore robust to exteme values at the cost of variability. This implementation follows Dowd's publication [4]_: .. math:: 2\gamma (h) = 2.198 * {median(x)}^2 with: .. math:: x = Z(x_i) - Z(x_{i+h}) where x is exactly the input array x. References ---------- .. [4] Dowd, P. A., (1984): The variogram and kriging: Robust and resistant estimators, in Geostatistics for Natural Resources Characterization. Edited by G. Verly et al., pp. 91 - 106, D. Reidel, Dordrecht. """ # convert return 2.198 * np.nanmedian(x)**2 / 2
[docs] @jit(forceobj=True) def genton(x): r""" Genton robust semi-variance estimator Return the Genton semi-variance of the given sample x. Genton is a highly robust varigram estimator, that is designed to be location free and robust on extreme values in x. Genton is based on calculating kth order statistics and will for large data sets be close or equal to the 25% quartile of all ordered point pairs in X. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. Returns ------- numpy.float64 Notes ----- The Genton estimator is described in great detail in the original publication [5]_ and is defined as: .. math:: Q_{N_h} = 2.2191\{|V_i(h) - V_j(h)|; i < j\}_{(k)} and .. math:: k = \binom{[N_h / 2] + 1}{2} and .. math:: q = \binom{N_h}{2} where k is the kth quantile of all q point pairs. For large N (k/q) will be close to 0.25. For N >= 500, (k/q) is close to 0.25 by two decimals and will therefore be set to 0.5 and the two binomial coefficients k, q are not calculated. References ---------- .. [5] Genton, M. G., (1998): Highly robust variogram estimation, Math. Geol., 30, 213 - 221. """ # get length n = x.size if n < 2: return np.nan # calculate y = [] for i in range(n): for j in range(i + 1, n): y.append(np.abs(x[i] - x[j])) # if N > 500, (k/q) will be ~ 1/4 anyway if n >= 500: k, q, = 1, 4 else: # get k k is binom(N(x)/2+1, 2) k = binom(n / 2 + 1, 2) # get q. Genton needs the kth quantile of q q = binom(n, 2) # return the kth percentile return 0.5 * np.power(2.219 * np.quantile(y, (k / q)), 2)
[docs] def minmax(x): """Minimum - Maximum Estimator Returns a custom value. This estimator is the difference of maximum and minimum pairwise differences, normalized by the mean. MinMax will be very sensitive to extreme values. Do only use this estimator, in case you know what you are doing. It is experimental and might change its behaviour in a future version. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. Returns ------- numpy.float64 """ return (np.nanmax(x) - np.nanmin(x)) / np.nanmean(x)
[docs] def percentile(x, p=50): """Percentile estimator Returns a given percentile as semi-variance. Do only use this estimator, in case you know what you are doing. Do only use this estimator, in case you know what you are doing. It is experimental and might change its behaviour in a future version. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. p : int Desired percentile. Should be given as whole numbers 0 < p < 100. Returns ------- np.float64 """ return np.percentile(x, q=p)
[docs] def entropy(x, bins=None): """Shannon Entropy estimator Calculates the Shannon Entropy H as a variogram estimator. It is highly recommended to calculate the bins and explicitly set them as a list. In case this function is called for more than one lag class in a variogram, setting bins to None would result in different bin edges in each lag class. This would be very difficult to interpret. Parameters ---------- x : numpy.ndarray Array of pairwise differences. These values should be the distances between pairwise observations in value space. If xi and x[i+h] fall into the h separating distance class, x should contain abs(xi - x[i+h]) as an element. bins : int, list, str list of the bin edges used to calculate the empirical distribution of x. If bins is a list, these values are used directly. In case bins is a integer, as many even width bins will be calculated between the minimum and maximum value of x. In case bins is a string, it will be passed as bins argument to numpy.histograms function. Returns ------- entropy : numpy.float64 Shannon entropy of the given pairwise differences. Notes ----- """ # this is actually a bad idea if bins is None: bins = 15 return shannon_entropy(x, bins)