from ..core import numeric as np class GaussianKDE(object): """ Representation of a kernel-density estimate using Gaussian kernels. Parameters ---------- dataset : array_like Datapoints to estimate from. In case of univariate data this is a 1-D array, otherwise a 2-D array with shape (# of dims, # of data). bw_method : str, scalar or callable, optional The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If a scalar, this will be used directly as `kde.factor`. If a callable, it should take a `GaussianKDE` instance as only parameter and return a scalar. If None (default), 'scott' is used. Attributes ---------- dataset : ndarray The dataset with which `gaussian_kde` was initialized. dim : int Number of dimensions. num_dp : int Number of datapoints. factor : float The bandwidth factor, obtained from `kde.covariance_factor`, with which the covariance matrix is multiplied. covariance : ndarray The covariance matrix of `dataset`, scaled by the calculated bandwidth (`kde.factor`). inv_cov : ndarray The inverse of `covariance`. Methods ------- kde.evaluate(points) : ndarray Evaluate the estimated pdf on a provided set of points. kde(points) : ndarray Same as kde.evaluate(points) """ # This implementation with minor modification was too good to pass up. # from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py def __init__(self, dataset, bw_method=None): self.dataset = np.atleast_2d(dataset) if not np.array(self.dataset).size > 1: raise ValueError("`dataset` input should have multiple elements.") self.dim, self.num_dp = np.array(self.dataset).shape isString = isinstance(bw_method, str) if bw_method is None: pass elif (isString and bw_method == 'scott'): self.covariance_factor = self.scotts_factor elif (isString and bw_method == 'silverman'): self.covariance_factor = self.silverman_factor elif (np.isscalar(bw_method) and not isString): self._bw_method = 'use constant' self.covariance_factor = lambda: bw_method elif callable(bw_method): self._bw_method = bw_method self.covariance_factor = lambda: self._bw_method(self) else: raise ValueError("`bw_method` should be 'scott', 'silverman', a " "scalar or a callable") # Computes the covariance matrix for each Gaussian kernel using # covariance_factor(). self.factor = self.covariance_factor() # Cache covariance and inverse covariance of the data if not hasattr(self, '_data_inv_cov'): self.data_covariance = np.atleast_2d(np.stats.cov(self.dataset, rowvar=1, bias=False)) self.data_inv_cov = np.linalg.inv(self.data_covariance) self.covariance = self.data_covariance * self.factor ** 2 self.inv_cov = self.data_inv_cov / self.factor ** 2 self.norm_factor = np.sqrt(np.linalg.det(2 * np.pi * self.covariance)) * self.num_dp def scotts_factor(self): return np.power(self.num_dp, -1. / (self.dim + 4)) def silverman_factor(self): return np.power( self.num_dp * (self.dim + 2.0) / 4.0, -1. / (self.dim + 4)) # Default method to calculate bandwidth, can be overwritten by subclass covariance_factor = scotts_factor def evaluate(self, points): """Evaluate the estimated pdf on a set of points. Parameters ---------- points : (# of dimensions, # of points)-array Alternatively, a (# of dimensions,) vector can be passed in and treated as a single point. Returns ------- values : (# of points,)-array The values at each point. Raises ------ ValueError : if the dimensionality of the input points is different than the dimensionality of the KDE. """ points = np.atleast_2d(points) dim, num_m = np.array(points).shape if dim != self.dim: raise ValueError("points have dimension {}, dataset has dimension " "{}".format(dim, self.dim)) result = np.zeros(num_m) if num_m >= self.num_dp: # there are more points than data, so loop over data for i in range(self.num_dp): diff = self.dataset[:, i, np.newaxis] - points tdiff = np.dot(self.inv_cov, diff) energy = np.sum(diff * tdiff, axis=0) / 2.0 result = result + np.exp(-energy) else: # loop over points for i in range(num_m): diff = self.dataset - points[:, i, np.newaxis] tdiff = np.dot(self.inv_cov, diff) energy = np.sum(diff * tdiff, axis=0) / 2.0 result[i] = np.sum(np.exp(-energy), axis=0) result = result / self.norm_factor return result __call__ = evaluate