// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package stats import ( "fmt" "math" ) // A KDE is a distribution that estimates the underlying distribution // of a Sample using kernel density estimation. // // Kernel density estimation is a method for constructing an estimate // ƒ̂(x) of a unknown distribution ƒ(x) given a sample from that // distribution. Unlike many techniques, kernel density estimation is // non-parametric: in general, it doesn't assume any particular true // distribution (note, however, that the resulting distribution // depends deeply on the selected bandwidth, and many bandwidth // estimation techniques assume normal reference rules). // // A kernel density estimate is similar to a histogram, except that it // is a smooth probability estimate and does not require choosing a // bin size and discretizing the data. // // Sample is the only required field. All others have reasonable // defaults. type KDE struct { // Sample is the data sample underlying this KDE. Sample Sample // Kernel is the kernel to use for the KDE. Kernel KDEKernel // Bandwidth is the bandwidth to use for the KDE. // // If this is zero, the bandwidth is computed from the // provided data using a default bandwidth estimator // (currently BandwidthScott). Bandwidth float64 // BoundaryMethod is the boundary correction method to use for // the KDE. The default value is BoundaryReflect; however, the // default bounds are effectively +/-inf, which is equivalent // to performing no boundary correction. BoundaryMethod KDEBoundaryMethod // [BoundaryMin, BoundaryMax) specify a bounded support for // the KDE. If both are 0 (their default values), they are // treated as +/-inf. // // To specify a half-bounded support, set Min to math.Inf(-1) // or Max to math.Inf(1). BoundaryMin float64 BoundaryMax float64 } // BandwidthSilverman is a bandwidth estimator implementing // Silverman's Rule of Thumb. It's fast, but not very robust to // outliers as it assumes data is approximately normal. // // Silverman, B. W. (1986) Density Estimation. func BandwidthSilverman(data interface { StdDev() float64 Weight() float64 }) float64 { return 1.06 * data.StdDev() * math.Pow(data.Weight(), -1.0/5) } // BandwidthScott is a bandwidth estimator implementing Scott's Rule. // This is generally robust to outliers: it chooses the minimum // between the sample's standard deviation and an robust estimator of // a Gaussian distribution's standard deviation. // // Scott, D. W. (1992) Multivariate Density Estimation: Theory, // Practice, and Visualization. func BandwidthScott(data interface { StdDev() float64 Weight() float64 Quantile(float64) float64 }) float64 { iqr := data.Quantile(0.75) - data.Quantile(0.25) hScale := 1.06 * math.Pow(data.Weight(), -1.0/5) stdDev := data.StdDev() if stdDev < iqr/1.349 { // Use Silverman's Rule of Thumb return hScale * stdDev } else { // Use IQR/1.349 as a robust estimator of the standard // deviation of a Gaussian distribution. return hScale * (iqr / 1.349) } } // TODO(austin) Implement bandwidth estimator from Botev, Grotowski, // Kroese. (2010) Kernel Density Estimation via Diffusion. // KDEKernel represents a kernel to use for a KDE. type KDEKernel int //go:generate stringer -type=KDEKernel const ( // An EpanechnikovKernel is a smooth kernel with bounded // support. As a result, the KDE will also have bounded // support. It is "optimal" in the sense that it minimizes the // asymptotic mean integrated squared error (AMISE). EpanechnikovKernel KDEKernel = iota // A GaussianKernel is a Gaussian (normal) kernel. GaussianKernel // A DeltaKernel is a Dirac delta function. The PDF of such a // KDE is not well-defined, but the CDF will represent each // sample as an instantaneous increase. This kernel ignores // bandwidth and never requires boundary correction. DeltaKernel ) // KDEBoundaryMethod represents a boundary correction method for // constructing a KDE with bounded support. type KDEBoundaryMethod int //go:generate stringer -type=KDEBoundaryMethod const ( // BoundaryReflect reflects the density estimate at the // boundaries. For example, for a KDE with support [0, inf), // this is equivalent to ƒ̂ᵣ(x)=ƒ̂(x)+ƒ̂(-x) for x>=0. This is a // simple and fast technique, but enforces that ƒ̂ᵣ'(0)=0, so // it may not be applicable to all distributions. BoundaryReflect KDEBoundaryMethod = iota ) type kdeKernel interface { pdfEach(xs []float64) []float64 cdfEach(xs []float64) []float64 } func (k *KDE) prepare() (kdeKernel, bool) { // Compute bandwidth. if k.Bandwidth == 0 { k.Bandwidth = BandwidthScott(k.Sample) } // Construct kernel. kernel := kdeKernel(nil) switch k.Kernel { default: panic(fmt.Sprint("unknown kernel", k)) case EpanechnikovKernel: kernel = epanechnikovKernel{k.Bandwidth} case GaussianKernel: kernel = NormalDist{0, k.Bandwidth} case DeltaKernel: kernel = DeltaDist{0} } // Use boundary correction? bc := k.BoundaryMin != 0 || k.BoundaryMax != 0 return kernel, bc } // TODO: For KDEs of histograms, make histograms able to create a // weighted Sample and simply require the caller to provide a // good bandwidth from a StreamStats. // normalizedXs returns x - kde.Sample.Xs. Evaluating kernels shifted // by kde.Sample.Xs all at x is equivalent to evaluating one unshifted // kernel at x - kde.Sample.Xs. func (kde *KDE) normalizedXs(x float64) []float64 { txs := make([]float64, len(kde.Sample.Xs)) for i, xi := range kde.Sample.Xs { txs[i] = x - xi } return txs } func (kde *KDE) PDF(x float64) float64 { kernel, bc := kde.prepare() // Apply boundary if bc && (x < kde.BoundaryMin || x >= kde.BoundaryMax) { return 0 } y := func(x float64) float64 { // Shift kernel to each of kde.xs and evaluate at x ys := kernel.pdfEach(kde.normalizedXs(x)) // Kernel samples are weighted according to the weights of xs wys := Sample{Xs: ys, Weights: kde.Sample.Weights} return wys.Sum() / wys.Weight() } if !bc { return y(x) } switch kde.BoundaryMethod { default: panic("unknown boundary correction method") case BoundaryReflect: if math.IsInf(kde.BoundaryMax, 1) { return y(x) + y(2*kde.BoundaryMin-x) } else if math.IsInf(kde.BoundaryMin, -1) { return y(x) + y(2*kde.BoundaryMax-x) } else { d := 2 * (kde.BoundaryMax - kde.BoundaryMin) w := 2 * (x - kde.BoundaryMin) return series(func(n float64) float64 { // Points >= x return y(x+n*d) + y(x+n*d-w) }) + series(func(n float64) float64 { // Points < x return y(x-(n+1)*d+w) + y(x-(n+1)*d) }) } } } func (kde *KDE) CDF(x float64) float64 { kernel, bc := kde.prepare() // Apply boundary if bc { if x < kde.BoundaryMin { return 0 } else if x >= kde.BoundaryMax { return 1 } } y := func(x float64) float64 { // Shift kernel integral to each of cdf.xs and evaluate at x ys := kernel.cdfEach(kde.normalizedXs(x)) // Kernel samples are weighted according to the weights of xs wys := Sample{Xs: ys, Weights: kde.Sample.Weights} return wys.Sum() / wys.Weight() } if !bc { return y(x) } switch kde.BoundaryMethod { default: panic("unknown boundary correction method") case BoundaryReflect: if math.IsInf(kde.BoundaryMax, 1) { return y(x) - y(2*kde.BoundaryMin-x) } else if math.IsInf(kde.BoundaryMin, -1) { return y(x) + (1 - y(2*kde.BoundaryMax-x)) } else { d := 2 * (kde.BoundaryMax - kde.BoundaryMin) w := 2 * (x - kde.BoundaryMin) return series(func(n float64) float64 { // Windows >= x-w return y(x+n*d) - y(x+n*d-w) }) + series(func(n float64) float64 { // Windows < x-w return y(x-(n+1)*d) - y(x-(n+1)*d-w) }) } } } func (kde *KDE) Bounds() (low float64, high float64) { _, bc := kde.prepare() // TODO(austin) If this KDE came from a histogram, we'd better // not sample at a significantly higher rate than the // histogram. Maybe we want to just return the bounds of the // histogram? // TODO(austin) It would be nice if this could be instructed // to include all original data points, even if they are in // the tail. Probably that should just be up to the caller to // pass an axis derived from the bounds of the original data. // Use the lowest and highest samples as starting points lowX, highX := kde.Sample.Bounds() if lowX == highX { lowX -= 1 highX += 1 } // Find the end points that contain 99% of the CDF's weight. // Since bisect requires that the root be bracketed, start by // expanding our range if necessary. TODO(austin) This can // definitely be done faster. const ( lowY = 0.005 highY = 0.995 tolerance = 0.001 ) for kde.CDF(lowX) > lowY { lowX -= highX - lowX } for kde.CDF(highX) < highY { highX += highX - lowX } // Explicitly accept discontinuities, since we may be using a // discontiguous kernel. low, _ = bisect(func(x float64) float64 { return kde.CDF(x) - lowY }, lowX, highX, tolerance) high, _ = bisect(func(x float64) float64 { return kde.CDF(x) - highY }, lowX, highX, tolerance) // Expand width by 20% to give some margins width := high - low low, high = low-0.1*width, high+0.1*width // Limit to bounds if bc { low = math.Max(low, kde.BoundaryMin) high = math.Min(high, kde.BoundaryMax) } return } type epanechnikovKernel struct { h float64 } func (d epanechnikovKernel) pdfEach(xs []float64) []float64 { ys := make([]float64, len(xs)) a := 0.75 / d.h invhh := 1 / (d.h * d.h) for i, x := range xs { if -d.h < x && x < d.h { ys[i] = a * (1 - x*x*invhh) } } return ys } func (d epanechnikovKernel) cdfEach(xs []float64) []float64 { ys := make([]float64, len(xs)) invh := 1 / d.h for i, x := range xs { if x > d.h { ys[i] = 1 } else if x > -d.h { u := x * invh ys[i] = 0.25 * (2 + 3*u - u*u*u) } } return ys }