route/vendor/github.com/aclements/go-moremath/stats/sample.go

336 lines
7.2 KiB
Go

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package stats
import (
"math"
"sort"
"github.com/aclements/go-moremath/vec"
)
// Sample is a collection of possibly weighted data points.
type Sample struct {
// Xs is the slice of sample values.
Xs []float64
// Weights[i] is the weight of sample Xs[i]. If Weights is
// nil, all Xs have weight 1. Weights must have the same
// length of Xs and all values must be non-negative.
Weights []float64
// Sorted indicates that Xs is sorted in ascending order.
Sorted bool
}
// Bounds returns the minimum and maximum values of xs.
func Bounds(xs []float64) (min float64, max float64) {
if len(xs) == 0 {
return math.NaN(), math.NaN()
}
min, max = xs[0], xs[0]
for _, x := range xs {
if x < min {
min = x
}
if x > max {
max = x
}
}
return
}
// Bounds returns the minimum and maximum values of the Sample.
//
// If the Sample is weighted, this ignores samples with zero weight.
//
// This is constant time if s.Sorted and there are no zero-weighted
// values.
func (s Sample) Bounds() (min float64, max float64) {
if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) {
return Bounds(s.Xs)
}
if s.Sorted {
if s.Weights == nil {
return s.Xs[0], s.Xs[len(s.Xs)-1]
}
min, max = math.NaN(), math.NaN()
for i, w := range s.Weights {
if w != 0 {
min = s.Xs[i]
break
}
}
if math.IsNaN(min) {
return
}
for i := range s.Weights {
if s.Weights[len(s.Weights)-i-1] != 0 {
max = s.Xs[len(s.Weights)-i-1]
break
}
}
} else {
min, max = math.Inf(1), math.Inf(-1)
for i, x := range s.Xs {
w := s.Weights[i]
if x < min && w != 0 {
min = x
}
if x > max && w != 0 {
max = x
}
}
if math.IsInf(min, 0) {
min, max = math.NaN(), math.NaN()
}
}
return
}
// Sum returns the (possibly weighted) sum of the Sample.
func (s Sample) Sum() float64 {
if s.Weights == nil {
return vec.Sum(s.Xs)
}
sum := 0.0
for i, x := range s.Xs {
sum += x * s.Weights[i]
}
return sum
}
// Weight returns the total weight of the Sasmple.
func (s Sample) Weight() float64 {
if s.Weights == nil {
return float64(len(s.Xs))
}
return vec.Sum(s.Weights)
}
// Mean returns the arithmetic mean of xs.
func Mean(xs []float64) float64 {
if len(xs) == 0 {
return math.NaN()
}
m := 0.0
for i, x := range xs {
m += (x - m) / float64(i+1)
}
return m
}
// Mean returns the arithmetic mean of the Sample.
func (s Sample) Mean() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return Mean(s.Xs)
}
m, wsum := 0.0, 0.0
for i, x := range s.Xs {
// Use weighted incremental mean:
// m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i
// = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i)
w := s.Weights[i]
wsum += w
m += (x - m) * w / wsum
}
return m
}
// GeoMean returns the geometric mean of xs. xs must be positive.
func GeoMean(xs []float64) float64 {
if len(xs) == 0 {
return math.NaN()
}
m := 0.0
for i, x := range xs {
if x <= 0 {
return math.NaN()
}
lx := math.Log(x)
m += (lx - m) / float64(i+1)
}
return math.Exp(m)
}
// GeoMean returns the geometric mean of the Sample. All samples
// values must be positive.
func (s Sample) GeoMean() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return GeoMean(s.Xs)
}
m, wsum := 0.0, 0.0
for i, x := range s.Xs {
w := s.Weights[i]
wsum += w
lx := math.Log(x)
m += (lx - m) * w / wsum
}
return math.Exp(m)
}
// Variance returns the sample variance of xs.
func Variance(xs []float64) float64 {
if len(xs) == 0 {
return math.NaN()
} else if len(xs) <= 1 {
return 0
}
// Based on Wikipedia's presentation of Welford 1962
// (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm).
// This is more numerically stable than the standard two-pass
// formula and not prone to massive cancellation.
mean, M2 := 0.0, 0.0
for n, x := range xs {
delta := x - mean
mean += delta / float64(n+1)
M2 += delta * (x - mean)
}
return M2 / float64(len(xs)-1)
}
func (s Sample) Variance() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return Variance(s.Xs)
}
// TODO(austin)
panic("Weighted Variance not implemented")
}
// StdDev returns the sample standard deviation of xs.
func StdDev(xs []float64) float64 {
return math.Sqrt(Variance(xs))
}
// StdDev returns the sample standard deviation of the Sample.
func (s Sample) StdDev() float64 {
if len(s.Xs) == 0 || s.Weights == nil {
return StdDev(s.Xs)
}
// TODO(austin)
panic("Weighted StdDev not implemented")
}
// Quantile returns the sample value X at which q*weight of the sample
// is <= X. This uses interpolation method R8 from Hyndman and Fan
// (1996).
//
// q will be capped to the range [0, 1]. If len(xs) == 0 or all
// weights are 0, returns NaN.
//
// Quantile(0.5) is the median. Quantile(0.25) and Quantile(0.75) are
// the first and third quartiles, respectively. Quantile(P/100) is the
// P'th percentile.
//
// This is constant time if s.Sorted and s.Weights == nil.
func (s Sample) Quantile(q float64) float64 {
if len(s.Xs) == 0 {
return math.NaN()
} else if q <= 0 {
min, _ := s.Bounds()
return min
} else if q >= 1 {
_, max := s.Bounds()
return max
}
if !s.Sorted {
// TODO(austin) Use select algorithm instead
s = *s.Copy().Sort()
}
if s.Weights == nil {
N := float64(len(s.Xs))
//n := q * (N + 1) // R6
n := 1/3.0 + q*(N+1/3.0) // R8
kf, frac := math.Modf(n)
k := int(kf)
if k <= 0 {
return s.Xs[0]
} else if k >= len(s.Xs) {
return s.Xs[len(s.Xs)-1]
}
return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1])
} else {
// TODO(austin): Implement interpolation
target := s.Weight() * q
// TODO(austin) If we had cumulative weights, we could
// do this in log time.
for i, weight := range s.Weights {
target -= weight
if target < 0 {
return s.Xs[i]
}
}
return s.Xs[len(s.Xs)-1]
}
}
// IQR returns the interquartile range of the Sample.
//
// This is constant time if s.Sorted and s.Weights == nil.
func (s Sample) IQR() float64 {
if !s.Sorted {
s = *s.Copy().Sort()
}
return s.Quantile(0.75) - s.Quantile(0.25)
}
type sampleSorter struct {
xs []float64
weights []float64
}
func (p *sampleSorter) Len() int {
return len(p.xs)
}
func (p *sampleSorter) Less(i, j int) bool {
return p.xs[i] < p.xs[j]
}
func (p *sampleSorter) Swap(i, j int) {
p.xs[i], p.xs[j] = p.xs[j], p.xs[i]
p.weights[i], p.weights[j] = p.weights[j], p.weights[i]
}
// Sort sorts the samples in place in s and returns s.
//
// A sorted sample improves the performance of some algorithms.
func (s *Sample) Sort() *Sample {
if s.Sorted || sort.Float64sAreSorted(s.Xs) {
// All set
} else if s.Weights == nil {
sort.Float64s(s.Xs)
} else {
sort.Sort(&sampleSorter{s.Xs, s.Weights})
}
s.Sorted = true
return s
}
// Copy returns a copy of the Sample.
//
// The returned Sample shares no data with the original, so they can
// be modified (for example, sorted) independently.
func (s Sample) Copy() *Sample {
xs := make([]float64, len(s.Xs))
copy(xs, s.Xs)
weights := []float64(nil)
if s.Weights != nil {
weights = make([]float64, len(s.Weights))
copy(weights, s.Weights)
}
return &Sample{xs, weights, s.Sorted}
}