336 lines
7.2 KiB
Go
336 lines
7.2 KiB
Go
// Copyright 2015 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package stats
|
|
|
|
import (
|
|
"math"
|
|
"sort"
|
|
|
|
"github.com/aclements/go-moremath/vec"
|
|
)
|
|
|
|
// Sample is a collection of possibly weighted data points.
|
|
type Sample struct {
|
|
// Xs is the slice of sample values.
|
|
Xs []float64
|
|
|
|
// Weights[i] is the weight of sample Xs[i]. If Weights is
|
|
// nil, all Xs have weight 1. Weights must have the same
|
|
// length of Xs and all values must be non-negative.
|
|
Weights []float64
|
|
|
|
// Sorted indicates that Xs is sorted in ascending order.
|
|
Sorted bool
|
|
}
|
|
|
|
// Bounds returns the minimum and maximum values of xs.
|
|
func Bounds(xs []float64) (min float64, max float64) {
|
|
if len(xs) == 0 {
|
|
return math.NaN(), math.NaN()
|
|
}
|
|
min, max = xs[0], xs[0]
|
|
for _, x := range xs {
|
|
if x < min {
|
|
min = x
|
|
}
|
|
if x > max {
|
|
max = x
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Bounds returns the minimum and maximum values of the Sample.
|
|
//
|
|
// If the Sample is weighted, this ignores samples with zero weight.
|
|
//
|
|
// This is constant time if s.Sorted and there are no zero-weighted
|
|
// values.
|
|
func (s Sample) Bounds() (min float64, max float64) {
|
|
if len(s.Xs) == 0 || (!s.Sorted && s.Weights == nil) {
|
|
return Bounds(s.Xs)
|
|
}
|
|
|
|
if s.Sorted {
|
|
if s.Weights == nil {
|
|
return s.Xs[0], s.Xs[len(s.Xs)-1]
|
|
}
|
|
min, max = math.NaN(), math.NaN()
|
|
for i, w := range s.Weights {
|
|
if w != 0 {
|
|
min = s.Xs[i]
|
|
break
|
|
}
|
|
}
|
|
if math.IsNaN(min) {
|
|
return
|
|
}
|
|
for i := range s.Weights {
|
|
if s.Weights[len(s.Weights)-i-1] != 0 {
|
|
max = s.Xs[len(s.Weights)-i-1]
|
|
break
|
|
}
|
|
}
|
|
} else {
|
|
min, max = math.Inf(1), math.Inf(-1)
|
|
for i, x := range s.Xs {
|
|
w := s.Weights[i]
|
|
if x < min && w != 0 {
|
|
min = x
|
|
}
|
|
if x > max && w != 0 {
|
|
max = x
|
|
}
|
|
}
|
|
if math.IsInf(min, 0) {
|
|
min, max = math.NaN(), math.NaN()
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Sum returns the (possibly weighted) sum of the Sample.
|
|
func (s Sample) Sum() float64 {
|
|
if s.Weights == nil {
|
|
return vec.Sum(s.Xs)
|
|
}
|
|
sum := 0.0
|
|
for i, x := range s.Xs {
|
|
sum += x * s.Weights[i]
|
|
}
|
|
return sum
|
|
}
|
|
|
|
// Weight returns the total weight of the Sasmple.
|
|
func (s Sample) Weight() float64 {
|
|
if s.Weights == nil {
|
|
return float64(len(s.Xs))
|
|
}
|
|
return vec.Sum(s.Weights)
|
|
}
|
|
|
|
// Mean returns the arithmetic mean of xs.
|
|
func Mean(xs []float64) float64 {
|
|
if len(xs) == 0 {
|
|
return math.NaN()
|
|
}
|
|
m := 0.0
|
|
for i, x := range xs {
|
|
m += (x - m) / float64(i+1)
|
|
}
|
|
return m
|
|
}
|
|
|
|
// Mean returns the arithmetic mean of the Sample.
|
|
func (s Sample) Mean() float64 {
|
|
if len(s.Xs) == 0 || s.Weights == nil {
|
|
return Mean(s.Xs)
|
|
}
|
|
|
|
m, wsum := 0.0, 0.0
|
|
for i, x := range s.Xs {
|
|
// Use weighted incremental mean:
|
|
// m_i = (1 - w_i/wsum_i) * m_(i-1) + (w_i/wsum_i) * x_i
|
|
// = m_(i-1) + (x_i - m_(i-1)) * (w_i/wsum_i)
|
|
w := s.Weights[i]
|
|
wsum += w
|
|
m += (x - m) * w / wsum
|
|
}
|
|
return m
|
|
}
|
|
|
|
// GeoMean returns the geometric mean of xs. xs must be positive.
|
|
func GeoMean(xs []float64) float64 {
|
|
if len(xs) == 0 {
|
|
return math.NaN()
|
|
}
|
|
m := 0.0
|
|
for i, x := range xs {
|
|
if x <= 0 {
|
|
return math.NaN()
|
|
}
|
|
lx := math.Log(x)
|
|
m += (lx - m) / float64(i+1)
|
|
}
|
|
return math.Exp(m)
|
|
}
|
|
|
|
// GeoMean returns the geometric mean of the Sample. All samples
|
|
// values must be positive.
|
|
func (s Sample) GeoMean() float64 {
|
|
if len(s.Xs) == 0 || s.Weights == nil {
|
|
return GeoMean(s.Xs)
|
|
}
|
|
|
|
m, wsum := 0.0, 0.0
|
|
for i, x := range s.Xs {
|
|
w := s.Weights[i]
|
|
wsum += w
|
|
lx := math.Log(x)
|
|
m += (lx - m) * w / wsum
|
|
}
|
|
return math.Exp(m)
|
|
}
|
|
|
|
// Variance returns the sample variance of xs.
|
|
func Variance(xs []float64) float64 {
|
|
if len(xs) == 0 {
|
|
return math.NaN()
|
|
} else if len(xs) <= 1 {
|
|
return 0
|
|
}
|
|
|
|
// Based on Wikipedia's presentation of Welford 1962
|
|
// (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm).
|
|
// This is more numerically stable than the standard two-pass
|
|
// formula and not prone to massive cancellation.
|
|
mean, M2 := 0.0, 0.0
|
|
for n, x := range xs {
|
|
delta := x - mean
|
|
mean += delta / float64(n+1)
|
|
M2 += delta * (x - mean)
|
|
}
|
|
return M2 / float64(len(xs)-1)
|
|
}
|
|
|
|
func (s Sample) Variance() float64 {
|
|
if len(s.Xs) == 0 || s.Weights == nil {
|
|
return Variance(s.Xs)
|
|
}
|
|
// TODO(austin)
|
|
panic("Weighted Variance not implemented")
|
|
}
|
|
|
|
// StdDev returns the sample standard deviation of xs.
|
|
func StdDev(xs []float64) float64 {
|
|
return math.Sqrt(Variance(xs))
|
|
}
|
|
|
|
// StdDev returns the sample standard deviation of the Sample.
|
|
func (s Sample) StdDev() float64 {
|
|
if len(s.Xs) == 0 || s.Weights == nil {
|
|
return StdDev(s.Xs)
|
|
}
|
|
// TODO(austin)
|
|
panic("Weighted StdDev not implemented")
|
|
}
|
|
|
|
// Quantile returns the sample value X at which q*weight of the sample
|
|
// is <= X. This uses interpolation method R8 from Hyndman and Fan
|
|
// (1996).
|
|
//
|
|
// q will be capped to the range [0, 1]. If len(xs) == 0 or all
|
|
// weights are 0, returns NaN.
|
|
//
|
|
// Quantile(0.5) is the median. Quantile(0.25) and Quantile(0.75) are
|
|
// the first and third quartiles, respectively. Quantile(P/100) is the
|
|
// P'th percentile.
|
|
//
|
|
// This is constant time if s.Sorted and s.Weights == nil.
|
|
func (s Sample) Quantile(q float64) float64 {
|
|
if len(s.Xs) == 0 {
|
|
return math.NaN()
|
|
} else if q <= 0 {
|
|
min, _ := s.Bounds()
|
|
return min
|
|
} else if q >= 1 {
|
|
_, max := s.Bounds()
|
|
return max
|
|
}
|
|
|
|
if !s.Sorted {
|
|
// TODO(austin) Use select algorithm instead
|
|
s = *s.Copy().Sort()
|
|
}
|
|
|
|
if s.Weights == nil {
|
|
N := float64(len(s.Xs))
|
|
//n := q * (N + 1) // R6
|
|
n := 1/3.0 + q*(N+1/3.0) // R8
|
|
kf, frac := math.Modf(n)
|
|
k := int(kf)
|
|
if k <= 0 {
|
|
return s.Xs[0]
|
|
} else if k >= len(s.Xs) {
|
|
return s.Xs[len(s.Xs)-1]
|
|
}
|
|
return s.Xs[k-1] + frac*(s.Xs[k]-s.Xs[k-1])
|
|
} else {
|
|
// TODO(austin): Implement interpolation
|
|
|
|
target := s.Weight() * q
|
|
|
|
// TODO(austin) If we had cumulative weights, we could
|
|
// do this in log time.
|
|
for i, weight := range s.Weights {
|
|
target -= weight
|
|
if target < 0 {
|
|
return s.Xs[i]
|
|
}
|
|
}
|
|
return s.Xs[len(s.Xs)-1]
|
|
}
|
|
}
|
|
|
|
// IQR returns the interquartile range of the Sample.
|
|
//
|
|
// This is constant time if s.Sorted and s.Weights == nil.
|
|
func (s Sample) IQR() float64 {
|
|
if !s.Sorted {
|
|
s = *s.Copy().Sort()
|
|
}
|
|
return s.Quantile(0.75) - s.Quantile(0.25)
|
|
}
|
|
|
|
type sampleSorter struct {
|
|
xs []float64
|
|
weights []float64
|
|
}
|
|
|
|
func (p *sampleSorter) Len() int {
|
|
return len(p.xs)
|
|
}
|
|
|
|
func (p *sampleSorter) Less(i, j int) bool {
|
|
return p.xs[i] < p.xs[j]
|
|
}
|
|
|
|
func (p *sampleSorter) Swap(i, j int) {
|
|
p.xs[i], p.xs[j] = p.xs[j], p.xs[i]
|
|
p.weights[i], p.weights[j] = p.weights[j], p.weights[i]
|
|
}
|
|
|
|
// Sort sorts the samples in place in s and returns s.
|
|
//
|
|
// A sorted sample improves the performance of some algorithms.
|
|
func (s *Sample) Sort() *Sample {
|
|
if s.Sorted || sort.Float64sAreSorted(s.Xs) {
|
|
// All set
|
|
} else if s.Weights == nil {
|
|
sort.Float64s(s.Xs)
|
|
} else {
|
|
sort.Sort(&sampleSorter{s.Xs, s.Weights})
|
|
}
|
|
s.Sorted = true
|
|
return s
|
|
}
|
|
|
|
// Copy returns a copy of the Sample.
|
|
//
|
|
// The returned Sample shares no data with the original, so they can
|
|
// be modified (for example, sorted) independently.
|
|
func (s Sample) Copy() *Sample {
|
|
xs := make([]float64, len(s.Xs))
|
|
copy(xs, s.Xs)
|
|
|
|
weights := []float64(nil)
|
|
if s.Weights != nil {
|
|
weights = make([]float64, len(s.Weights))
|
|
copy(weights, s.Weights)
|
|
}
|
|
|
|
return &Sample{xs, weights, s.Sorted}
|
|
}
|