route/vendor/github.com/aclements/go-moremath/stats/utest.go

277 lines
8.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package stats
import (
"math"
"sort"
"github.com/aclements/go-moremath/mathx"
)
// A LocationHypothesis specifies the alternative hypothesis of a
// location test such as a t-test or a Mann-Whitney U-test. The
// default (zero) value is to test against the alternative hypothesis
// that they differ.
type LocationHypothesis int
//go:generate stringer -type LocationHypothesis
const (
// LocationLess specifies the alternative hypothesis that the
// location of the first sample is less than the second. This
// is a one-tailed test.
LocationLess LocationHypothesis = -1
// LocationDiffers specifies the alternative hypothesis that
// the locations of the two samples are not equal. This is a
// two-tailed test.
LocationDiffers LocationHypothesis = 0
// LocationGreater specifies the alternative hypothesis that
// the location of the first sample is greater than the
// second. This is a one-tailed test.
LocationGreater LocationHypothesis = 1
)
// A MannWhitneyUTestResult is the result of a Mann-Whitney U-test.
type MannWhitneyUTestResult struct {
// N1 and N2 are the sizes of the input samples.
N1, N2 int
// U is the value of the Mann-Whitney U statistic for this
// test, generalized by counting ties as 0.5.
//
// Given the Cartesian product of the two samples, this is the
// number of pairs in which the value from the first sample is
// greater than the value of the second, plus 0.5 times the
// number of pairs where the values from the two samples are
// equal. Hence, U is always an integer multiple of 0.5 (it is
// a whole integer if there are no ties) in the range [0, N1*N2].
//
// U statistics always come in pairs, depending on which
// sample is "first". The mirror U for the other sample can be
// calculated as N1*N2 - U.
//
// There are many equivalent statistics with slightly
// different definitions. The Wilcoxon (1945) W statistic
// (generalized for ties) is U + (N1(N1+1))/2. It is also
// common to use 2U to eliminate the half steps and Smid
// (1956) uses N1*N2 - 2U to additionally center the
// distribution.
U float64
// AltHypothesis specifies the alternative hypothesis tested
// by this test against the null hypothesis that there is no
// difference in the locations of the samples.
AltHypothesis LocationHypothesis
// P is the p-value of the Mann-Whitney test for the given
// null hypothesis.
P float64
}
// MannWhitneyExactLimit gives the largest sample size for which the
// exact U distribution will be used for the Mann-Whitney U-test.
//
// Using the exact distribution is necessary for small sample sizes
// because the distribution is highly irregular. However, computing
// the distribution for large sample sizes is both computationally
// expensive and unnecessary because it quickly approaches a normal
// approximation. Computing the distribution for two 50 value samples
// takes a few milliseconds on a 2014 laptop.
var MannWhitneyExactLimit = 50
// MannWhitneyTiesExactLimit gives the largest sample size for which
// the exact U distribution will be used for the Mann-Whitney U-test
// in the presence of ties.
//
// Computing this distribution is more expensive than computing the
// distribution without ties, so this is set lower. Computing this
// distribution for two 25 value samples takes about ten milliseconds
// on a 2014 laptop.
var MannWhitneyTiesExactLimit = 25
// MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null
// hypothesis that two samples come from the same population against
// the alternative hypothesis that one sample tends to have larger or
// smaller values than the other.
//
// This is similar to a t-test, but unlike the t-test, the
// Mann-Whitney U-test is non-parametric (it does not assume a normal
// distribution). It has very slightly lower efficiency than the
// t-test on normal distributions.
//
// Computing the exact U distribution is expensive for large sample
// sizes, so this uses a normal approximation for sample sizes larger
// than MannWhitneyExactLimit if there are no ties or
// MannWhitneyTiesExactLimit if there are ties. This normal
// approximation uses both the tie correction and the continuity
// correction.
//
// This can fail with ErrSampleSize if either sample is empty or
// ErrSamplesEqual if all sample values are equal.
//
// This is also known as a Mann-Whitney-Wilcoxon test and is
// equivalent to the Wilcoxon rank-sum test, though the Wilcoxon
// rank-sum test differs in nomenclature.
//
// [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of
// Whether one of Two Random Variables is Stochastically Larger than
// the Other". Annals of Mathematical Statistics 18 (1): 5060.
//
// [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer".
// Journal of the American Statistical Association 61 (315): 772-787.
func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) {
n1, n2 := len(x1), len(x2)
if n1 == 0 || n2 == 0 {
return nil, ErrSampleSize
}
// Compute the U statistic and tie vector T.
x1 = append([]float64(nil), x1...)
x2 = append([]float64(nil), x2...)
sort.Float64s(x1)
sort.Float64s(x2)
merged, labels := labeledMerge(x1, x2)
R1 := 0.0
T, hasTies := []int{}, false
for i := 0; i < len(merged); {
rank1, nx1, v1 := i+1, 0, merged[i]
// Consume samples that tie this sample (including itself).
for ; i < len(merged) && merged[i] == v1; i++ {
if labels[i] == 1 {
nx1++
}
}
// Assign all tied samples the average rank of the
// samples, where merged[0] has rank 1.
if nx1 != 0 {
rank := float64(i+rank1) / 2
R1 += rank * float64(nx1)
}
T = append(T, i-rank1+1)
if i > rank1 {
hasTies = true
}
}
U1 := R1 - float64(n1*(n1+1))/2
// Compute the smaller of U1 and U2
U2 := float64(n1*n2) - U1
Usmall := math.Min(U1, U2)
var p float64
if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit ||
hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit {
// Use exact U distribution. U1 will be an integer.
if len(T) == 1 {
// All values are equal. Test is meaningless.
return nil, ErrSamplesEqual
}
dist := UDist{N1: n1, N2: n2, T: T}
switch alt {
case LocationDiffers:
if U1 == U2 {
// The distribution is symmetric about
// Usmall. Since the distribution is
// discrete, the CDF is discontinuous
// and if simply double CDF(Usmall),
// we'll double count the
// (non-infinitesimal) probability
// mass at Usmall. What we want is
// just the integral of the whole CDF,
// which is 1.
p = 1
} else {
p = dist.CDF(Usmall) * 2
}
case LocationLess:
p = dist.CDF(U1)
case LocationGreater:
p = 1 - dist.CDF(U1-1)
}
} else {
// Use normal approximation (with tie and continuity
// correction).
t := tieCorrection(T)
N := float64(n1 + n2)
μ_U := float64(n1*n2) / 2
σ_U := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12)
if σ_U == 0 {
return nil, ErrSamplesEqual
}
numer := U1 - μ_U
// Perform continuity correction.
switch alt {
case LocationDiffers:
numer -= mathx.Sign(numer) * 0.5
case LocationLess:
numer += 0.5
case LocationGreater:
numer -= 0.5
}
z := numer / σ_U
switch alt {
case LocationDiffers:
p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z))
case LocationLess:
p = StdNormal.CDF(z)
case LocationGreater:
p = 1 - StdNormal.CDF(z)
}
}
return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1,
AltHypothesis: alt, P: p}, nil
}
// labeledMerge merges sorted lists x1 and x2 into sorted list merged.
// labels[i] is 1 or 2 depending on whether merged[i] is a value from
// x1 or x2, respectively.
func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) {
merged = make([]float64, len(x1)+len(x2))
labels = make([]byte, len(x1)+len(x2))
i, j, o := 0, 0, 0
for i < len(x1) && j < len(x2) {
if x1[i] < x2[j] {
merged[o] = x1[i]
labels[o] = 1
i++
} else {
merged[o] = x2[j]
labels[o] = 2
j++
}
o++
}
for ; i < len(x1); i++ {
merged[o] = x1[i]
labels[o] = 1
o++
}
for ; j < len(x2); j++ {
merged[o] = x2[j]
labels[o] = 2
o++
}
return
}
// tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j)
// where t_j is the number of ties in the j'th rank.
func tieCorrection(ties []int) float64 {
t := 0
for _, tie := range ties {
t += tie*tie*tie - tie
}
return float64(t)
}