// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package stats import ( "math" "sort" "github.com/aclements/go-moremath/mathx" ) // A LocationHypothesis specifies the alternative hypothesis of a // location test such as a t-test or a Mann-Whitney U-test. The // default (zero) value is to test against the alternative hypothesis // that they differ. type LocationHypothesis int //go:generate stringer -type LocationHypothesis const ( // LocationLess specifies the alternative hypothesis that the // location of the first sample is less than the second. This // is a one-tailed test. LocationLess LocationHypothesis = -1 // LocationDiffers specifies the alternative hypothesis that // the locations of the two samples are not equal. This is a // two-tailed test. LocationDiffers LocationHypothesis = 0 // LocationGreater specifies the alternative hypothesis that // the location of the first sample is greater than the // second. This is a one-tailed test. LocationGreater LocationHypothesis = 1 ) // A MannWhitneyUTestResult is the result of a Mann-Whitney U-test. type MannWhitneyUTestResult struct { // N1 and N2 are the sizes of the input samples. N1, N2 int // U is the value of the Mann-Whitney U statistic for this // test, generalized by counting ties as 0.5. // // Given the Cartesian product of the two samples, this is the // number of pairs in which the value from the first sample is // greater than the value of the second, plus 0.5 times the // number of pairs where the values from the two samples are // equal. Hence, U is always an integer multiple of 0.5 (it is // a whole integer if there are no ties) in the range [0, N1*N2]. // // U statistics always come in pairs, depending on which // sample is "first". The mirror U for the other sample can be // calculated as N1*N2 - U. // // There are many equivalent statistics with slightly // different definitions. The Wilcoxon (1945) W statistic // (generalized for ties) is U + (N1(N1+1))/2. It is also // common to use 2U to eliminate the half steps and Smid // (1956) uses N1*N2 - 2U to additionally center the // distribution. U float64 // AltHypothesis specifies the alternative hypothesis tested // by this test against the null hypothesis that there is no // difference in the locations of the samples. AltHypothesis LocationHypothesis // P is the p-value of the Mann-Whitney test for the given // null hypothesis. P float64 } // MannWhitneyExactLimit gives the largest sample size for which the // exact U distribution will be used for the Mann-Whitney U-test. // // Using the exact distribution is necessary for small sample sizes // because the distribution is highly irregular. However, computing // the distribution for large sample sizes is both computationally // expensive and unnecessary because it quickly approaches a normal // approximation. Computing the distribution for two 50 value samples // takes a few milliseconds on a 2014 laptop. var MannWhitneyExactLimit = 50 // MannWhitneyTiesExactLimit gives the largest sample size for which // the exact U distribution will be used for the Mann-Whitney U-test // in the presence of ties. // // Computing this distribution is more expensive than computing the // distribution without ties, so this is set lower. Computing this // distribution for two 25 value samples takes about ten milliseconds // on a 2014 laptop. var MannWhitneyTiesExactLimit = 25 // MannWhitneyUTest performs a Mann-Whitney U-test [1,2] of the null // hypothesis that two samples come from the same population against // the alternative hypothesis that one sample tends to have larger or // smaller values than the other. // // This is similar to a t-test, but unlike the t-test, the // Mann-Whitney U-test is non-parametric (it does not assume a normal // distribution). It has very slightly lower efficiency than the // t-test on normal distributions. // // Computing the exact U distribution is expensive for large sample // sizes, so this uses a normal approximation for sample sizes larger // than MannWhitneyExactLimit if there are no ties or // MannWhitneyTiesExactLimit if there are ties. This normal // approximation uses both the tie correction and the continuity // correction. // // This can fail with ErrSampleSize if either sample is empty or // ErrSamplesEqual if all sample values are equal. // // This is also known as a Mann-Whitney-Wilcoxon test and is // equivalent to the Wilcoxon rank-sum test, though the Wilcoxon // rank-sum test differs in nomenclature. // // [1] Mann, Henry B.; Whitney, Donald R. (1947). "On a Test of // Whether one of Two Random Variables is Stochastically Larger than // the Other". Annals of Mathematical Statistics 18 (1): 50–60. // // [2] Klotz, J. H. (1966). "The Wilcoxon, Ties, and the Computer". // Journal of the American Statistical Association 61 (315): 772-787. func MannWhitneyUTest(x1, x2 []float64, alt LocationHypothesis) (*MannWhitneyUTestResult, error) { n1, n2 := len(x1), len(x2) if n1 == 0 || n2 == 0 { return nil, ErrSampleSize } // Compute the U statistic and tie vector T. x1 = append([]float64(nil), x1...) x2 = append([]float64(nil), x2...) sort.Float64s(x1) sort.Float64s(x2) merged, labels := labeledMerge(x1, x2) R1 := 0.0 T, hasTies := []int{}, false for i := 0; i < len(merged); { rank1, nx1, v1 := i+1, 0, merged[i] // Consume samples that tie this sample (including itself). for ; i < len(merged) && merged[i] == v1; i++ { if labels[i] == 1 { nx1++ } } // Assign all tied samples the average rank of the // samples, where merged[0] has rank 1. if nx1 != 0 { rank := float64(i+rank1) / 2 R1 += rank * float64(nx1) } T = append(T, i-rank1+1) if i > rank1 { hasTies = true } } U1 := R1 - float64(n1*(n1+1))/2 // Compute the smaller of U1 and U2 U2 := float64(n1*n2) - U1 Usmall := math.Min(U1, U2) var p float64 if !hasTies && n1 <= MannWhitneyExactLimit && n2 <= MannWhitneyExactLimit || hasTies && n1 <= MannWhitneyTiesExactLimit && n2 <= MannWhitneyTiesExactLimit { // Use exact U distribution. U1 will be an integer. if len(T) == 1 { // All values are equal. Test is meaningless. return nil, ErrSamplesEqual } dist := UDist{N1: n1, N2: n2, T: T} switch alt { case LocationDiffers: if U1 == U2 { // The distribution is symmetric about // Usmall. Since the distribution is // discrete, the CDF is discontinuous // and if simply double CDF(Usmall), // we'll double count the // (non-infinitesimal) probability // mass at Usmall. What we want is // just the integral of the whole CDF, // which is 1. p = 1 } else { p = dist.CDF(Usmall) * 2 } case LocationLess: p = dist.CDF(U1) case LocationGreater: p = 1 - dist.CDF(U1-1) } } else { // Use normal approximation (with tie and continuity // correction). t := tieCorrection(T) N := float64(n1 + n2) μ_U := float64(n1*n2) / 2 σ_U := math.Sqrt(float64(n1*n2) * ((N + 1) - t/(N*(N-1))) / 12) if σ_U == 0 { return nil, ErrSamplesEqual } numer := U1 - μ_U // Perform continuity correction. switch alt { case LocationDiffers: numer -= mathx.Sign(numer) * 0.5 case LocationLess: numer += 0.5 case LocationGreater: numer -= 0.5 } z := numer / σ_U switch alt { case LocationDiffers: p = 2 * math.Min(StdNormal.CDF(z), 1-StdNormal.CDF(z)) case LocationLess: p = StdNormal.CDF(z) case LocationGreater: p = 1 - StdNormal.CDF(z) } } return &MannWhitneyUTestResult{N1: n1, N2: n2, U: U1, AltHypothesis: alt, P: p}, nil } // labeledMerge merges sorted lists x1 and x2 into sorted list merged. // labels[i] is 1 or 2 depending on whether merged[i] is a value from // x1 or x2, respectively. func labeledMerge(x1, x2 []float64) (merged []float64, labels []byte) { merged = make([]float64, len(x1)+len(x2)) labels = make([]byte, len(x1)+len(x2)) i, j, o := 0, 0, 0 for i < len(x1) && j < len(x2) { if x1[i] < x2[j] { merged[o] = x1[i] labels[o] = 1 i++ } else { merged[o] = x2[j] labels[o] = 2 j++ } o++ } for ; i < len(x1); i++ { merged[o] = x1[i] labels[o] = 1 o++ } for ; j < len(x2); j++ { merged[o] = x2[j] labels[o] = 2 o++ } return } // tieCorrection computes the tie correction factor Σ_j (t_j³ - t_j) // where t_j is the number of ties in the j'th rank. func tieCorrection(ties []int) float64 { t := 0 for _, tie := range ties { t += tie*tie*tie - tie } return float64(t) }