2017-04-05 21:31:15 +00:00
|
|
|
package onlinestats
|
|
|
|
|
|
|
|
// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U
|
|
|
|
|
|
|
|
import (
|
|
|
|
"math"
|
|
|
|
"sort"
|
|
|
|
)
|
|
|
|
|
|
|
|
// MannWhitney performs a Matt-Whitney U test for the two samples xs and ys.
|
|
|
|
// It returns the two-tailed p-value for the null hypothesis that the medians
|
|
|
|
// of the two samples are the same. This uses the normal approximation which
|
|
|
|
// is more accurate if the number of samples is >30.
|
|
|
|
func MannWhitney(xs, ys []float64) float64 {
|
|
|
|
|
|
|
|
// floats in a map.. this feels dubious?
|
|
|
|
data := make(map[float64][]int)
|
|
|
|
|
|
|
|
for _, x := range xs {
|
|
|
|
data[x] = append(data[x], 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, y := range ys {
|
|
|
|
data[y] = append(data[y], 1)
|
|
|
|
}
|
|
|
|
|
|
|
|
floats := make([]float64, 0, len(data))
|
|
|
|
|
|
|
|
for k := range data {
|
|
|
|
floats = append(floats, k)
|
|
|
|
}
|
|
|
|
|
|
|
|
sort.Float64s(floats)
|
|
|
|
|
|
|
|
var r [2]float64
|
|
|
|
|
|
|
|
var idx = 1
|
|
|
|
for _, f := range floats {
|
|
|
|
dataf := data[f]
|
|
|
|
l := len(dataf)
|
|
|
|
var rank float64
|
|
|
|
if l == 1 {
|
|
|
|
rank = float64(idx)
|
|
|
|
} else {
|
|
|
|
rank = float64(idx) + float64(l-1)/2.0
|
|
|
|
}
|
|
|
|
for _, xy := range dataf {
|
|
|
|
r[xy] += rank
|
|
|
|
}
|
|
|
|
idx += l
|
|
|
|
}
|
|
|
|
|
|
|
|
n1n2 := len(xs) * len(ys)
|
|
|
|
|
|
|
|
idx = 0
|
|
|
|
u := float64(n1n2+(len(xs)*(len(xs)+1))/2.0) - r[0]
|
|
|
|
if u1 := float64(n1n2+(len(ys)*(len(ys)+1))/2.0) - r[1]; u > u1 {
|
|
|
|
idx = 1
|
|
|
|
u = u1
|
|
|
|
}
|
|
|
|
|
|
|
|
mu := float64(n1n2) / 2.0
|
|
|
|
sigu := math.Sqrt(float64(n1n2*(len(xs)+len(ys)+1)) / 12.0)
|
2017-10-06 15:29:20 +00:00
|
|
|
zu := math.Abs(u-mu) / sigu
|
2017-04-05 21:31:15 +00:00
|
|
|
|
|
|
|
return 2 - 2*cdf(0, 1, zu)
|
|
|
|
}
|
|
|
|
|
|
|
|
func cdf(mean, stddev, x float64) float64 {
|
|
|
|
return 0.5 + 0.5*math.Erf((x-mean)/(stddev*math.Sqrt2))
|
|
|
|
}
|