425 lines
9.5 KiB
Go
425 lines
9.5 KiB
Go
package onlinestats
|
|
|
|
import (
|
|
"math"
|
|
"sort"
|
|
"strconv"
|
|
)
|
|
|
|
func SWilk(x []float64) (float64, float64, error) {
|
|
|
|
data := make([]float64, len(x)+1)
|
|
copy(data[1:], x)
|
|
sort.Float64s(data[1:])
|
|
data[0] = math.NaN()
|
|
|
|
length := len(x)
|
|
w, pw, err := swilkHelper(data, length, nil)
|
|
return w, pw, err
|
|
}
|
|
|
|
// Calculate the Shapiro-Wilk W test and its significance level
|
|
// Based on the public domain code at https://joinup.ec.europa.eu/svn/sextante/soft/sextante_lib/trunk/algorithms/src/es/unex/sextante/tables/normalityTest/SWilk.java
|
|
|
|
/*
|
|
* Constants and polynomial coefficients for swilk(). NOTE: FORTRAN counts the elements of the array x[length] as
|
|
* x[1] through x[length], not x[0] through x[length-1]. To avoid making pervasive, subtle changes to the algorithm
|
|
* (which would inevitably introduce pervasive, subtle bugs) the referenced arrays are padded with an unused 0th
|
|
* element, and the algorithm is ported so as to continue accessing from [1] through [length].
|
|
*/
|
|
var c1 = []float64{math.NaN(), 0.0E0, 0.221157E0, -0.147981E0, -0.207119E1, 0.4434685E1, -0.2706056E1}
|
|
var c2 = []float64{math.NaN(), 0.0E0, 0.42981E-1, -0.293762E0, -0.1752461E1, 0.5682633E1, -0.3582633E1}
|
|
var c3 = []float64{math.NaN(), 0.5440E0, -0.39978E0, 0.25054E-1, -0.6714E-3}
|
|
var c4 = []float64{math.NaN(), 0.13822E1, -0.77857E0, 0.62767E-1, -0.20322E-2}
|
|
var c5 = []float64{math.NaN(), -0.15861E1, -0.31082E0, -0.83751E-1, 0.38915E-2}
|
|
var c6 = []float64{math.NaN(), -0.4803E0, -0.82676E-1, 0.30302E-2}
|
|
var c7 = []float64{math.NaN(), 0.164E0, 0.533E0}
|
|
var c8 = []float64{math.NaN(), 0.1736E0, 0.315E0}
|
|
var c9 = []float64{math.NaN(), 0.256E0, -0.635E-2}
|
|
var g = []float64{math.NaN(), -0.2273E1, 0.459E0}
|
|
|
|
const (
|
|
z90 = 0.12816E1
|
|
z95 = 0.16449E1
|
|
z99 = 0.23263E1
|
|
zm = 0.17509E1
|
|
zss = 0.56268E0
|
|
bf1 = 0.8378E0
|
|
xx90 = 0.556E0
|
|
xx95 = 0.622E0
|
|
sqrth = 0.70711E0
|
|
th = 0.375E0
|
|
small = 1E-19
|
|
pi6 = 0.1909859E1
|
|
stqr = 0.1047198E1
|
|
upper = true
|
|
)
|
|
|
|
/**
|
|
* ALGORITHM AS R94 APPL. STATIST. (1995) VOL.44, NO.4
|
|
*
|
|
* Calculates Shapiro-Wilk normality test and P-value for sample sizes 3 <= n <= 5000 .
|
|
* Corrects AS 181, which was found to be inaccurate for n > 50.
|
|
*
|
|
* As described above with the constants, the data arrays x[] and a[] are referenced with a base element of 1 (like FORTRAN)
|
|
* instead of 0 (like Java) to avoid screwing up the algorithm. To pass in 100 data points, declare x[101] and fill elements
|
|
* x[1] through x[100] with data. x[0] will be ignored.
|
|
*
|
|
* @param x
|
|
* Input; Data set to analyze; 100 points go in x[101] array from x[1] through x[100]
|
|
* @param n
|
|
* Input; Number of data points in x
|
|
* @param a
|
|
* Shapiro-Wilk coefficients. Can be nil, or pre-computed by swilkCoeffs and passed in.
|
|
*/
|
|
|
|
type SwilkFault int
|
|
|
|
func (s SwilkFault) Error() string {
|
|
return "swilk fault " + strconv.Itoa(int(s))
|
|
}
|
|
|
|
func swilkHelper(x []float64, n int, a []float64) (w float64, pw float64, err error) {
|
|
|
|
if n > 5000 {
|
|
return 0, 0, SwilkFault(2)
|
|
}
|
|
|
|
pw = 1.0
|
|
if w >= 0.0 {
|
|
w = 1.0
|
|
}
|
|
an := float64(n)
|
|
if n < 3 {
|
|
return 0, 0, SwilkFault(1)
|
|
}
|
|
|
|
if a == nil {
|
|
a = SwilkCoeffs(n)
|
|
}
|
|
|
|
if n < 3 {
|
|
return
|
|
}
|
|
|
|
// If W input as negative, calculate significance level of -W
|
|
var w1, xx float64
|
|
if w < 0.0 {
|
|
w1 = 1.0 + w
|
|
} else {
|
|
|
|
// Check for zero range
|
|
|
|
range_ := x[n] - x[1]
|
|
if range_ < small {
|
|
return 0, 0, SwilkFault(6)
|
|
}
|
|
|
|
// Check for correct sort order on range - scaled X
|
|
// TODO(dgryski): did the FORTRAN code puke on out-of-order X ? with ifault=7 ?
|
|
xx = x[1] / range_
|
|
sx := xx
|
|
sa := -a[1]
|
|
j := n - 1
|
|
for i := 2; i <= n; i++ {
|
|
xi := x[i] / range_
|
|
// IF (XX-XI .GT. SMALL) PRINT *,' ANYTHING'
|
|
sx += xi
|
|
if i != j {
|
|
sa += float64(sign(1, i-j)) * a[imin(i, j)]
|
|
}
|
|
xx = xi
|
|
j--
|
|
}
|
|
|
|
// Calculate W statistic as squared correlation between data and coefficients
|
|
sa /= float64(n)
|
|
sx /= float64(n)
|
|
ssa := 0.0
|
|
ssx := 0.0
|
|
sax := 0.0
|
|
j = n
|
|
var asa float64
|
|
for i := 1; i <= n; i++ {
|
|
if i != j {
|
|
asa = float64(sign(1, i-j))*a[imin(i, j)] - sa
|
|
} else {
|
|
asa = -sa
|
|
}
|
|
xsx := x[i]/range_ - sx
|
|
ssa += asa * asa
|
|
ssx += xsx * xsx
|
|
sax += asa * xsx
|
|
j--
|
|
}
|
|
|
|
// W1 equals (1-W) calculated to avoid excessive rounding error
|
|
// for W very near 1 (a potential problem in very large samples)
|
|
|
|
ssassx := math.Sqrt(ssa * ssx)
|
|
w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx)
|
|
}
|
|
w = 1.0 - w1
|
|
|
|
// Calculate significance level for W (exact for N=3)
|
|
|
|
if n == 3 {
|
|
pw = pi6 * (math.Asin(math.Sqrt(w)) - stqr)
|
|
return w, pw, nil
|
|
}
|
|
y := math.Log(w1)
|
|
xx = math.Log(an)
|
|
m := 0.0
|
|
s := 1.0
|
|
if n <= 11 {
|
|
gamma := poly(g, 2, an)
|
|
if y >= gamma {
|
|
pw = small
|
|
return w, pw, nil
|
|
}
|
|
y = -math.Log(gamma - y)
|
|
m = poly(c3, 4, an)
|
|
s = math.Exp(poly(c4, 4, an))
|
|
} else {
|
|
m = poly(c5, 4, xx)
|
|
s = math.Exp(poly(c6, 3, xx))
|
|
}
|
|
pw = alnorm((y-m)/s, upper)
|
|
|
|
return w, pw, nil
|
|
}
|
|
|
|
// Precomputes the coefficients array a for SWilk
|
|
func SwilkCoeffs(n int) []float64 {
|
|
|
|
a := make([]float64, n+1)
|
|
|
|
an := float64(n)
|
|
|
|
n2 := n / 2
|
|
|
|
if n == 3 {
|
|
a[1] = sqrth
|
|
} else {
|
|
an25 := an + 0.25
|
|
summ2 := 0.0
|
|
for i := 1; i <= n2; i++ {
|
|
a[i] = ppnd((float64(i) - th) / an25)
|
|
summ2 += a[i] * a[i]
|
|
}
|
|
summ2 *= 2.0
|
|
ssumm2 := math.Sqrt(summ2)
|
|
rsn := 1.0 / math.Sqrt(an)
|
|
a1 := poly(c1, 6, rsn) - a[1]/ssumm2
|
|
|
|
// Normalize coefficients
|
|
|
|
var i1 int
|
|
var fac float64
|
|
if n > 5 {
|
|
i1 = 3
|
|
a2 := -a[2]/ssumm2 + poly(c2, 6, rsn)
|
|
fac = math.Sqrt((summ2 - 2.0*a[1]*a[1] - 2.0*a[2]*a[2]) / (1.0 - 2.0*a1*a1 - 2.0*a2*a2))
|
|
a[1] = a1
|
|
a[2] = a2
|
|
} else {
|
|
i1 = 2
|
|
fac = math.Sqrt((summ2 - 2.0*a[1]*a[1]) / (1.0 - 2.0*a1*a1))
|
|
a[1] = a1
|
|
}
|
|
for i := i1; i <= n2; i++ {
|
|
a[i] = -a[i] / fac
|
|
}
|
|
}
|
|
|
|
return a
|
|
}
|
|
|
|
/**
|
|
* Constructs an int with the absolute value of x and the sign of y
|
|
*
|
|
* @param x
|
|
* int to copy absolute value from
|
|
* @param y
|
|
* int to copy sign from
|
|
* @return int with absolute value of x and sign of y
|
|
*/
|
|
func sign(x int, y int) int {
|
|
var result = x
|
|
if x < 0 {
|
|
result = -x
|
|
}
|
|
if y < 0 {
|
|
result = -result
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Constants & polynomial coefficients for ppnd(), slightly renamed to avoid conflicts. Could define
|
|
// them inside ppnd(), but static constants are more efficient.
|
|
|
|
// Coefficients for P close to 0.5
|
|
const (
|
|
a0_p = 3.3871327179E+00
|
|
a1_p = 5.0434271938E+01
|
|
a2_p = 1.5929113202E+02
|
|
a3_p = 5.9109374720E+01
|
|
b1_p = 1.7895169469E+01
|
|
b2_p = 7.8757757664E+01
|
|
b3_p = 6.7187563600E+01
|
|
|
|
// Coefficients for P not close to 0, 0.5 or 1 (names changed to avoid conflict with swilk())
|
|
c0_p = 1.4234372777E+00
|
|
c1_p = 2.7568153900E+00
|
|
c2_p = 1.3067284816E+00
|
|
c3_p = 1.7023821103E-01
|
|
d1_p = 7.3700164250E-01
|
|
d2_p = 1.2021132975E-01
|
|
|
|
// Coefficients for P near 0 or 1.
|
|
e0_p = 6.6579051150E+00
|
|
e1_p = 3.0812263860E+00
|
|
e2_p = 4.2868294337E-01
|
|
e3_p = 1.7337203997E-02
|
|
f1_p = 2.4197894225E-01
|
|
f2_p = 1.2258202635E-02
|
|
|
|
split1 = 0.425
|
|
split2 = 5.0
|
|
const1 = 0.180625
|
|
const2 = 1.6
|
|
)
|
|
|
|
/**
|
|
* ALGORITHM AS 241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477-484.
|
|
*
|
|
* Produces the normal deviate Z corresponding to a given lower tail area of P; Z is accurate to about 1 part in 10**7.
|
|
*
|
|
* @param p
|
|
* @return
|
|
*/
|
|
func ppnd(p float64) float64 {
|
|
q := p - 0.5
|
|
var r float64
|
|
if math.Abs(q) <= split1 {
|
|
r = const1 - q*q
|
|
return q * (((a3_p*r+a2_p)*r+a1_p)*r + a0_p) / (((b3_p*r+b2_p)*r+b1_p)*r + 1.0)
|
|
} else {
|
|
if q < 0.0 {
|
|
r = p
|
|
} else {
|
|
r = 1.0 - p
|
|
}
|
|
if r <= 0.0 {
|
|
return 0.0
|
|
}
|
|
r = math.Sqrt(-math.Log(r))
|
|
var normal_dev float64
|
|
if r <= split2 {
|
|
r -= const2
|
|
normal_dev = (((c3_p*r+c2_p)*r+c1_p)*r + c0_p) / ((d2_p*r+d1_p)*r + 1.0)
|
|
} else {
|
|
r -= split2
|
|
normal_dev = (((e3_p*r+e2_p)*r+e1_p)*r + e0_p) / ((f2_p*r+f1_p)*r + 1.0)
|
|
}
|
|
if q < 0.0 {
|
|
normal_dev = -normal_dev
|
|
}
|
|
return normal_dev
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
|
|
*
|
|
* Calculates the algebraic polynomial of order nord-1 with array of coefficients c. Zero order coefficient is c[1]
|
|
*
|
|
* @param c
|
|
* @param nord
|
|
* @param x
|
|
* @return
|
|
*/
|
|
func poly(c []float64, nord int, x float64) float64 {
|
|
poly := c[1]
|
|
if nord == 1 {
|
|
return poly
|
|
}
|
|
p := x * c[nord]
|
|
if nord != 2 {
|
|
n2 := nord - 2
|
|
j := n2 + 1
|
|
for i := 1; i <= n2; i++ {
|
|
p = (p + c[j]) * x
|
|
j--
|
|
}
|
|
}
|
|
poly += p
|
|
return poly
|
|
}
|
|
|
|
// Constants & polynomial coefficients for alnorm(), slightly renamed to avoid conflicts.
|
|
const (
|
|
con_a = 1.28
|
|
ltone_a = 7.0
|
|
utzero_a = 18.66
|
|
p_a = 0.398942280444
|
|
q_a = 0.39990348504
|
|
r_a = 0.398942280385
|
|
a1_a = 5.75885480458
|
|
a2_a = 2.62433121679
|
|
a3_a = 5.92885724438
|
|
|
|
b1_a = -29.8213557807
|
|
b2_a = 48.6959930692
|
|
|
|
c1_a = -3.8052E-8
|
|
c2_a = 3.98064794E-4
|
|
c3_a = -0.151679116635
|
|
c4_a = 4.8385912808
|
|
c5_a = 0.742380924027
|
|
c6_a = 3.99019417011
|
|
|
|
d1_a = 1.00000615302
|
|
d2_a = 1.98615381364
|
|
d3_a = 5.29330324926
|
|
d4_a = -15.1508972451
|
|
d5_a = 30.789933034
|
|
)
|
|
|
|
/**
|
|
* Algorithm AS66 Applied Statistics (1973) vol.22, no.3
|
|
*
|
|
* Evaluates the tail area of the standardised normal curve from x to infinity if upper is true or from minus infinity to x if
|
|
* upper is false.
|
|
*/
|
|
func alnorm(x float64, upper bool) float64 {
|
|
up := upper
|
|
z := x
|
|
if z < 0.0 {
|
|
up = !up
|
|
z = -z
|
|
}
|
|
var fn_val float64
|
|
if z > ltone_a && (!up || z > utzero_a) {
|
|
fn_val = 0.0
|
|
} else {
|
|
y := 0.5 * z * z
|
|
if z <= con_a {
|
|
fn_val = 0.5 - z*(p_a-q_a*y/(y+a1_a+b1_a/(y+a2_a+b2_a/(y+a3_a))))
|
|
} else {
|
|
fn_val = r_a * math.Exp(-y) / (z + c1_a + d1_a/(z+c2_a+d2_a/(z+c3_a+d3_a/(z+c4_a+d4_a/(z+c5_a+d5_a/(z+c6_a))))))
|
|
}
|
|
}
|
|
if !up {
|
|
fn_val = 1.0 - fn_val
|
|
}
|
|
return fn_val
|
|
}
|
|
|
|
func imin(i, j int) int {
|
|
if i < j {
|
|
return i
|
|
}
|
|
return j
|
|
}
|