route/vendor/github.com/dgryski/go-onlinestats/swilk.go

425 lines
9.5 KiB
Go

package onlinestats
import (
"math"
"sort"
"strconv"
)
func SWilk(x []float64) (float64, float64, error) {
data := make([]float64, len(x)+1)
copy(data[1:], x)
sort.Float64s(data[1:])
data[0] = math.NaN()
length := len(x)
w, pw, err := swilkHelper(data, length, nil)
return w, pw, err
}
// Calculate the Shapiro-Wilk W test and its significance level
// Based on the public domain code at https://joinup.ec.europa.eu/svn/sextante/soft/sextante_lib/trunk/algorithms/src/es/unex/sextante/tables/normalityTest/SWilk.java
/*
* Constants and polynomial coefficients for swilk(). NOTE: FORTRAN counts the elements of the array x[length] as
* x[1] through x[length], not x[0] through x[length-1]. To avoid making pervasive, subtle changes to the algorithm
* (which would inevitably introduce pervasive, subtle bugs) the referenced arrays are padded with an unused 0th
* element, and the algorithm is ported so as to continue accessing from [1] through [length].
*/
var c1 = []float64{math.NaN(), 0.0E0, 0.221157E0, -0.147981E0, -0.207119E1, 0.4434685E1, -0.2706056E1}
var c2 = []float64{math.NaN(), 0.0E0, 0.42981E-1, -0.293762E0, -0.1752461E1, 0.5682633E1, -0.3582633E1}
var c3 = []float64{math.NaN(), 0.5440E0, -0.39978E0, 0.25054E-1, -0.6714E-3}
var c4 = []float64{math.NaN(), 0.13822E1, -0.77857E0, 0.62767E-1, -0.20322E-2}
var c5 = []float64{math.NaN(), -0.15861E1, -0.31082E0, -0.83751E-1, 0.38915E-2}
var c6 = []float64{math.NaN(), -0.4803E0, -0.82676E-1, 0.30302E-2}
var c7 = []float64{math.NaN(), 0.164E0, 0.533E0}
var c8 = []float64{math.NaN(), 0.1736E0, 0.315E0}
var c9 = []float64{math.NaN(), 0.256E0, -0.635E-2}
var g = []float64{math.NaN(), -0.2273E1, 0.459E0}
const (
z90 = 0.12816E1
z95 = 0.16449E1
z99 = 0.23263E1
zm = 0.17509E1
zss = 0.56268E0
bf1 = 0.8378E0
xx90 = 0.556E0
xx95 = 0.622E0
sqrth = 0.70711E0
th = 0.375E0
small = 1E-19
pi6 = 0.1909859E1
stqr = 0.1047198E1
upper = true
)
/**
* ALGORITHM AS R94 APPL. STATIST. (1995) VOL.44, NO.4
*
* Calculates Shapiro-Wilk normality test and P-value for sample sizes 3 <= n <= 5000 .
* Corrects AS 181, which was found to be inaccurate for n > 50.
*
* As described above with the constants, the data arrays x[] and a[] are referenced with a base element of 1 (like FORTRAN)
* instead of 0 (like Java) to avoid screwing up the algorithm. To pass in 100 data points, declare x[101] and fill elements
* x[1] through x[100] with data. x[0] will be ignored.
*
* @param x
* Input; Data set to analyze; 100 points go in x[101] array from x[1] through x[100]
* @param n
* Input; Number of data points in x
* @param a
* Shapiro-Wilk coefficients. Can be nil, or pre-computed by swilkCoeffs and passed in.
*/
type SwilkFault int
func (s SwilkFault) Error() string {
return "swilk fault " + strconv.Itoa(int(s))
}
func swilkHelper(x []float64, n int, a []float64) (w float64, pw float64, err error) {
if n > 5000 {
return 0, 0, SwilkFault(2)
}
pw = 1.0
if w >= 0.0 {
w = 1.0
}
an := float64(n)
if n < 3 {
return 0, 0, SwilkFault(1)
}
if a == nil {
a = SwilkCoeffs(n)
}
if n < 3 {
return
}
// If W input as negative, calculate significance level of -W
var w1, xx float64
if w < 0.0 {
w1 = 1.0 + w
} else {
// Check for zero range
range_ := x[n] - x[1]
if range_ < small {
return 0, 0, SwilkFault(6)
}
// Check for correct sort order on range - scaled X
// TODO(dgryski): did the FORTRAN code puke on out-of-order X ? with ifault=7 ?
xx = x[1] / range_
sx := xx
sa := -a[1]
j := n - 1
for i := 2; i <= n; i++ {
xi := x[i] / range_
// IF (XX-XI .GT. SMALL) PRINT *,' ANYTHING'
sx += xi
if i != j {
sa += float64(sign(1, i-j)) * a[imin(i, j)]
}
xx = xi
j--
}
// Calculate W statistic as squared correlation between data and coefficients
sa /= float64(n)
sx /= float64(n)
ssa := 0.0
ssx := 0.0
sax := 0.0
j = n
var asa float64
for i := 1; i <= n; i++ {
if i != j {
asa = float64(sign(1, i-j))*a[imin(i, j)] - sa
} else {
asa = -sa
}
xsx := x[i]/range_ - sx
ssa += asa * asa
ssx += xsx * xsx
sax += asa * xsx
j--
}
// W1 equals (1-W) calculated to avoid excessive rounding error
// for W very near 1 (a potential problem in very large samples)
ssassx := math.Sqrt(ssa * ssx)
w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx)
}
w = 1.0 - w1
// Calculate significance level for W (exact for N=3)
if n == 3 {
pw = pi6 * (math.Asin(math.Sqrt(w)) - stqr)
return w, pw, nil
}
y := math.Log(w1)
xx = math.Log(an)
m := 0.0
s := 1.0
if n <= 11 {
gamma := poly(g, 2, an)
if y >= gamma {
pw = small
return w, pw, nil
}
y = -math.Log(gamma - y)
m = poly(c3, 4, an)
s = math.Exp(poly(c4, 4, an))
} else {
m = poly(c5, 4, xx)
s = math.Exp(poly(c6, 3, xx))
}
pw = alnorm((y-m)/s, upper)
return w, pw, nil
}
// Precomputes the coefficients array a for SWilk
func SwilkCoeffs(n int) []float64 {
a := make([]float64, n+1)
an := float64(n)
n2 := n / 2
if n == 3 {
a[1] = sqrth
} else {
an25 := an + 0.25
summ2 := 0.0
for i := 1; i <= n2; i++ {
a[i] = ppnd((float64(i) - th) / an25)
summ2 += a[i] * a[i]
}
summ2 *= 2.0
ssumm2 := math.Sqrt(summ2)
rsn := 1.0 / math.Sqrt(an)
a1 := poly(c1, 6, rsn) - a[1]/ssumm2
// Normalize coefficients
var i1 int
var fac float64
if n > 5 {
i1 = 3
a2 := -a[2]/ssumm2 + poly(c2, 6, rsn)
fac = math.Sqrt((summ2 - 2.0*a[1]*a[1] - 2.0*a[2]*a[2]) / (1.0 - 2.0*a1*a1 - 2.0*a2*a2))
a[1] = a1
a[2] = a2
} else {
i1 = 2
fac = math.Sqrt((summ2 - 2.0*a[1]*a[1]) / (1.0 - 2.0*a1*a1))
a[1] = a1
}
for i := i1; i <= n2; i++ {
a[i] = -a[i] / fac
}
}
return a
}
/**
* Constructs an int with the absolute value of x and the sign of y
*
* @param x
* int to copy absolute value from
* @param y
* int to copy sign from
* @return int with absolute value of x and sign of y
*/
func sign(x int, y int) int {
var result = x
if x < 0 {
result = -x
}
if y < 0 {
result = -result
}
return result
}
// Constants & polynomial coefficients for ppnd(), slightly renamed to avoid conflicts. Could define
// them inside ppnd(), but static constants are more efficient.
// Coefficients for P close to 0.5
const (
a0_p = 3.3871327179E+00
a1_p = 5.0434271938E+01
a2_p = 1.5929113202E+02
a3_p = 5.9109374720E+01
b1_p = 1.7895169469E+01
b2_p = 7.8757757664E+01
b3_p = 6.7187563600E+01
// Coefficients for P not close to 0, 0.5 or 1 (names changed to avoid conflict with swilk())
c0_p = 1.4234372777E+00
c1_p = 2.7568153900E+00
c2_p = 1.3067284816E+00
c3_p = 1.7023821103E-01
d1_p = 7.3700164250E-01
d2_p = 1.2021132975E-01
// Coefficients for P near 0 or 1.
e0_p = 6.6579051150E+00
e1_p = 3.0812263860E+00
e2_p = 4.2868294337E-01
e3_p = 1.7337203997E-02
f1_p = 2.4197894225E-01
f2_p = 1.2258202635E-02
split1 = 0.425
split2 = 5.0
const1 = 0.180625
const2 = 1.6
)
/**
* ALGORITHM AS 241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477-484.
*
* Produces the normal deviate Z corresponding to a given lower tail area of P; Z is accurate to about 1 part in 10**7.
*
* @param p
* @return
*/
func ppnd(p float64) float64 {
q := p - 0.5
var r float64
if math.Abs(q) <= split1 {
r = const1 - q*q
return q * (((a3_p*r+a2_p)*r+a1_p)*r + a0_p) / (((b3_p*r+b2_p)*r+b1_p)*r + 1.0)
} else {
if q < 0.0 {
r = p
} else {
r = 1.0 - p
}
if r <= 0.0 {
return 0.0
}
r = math.Sqrt(-math.Log(r))
var normal_dev float64
if r <= split2 {
r -= const2
normal_dev = (((c3_p*r+c2_p)*r+c1_p)*r + c0_p) / ((d2_p*r+d1_p)*r + 1.0)
} else {
r -= split2
normal_dev = (((e3_p*r+e2_p)*r+e1_p)*r + e0_p) / ((f2_p*r+f1_p)*r + 1.0)
}
if q < 0.0 {
normal_dev = -normal_dev
}
return normal_dev
}
}
/**
* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
*
* Calculates the algebraic polynomial of order nord-1 with array of coefficients c. Zero order coefficient is c[1]
*
* @param c
* @param nord
* @param x
* @return
*/
func poly(c []float64, nord int, x float64) float64 {
poly := c[1]
if nord == 1 {
return poly
}
p := x * c[nord]
if nord != 2 {
n2 := nord - 2
j := n2 + 1
for i := 1; i <= n2; i++ {
p = (p + c[j]) * x
j--
}
}
poly += p
return poly
}
// Constants & polynomial coefficients for alnorm(), slightly renamed to avoid conflicts.
const (
con_a = 1.28
ltone_a = 7.0
utzero_a = 18.66
p_a = 0.398942280444
q_a = 0.39990348504
r_a = 0.398942280385
a1_a = 5.75885480458
a2_a = 2.62433121679
a3_a = 5.92885724438
b1_a = -29.8213557807
b2_a = 48.6959930692
c1_a = -3.8052E-8
c2_a = 3.98064794E-4
c3_a = -0.151679116635
c4_a = 4.8385912808
c5_a = 0.742380924027
c6_a = 3.99019417011
d1_a = 1.00000615302
d2_a = 1.98615381364
d3_a = 5.29330324926
d4_a = -15.1508972451
d5_a = 30.789933034
)
/**
* Algorithm AS66 Applied Statistics (1973) vol.22, no.3
*
* Evaluates the tail area of the standardised normal curve from x to infinity if upper is true or from minus infinity to x if
* upper is false.
*/
func alnorm(x float64, upper bool) float64 {
up := upper
z := x
if z < 0.0 {
up = !up
z = -z
}
var fn_val float64
if z > ltone_a && (!up || z > utzero_a) {
fn_val = 0.0
} else {
y := 0.5 * z * z
if z <= con_a {
fn_val = 0.5 - z*(p_a-q_a*y/(y+a1_a+b1_a/(y+a2_a+b2_a/(y+a3_a))))
} else {
fn_val = r_a * math.Exp(-y) / (z + c1_a + d1_a/(z+c2_a+d2_a/(z+c3_a+d3_a/(z+c4_a+d4_a/(z+c5_a+d5_a/(z+c6_a))))))
}
}
if !up {
fn_val = 1.0 - fn_val
}
return fn_val
}
func imin(i, j int) int {
if i < j {
return i
}
return j
}