package scientific.stats.stattest
/* Rank tests and related functions. */
import std.math.*
import std.collection.*
import std.unittest.*
import std.unittest.testmacro.*
import scientific.linear.*
import scientific.numbers.*
import scientific.stats.continuous.chisqCDF
import scientific.stats.random.Random
import scientific.stats.normal.*
public func rankdata<T>(a: Vector<T>, method!:String = "average"): Vector<Float64>
where T <: Real<T> & Comparable<T> & Hashable {
let size = a.size()
let rank_map = HashMap<T, Float64>()
// First, sort the vector
let sorted_a = sorted(a)
// Now, go over sorted_a and obtain the ranks of each entry,
// store them in rank_map.
var id: Int64 = 0
var range_count: Int64 = 0
while (id < size) {
var id2 = id + 1
while (id2 < size && sorted_a[id2] == sorted_a[id]) {
id2++
}
// The current range of equal elements is [id, id2).
var cur_rank: Float64
if (method == "average") {
cur_rank = Float64(id + id2 + 1) / 2.0
} else if (method == "min") {
cur_rank = Float64(id + 1)
} else if (method == "max") {
cur_rank = Float64(id2)
} else if (method == "dense") {
cur_rank = Float64(range_count + 1)
} else if (method == "ordinal") {
cur_rank = Float64(id + 1) // to be adjusted later
} else {
throw IllegalArgumentException("rankdata: unknown method")
}
rank_map[sorted_a[id]] = cur_rank
range_count++
id = id2
}
// Now use rank_map to obtain rank for all elements in a.
let ranks = empty<Float64>(size)
for (i in 0..size) {
ranks[i] = rank_map[a[i]]
if (method == "ordinal") {
rank_map[a[i]] += 1.0
}
}
return ranks
}
public func repeatCounts(a: Vector<Float64>): Vector<Int64> {
let size = a.size()
// First, sort the vector
let sorted_a: Vector<Float64> = sorted<Float64>(a)
// Compute sizes of ranges
var id: Int64 = 0
var ranges = ArrayList<Int64>()
while (id < size) {
var id2 = id + 1
while (id2 < size && sorted_a[id2] == sorted_a[id]) {
id2++
}
if (id2 - id > 1) {
ranges.add(id2 - id)
}
id = id2
}
return vector<Int64>(ranges.toArray())
}
public func tiecorrect(a: Vector<Float64>): Float64 {
let cnt = repeatCounts(a).toFloat64()
let size = a.size()
return 1.0 - sum(power(cnt, 3.0) - cnt) / Float64(size * (size * size - 1))
}
public func kruskal<T>(samples: Array<Vector<T>>): (Float64, Float64)
where T <: Real<T> & Comparable<T> & Hashable {
let alldata = concat(samples)
let ranked = rankdata(alldata)
let ties: Float64 = tiecorrect(ranked)
var ssbn: Float64 = 0.0
var totaln: Int64 = 0
let num_sample = samples.size
for (i in 0..num_sample) {
var cursum: Float64 = 0.0
let cursize = samples[i].size()
for (j in totaln..totaln+cursize) {
cursum += ranked[j]
}
ssbn += cursum * cursum / Float64(cursize)
totaln += cursize
}
var n: Float64 = Float64(totaln)
var h: Float64 = 12.0 / (n * (n + 1.0)) * ssbn - 3.0 * (n + 1.0)
let df = num_sample - 1
h = h / ties
return (h, 1.0 - chisqCDF(h, Float64(df)))
}
public func ranksums<T>(a: Vector<T>, b: Vector<T>, alternative!:String = "two-sided"): (Float64, Float64)
where T <: Real<T> & Comparable<T> & Hashable {
let alldata = concat(a, b)
let ranked = rankdata(alldata)
let n1 = a.size()
let n2 = b.size()
var s: Float64 = 0.0
for (i in 0..n1) {
s += ranked[i]
}
let expected = Float64(n1 * (n1 + n2 + 1)) / 2.0
let z = (s - expected) / sqrt(Float64(n1 * n2 * (n1 + n2 + 1)) / 12.0)
let prob = norm_finish(z, alternative)
return (z, prob)
}
public func friedmanchisquare<T>(samples: Matrix<T>): (Float64, Float64)
where T <: Real<T> & Comparable<T> & Hashable {
let k = samples.getRows()
if (k < 3) {
throw IllegalArgumentException("friedmanchisquare: must have at least 3 samples")
}
let n = samples.getCols()
let samplesT = samples.transpose()
// Collect all ranks in a matrix
var ranks = empty<Float64>(n, k)
for (i in 0..n) {
ranks[i] = rankdata(samplesT[i])
}
// Compute tie correction
var ties: Float64 = 0.0
for (i in 0..n) {
let cnt = repeatCounts(ranks[i]).toFloat64()
ties += sum(power(cnt, 3.0) - cnt)
}
let c = 1.0 - ties / Float64(k * (k * k - 1) * n)
// Compute final result
let ssbn: Float64 = sum(power(sumCol(ranks), 2.0))
let chisq: Float64 = (12.0 / Float64(k * n * (k + 1)) * ssbn - Float64(3 * n * (k + 1))) / c
let prob = 1.0 - chisqCDF(chisq, Float64(k - 1))
return (chisq, prob)
}
/*
* Only the approximate mode is implemented, with wilcox method for
* dealing with zeros.
*/
public func wilcoxon<T>(d: Vector<T>, alternative!:String = "two-sided"): (Float64, Float64)
where T <: Real<T> & Comparable<T> & Hashable {
// First, remove all zeros from d
let d2 = d.filter({x => x != T.fromInt(0)})
let r = rankdata(abs(d2))
var r_plus: Float64 = 0.0
var r_minus: Float64 = 0.0
let n = d2.size()
for (i in 0..n) {
if (d2[i] > T.fromInt(0)) {
r_plus += r[i]
} else {
r_minus += r[i]
}
}
var T: Float64
if (alternative == "two-sided") {
T = min(r_plus, r_minus)
} else {
T = r_plus
}
let mn = Float64(n * (n + 1)) * 0.25
var se = Float64(n * (n + 1) * (2 * n + 1))
let cnt = repeatCounts(r).toFloat64()
if (cnt.size() > 0) {
se -= 0.5 * sum(power(cnt, 3.0) - cnt)
}
se = sqrt(se / 24.0)
let z = (T - mn) / se
let prob = norm_finish(z, alternative)
return (T, prob)
}
public func wilcoxon<T>(x: Vector<T>, y: Vector<T>, alternative!:String = "two-sided"): (Float64, Float64)
where T <: Real<T> & Comparable<T> & Hashable {
return wilcoxon(x - y, alternative:alternative)
}
public func mannwhitneyu<T>(x: Vector<T>, y: Vector<T>, alternative!:String = "two-sided",
continuity!:Bool = true): (Float64, Float64)
where T <: Real<T> & Comparable<T> & Hashable {
let xy = concat(x, y)
let ranks = rankdata(xy)
let n1 = x.size()
let n2 = y.size()
var R1: Float64 = 0.0
for (i in 0..n1) {
R1 += ranks[i]
}
let U1: Float64 = R1 - Float64(n1 * (n1 + 1)) / 2.0
let U2: Float64 = Float64(n1 * n2) - U1
var U: Float64
var f: Float64
if (alternative == "greater") {
U = U1
f = 1.0
} else if (alternative == "less") {
U = U2
f = 1.0
} else {
U = max(U1, U2)
f = 2.0
}
let mu = Float64(n1 * n2) / 2.0
let n = Float64(n1 + n2)
let cnt = repeatCounts(ranks).toFloat64()
let tie_term = sum(power(cnt, 3.0) - cnt)
let s = sqrt(mu / 6.0 * ((n + 1.0) - tie_term / (n * (n - 1.0))))
var numerator = U - mu
if (continuity) {
numerator -= 0.5
}
let z = numerator / s
let prob = (1.0 - normalCDF(z)) * f
return (U1, prob)
}
@Test
public class TestRanktest {
@TestCase
func testRankdata(): Unit {
let a1 = vector<Int64>([0, 2, 3, 2])
let r1 = rankdata(a1)
@Assert(approxEqual(r1, vector<Float64>([1.0, 2.5, 4.0, 2.5])))
let r2 = rankdata(a1, method: "min")
@Assert(approxEqual(r2, vector<Float64>([1.0, 2.0, 4.0, 2.0])))
let r3 = rankdata(a1, method: "max")
@Assert(approxEqual(r3, vector<Float64>([1.0, 3.0, 4.0, 3.0])))
let r4 = rankdata(a1, method: "dense")
@Assert(approxEqual(r4, vector<Float64>([1.0, 2.0, 3.0, 2.0])))
let r5 = rankdata(a1, method: "ordinal")
@Assert(approxEqual(r5, vector<Float64>([1.0, 2.0, 4.0, 3.0])))
}
@TestCase
func testTieCorrect(): Unit {
let a1 = vector<Float64>([1.0, 2.5, 2.5, 4.0])
@Assert(approxEqual(tiecorrect(a1), 0.9))
let a2 = vector<Int64>([1, 3, 2, 4, 5, 7, 2, 8, 4])
let r2 = rankdata(a2)
@Assert(approxEqual(tiecorrect(r2), 0.9833333333333333, atol:1e-10))
}
@TestCase
func testKruskal1(): Unit {
let samples = [vector<Int64>([1, 3, 5, 7, 9]),
vector<Int64>([2, 4, 6, 8, 10])]
let (stat, prob) = kruskal(samples)
@Assert(approxEqual(stat, 0.2727272727272734, atol:1e-10))
@Assert(approxEqual(prob, 0.6015081344405895, atol:1e-10))
}
@TestCase
func testKruskal2(): Unit {
let samples = [vector<Int64>([1, 1, 1]),
vector<Int64>([2, 2, 2]),
vector<Int64>([2, 2])]
let (stat, prob) = kruskal(samples)
@Assert(approxEqual(stat, 7.0, atol:1e-10))
@Assert(approxEqual(prob, 0.0301973834223185, atol:1e-10))
}
@TestCase
func testRanksums(): Unit {
let r = Random(0)
let sample1 = rand(r, 200, -1.0, 1.0)
let sample2 = rand(r, 200, -0.5, 1.5) // Shifted distribution
var (z, prob) = ranksums(sample1, sample2)
@Assert(approxEqual(z, -7.162609, atol:1e-5))
@Assert(approxEqual(prob, 0.0, atol:1e-10))
var (z2, prob2) = ranksums(sample1, sample2, alternative:"less")
@Assert(approxEqual(z2, -7.162609, atol:1e-5))
@Assert(approxEqual(prob2, 0.0, atol:1e-10))
var (z3, prob3) = ranksums(sample1, sample2, alternative:"greater")
@Assert(approxEqual(z3, -7.162609, atol:1e-5))
@Assert(approxEqual(prob3, 1.0, atol:1e-10))
}
@TestCase
func testFriedmanChisquare(): Unit {
let samples = matrix<Int64>([
[1, 3, 5, 7, 9], [2, 4, 6, 8, 10], [1, 4, 7, 9, 10]
])
var (chisq, prob) = friedmanchisquare(samples)
@Assert(approxEqual(chisq, 7.176470588235304, atol:1e-10))
@Assert(approxEqual(prob, 0.02764707635775853, atol:1e-10))
}
// @TestCase
// func testWilcoxon(): Unit {
// let d = vector<Int64>([
// 6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75
// ])
// var (T, prob) = wilcoxon(d)
// @Assert(approxEqual(T, 24.0, atol:1e-10))
// @Assert(approxEqual(prob, 0.0408881, atol:1e-6))
// var (T2, prob2) = wilcoxon(d, alternative:"greater")
// @Assert(approxEqual(T2, 96.0, atol:1e-10))
// @Assert(approxEqual(prob2, 0.0204441, atol:1e-6))
// var (T3, prob3) = wilcoxon(d, alternative:"less")
// @Assert(approxEqual(T3, 96.0, atol:1e-10))
// @Assert(approxEqual(prob3, 0.9795559, atol:1e-6))
// }
// @TestCase
// func testWilcoxon2(): Unit {
// let d = vector<Int64>([
// 4, 2, 4, 0, -4, -6, -3, -3, 0, -3, -7
// ])
// var (T, prob) = wilcoxon(d)
// @Assert(approxEqual(T, 13.0, atol:1e-10))
// @Assert(approxEqual(prob, 0.2570274, atol:1e-6))
// var (T2, prob2) = wilcoxon(d, alternative:"greater")
// @Assert(approxEqual(T2, 13.0, atol:1e-10))
// @Assert(approxEqual(prob2, 0.8714863, atol:1e-6))
// var (T3, prob3) = wilcoxon(d, alternative:"less")
// @Assert(approxEqual(T3, 13.0, atol:1e-10))
// @Assert(approxEqual(prob3, 0.1285137, atol:1e-6))
// }
// @TestCase
// func testWilcoxon3(): Unit {
// let x = vector<Int64>([1, 6, 3, 5, 9, 11, 8])
// let y = vector<Int64>([4, 7, 3, 6, 9, 14, 10])
// var (T, prob) = wilcoxon(x, y)
// @Assert(approxEqual(T, 0.0, atol:1e-10))
// @Assert(approxEqual(prob, 0.0412268, atol:1e-6))
// var (T2, prob2) = wilcoxon(x, y, alternative:"greater")
// @Assert(approxEqual(T2, 0.0, atol:1e-10))
// @Assert(approxEqual(prob2, 0.9793866, atol:1e-6))
// var (T3, prob3) = wilcoxon(x, y, alternative:"less")
// @Assert(approxEqual(T3, 0.0, atol:1e-10))
// @Assert(approxEqual(prob3, 0.0206134, atol:1e-6))
// }
@TestCase
func testMannWhitneyU(): Unit {
let males = vector<Int64>([19, 22, 16, 29, 24])
let females = vector<Int64>([20, 11, 17, 12])
var (T, prob) = mannwhitneyu(males, females)
@Assert(approxEqual(T, 17.0, atol:1e-10))
@Assert(approxEqual(prob, 0.1113469, atol:1e-6))
var (T2, prob2) = mannwhitneyu(males, females, alternative:"greater")
@Assert(approxEqual(T2, 17.0, atol:1e-10))
@Assert(approxEqual(prob2, 0.0556734, atol:1e-6))
var (T3, prob3) = mannwhitneyu(males, females, alternative:"less")
@Assert(approxEqual(T3, 17.0, atol:1e-10))
@Assert(approxEqual(prob3, 0.9669037, atol:1e-6))
}
@TestCase
func testMannWhitneyU2(): Unit {
let males = vector<Int64>([19, 22, 16, 29, 24])
let females = vector<Int64>([20, 11, 17, 12])
var (T, prob) = mannwhitneyu(males, females, continuity:false)
@Assert(approxEqual(T, 17.0, atol:1e-10))
@Assert(approxEqual(prob, 0.0864107, atol:1e-6))
var (T2, prob2) = mannwhitneyu(males, females, alternative:"greater", continuity:false)
@Assert(approxEqual(T2, 17.0, atol:1e-10))
@Assert(approxEqual(prob2, 0.0432053, atol:1e-6))
var (T3, prob3) = mannwhitneyu(males, females, alternative:"less", continuity:false)
@Assert(approxEqual(T3, 17.0, atol:1e-10))
@Assert(approxEqual(prob3, 0.9567946, atol:1e-6))
}
@TestCase
func testMannWhitneyU3(): Unit {
let males = vector<Int64>([19, 22, 16, 29, 24])
let females = vector<Int64>([20, 11, 17, 12, 16])
var (T, prob) = mannwhitneyu(males, females)
@Assert(approxEqual(T, 21.5, atol:1e-10))
@Assert(approxEqual(prob, 0.0749129, atol:1e-6))
var (T2, prob2) = mannwhitneyu(males, females, alternative:"greater")
@Assert(approxEqual(T2, 21.5, atol:1e-10))
@Assert(approxEqual(prob2, 0.0374564, atol:1e-6))
var (T3, prob3) = mannwhitneyu(males, females, alternative:"less")
@Assert(approxEqual(T3, 21.5, atol:1e-10))
@Assert(approxEqual(prob3, 0.9767335, atol:1e-6))
}
}