8656feed创建于 2025年10月29日历史提交
package scientific.stats.stattest

/* Rank tests and related functions. */

import std.math.*
import std.collection.*
import std.unittest.*
import std.unittest.testmacro.*

import scientific.linear.*
import scientific.numbers.*
import scientific.stats.continuous.chisqCDF
import scientific.stats.random.Random
import scientific.stats.normal.*

public func rankdata<T>(a: Vector<T>, method!:String = "average"): Vector<Float64>
                        where T <: Real<T> & Comparable<T> & Hashable {
    let size = a.size()

    let rank_map = HashMap<T, Float64>()

    // First, sort the vector
    let sorted_a = sorted(a)

    // Now, go over sorted_a and obtain the ranks of each entry,
    // store them in rank_map.
    var id: Int64 = 0
    var range_count: Int64 = 0
    while (id < size) {
        var id2 = id + 1
        while (id2 < size && sorted_a[id2] == sorted_a[id]) {
            id2++
        }
        // The current range of equal elements is [id, id2).
        var cur_rank: Float64
        if (method == "average") {
            cur_rank = Float64(id + id2 + 1) / 2.0
        } else if (method == "min") {
            cur_rank = Float64(id + 1)
        } else if (method == "max") {
            cur_rank = Float64(id2)
        } else if (method == "dense") {
            cur_rank = Float64(range_count + 1)
        } else if (method == "ordinal") {
            cur_rank = Float64(id + 1)  // to be adjusted later
        } else {
            throw IllegalArgumentException("rankdata: unknown method")
        }
        rank_map[sorted_a[id]] = cur_rank
        range_count++
        id = id2
    }

    // Now use rank_map to obtain rank for all elements in a.
    let ranks = empty<Float64>(size)
    for (i in 0..size) {
        ranks[i] = rank_map[a[i]]
        if (method == "ordinal") {
            rank_map[a[i]] += 1.0
        }
    }
    return ranks
}

public func repeatCounts(a: Vector<Float64>): Vector<Int64> {
    let size = a.size()

    // First, sort the vector
    let sorted_a: Vector<Float64> = sorted<Float64>(a)

    // Compute sizes of ranges
    var id: Int64 = 0
    var ranges = ArrayList<Int64>()
    while (id < size) {
        var id2 = id + 1
        while (id2 < size && sorted_a[id2] == sorted_a[id]) {
            id2++
        }
        if (id2 - id > 1) {
            ranges.add(id2 - id)
        }
        id = id2
    }
    return vector<Int64>(ranges.toArray())
}

public func tiecorrect(a: Vector<Float64>): Float64 {
    let cnt = repeatCounts(a).toFloat64()
    let size = a.size()
    return 1.0 - sum(power(cnt, 3.0) - cnt) / Float64(size * (size * size - 1))
}

public func kruskal<T>(samples: Array<Vector<T>>): (Float64, Float64)
                        where T <: Real<T> & Comparable<T> & Hashable {
    let alldata = concat(samples)
    let ranked = rankdata(alldata)
    let ties: Float64 = tiecorrect(ranked)
    var ssbn: Float64 = 0.0
    var totaln: Int64 = 0
    let num_sample = samples.size
    for (i in 0..num_sample) {
        var cursum: Float64 = 0.0
        let cursize = samples[i].size()
        for (j in totaln..totaln+cursize) {
            cursum += ranked[j]
        }
        ssbn += cursum * cursum / Float64(cursize)
        totaln += cursize
    }
    var n: Float64 = Float64(totaln)
    var h: Float64 = 12.0 / (n * (n + 1.0)) * ssbn - 3.0 * (n + 1.0)
    let df = num_sample - 1
    h = h / ties

    return (h, 1.0 - chisqCDF(h, Float64(df)))
}

public func ranksums<T>(a: Vector<T>, b: Vector<T>, alternative!:String = "two-sided"): (Float64, Float64)
                        where T <: Real<T> & Comparable<T> & Hashable {
    let alldata = concat(a, b)
    let ranked = rankdata(alldata)
    let n1 = a.size()
    let n2 = b.size()
    var s: Float64 = 0.0
    for (i in 0..n1) {
        s += ranked[i]
    }
    let expected = Float64(n1 * (n1 + n2 + 1)) / 2.0
    let z = (s - expected) / sqrt(Float64(n1 * n2 * (n1 + n2 + 1)) / 12.0)
    let prob = norm_finish(z, alternative)
    return (z, prob)
}

public func friedmanchisquare<T>(samples: Matrix<T>): (Float64, Float64)
                        where T <: Real<T> & Comparable<T> & Hashable {
    let k = samples.getRows()
    if (k < 3) {
        throw IllegalArgumentException("friedmanchisquare: must have at least 3 samples")
    }
    let n = samples.getCols()

    let samplesT = samples.transpose()

    // Collect all ranks in a matrix
    var ranks = empty<Float64>(n, k)
    for (i in 0..n) {
        ranks[i] = rankdata(samplesT[i])
    }

    // Compute tie correction
    var ties: Float64 = 0.0
    for (i in 0..n) {
        let cnt = repeatCounts(ranks[i]).toFloat64()
        ties += sum(power(cnt, 3.0) - cnt)
    }
    let c = 1.0 - ties / Float64(k * (k * k - 1) * n)

    // Compute final result
    let ssbn: Float64 = sum(power(sumCol(ranks), 2.0))
    let chisq: Float64 = (12.0 / Float64(k * n * (k + 1)) * ssbn - Float64(3 * n * (k + 1))) / c
    let prob = 1.0 - chisqCDF(chisq, Float64(k - 1))
    return (chisq, prob)
}

/*
 * Only the approximate mode is implemented, with wilcox method for
 * dealing with zeros.
 */
public func wilcoxon<T>(d: Vector<T>, alternative!:String = "two-sided"): (Float64, Float64)
                        where T <: Real<T> & Comparable<T> & Hashable {
    // First, remove all zeros from d
    let d2 = d.filter({x => x != T.fromInt(0)})
    let r = rankdata(abs(d2))
    var r_plus: Float64 = 0.0
    var r_minus: Float64 = 0.0
    let n = d2.size()
    for (i in 0..n) {
        if (d2[i] > T.fromInt(0)) {
            r_plus += r[i]
        } else {
            r_minus += r[i]
        }
    }
    var T: Float64
    if (alternative == "two-sided") {
        T = min(r_plus, r_minus)
    } else {
        T = r_plus
    }

    let mn = Float64(n * (n + 1)) * 0.25
    var se = Float64(n * (n + 1) * (2 * n + 1))

    let cnt = repeatCounts(r).toFloat64()
    if (cnt.size() > 0) {
        se -= 0.5 * sum(power(cnt, 3.0) - cnt)
    }
    se = sqrt(se / 24.0)
    let z = (T - mn) / se
    let prob = norm_finish(z, alternative)
    return (T, prob)
}

public func wilcoxon<T>(x: Vector<T>, y: Vector<T>, alternative!:String = "two-sided"): (Float64, Float64)
                        where T <: Real<T> & Comparable<T> & Hashable {
    return wilcoxon(x - y, alternative:alternative)
}

public func mannwhitneyu<T>(x: Vector<T>, y: Vector<T>, alternative!:String = "two-sided",
                            continuity!:Bool = true): (Float64, Float64)
                        where T <: Real<T> & Comparable<T> & Hashable {
    let xy = concat(x, y)
    let ranks = rankdata(xy)
    let n1 = x.size()
    let n2 = y.size()
    var R1: Float64 = 0.0
    for (i in 0..n1) {
        R1 += ranks[i]
    }
    let U1: Float64 = R1 - Float64(n1 * (n1 + 1)) / 2.0
    let U2: Float64 = Float64(n1 * n2) - U1
    var U: Float64
    var f: Float64
    if (alternative == "greater") {
        U = U1
        f = 1.0
    } else if (alternative == "less") {
        U = U2
        f = 1.0
    } else {
        U = max(U1, U2)
        f = 2.0
    }

    let mu = Float64(n1 * n2) / 2.0
    let n = Float64(n1 + n2)
    let cnt = repeatCounts(ranks).toFloat64()
    let tie_term = sum(power(cnt, 3.0) - cnt)
    let s = sqrt(mu / 6.0 * ((n + 1.0) - tie_term / (n * (n - 1.0))))
    var numerator = U - mu
    if (continuity) {
        numerator -= 0.5
    }
    let z = numerator / s
    let prob = (1.0 - normalCDF(z)) * f
    return (U1, prob)
}

@Test
public class TestRanktest {
    @TestCase
    func testRankdata(): Unit {
        let a1 = vector<Int64>([0, 2, 3, 2])
        let r1 = rankdata(a1)
        @Assert(approxEqual(r1, vector<Float64>([1.0, 2.5, 4.0, 2.5])))

        let r2 = rankdata(a1, method: "min")
        @Assert(approxEqual(r2, vector<Float64>([1.0, 2.0, 4.0, 2.0])))

        let r3 = rankdata(a1, method: "max")
        @Assert(approxEqual(r3, vector<Float64>([1.0, 3.0, 4.0, 3.0])))

        let r4 = rankdata(a1, method: "dense")
        @Assert(approxEqual(r4, vector<Float64>([1.0, 2.0, 3.0, 2.0])))

        let r5 = rankdata(a1, method: "ordinal")
        @Assert(approxEqual(r5, vector<Float64>([1.0, 2.0, 4.0, 3.0])))
    }

    @TestCase
    func testTieCorrect(): Unit {
        let a1 = vector<Float64>([1.0, 2.5, 2.5, 4.0])
        @Assert(approxEqual(tiecorrect(a1), 0.9))

        let a2 = vector<Int64>([1, 3, 2, 4, 5, 7, 2, 8, 4])
        let r2 = rankdata(a2)
        @Assert(approxEqual(tiecorrect(r2), 0.9833333333333333, atol:1e-10))
    }

    @TestCase
    func testKruskal1(): Unit {
        let samples = [vector<Int64>([1, 3, 5, 7, 9]),
                       vector<Int64>([2, 4, 6, 8, 10])]
        let (stat, prob) = kruskal(samples)
        @Assert(approxEqual(stat, 0.2727272727272734, atol:1e-10))
        @Assert(approxEqual(prob, 0.6015081344405895, atol:1e-10))
    }

    @TestCase
    func testKruskal2(): Unit {
        let samples = [vector<Int64>([1, 1, 1]),
                       vector<Int64>([2, 2, 2]),
                       vector<Int64>([2, 2])]
        let (stat, prob) = kruskal(samples)
        @Assert(approxEqual(stat, 7.0, atol:1e-10))
        @Assert(approxEqual(prob, 0.0301973834223185, atol:1e-10))
    }

    @TestCase
    func testRanksums(): Unit {
        let r = Random(0)
        let sample1 = rand(r, 200, -1.0, 1.0)
        let sample2 = rand(r, 200, -0.5, 1.5)   // Shifted distribution
        var (z, prob) = ranksums(sample1, sample2)
        @Assert(approxEqual(z, -7.162609, atol:1e-5))
        @Assert(approxEqual(prob, 0.0, atol:1e-10))
        var (z2, prob2) = ranksums(sample1, sample2, alternative:"less")
        @Assert(approxEqual(z2, -7.162609, atol:1e-5))
        @Assert(approxEqual(prob2, 0.0, atol:1e-10))
        var (z3, prob3) = ranksums(sample1, sample2, alternative:"greater")
        @Assert(approxEqual(z3, -7.162609, atol:1e-5))
        @Assert(approxEqual(prob3, 1.0, atol:1e-10))
    }

    @TestCase
    func testFriedmanChisquare(): Unit {
        let samples = matrix<Int64>([
            [1, 3, 5, 7, 9], [2, 4, 6, 8, 10], [1, 4, 7, 9, 10]
        ])
        var (chisq, prob) = friedmanchisquare(samples)
        @Assert(approxEqual(chisq, 7.176470588235304, atol:1e-10))
        @Assert(approxEqual(prob, 0.02764707635775853, atol:1e-10))
    }

    // @TestCase
    // func testWilcoxon(): Unit {
    //     let d = vector<Int64>([
    //         6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75
    //     ])
    //     var (T, prob) = wilcoxon(d)
    //     @Assert(approxEqual(T, 24.0, atol:1e-10))
    //     @Assert(approxEqual(prob, 0.0408881, atol:1e-6))

    //     var (T2, prob2) = wilcoxon(d, alternative:"greater")
    //     @Assert(approxEqual(T2, 96.0, atol:1e-10))
    //     @Assert(approxEqual(prob2, 0.0204441, atol:1e-6))

    //     var (T3, prob3) = wilcoxon(d, alternative:"less")
    //     @Assert(approxEqual(T3, 96.0, atol:1e-10))
    //     @Assert(approxEqual(prob3, 0.9795559, atol:1e-6))
    // }

    // @TestCase
    // func testWilcoxon2(): Unit {
    //     let d = vector<Int64>([
    //         4, 2, 4, 0, -4, -6, -3, -3, 0, -3, -7
    //     ])
    //     var (T, prob) = wilcoxon(d)
    //     @Assert(approxEqual(T, 13.0, atol:1e-10))
    //     @Assert(approxEqual(prob, 0.2570274, atol:1e-6))

    //     var (T2, prob2) = wilcoxon(d, alternative:"greater")
    //     @Assert(approxEqual(T2, 13.0, atol:1e-10))
    //     @Assert(approxEqual(prob2, 0.8714863, atol:1e-6))

    //     var (T3, prob3) = wilcoxon(d, alternative:"less")
    //     @Assert(approxEqual(T3, 13.0, atol:1e-10))
    //     @Assert(approxEqual(prob3, 0.1285137, atol:1e-6))
    // }

    // @TestCase
    // func testWilcoxon3(): Unit {
    //     let x = vector<Int64>([1, 6, 3, 5, 9, 11, 8])
    //     let y = vector<Int64>([4, 7, 3, 6, 9, 14, 10])
    //     var (T, prob) = wilcoxon(x, y)
    //     @Assert(approxEqual(T, 0.0, atol:1e-10))
    //     @Assert(approxEqual(prob, 0.0412268, atol:1e-6))

    //     var (T2, prob2) = wilcoxon(x, y, alternative:"greater")
    //     @Assert(approxEqual(T2, 0.0, atol:1e-10))
    //     @Assert(approxEqual(prob2, 0.9793866, atol:1e-6))

    //     var (T3, prob3) = wilcoxon(x, y, alternative:"less")
    //     @Assert(approxEqual(T3, 0.0, atol:1e-10))
    //     @Assert(approxEqual(prob3, 0.0206134, atol:1e-6))
    // }

    @TestCase
    func testMannWhitneyU(): Unit {
        let males = vector<Int64>([19, 22, 16, 29, 24])
        let females = vector<Int64>([20, 11, 17, 12])
        var (T, prob) = mannwhitneyu(males, females)
        @Assert(approxEqual(T, 17.0, atol:1e-10))
        @Assert(approxEqual(prob, 0.1113469, atol:1e-6))
        var (T2, prob2) = mannwhitneyu(males, females, alternative:"greater")
        @Assert(approxEqual(T2, 17.0, atol:1e-10))
        @Assert(approxEqual(prob2, 0.0556734, atol:1e-6))
        var (T3, prob3) = mannwhitneyu(males, females, alternative:"less")
        @Assert(approxEqual(T3, 17.0, atol:1e-10))
        @Assert(approxEqual(prob3, 0.9669037, atol:1e-6))
    }

    @TestCase
    func testMannWhitneyU2(): Unit {
        let males = vector<Int64>([19, 22, 16, 29, 24])
        let females = vector<Int64>([20, 11, 17, 12])
        var (T, prob) = mannwhitneyu(males, females, continuity:false)
        @Assert(approxEqual(T, 17.0, atol:1e-10))
        @Assert(approxEqual(prob, 0.0864107, atol:1e-6))
        var (T2, prob2) = mannwhitneyu(males, females, alternative:"greater", continuity:false)
        @Assert(approxEqual(T2, 17.0, atol:1e-10))
        @Assert(approxEqual(prob2, 0.0432053, atol:1e-6))
        var (T3, prob3) = mannwhitneyu(males, females, alternative:"less", continuity:false)
        @Assert(approxEqual(T3, 17.0, atol:1e-10))
        @Assert(approxEqual(prob3, 0.9567946, atol:1e-6))
    }

    @TestCase
    func testMannWhitneyU3(): Unit {
        let males = vector<Int64>([19, 22, 16, 29, 24])
        let females = vector<Int64>([20, 11, 17, 12, 16])
        var (T, prob) = mannwhitneyu(males, females)
        @Assert(approxEqual(T, 21.5, atol:1e-10))
        @Assert(approxEqual(prob, 0.0749129, atol:1e-6))
        var (T2, prob2) = mannwhitneyu(males, females, alternative:"greater")
        @Assert(approxEqual(T2, 21.5, atol:1e-10))
        @Assert(approxEqual(prob2, 0.0374564, atol:1e-6))
        var (T3, prob3) = mannwhitneyu(males, females, alternative:"less")
        @Assert(approxEqual(T3, 21.5, atol:1e-10))
        @Assert(approxEqual(prob3, 0.9767335, atol:1e-6))
    }
}