910e62b5创建于 1月15日历史提交
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Note: This file refers to modifiers in ETC1 spec as "selectors". The jargon
//       was inherited from etcpak.

use std::simd::prelude::*;
use std::simd::{Mask, Simd};

use crate::{Reg, Reg32, UReg, SIMD_WIDTH};

// Selector tables from ETC1 spec. The negative part is omitted due to symmetry.
pub const TABLES: [[i16; 2]; 8] =
    [[2, 8], [5, 17], [9, 29], [13, 42], [18, 60], [24, 80], [33, 106], [47, 183]];

/// Conditionally exchange the bottom left 2x2 block with top right 2x2 block,
/// if `flip` for that lane is true.
///
/// i.e. the goal is to flip from:
/// ```text
/// aeim
/// bfjn
/// cgko
/// dhlp
/// ```
/// to:
/// ```text
/// aecg
/// bfdh
/// imko
/// jnlp
/// ```
#[inline]
pub fn flip_pixels(d: &[[[Reg; 3]; 4]; 4], flip: Mask<i16, SIMD_WIDTH>) -> [[[Reg; 3]; 4]; 4] {
    let mut o = [[[Reg::default(); 3]; 4]; 4];
    for y0 in [0, 2] {
        for x0 in [0, 2] {
            for y1 in 0..2 {
                for x1 in 0..2 {
                    for ch in 0..3 {
                        if y0 == x0 {
                            o[y0 + y1][x0 + x1][ch] = d[y0 + y1][x0 + x1][ch];
                        } else {
                            o[y0 + y1][x0 + x1][ch] =
                                flip.select(d[x0 + y1][y0 + x1][ch], d[y0 + y1][x0 + x1][ch]);
                        }
                    }
                }
            }
        }
    }
    o
}

/// Flip the selector codeword if `flip` for that lane is true.
///
/// See [`flip_pixels`] for a description of the flip operation.
#[inline]
pub fn flip_selectors(x: UReg, flip: Mask<i16, SIMD_WIDTH>) -> UReg {
    let keep = x & Simd::splat(0xCC33);
    let bottom_left = x & Simd::splat(0x00CC);
    let top_right = x & Simd::splat(0x3300);

    let flipped = keep | (bottom_left << 6) | (top_right >> 6);
    flip.select(flipped, x)
}

pub struct Fit {
    pub err: Reg32,
    pub table_idx: UReg,
    pub selector_lo: UReg,
    pub selector_hi: UReg,
}

/// Search for the optimal table and selectors for a subblock.
///
/// `data` should be in flipped layout, i.e. 4x2.
///
/// The error function used here is a bit quirky, see code comment for details.
#[inline]
pub fn search_table_and_selectors_subblock(data: &[[[Reg; 3]; 4]], base_color: [Reg; 3]) -> Fit {
    assert_eq!(data.len(), 2);
    // Use fold to compute minimum. Essentially a vector version of min_by_key.
    TABLES
        .iter()
        .enumerate()
        .fold(None, |best_fit, (table_idx, sel_table)| {
            let mut outer_err = Reg32::splat(0);
            let mut selector_lo = UReg::splat(0);
            let mut selector_hi = UReg::splat(0);
            for y in 0..2 {
                for x in 0..4 {
                    // Below, we search for the optimal selector among [-lg, -sm, sm, lg] (sm
                    // and lg is from the selector table).
                    //
                    // We use the error metric:
                    //   abs(gray(q + s - x))
                    //   where q = quantized average, s = selector, x = pixel before compression
                    //         gray(p) = 19*p.r + 38*p.g + 7*p.b  (cf. rec601)
                    //
                    // Note that this is abs(gray(..)) not gray(abs(..)), i.e. the absolute
                    // is taken after computing to grayscale. This allows precomputing
                    // gray(q-x), then exploiting the fact that the selector is same for all
                    // three channels to calculate the final error with a single addition.
                    //
                    // We will first precompute gray(q - x).
                    let mut base_err = Reg::splat(0);
                    let rgb_weight = [19, 38, 7];
                    for ch in 0..3 {
                        base_err += (base_color[ch] - data[y][x][ch]) * Simd::splat(rgb_weight[ch]);
                    }

                    // Now, the sign of selector can be easily decided. To minimize the
                    // absolute value, the selector should be the opposite sign of
                    // gray(q - x).
                    let prefer_neg = base_err.simd_gt(Simd::splat(0));

                    // Finally, we compute the error metric for both sm and lg and decide the
                    // winner.
                    let base_err_abs = base_err.abs();
                    // Subtract in the direction that the final error metric is smaller.
                    // The selector is same for all three channels, so just multiply it by the
                    // total weight.
                    let weight_sum = 64;
                    let err_sm = (base_err_abs - Reg::splat(sel_table[0] * weight_sum)).abs();
                    let err_lg = (base_err_abs - Reg::splat(sel_table[1] * weight_sum)).abs();
                    let prefer_lg = err_lg.simd_lt(err_sm);

                    // The error can be fairly large (a crude upper bound is 255*64). To avoid
                    // overflow after squaring, we use widening multiply and accumulate. This
                    // is somewhat expensive.
                    let best_err = prefer_lg.select(err_lg, err_sm).cast::<i32>();
                    outer_err += best_err * best_err;

                    let pixel_idx = (y + x * 4) as u16;
                    selector_lo |= prefer_lg.select(UReg::splat(1 << pixel_idx), UReg::splat(0));
                    selector_hi |= prefer_neg.select(UReg::splat(1 << pixel_idx), UReg::splat(0));
                }
            }

            let table_idx = UReg::splat(table_idx as u16);
            match best_fit {
                None => Some(Fit { err: outer_err, table_idx, selector_lo, selector_hi }),
                Some(best) => {
                    let lt_32 = outer_err.simd_lt(best.err);
                    let lt = lt_32.cast::<i16>();
                    Some(Fit {
                        err: lt_32.select(outer_err, best.err),
                        table_idx: lt.select(table_idx, best.table_idx),
                        selector_lo: lt.select(selector_lo, best.selector_lo),
                        selector_hi: lt.select(selector_hi, best.selector_hi),
                    })
                }
            }
        })
        .unwrap()
}

/// Search through possible selector tables and selector values for each
/// subblock.
///
/// Returns: Four 16-bit codewords coding the optimal coefficients.
#[inline]
pub fn search_table_and_selectors(
    mut hdr0: UReg,
    hdr1: UReg,
    data: &[[[Reg; 3]; 4]; 4],
    base_color: [[Reg; 3]; 2],
) -> [UReg; 4] {
    // We need to work on pixels in the first subblock, then the second. To allow
    // uniform indices, the flip functions takes care of moving the first
    // subblock to the top half and the second to bottom half. We will fix up
    // the shuffled results in the end.
    let flip = (hdr0 & (UReg::splat(1))).simd_ne(UReg::splat(0));
    let permuted_data = flip_pixels(&data, !flip);

    let mut selector_lo = UReg::splat(0);
    let mut selector_hi = UReg::splat(0);

    for subblock in 0..2 {
        let best_fit = search_table_and_selectors_subblock(
            &permuted_data[subblock * 2..subblock * 2 + 2],
            base_color[subblock],
        );
        let subblock_bit = match subblock {
            0 => 5,
            1 => 2,
            _ => unreachable!(),
        };
        hdr0 |= best_fit.table_idx << subblock_bit;
        selector_lo |= best_fit.selector_lo << (subblock as u16 * 2);
        selector_hi |= best_fit.selector_hi << (subblock as u16 * 2);
    }
    selector_lo = flip_selectors(selector_lo, !flip);
    selector_hi = flip_selectors(selector_hi, !flip);
    [selector_lo, selector_hi, hdr0, hdr1]
}