910e62b5创建于 1月15日历史提交
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#![feature(portable_simd)]

// Modules public for testing, don't expect stable API.
mod cxx;
pub mod decoder;
pub mod dither;
pub mod quant;
pub mod selectors;

use std::simd::prelude::*;
use std::simd::Simd;

use bytemuck::cast_slice;

use crate::dither::dither;
use crate::quant::{quantize_averages, QuantResult};
use crate::selectors::search_table_and_selectors;

// We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
// wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
// and support for AVX2 etc. this should be converted to a template parameter
// that varies based on the target architecture.
const SIMD_WIDTH: usize = 8;
const HALF_WIDTH: usize = SIMD_WIDTH / 2;
const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
type Reg = Simd<i16, SIMD_WIDTH>;
type Reg32 = Simd<i32, SIMD_WIDTH>;
type UReg = Simd<u16, SIMD_WIDTH>;

const ETC1_BLOCK_BYTES: usize = 8;

/// Define a helper to interleave elements from two vectors, reinterpret
/// it as a type twice as large, and return the resulting vector.
/// Each argument / return value is an array of vectors; conceptually, this
/// represents a vector that is <width> * <len> large; however, since std::simd
/// types have upper limits on their width we represent them using arrays to be
/// portable.
macro_rules! define_interleave {
    ($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
        fn $fn_name(
            a: [Simd<$src_ty, $src_width>; $src_len],
            b: [Simd<$src_ty, $src_width>; $src_len],
        ) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
            let mut iter = (0..$src_len).flat_map(|i| {
                let (a, b) = a[i].interleave(b[i]);
                [a, b].map(|x| bytemuck::cast(x))
            });
            let res = std::array::from_fn(|_| iter.next().unwrap());
            assert!(iter.next().is_none());
            res
        }
    };
}

/// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
/// into interleaved u64 arrays, while flipping the endianness (our internal
/// representation is little endian while ETC1 requires big endian).
#[inline]
pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
    // The interleaving assumes little endian.
    #[cfg(target_endian = "big")]
    compile_error!("Big endian is not supported");

    define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
    define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
    // Step 1: make each u16 codeword big-endian
    let regs = regs.map(|r| r.swap_bytes());
    // Step 2: [aaaa, bbbb] to [baba, baba]
    let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
    // Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
    let regs = conv_32_to_64(regs[1], regs[0]);
    regs
}

/// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
/// starting at `base_x` and `base_y`.
///
/// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
/// becomes `abcdxyyx`.
///
/// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
/// (from left to right), and each pixel in the block is accessed as
/// `[y][x][channel]`.
#[inline]
pub fn load_input_block(
    src: &[u32],
    width: u32,
    height: u32,
    row_width: u32,
    base_x: u32,
    base_y: u32,
) -> [[[Reg; 3]; 4]; 4] {
    let mut data = [[[Reg::default(); 3]; 4]; 4];
    // For now, input load and output store are not vectorized. The main reason is
    // that efficient loading requires shuffling and is poorly supported
    // by std::simd and the wide crate (which we plan to use for
    // supporting stable toolchain). Input load currently accounts for
    // ~20% of the runtime. If shuffle support improves this would be a
    // good candidate for optimization.
    for i in 0..4 {
        for j in 0..4 {
            let mut buf = [0u32; SIMD_WIDTH];
            for block in 0..SIMD_WIDTH as u32 {
                let x = base_x + block * 4 + j as u32;
                let y = base_y + i as u32;
                buf[block as usize] = if x < width && y < height {
                    // Fast path: load in-bound pixel
                    src[(y * row_width + x) as usize]
                } else {
                    // Slow path: mirror out-of-bound pixels
                    // If width or height is 1, mirroring can overflow, so make it saturate.
                    let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
                    let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
                    src[(ym * row_width + xm) as usize]
                };
            }
            let rgbx = Simd::from_array(buf);
            let extract_channel = |x: Simd<u32, SIMD_WIDTH>, shift: u32| {
                (x >> shift).cast::<i16>() & Simd::splat(0xFF)
            };
            data[i][j][0] = extract_channel(rgbx, 0);
            data[i][j][1] = extract_channel(rgbx, 8);
            data[i][j][2] = extract_channel(rgbx, 16);
        }
    }
    data
}

/// Compress RGB pixels to ETC1.
///
/// - `src` should be in RGBA format (the least significant byte is red).
/// - `dst` will be filled with compressed ETC1 blocks.
/// - `src_width` and `src_height` specifies the logical size of the image in
///   pixels. These does not need to be multiple of 4. The boundary pixels will
///   be padded with unspecified values.
/// - `src_row_width` and `dst_row_width` specifies the in-memory length of each
///   row, in pixels and blocks, respectively.
///
/// Note that `src` takes an aligned 32-bit buffer while `dst` takes a byte
/// buffer, even though each ETC1 codeword is 64-bit. This is due to two
/// reasons:
/// - 32-bit alignment is practical to get even on 32-bit platforms, whereas
///   64-bit values are not aligned to 8 bytes on 32-bit ARM.
/// - We require extensive shuffling when loading inputs, but store to the
///   output straight in the order of blocks. Dealing with unaligned buffers in
///   the latter case is significantly easier.
pub fn compress_etc1(
    src: &[u32],
    dst: &mut [u8],
    src_width: u32,
    src_height: u32,
    src_row_width: u32,
    dst_row_width: u32,
) {
    // Note: We deliberately do not declare the block size (4x4) of ETC1 as a
    //       constant. While magic constants in general are discouraged, the
    //       block size appears way too frequent that naming it would make the
    //       code verbose and less readable.
    let dst_height = src_height.div_ceil(4);
    let dst_width = src_width.div_ceil(4);
    // Aligned staging buffer. Data is copied into the potentially unaligned
    // destination buffer at the end of the each row.
    let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
    let copy_len = dst_width as usize * ETC1_BLOCK_BYTES;
    // Note on vectorization scheme:
    //
    // We process one 4x4 block per SIMD lane, instead of the more common practice
    // of processing pixels within the same block in parallel using multiple
    // lanes. The one-block-per-lane scheme, more akin to SPMD programming,
    // allows most of our code to be shuffle-free, and works much better with
    // portable SIMD than schemes that heavily shuffles.
    for dst_y in 0..dst_height {
        for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
            let data =
                load_input_block(src, src_width, src_height, src_row_width, dst_x0 * 4, dst_y * 4);
            let data = dither(&data);
            let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
                quantize_averages(&data);
            let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
            let codewords = interleave_etc1(best_fit);
            staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
        }
        let dst_row = &mut dst[(dst_y * dst_row_width) as usize * ETC1_BLOCK_BYTES..];
        let staging_row_bytes = cast_slice(&*staging_row);
        dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
    }
}

/// Decompress ETC1 to RGBA
///
/// - `src` should be in ETC1
/// - `dst` will be filled with RGBA
/// - `width` and `height` should be the dimensions of `dst`. If width or height
///   are not multiples of 4, note that the edges become partial blocks and
///   pixels out of bounds will be discarded. The number is truncated.
/// - `src_row_width` should be the width of ETC1 image `dst_row_width` should
///   be the width of RGBA image
///
///
/// This is a stub.
/// TODO: b/393495436 - Implement ETC1 decoding logic.
pub fn decompress_etc1(
    _src: &[u8],
    dst: &mut [u32],
    dst_width: u32,
    dst_height: u32,
    _src_row_width: u32,
    dst_row_width: u32,
) {
    for y in 0..dst_height {
        for x in 0..dst_width {
            let r = (x % 256) as u32;
            let b = (y % 256) as u32;
            let pixel_value: u32 = 0xFF000000 // Alpha: 0xFF
                    | ((r & 0xFF) << 16) // Red
                    |  (b & 0xFF); // Blue
            dst[(y * dst_row_width + x) as usize] = pixel_value;
        }
    }
}