/*
 * This file is part of the openHiTLS project.
 *
 * openHiTLS is licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *
 *     http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */


#include "hitls_build.h"
#ifdef HITLS_CRYPTO_SM3

#include "crypt_arm.h"
CRYPT_AARCH64_ARCH_CRYPTO

// The first 16 of the compression function, w13 is Tj.
.macro first16 A B C D E F G H W1 W2
    ror w13, w13, #31
    ror w10, \A, #20
    add w9, \E, w10
    eor w12, \E, \F
    ror \F, \F, #13
    eor w12, w12, \G
    add w12, w12, \H
    add w9, w9, w13
    ror w9, w9, #25
    add w12, w12, w9
    eor w10, w10, w9
    add w12, w12, \W1
    eor \H, w12, w12, ror #23
    ror w9, w12, #15
    eor \H, \H, w9
    eor w11, \A, \B
    ror \B, \B, #23
    eor w11, w11, \C
    add w11, w11, \D
    add w11, w11, w10
    eor w9, \W1, \W2
    add \D, w11, w9
    .endm

// Compress the last 48 of the function, w13 is Tj
.macro second48 A B C D E F G H W1 W2
    ror w13, w13, #31
    orr w11, \B, \C
    eor w12, \F, \G
    ror \F, \F, #13
    ror w10, \A, #20
    add w9, w10, \E
    and w14, \A, w11
    and w12, w12, \E
    eor w12, w12, \G
    add w12, w12, \H
    add w9, w9, w13
    ror w9, w9, #25
    add w12, w12, w9
    eor w10, w10, w9
    add w12, w12, \W1
    and w11, \B, \C
    ror \B, \B, #23
    orr w11, w11, w14
    eor w9, \W1, \W2
    add w11, w11, \D
    add w11, w11, w10
    add \D, w11, w9
    eor \H, w12, w12, ror #23
    ror w9, w12, #15
    eor \H, \H, w9
    .endm

// void SM3_CompressAsm(uint32_t state[8], const uint8_t *data, uint32_t blockCnt);
CRYPT_AARCH64_FUNC_START(SM3_CompressAsm)
AARCH64_PACIASP
    sub sp, sp, 128
    stp x19, x20, [sp]
    stp x21, x22, [sp, #16]
    stp x23, x24, [sp, #32]
    stp x25, x26, [sp, #48]
    // According to the calling convention, this function needs to be saved.
    stp d8, d9, [sp, #64]
    stp d10, d11, [sp, #80]
    stp d12, d13, [sp, #96]
    stp d14, d15, [sp, #112]

    sub sp, sp, 64
    mov x25, sp
    sub sp, sp, 64
    mov x26, sp

    mov x22, x0 // x22: state
    mov x23, x1 // x23: data
    mov w24, w2 // x24: blockCnt

    // w0-w7: ABCDEFGH word register in"SM3 cryptographic hash algorithm"
    ldp w0, w1, [x22]
    ldp w2, w3, [x22, #8]
    ldp w4, w5, [x22, #16]
    ldp w6, w7, [x22, #24]

    prfm pldl1keep, [x23, #64]
    blocksloop_1:
    subs w24, w24, #1
    bmi end
    // Due to the SM3 feature, only three messages can be extended in parallel.
    // You need to use ext to ensure that the data meets the requirements for calculation.
    // To reduce the delay, the message expansion is calculated together with the compression function,
    // and the compression function is calculated three times for every three Ws.

    // v0-v3 message group w0-w15
    ld1 {v0.4s-v3.4s}, [x23]
#ifndef HITLS_BIG_ENDIAN
    rev32 v0.16B, v0.16B
    rev32 v1.16B, v1.16B
    rev32 v2.16B, v2.16B
    rev32 v3.16B, v3.16B
#endif

    ldp w15, w20, [x23]
    ldp w19, w21, [x23, #16]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w19, w19
    rev w20, w20
    rev w21, w21
#endif

    ext v24.16b, v3.16b, v3.16b, #4   // 13, 14, 15
    ext v25.16b, v0.16b, v1.16b, #12  // 3, 4, 5
    ext v23.16b, v1.16b, v2.16b, #12  // 7, 8, 9
    ext v26.16b, v2.16b, v3.16b, #8   // 10, 11, 12
    eor v27.16b, v0.16b, v23.16b
    // w13: constant Tj , 0 <= j <= 16
    mov w13, #0x228c
    movk w13, #0xbce6, lsl #16

    // Message grouping: Wj−3 ≪ 15, Wj−13 ≪ 7
    shl v21.4s, v24.4s, #15
    shl v22.4s, v25.4s, #7
    sri v21.4s, v24.4s, #17  // 13, 14, 15<<<15
    sri v22.4s, v25.4s, #25  // 3, 4, 5<<<7
    first16 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v26.16b
    first16 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    // permutation function P1: X ^ (X ≪ 15) ^ (X ≪ 23)
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x23, #8]
    ldp w19, w21, [x23, #24]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w19, w19
    rev w20, w20
    rev w21, w21
#endif
    eor v27.16b, v27.16b, v30.16b
    first16 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v4.16b, v27.16b, v28.16b

    // 2:19, 20, 21
    ext v23.16b, v1.16b, v2.16b, #8  // 6, 7, 8
    eor v27.16b, v25.16b, v26.16b
    first16 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v21.4s, v4.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v4.4s, #17   // 16, 17, 18<<<15
    sri v22.4s, v23.4s, #25  // 6, 7, 8<<<7
    ldp w15, w20, [x23, #16]
    ldp w19, w21, [x23, #32]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w19, w19
    rev w20, w20
    rev w21, w21
#endif
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v24.16b
    first16 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    first16 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    eor v27.16b, v27.16b, v30.16b
    mov v4.s[3], v4.s[2]  // Due to ext requirements, to fill s[3]
    eor v5.16b, v27.16b, v28.16b

    // 3:22, 23, 24
    ext v25.16b, v2.16b, v3.16b, #4  // 9, 10, 11
    eor v27.16b, v23.16b, v24.16b
    ldp w15, w20, [x23, #24]
    ldp w19, w21, [x23, #40]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w19, w19
    rev w20, w20
    rev w21, w21
#endif
    shl v21.4s, v5.4s, #15
    shl v22.4s, v25.4s, #7
    sri v21.4s, v5.4s, #17   // 19, 20, 21<<<15
    sri v22.4s, v25.4s, #25  // 9, 10, 11<<<7
    first16 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v4.16b
    first16 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x23, #32]
    ldp w19, w21, [x23, #48]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w19, w19
    rev w20, w20
    rev w21, w21
#endif
    first16 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v27.16b, v30.16b
    mov v5.s[3], v5.s[2]  // Due to ext requirements, to fill s[3]
    eor v6.16b, v27.16b, v28.16b

    // 4:25, 26, 27
    eor v27.16b, v25.16b, v4.16b
    shl v21.4s, v6.4s, #15
    shl v22.4s, v3.4s, #7
    sri v21.4s, v6.4s, #17  // 22, 23, 24<<<15
    sri v22.4s, v3.4s, #25  // 12, 13, 14<<<7
    first16 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v5.16b
    ldp w15, w20, [x23, #40]
    ldp w19, w21, [x23, #56]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w19, w19
    rev w20, w20
    rev w21, w21
#endif
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    first16 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v27.16b, v29.16b
    first16 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    eor v27.16b, v27.16b, v30.16b
    mov v6.s[3], v6.s[2]  // Due to ext requirements, to fill s[3]
    eor v7.16b, v27.16b, v28.16b

    // 5:28, 29, 30
    ext v23.16b, v3.16b, v4.16b, #12  // 15, 16, 17
    eor v27.16b, v3.16b, v5.16b
    st1 {v4.4s-v7.4s}, [x25]  // There is a redundant data for every four 32-bit bits of the stored data.
                              // The data needs to be read in a skip manner.
    shl v21.4s, v7.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v7.4s, #17   // 25, 26, 27<<<15
    sri v22.4s, v23.4s, #25  // 15, 16, 17<<<7
    ldp w15, w20, [x23, #48]
    ldp w19, w21, [x25]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w20, w20
#endif
    first16 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v6.16b
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    first16 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    ldp w15, w20, [x23, #56]
#ifndef HITLS_BIG_ENDIAN
    rev w15, w15
    rev w20, w20
#endif
    add x23, x23, #64
    prfm pldl1keep, [x23, #64]
    ldr w19, [x25, #8]
    ldr w21, [x25, #16]
    eor v27.16b, v27.16b, v30.16b
    first16 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    mov v7.s[3], v7.s[2]  // Due to ext requirements, to fill s[3]
    eor v8.16b, v27.16b, v28.16b

    // Message extension completed. Continue with the next 48 compression.
    ext v24.16b, v4.16b, v5.16b, #12  // 18, 19, 20
    eor v27.16b, v23.16b, v6.16b
    first16 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v21.4s, v8.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v8.4s, #17   // 28, 29, 30<<<15
    sri v22.4s, v24.4s, #25  // 18, 19, 20<<<7
    ldp w15, w20, [x25]
    ldp w19, w21, [x25, #20]
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v7.16b
    // w13: constant Tj , 17 <= j <= 63
    mov w13, #0x3d43
    movk w13, #0xcec5, lsl #16
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    eor v27.16b, v27.16b, v30.16b
    mov v8.s[3], v8.s[2]  // Due to ext requirements, to fill s[3]
    eor v9.16b, v27.16b, v28.16b

    // 7:34, 35, 36
    ext v23.16b, v5.16b, v6.16b, #12  // 21, 22, 23
    eor v27.16b, v24.16b, v7.16b
    ldr w15, [x25, #8]
    ldr w20, [x25, #16]
    ldp w19, w21, [x25, #32]
    shl v21.4s, v9.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v9.4s, #17   // 31, 32, 33<<<15
    sri v22.4s, v23.4s, #25  // 21, 22, 23<<<7
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v8.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x25, #20]
    ldr w19, [x25, #40]
    ldr w21, [x25, #48]
    eor v27.16b, v27.16b, v30.16b
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    mov v9.s[3], v9.s[2]  // Due to ext requirements, to fill s[3]
    eor v10.16b, v27.16b, v28.16b

    // 8:37, 38, 39
    ext v24.16b, v6.16b, v7.16b, #12  // 24, 25, 26
    eor v27.16b, v23.16b, v8.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    shl v21.4s, v10.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v10.4s, #17  // 34, 35, 36<<<15
    sri v22.4s, v24.4s, #25  // 24, 25, 26<<<7
    ldp w15, w20, [x25, #32]
    ldp w19, w21, [x25, #52]
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v9.16b
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    eor v27.16b, v27.16b, v30.16b
    mov v10.s[3], v10.s[2]  // Due to ext requirements, to fill s[3]
    eor v11.16b, v27.16b, v28.16b

    // 9:40, 41, 42
    ext v23.16b, v7.16b, v8.16b, #12  // 27, 28, 29
    eor v27.16b, v24.16b, v9.16b
    st1 {v8.4s-v11.4s}, [x26]
    shl v21.4s, v11.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v11.4s, #17  // 37, 38, 39<<<15
    sri v22.4s, v23.4s, #25  // 27, 28, 29<<<7
    ldr w15, [x25, #40]
    ldr w20, [x25, #48]
    ldp w19, w21, [x26]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v10.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x25, #52]
    ldr w19, [x26, #8]
    ldr w21, [x26, #16]
    eor v27.16b, v27.16b, v30.16b
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    mov v11.s[3], v11.s[2]  // Due to ext requirements, to fill s[3]
    eor v12.16b, v27.16b, v28.16b

    // 10:43, 44, 45
    ext v24.16b, v8.16b, v9.16b, #12  // 30, 31, 32
    eor v27.16b, v23.16b, v10.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v21.4s, v12.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v12.4s, #17  // 40, 41, 42<<<15
    sri v22.4s, v24.4s, #25  // 30, 31, 32<<<7
    ldp w15, w20, [x26]
    ldp w19, w21, [x26, #20]
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v11.16b
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    eor v27.16b, v27.16b, v30.16b
    mov v12.s[3], v12.s[2]  // Due to ext requirements, to fill s[3]
    eor v13.16b, v27.16b, v28.16b

    // 11:46, 47, 48
    ext v23.16b, v9.16b, v10.16b, #12  // 33, 34, 35
    eor v27.16b, v24.16b, v11.16b
    ldr w15, [x26, #8]
    ldr w20, [x26, #16]
    ldp w19, w21, [x26, #32]
    shl v21.4s, v13.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v13.4s, #17  // 43, 44, 45<<<15
    sri v22.4s, v23.4s, #25  // 33, 34, 35<<<7
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v12.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x26, #20]
    ldr w19, [x26, #40]
    ldr w21, [x26, #48]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v27.16b, v30.16b
    mov v13.s[3], v13.s[2]  // Due to ext requirements, to fill s[3]
    eor v14.16b, v27.16b, v28.16b

    // 12:49, 50, 51
    ext v24.16b, v10.16b, v11.16b, #12  // 36, 37, 38
    eor v27.16b, v23.16b, v12.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    shl v21.4s, v14.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v14.4s, #17  // 46, 47, 48<<<15
    sri v22.4s, v24.4s, #25  // 36, 37, 38<<<7
    ldp w15, w20, [x26, #32]
    ldp w19, w21, [x26, #52]
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v13.16b
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    eor v27.16b, v27.16b, v30.16b
    mov v14.s[3], v14.s[2]  // Due to ext requirements, to fill s[3]
    eor v15.16b, v27.16b, v28.16b

    // 13:52, 53, 54
    ext v23.16b, v11.16b, v12.16b, #12  // 39, 40, 41
    eor v27.16b, v24.16b, v13.16b
    st1 {v12.4s-v15.4s}, [x25]
    shl v21.4s, v15.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v15.4s, #17  // 49, 50, 51<<<15
    sri v22.4s, v23.4s, #25  // 39, 40, 41<<<7
    ldr w15, [x26, #40]
    ldr w20, [x26, #48]
    ldp w19, w21, [x25]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v14.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x26, #52]
    ldr w19, [x25, #8]
    ldr w21, [x25, #16]
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v27.16b, v30.16b
    mov v15.s[3], v15.s[2]  // Due to ext requirements, to fill s[3]
    eor v16.16b, v27.16b, v28.16b

    // 14:55, 56, 57
    ext v24.16b, v12.16b, v13.16b, #12  // 42, 43, 44
    eor v27.16b, v23.16b, v14.16b
    shl v21.4s, v16.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v16.4s, #17  // 52, 53, 54<<<15
    sri v22.4s, v24.4s, #25  // 42, 43, 44<<<7
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v15.16b
    ldp w15, w20, [x25]
    ldp w19, w21, [x25, #20]
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v27.16b, v30.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    mov v16.s[3], v16.s[2]  // Due to ext requirements, to fill s[3]
    eor v17.16b, v27.16b, v28.16b

    // 15:58, 59, 60
    ext v23.16b, v13.16b, v14.16b, #12  // 45, 46, 47
    eor v27.16b, v24.16b, v15.16b
    shl v21.4s, v17.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v17.4s, #17  // 55, 56, 57<<<15
    sri v22.4s, v23.4s, #25  // 45, 46, 47<<<7
    ldr w15, [x25, #8]
    ldr w20, [x25, #16]
    ldp w19, w21, [x25, #32]
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v16.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x25, #20]
    ldr w19, [x25, #40]
    ldr w21, [x25, #48]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v27.16b, v30.16b
    eor v18.16b, v27.16b, v28.16b

    // 16:61, 62, 63
    ext v24.16b, v14.16b, v15.16b, #12  // 48, 49, 50
    eor v27.16b, v23.16b, v16.16b
    shl v21.4s, v18.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v18.4s, #17  // 58, 59, 60<<<15
    sri v22.4s, v24.4s, #25  // 48, 49, 50<<<7
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v17.16b
    ldp w15, w20, [x25, #32]
    ldp w19, w21, [x25, #52]
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v27.16b, v27.16b, v30.16b
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    eor v19.16b, v27.16b, v28.16b

    // 17:64, 65, 66
    ext v23.16b, v15.16b, v16.16b, #12  // 51, 52, 53
    eor v27.16b, v24.16b, v17.16b
    st1 {v16.4s-v19.4s}, [x26]
    shl v21.4s, v19.4s, #15
    shl v22.4s, v23.4s, #7
    sri v21.4s, v19.4s, #17  // 61, 62, 63<<<15
    sri v22.4s, v23.4s, #25  // 51, 52, 53<<<7
    ldr w15, [x25, #40]
    ldr w20, [x25, #48]
    ldp w19, w21, [x26]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v18.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    ldp w15, w20, [x25, #52]
    ldr w19, [x26, #8]
    ldr w21, [x26, #16]
    eor v27.16b, v27.16b, v30.16b
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    eor v20.16b, v27.16b, v28.16b

    // 18:67
    ext v24.16b, v16.16b, v17.16b, #12  // 54, 55, 56
    eor v27.16b, v23.16b, v18.16b
    shl v21.4s, v20.4s, #15
    shl v22.4s, v24.4s, #7
    sri v21.4s, v20.4s, #17  // 64, 65, 66<<<15
    sri v22.4s, v24.4s, #25  // 54, 55, 56<<<7
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    eor v27.16b, v21.16b, v27.16b
    eor v28.16b, v22.16b, v19.16b
    ldp w15, w20, [x26]
    ldp w19, w21, [x26, #20]
    shl v29.4s, v27.4s, #15
    shl v30.4s, v27.4s, #23
    sri v29.4s, v27.4s, #17
    sri v30.4s, v27.4s, #9
    eor v27.16b, v27.16b, v29.16b
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    eor v27.16b, v27.16b, v30.16b
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    eor v21.16b, v27.16b, v28.16b

    ldr w15, [x26, #8]
    ldr w20, [x26, #16]
    ldp w19, w21, [x26, #32]
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    st1 {v20.4s-v21.4s}, [x25]
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    ldp w15, w20, [x26, #20]
    ldr w19, [x26, #40]
    ldr w21, [x26, #48]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19

    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    ldp w15, w20, [x26, #32]
    ldp w19, w21, [x26, #52]
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    ldr w15, [x26, #40]
    ldr w20, [x26, #48]
    ldp w19, w21, [x25]
    second48 w0, w1, w2, w3, w4, w5, w6, w7, w15, w19
    second48 w3, w0, w1, w2, w7, w4, w5, w6, w20, w21
    ldp w15, w20, [x26, #52]
    ldr w19, [x25, #8]
    ldr w21, [x25, #16]
    second48 w2, w3, w0, w1, w6, w7, w4, w5, w15, w19
    second48 w1, w2, w3, w0, w5, w6, w7, w4, w20, w21
    ldp w9, w10, [x22]  // XOR with the previous hash result
    ldp w11, w12, [x22, #8]
    ldp w13, w14, [x22, #16]
    ldp w15, w19, [x22, #24]
    eor w0, w0, w9
    eor w1, w1, w10
    eor w2, w2, w11
    eor w3, w3, w12
    eor w4, w4, w13
    eor w5, w5, w14
    eor w6, w6, w15
    eor w7, w7, w19
    stp w0, w1, [x22]  // Result saving
    stp w2, w3, [x22, #8]
    stp w4, w5, [x22, #16]
    stp w6, w7, [x22, #24]
    b blocksloop_1
    end:

    add sp, sp, 128

    ldp x19, x20, [sp]
    ldp x21, x22, [sp, #16]
    ldp x23, x24, [sp, #32]
    ldp x25, x26, [sp, #48]
    ldp d8, d9, [sp, #64]
    ldp d10, d11, [sp, #80]
    ldp d12, d13, [sp, #96]
    ldp d14, d15, [sp, #112]
    add sp, sp, 128

AARCH64_AUTIASP
    ret
CRYPT_AARCH64_FUNC_END(SM3_CompressAsm)

#endif