/*
 * This file is part of the openHiTLS project.
 *
 * openHiTLS is licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *
 *     http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */
#include "hitls_build.h"
#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_GCM)

.macro GCM_ENC128_LOOP
    ldp x6, x7, [INPUT], #16                // AES[0] - load plaintext
    rev64 OUT0.16b, OUT0.16b                // GHASH blocl[0]
    rev64 OUT2.16b, OUT2.16b                // GHASH block[2]
    ROUND CTR2.16b, KEY0.16b
#ifdef HITLS_BIG_ENDIAN
    rev x6, x6
    rev x7, x7
#endif

    fmov d3, x10                            // CTR[3]
    ext HASH0.16b, HASH0.16b, HASH0.16b, #8 // PRE 0
    rev64 OUT1.16b, OUT1.16b                // GHASH block[1]
    ROUND CTR1.16b, KEY0.16b

    add IV_W, IV_W, #1                        // CTR3++
    fmov CTR3.d[1], x9                      // CTR[3]--OK
    ROUND CTR0.16b, KEY0.16b
    mov d31, OUT2.d[1]                      // GHASH block[2.1]

    ROUND CTR2.16b, KEY1.16b
    mov d30, OUT1.d[1]                      // GHASH block[1.1]
    ROUND CTR1.16b, KEY1.16b
    eor v4.16b, OUT0.16b, HASH0.16b         // PRE 1

    ROUND CTR3.16b, KEY0.16b
    eor x7, x7, KEND1                       // AES[0] - round 10 high
    pmull2 v28.1q, OUT1.2d, HASH3.2d        // GHASH block 4k+1 - high
    eor v31.8b, v31.8b, OUT2.8b             // GHASH[2] - mid

    ldp x19, x20, [INPUT], #16              // AES[1] - load plaintext
    ROUND CTR0.16b, KEY1.16b
    rev w9, IV_W                             // CTR0--Start
    eor v30.8b, v30.8b, OUT1.8b             // GHASH block 4k+1 - mid
#ifdef HITLS_BIG_ENDIAN
    rev x19, x19
    rev x20, x20
#endif

    mov d8, v4.d[1]                         // GHASH block 4k - mid
    orr x9, x11, x9, lsl #32                // CTR0 block 4k+8
    pmull2 v9.1q, v4.2d, HASH4.2d           // GHASH block 4k - high
    add IV_W, IV_W, #1                        // CTR0++

    mov d10, v17.d[1]                       // GHASH block 4k - mid
    ROUND CTR0.16b, KEY2.16b
    pmull HASH0.1q, v4.1d, HASH4.1d         // GHASH block 4k - low
    eor v8.8b, v8.8b, v4.8b                 // GHASH block 4k - mid

    ROUND CTR1.16b, KEY2.16b
    ROUND CTR0.16b, KEY3.16b
    eor v9.16b, v9.16b, v28.16b             // GHASH block 4k+1 - high
    pmull v28.1q, OUT2.1d, HASH2.1d         // GHASH[2] - low

    pmull v10.1q, v8.1d, v10.1d             // GHASH block 4k - mid
    rev64 OUT3.16b, OUT3.16b                // GHASH[0] (t0, t1, t2 and t3 free)
    pmull v30.1q, v30.1d, v17.1d            // GHASH block 4k+1 - mid
    pmull v29.1q, OUT1.1d, HASH3.1d         // GHASH block 4k+1 - low

    ins v31.d[1], v31.d[0]                  // GHASH[2] - mid
    pmull2 v8.1q, OUT2.2d, HASH2.2d         // GHASH[2] - high
    eor x20, x20, KEND1                     // AES[1] - round 10 high
    eor v10.16b, v10.16b, v30.16b           // GHASH block 4k+1 - mid

    mov d30, OUT3.d[1]                      // GHASH[0] - mid
    ROUND CTR3.16b, v19.16b
    eor HASH0.16b, HASH0.16b, v29.16b       // GHASH block 4k+1 - low
    ROUND CTR2.16b, KEY2.16b

    eor x6, x6, KEND0                       // AES[0] - round 10 low
    ROUND CTR1.16b, KEY3.16b
    eor v30.8b, v30.8b, OUT3.8b             // GHASH[0] - mid
    pmull2 v4.1q, OUT3.2d, HASH1.2d         // GHASH[0] - high

    ROUND CTR2.16b, KEY3.16b
    eor v9.16b, v9.16b, v8.16b              // GHASH[2] - high
    pmull2 v31.1q, v31.2d, v16.2d           // GHASH[2] - mid
    pmull v29.1q, OUT3.1d, HASH1.1d         // GHASH[0] - low

    movi v8.8b, #0xc2
    pmull v30.1q, v30.1d, v16.1d            // GHASH[0] - mid
    eor HASH0.16b, HASH0.16b, v28.16b       // GHASH[2] - low
    ROUND CTR1.16b, KEY4.16b

    ROUND CTR3.16b, v20.16b
    shl d8, d8, #56                         // mod_constant
    ROUND CTR0.16b, KEY4.16b
    eor v9.16b, v9.16b, v4.16b              // GHASH[0] - high

    ROUND CTR1.16b, KEY5.16b
    ldp x21, x22, [INPUT], #16              // AES[2] - load plaintext
    ROUND CTR3.16b, v21.16b
    eor v10.16b, v10.16b, v31.16b           // GHASH[2] - mid
#ifdef HITLS_BIG_ENDIAN
    rev x21, x21
    rev x22, x22
#endif

    ROUND CTR0.16b, KEY5.16b
    ldp x23, x24, [INPUT], #16              // AES[3] - load plaintext
    pmull v31.1q, v9.1d, v8.1d              // MODULO - top 64b align with mid
    eor HASH0.16b, HASH0.16b, v29.16b       // GHASH[0] - low
#ifdef HITLS_BIG_ENDIAN
    rev x23, x23
    rev x24, x24
#endif

    ROUND CTR2.16b, KEY4.16b
    eor x19, x19, KEND0                     // AES[1] - round 10 low
    ROUND CTR3.16b, v22.16b
    eor v10.16b, v10.16b, v30.16b           // GHASH[0] - mid

    ROUND CTR1.16b, KEY6.16b
    eor x23, x23, KEND0                     // AES[3] - round 10 low
    ROUND CTR2.16b, KEY5.16b
    eor v30.16b, HASH0.16b, v9.16b          // MODULO - karatsuba tidy up

    fmov d4, x6                             // AES[0] - mov low
    ROUND CTR0.16b, KEY6.16b
    fmov OUT0.d[1], x7                      // AES[0] - mov high
    fmov d7, x23                            // AES[3] - mov low

    ext v9.16b, v9.16b, v9.16b, #8          // MODULO - other top alignment
    ROUND CTR3.16b, v23.16b
    fmov d5, x19                            // AES[2] - mov low
    ROUND CTR0.16b, KEY7.16b

    eor v10.16b, v10.16b, v30.16b           // MODULO - karatsuba tidy up
    ROUND CTR2.16b, KEY6.16b
    eor x24, x24, KEND1                     // AES[3] - round 10 high
    ROUND CTR1.16b, KEY7.16b

    fmov OUT1.d[1], x20                     // AES[1] - mov high
    ROUND CTR0.16b, KEY8.16b
    fmov OUT3.d[1], x24                     // AES[3] - mov high
    ROUND CTR3.16b, v24.16b

    subs COUNT, COUNT, #1                          // count--
    ROUND CTR1.16b, KEY8.16b
    eor v10.16b, v10.16b, v31.16b           // MODULO - fold into mid
    aese CTR0.16b, KEY9.16b

    eor x21, x21, KEND0                     // AES[2] - round 10 low
    eor x22, x22, KEND1                     // AES[2] - round 10 high
    ROUND CTR3.16b, v25.16b
    fmov d6, x21                            // AES[2] - mov low

    aese CTR1.16b, KEY9.16b                 // AES[1] - round 9
    fmov OUT2.d[1], x22                     // AES[2] - mov high
    ROUND CTR2.16b, KEY7.16b
    eor OUT0.16b, OUT0.16b, CTR0.16b        // AES[0] - result

    fmov d0, x10                            // CTR0-0
    ROUND CTR3.16b, KEY8.16b
    fmov CTR0.d[1], x9                      // CTR0-1--OK
    rev w9, IV_W                            // CTR1--start

    eor v10.16b, v10.16b, v9.16b            // MODULO - fold into mid
    ROUND CTR2.16b, KEY8.16b
    eor OUT1.16b, OUT1.16b, CTR1.16b        // AES[1] - result
    add IV_W, IV_W, #1                      // CTR1++

    orr x9, x11, x9, lsl #32                // CTR1 block 4k+9
    fmov d1, x10                            // CTR1-0
    pmull v9.1q, v10.1d, v8.1d              // MODULO - mid 64b align with low
    fmov CTR1.d[1], x9                      // CTR1-1--OK

    rev w9, IV_W                            // CTR2--Start
    aese CTR2.16b, KEY9.16b
    st1 {OUT0.16b}, [OUT00], #16            // Write back - OUT0
    eor OUT2.16b, OUT2.16b, CTR2.16b        // AES[2]-result

    orr x9, x11, x9, lsl #32                // CTR2 block 4k+10
    aese CTR3.16b, KEY9.16b
    add IV_W, IV_W, #1                      // CTR2++
    ext v10.16b, v10.16b, v10.16b, #8       // MODULO - other mid alignment
    fmov d2, x10                            // CTR2-0

    eor HASH0.16b, HASH0.16b, v9.16b        // MODULO - fold into low
    st1 {OUT1.16b}, [OUT00], #16            // Write back - OUT1
    fmov CTR2.d[1], x9                      // CTR2-1--OK
    st1 {OUT2.16b}, [OUT00], #16            // Write back - OUT2

    rev w9, IV_W                            // CTR3--start
    eor OUT3.16b, OUT3.16b, CTR3.16b        // AES[3]-result
    orr x9, x11, x9, lsl #32                // CTR3 block 4k+11
    eor HASH0.16b, HASH0.16b, v10.16b       // MODULO - fold into low
    st1 {OUT3.16b}, [OUT00], #16            // Write back - OUT3
.endm

.macro GCM_DEC128_LOOP
    eor CTR3.16b, OUT3.16b, CTR3.16b            // AES[3] - result
    ext HASH0.16b, HASH0.16b, HASH0.16b, #8     // PRE 0
    mov x21, CTR2.d[0]                          // AES[2] - mov low
    pmull2 v28.1q, v5.2d, HASH3.2d              // GHASH block 4k+1 - high
    mov x22, CTR2.d[1]                          // AES[2] - mov high
    ROUND CTR1.16b, KEY0.16b
    fmov d2, x10                                // CTR[3]
#ifdef HITLS_BIG_ENDIAN
    rev x21, x21
    rev x22, x22
#endif
    rev64 OUT2.16b, OUT2.16b                    // GHASH[2]
    fmov v2.d[1], x9                            // CTR[3]
    rev w9, IV_W                                // CTR[0]
    mov x23, CTR3.d[0]                          // AES[3] - mov low
    eor v4.16b, v4.16b, HASH0.16b               // PRE 1
    mov d30, v5.d[1]                            // GHASH block 4k+1 - mid
    ROUND CTR1.16b, KEY1.16b
    rev64 v7.16b, v7.16b                        // GHASH[0]
    pmull v29.1q, v5.1d, HASH3.1d               // GHASH block 4k+1 - low
    mov x24, CTR3.d[1]                          // AES[3] - mov high
    orr x9, x11, x9, lsl #32                    // CTR[0]
    pmull HASH0.1q, v4.1d, HASH4.1d             // GHASH block 4k - low
#ifdef HITLS_BIG_ENDIAN
    rev x23, x23
    rev x24, x24
#endif
    fmov d3, x10                                // CTR[0]
    eor v30.8b, v30.8b, v5.8b                   // GHASH block 4k+1 - mid
    ROUND CTR1.16b, KEY2.16b
    fmov v3.d[1], x9                            // CTR[0]
    ROUND CTR2.16b, KEY0.16b
    mov d10, v17.d[1]                           // GHASH block 4k - mid
    pmull2 v9.1q, v4.2d, HASH4.2d               // GHASH block 4k - high
    eor HASH0.16b, HASH0.16b, v29.16b           // GHASH block 4k+1 - low
    pmull v29.1q, v7.1d, HASH1.1d               // GHASH[0] - low
    ROUND CTR1.16b, KEY3.16b
    mov d8, v4.d[1]                             // GHASH block 4k - mid
    ROUND CTR3.16b, KEY0.16b
    eor v9.16b, v9.16b, v28.16b                 // GHASH block 4k+1 - high
    ROUND CTR0.16b, KEY0.16b
    pmull v28.1q, v6.1d, HASH2.1d               // GHASH[2] - low
    eor v8.8b, v8.8b, v4.8b                     // GHASH block 4k - mid
    ROUND CTR3.16b, KEY1.16b
    eor x23, x23, KEND0                         // AES[3] - round 10 low
    pmull v30.1q, v30.1d, v17.1d                // GHASH block 4k+1 - mid
    eor x22, x22, KEND1                         // AES[2] - round 10 high
    mov d31, v6.d[1]                            // GHASH[2] - mid
    ROUND CTR0.16b, KEY1.16b
    eor HASH0.16b, HASH0.16b, v28.16b           // GHASH[2] - low
    pmull v10.1q, v8.1d, v10.1d                 // GHASH block 4k - mid
    ROUND CTR3.16b, KEY2.16b
    eor v31.8b, v31.8b, v6.8b                   // GHASH[2] - mid
    ROUND CTR0.16b, KEY2.16b
    ROUND CTR1.16b, KEY4.16b
    eor v10.16b, v10.16b, v30.16b               // GHASH block 4k+1 - mid
    pmull2 v8.1q, v6.2d, HASH2.2d               // GHASH[2] - high
    ROUND CTR0.16b, KEY3.16b
    ins v31.d[1], v31.d[0]                      // GHASH[2] - mid
    pmull2 v4.1q, v7.2d, HASH1.2d               // GHASH[0] - high
    ROUND CTR2.16b, KEY1.16b
    mov d30, v7.d[1]                            // GHASH[0] - mid
    ROUND CTR0.16b, KEY4.16b
    eor v9.16b, v9.16b, v8.16b                  // GHASH[2] - high
    pmull2 v31.1q, v31.2d, v16.2d               // GHASH[2] - mid
    eor x24, x24, KEND1                         // AES[3] - round 10 high
    ROUND CTR2.16b, KEY2.16b
    eor v30.8b, v30.8b, v7.8b                   // GHASH[0] - mid
    ROUND CTR1.16b, KEY5.16b
    eor x21, x21, KEND0                         // AES[2] - round 10 low
    ROUND CTR0.16b, KEY5.16b
    movi v8.8b, #0xc2
    ROUND CTR2.16b, KEY3.16b
    eor HASH0.16b, HASH0.16b, v29.16b           // GHASH[0] - low
    ROUND CTR1.16b, KEY6.16b
    ROUND CTR0.16b, KEY6.16b
    eor v10.16b, v10.16b, v31.16b               // GHASH[2] - mid
    ROUND CTR2.16b, KEY4.16b
    stp x21, x22, [OUT00], #16                  // AES[2] - store result
    pmull v30.1q, v30.1d, v16.1d                // GHASH[0] - mid
    eor v9.16b, v9.16b, v4.16b                  // GHASH[0] - high

    ld1 {OUT0.16b}, [INPUT], #16                // AES[0] - load ciphertext

    ROUND CTR1.16b, KEY7.16b
    add IV_W, IV_W, #1                            // CTR++
    ROUND CTR0.16b, KEY7.16b
    shl d8, d8, #56                             // mod_constant
    ROUND CTR2.16b, KEY5.16b
    eor v10.16b, v10.16b, v30.16b               // GHASH[0] - mid
    ROUND CTR1.16b, KEY8.16b
    stp x23, x24, [OUT00], #16                  // AES[3] - store result
    ROUND CTR0.16b, KEY8.16b
    eor v30.16b, HASH0.16b, v9.16b              // MODULO - karatsuba tidy up
    ROUND CTR3.16b, KEY3.16b
    rev w9, IV_W                                 // CTR block 4k+8
    pmull v31.1q, v9.1d, v8.1d                  // MODULO - top 64b align with mid
    ld1 {OUT1.16b}, [INPUT], #16                // AES[1] - load
    ext v9.16b, v9.16b, v9.16b, #8              // MODULO - other top alignment
    aese CTR0.16b, KEY9.16b                     // AES[0] - round 9
    orr x9, x11, x9, lsl #32                    // CTR block 4k+8
    ROUND CTR3.16b, KEY4.16b
    eor v10.16b, v10.16b, v30.16b               // MODULO - karatsuba tidy up
    aese CTR1.16b, KEY9.16b                     // AES[1] - round 9
    ROUND CTR2.16b, KEY6.16b

    eor CTR0.16b, OUT0.16b, CTR0.16b            // AES[0] - result
    ROUND CTR3.16b, KEY5.16b
    ld1 {OUT2.16b}, [INPUT], #16                // AES[2] - load
    add IV_W, IV_W, #1                            // CTR++
    eor v10.16b, v10.16b, v31.16b               // MODULO - fold into mid
    eor CTR1.16b, OUT1.16b, CTR1.16b            // AES[1] - result
    ROUND CTR2.16b, KEY7.16b
    ld1 {OUT3.16b}, [INPUT], #16
    ROUND CTR3.16b, KEY6.16b

    rev64 OUT1.16b, OUT1.16b                    // GHASH block[1]
    eor v10.16b, v10.16b, v9.16b                // MODULO - fold into mid
    mov x7, CTR0.d[1]                           // AES[0] - mov high
    ROUND CTR2.16b, KEY8.16b
    mov x6, CTR0.d[0]                           // AES[0] - mov low

    ROUND CTR3.16b, KEY7.16b
    fmov d0, x10                                // CTR[0]
#ifdef HITLS_BIG_ENDIAN
    rev x7, x7
    rev x6, x6
#endif
    pmull v8.1q, v10.1d, v8.1d                  // MODULO - mid 64b align with low
    fmov CTR0.d[1], x9                            // CTR[0] - OK
    rev w9, IV_W                                 // CTR block 4k+9
    aese CTR2.16b, KEY9.16b
    orr x9, x11, x9, lsl #32                    // CTR block 4k+9

    ext v10.16b, v10.16b, v10.16b, #8           // MODULO - other mid alignment

    ROUND CTR3.16b, KEY8.16b

    eor x7, x7, KEND1                           // AES[0] - round 10 high
    eor HASH0.16b, HASH0.16b, v8.16b            // MODULO - fold into low
    mov x20, CTR1.d[1]                          // AES[1] - mov high
    eor x6, x6, KEND0                           // AES[0] - round 10 low
    eor CTR2.16b, OUT2.16b, CTR2.16b            // AES[2] - result
    mov x19, CTR1.d[0]                          // AES[1] - mov low
    add IV_W, IV_W, #1                            // CTR++
    aese CTR3.16b, KEY9.16b
    fmov d1, x10                                // CTR[1]
#ifdef HITLS_BIG_ENDIAN
    rev x20, x20
    rev x19, x19
#endif
    subs COUNT, COUNT, #1                       // COUNT--
    rev64 OUT0.16b, OUT0.16b                    // GHASH block[0]
    eor HASH0.16b, HASH0.16b, v10.16b           // MODULO - fold into low

    fmov v1.d[1], x9                            // CTR[1] - OK
    rev w9, IV_W                                 // CTR block 4k+10
    add IV_W, IV_W, #1                            // CTR block 4k+10

    eor x20, x20, KEND1                         // AES[1] - round 10 high
    stp x6, x7, [OUT00], #16                    // AES[0] - store result
    eor x19, x19, KEND0                         // AES[1] - round 10 low
    stp x19, x20, [OUT00], #16                  // AES[1] - store result
    orr x9, x11, x9, lsl #32                    // CTR block 4k+10
.endm

#endif