/*
 * This file is part of the openHiTLS project.
 *
 * openHiTLS is licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *
 *     http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */

#include "hitls_build.h"
#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_GCM)

.macro GCM_ENC256_LOOP
    ROUND CTR0.16b, KEY0.16b
    rev64 v4.16b, v4.16b                            // GHASH block 4k (only t0 is free)
    ROUND CTR1.16b, KEY0.16b
    fmov d3, x10                                    // CTR[3]
    ROUND CTR2.16b, KEY0.16b

    ext HASH0.16b, HASH0.16b, HASH0.16b, #8         // PRE 0
    ROUND CTR0.16b, KEY1.16b
    fmov CTR3.d[1], x9                              // CTR[3] - OK

    ROUND CTR1.16b, KEY1.16b
    ldp x6, x7, [INPUT], #16                        // AES[0] - load plaintext
    ROUND CTR2.16b, KEY1.16b
    ldp x19, x20, [INPUT], #16                      // AES[1] - load plaintext
    ROUND CTR0.16b, KEY2.16b
#ifdef HITLS_BIG_ENDIAN
    rev x6, x6
    rev x7, x7
    rev x19, x19
    rev x20, x20
#endif
    eor v4.16b, v4.16b, HASH0.16b                   // PRE 1
    ROUND CTR1.16b, KEY2.16b
    ROUND CTR3.16b, KEY0.16b
    eor x6, x6, KEND0                               // AES[0] - round 14 low

    ROUND CTR0.16b, KEY3.16b
    mov d10, v17.d[1]                               // GHASH block 4k - mid
    pmull2 v9.1q, v4.2d, HASH4.2d                   // GHASH block 4k - high
    eor x7, x7, KEND1                               // AES[0] - round 14 high
    mov d8, v4.d[1]                                 // GHASH block 4k - mid
    ROUND CTR3.16b, KEY1.16b
    rev64 v5.16b, v5.16b                            // GHASH block 4k+1 (t0 and t1 free)
    ROUND CTR0.16b, KEY4.16b
    pmull HASH0.1q, v4.1d, HASH4.1d                 // GHASH block 4k - low
    eor v8.8b, v8.8b, v4.8b                         // GHASH block 4k - mid
    ROUND CTR2.16b, KEY2.16b
    ROUND CTR0.16b, KEY5.16b
    rev64 v7.16b, v7.16b                            // GHASH[0] (t0, t1, t2 and t3 free)

    pmull2 v4.1q, v5.2d, HASH3.2d                   // GHASH block 4k+1 - high
    pmull v10.1q, v8.1d, v10.1d                     // GHASH block 4k - mid
    rev64 v6.16b, v6.16b                            // GHASH[2] (t0, t1, and t2 free)
    pmull v8.1q, v5.1d, HASH3.1d                    // GHASH block 4k+1 - low
    eor v9.16b, v9.16b, v4.16b                      // GHASH block 4k+1 - high
    mov d4, v5.d[1]                                 // GHASH block 4k+1 - mid
    ROUND CTR1.16b, KEY3.16b
    ROUND CTR3.16b, KEY2.16b
    eor HASH0.16b, HASH0.16b, v8.16b                // GHASH block 4k+1 - low
    ROUND CTR2.16b, KEY3.16b
    ROUND CTR1.16b, KEY4.16b
    mov d8, v6.d[1]                                 // GHASH[2] - mid
    ROUND CTR3.16b, KEY3.16b
    eor v4.8b, v4.8b, v5.8b                         // GHASH block 4k+1 - mid
    ROUND CTR2.16b, KEY4.16b
    ROUND CTR0.16b, KEY6.16b
    eor v8.8b, v8.8b, v6.8b                         // GHASH[2] - mid
    ROUND CTR3.16b, KEY4.16b
    pmull v4.1q, v4.1d, v17.1d                      // GHASH block 4k+1 - mid
    ROUND CTR0.16b, KEY7.16b
    ROUND CTR3.16b, KEY5.16b
    ins v8.d[1], v8.d[0]                            // GHASH[2] - mid
    ROUND CTR1.16b, KEY5.16b
    ROUND CTR0.16b, KEY8.16b
    ROUND CTR2.16b, KEY5.16b
    ROUND CTR1.16b, KEY6.16b
    eor v10.16b, v10.16b, v4.16b                    // GHASH block 4k+1 - mid
    pmull2 v4.1q, v6.2d, HASH2.2d                   // GHASH[2] - high
    pmull v5.1q, v6.1d, HASH2.1d                    // GHASH[2] - low
    ROUND CTR1.16b, KEY7.16b
    pmull v6.1q, v7.1d, HASH1.1d                    // GHASH[0] - low
    eor v9.16b, v9.16b, v4.16b                      // GHASH[2] - high
    ROUND CTR3.16b, KEY6.16b

    ldp x21, x22, [INPUT], #16                      // AES[2] - load plaintext
    ROUND CTR1.16b, KEY8.16b
    mov d4, v7.d[1]                                 // GHASH[0] - mid
#ifdef HITLS_BIG_ENDIAN
    rev x21, x21
    rev x22, x22
#endif
    ROUND CTR2.16b, KEY6.16b
    eor HASH0.16b, HASH0.16b, v5.16b                // GHASH[2] - low
    pmull2 v8.1q, v8.2d, v16.2d                     // GHASH[2] - mid
    pmull2 v5.1q, v7.2d, HASH1.2d                   // GHASH[0] - high
    eor v4.8b, v4.8b, v7.8b                         // GHASH[0] - mid
    ROUND CTR2.16b, KEY7.16b
    eor x19, x19, KEND0                             // AES[1] - round 14 low
    ROUND CTR1.16b, KEY9.16b
    eor v10.16b, v10.16b, v8.16b                    // GHASH[2] - mid
    ROUND CTR3.16b, KEY7.16b
    eor x21, x21, KEND0                             // AES[2] - round 14 low
    ROUND CTR0.16b, KEY9.16b
    movi v8.8b, #0xc2
    pmull v4.1q, v4.1d, v16.1d                      // GHASH[0] - mid
    eor v9.16b, v9.16b, v5.16b                      // GHASH[0] - high
    fmov d5, x19                                    // AES[1] - mov low

    ROUND CTR2.16b, KEY8.16b
    ldp x23, x24, [INPUT], #16                      // AES[3] - load plaintext
    ROUND CTR0.16b, KEY10.16b
    shl d8, d8, #56                                 // mod_constant
#ifdef HITLS_BIG_ENDIAN
    rev x23, x23
    rev x24, x24
#endif
    ROUND CTR3.16b, KEY8.16b
    eor HASH0.16b, HASH0.16b, v6.16b                // GHASH[0] - low
    ROUND CTR2.16b, KEY9.16b
    ROUND CTR1.16b, KEY10.16b
    eor v10.16b, v10.16b, v4.16b                    // GHASH[0] - mid
    ROUND CTR3.16b, KEY9.16b
    add IV_W, IV_W, #1                                // CTR++
    ROUND CTR0.16b, KEY11.16b
    eor v4.16b, HASH0.16b, v9.16b                   // MODULO - karatsuba tidy up
    ROUND CTR1.16b, KEY11.16b

    pmull v7.1q, v9.1d, v8.1d                       // MODULO - top 64b align with mid
    rev w9, IV_W                                     // CTR block 4k+8
    ext v9.16b, v9.16b, v9.16b, #8                  // MODULO - other top alignment
    ROUND CTR2.16b, KEY10.16b
    eor x23, x23, KEND0                             // AES[3] - round 14 low
    ROUND CTR1.16b, KEY12.16b
    eor v10.16b, v10.16b, v4.16b                    // MODULO - karatsuba tidy up
    ROUND CTR3.16b, KEY10.16b
    eor x20, x20, KEND1                             // AES[1] - round 14 high

    fmov d4, x6                                     // AES[0] - mov low
    orr x9, x11, x9, lsl #32                        // CTR block 4k+8
    eor v7.16b, v9.16b, v7.16b                      // MODULO - fold into mid
    ROUND CTR0.16b, KEY12.16b
    eor x22, x22, KEND1                             // AES[2] - round 14 high
    ROUND CTR2.16b, KEY11.16b
    eor x24, x24, KEND1                             // AES[3] - round 14 high

    ROUND CTR3.16b, KEY11.16b
    add IV_W, IV_W, #1                                // CTR++
    aese CTR0.16b, KEY13.16b                        // AES[0] - round 13
    fmov OUT0.d[1], x7                              // AES[0] - mov high
    eor v10.16b, v10.16b, v7.16b                    // MODULO - fold into mid
    ROUND CTR2.16b, KEY12.16b
    fmov d7, x23                                    // AES[3] - mov low
    aese CTR1.16b, KEY13.16b                        // AES[2] - round 13
    fmov OUT1.d[1], x20                             // AES[1] - mov high
    fmov d6, x21                                    // AES[2] - mov low

    subs COUNT, COUNT, #1                           // COUNT--
    fmov OUT2.d[1], x22                             // AES[2] - mov high
    pmull v9.1q, v10.1d, v8.1d                      // MODULO - mid 64b align with low
    eor OUT0.16b, OUT0.16b, CTR0.16b                // AES[0] - result
    fmov d0, x10                                    // CTR[0]
    fmov CTR0.d[1], x9                                // CTR[0]--OK
    rev w9, IV_W                                     // CTR[1]
    add IV_W, IV_W, #1                                // CTR++

    eor OUT1.16b, OUT1.16b, CTR1.16b                // AES[1] - result
    fmov d1, x10                                    // CTR[1]
    orr x9, x11, x9, lsl #32                        // CTR[1]
    ROUND CTR3.16b, KEY12.16b
    fmov v1.d[1], x9                                // CTR[1]--OK
    aese CTR2.16b, KEY13.16b                        // AES[3] - round 13
    rev w9, IV_W                                     // CTR block 4k+10

    st1 {OUT0.16b}, [OUT00], #16                    // AES[0] - store result
    orr x9, x11, x9, lsl #32                        // CTR block 4k+10
    eor HASH0.16b, HASH0.16b, v9.16b                // MODULO - fold into low
    fmov OUT3.d[1], x24                             // AES[3] - mov high
    ext v10.16b, v10.16b, v10.16b, #8               // MODULO - other mid alignment
    st1 {OUT1.16b}, [OUT00], #16                    // AES[1] - store result
    add IV_W, IV_W, #1                                // CTR++
    aese CTR3.16b, KEY13.16b                        // AES[0] - round 13

    eor OUT2.16b, OUT2.16b, CTR2.16b                // AES[2] - result
    fmov d2, x10                                    // CTR[2]
    st1 {OUT2.16b}, [OUT00], #16                    // AES[2] - store result
    fmov v2.d[1], x9                                // CTR[2]--OK
    rev w9, IV_W                                     // CTR block 4k+11

    eor OUT3.16b, OUT3.16b, CTR3.16b                // AES[3] - result
    eor HASH0.16b, HASH0.16b, v10.16b               // MODULO - fold into low
    orr x9, x11, x9, lsl #32                        // CTR block 4k+11
    st1 {OUT3.16b}, [OUT00], #16                    // AES[3] - store result
.endm

.macro GCM_DEC256_LOOP
    mov x21, CTR2.d[0]                      // AES[2] block - mov low
    ext HASH0.16b, HASH0.16b, HASH0.16b, #8 // PRE 0
    eor CTR3.16b, OUT3.16b, CTR3.16b        // AES[3] block - result
    ROUND CTR0.16b, KEY0.16b
    mov x22, CTR2.d[1]                      // AES[2] block - mov high

    ROUND CTR1.16b, KEY0.16b
    fmov d2, x10                            // CTR[2]
    fmov v2.d[1], x9                        // CTR[2]
    eor v4.16b, v4.16b, HASH0.16b           // PRE 1
#ifdef HITLS_BIG_ENDIAN
    rev x21, x21
    rev x22, x22
#endif
    rev w9, IV_W                             // CTR[0]
    ROUND CTR0.16b, KEY1.16b
    mov x24, CTR3.d[1]                      // AES[3] block - mov high
    ROUND CTR1.16b, KEY1.16b
    mov x23, CTR3.d[0]                      // AES[3] block - mov low

    pmull2 v9.1q, v4.2d, HASH4.2d           // GHASH block 4k - high
    mov d8, v4.d[1]                         // GHASH block 4k - mid
    fmov d3, x10                            // CTR[0]
#ifdef HITLS_BIG_ENDIAN
    rev x23, x23
    rev x24, x24
#endif
    ROUND CTR0.16b, KEY2.16b
    orr x9, x11, x9, lsl #32                // CTR[0]
    ROUND CTR2.16b, KEY0.16b
    fmov v3.d[1], x9                        // CTR[0]
    ROUND CTR1.16b, KEY2.16b
    eor v8.8b, v8.8b, v4.8b                 // GHASH block 4k - mid
    ROUND CTR0.16b, KEY3.16b
    eor x22, x22, KEND1                     // AES[2] - round 14 high
    ROUND CTR2.16b, KEY1.16b
    mov d10, v17.d[1]                       // GHASH block 4k - mid
    ROUND CTR1.16b, KEY3.16b
    rev64 v6.16b, v6.16b                    // GHASH[2]
    ROUND CTR3.16b, KEY0.16b
    eor x21, x21, KEND0                     // AES[2] - round 14 low
    ROUND CTR2.16b, KEY2.16b
    stp x21, x22, [OUT00], #16              // AES[2] - store result
    pmull HASH0.1q, v4.1d, HASH4.1d         // GHASH block 4k - low
    pmull2 v4.1q, v5.2d, HASH3.2d           // GHASH block 4k+1 - high
    ROUND CTR2.16b, KEY3.16b
    rev64 v7.16b, v7.16b                    // GHASH[0]

    pmull v10.1q, v8.1d, v10.1d             // GHASH block 4k - mid
    eor x23, x23, KEND0                     // AES[3] - round 14 low
    pmull v8.1q, v5.1d, HASH3.1d            // GHASH block 4k+1 - low
    eor x24, x24, KEND1                     // AES[3] - round 14 high
    eor v9.16b, v9.16b, v4.16b              // GHASH block 4k+1 - high
    ROUND CTR2.16b, KEY4.16b
    ROUND CTR3.16b, KEY1.16b
    mov d4, v5.d[1]                         // GHASH block 4k+1 - mid
    ROUND CTR0.16b, KEY4.16b
    eor HASH0.16b, HASH0.16b, v8.16b        // GHASH block 4k+1 - low
    ROUND CTR2.16b, KEY5.16b
    add IV_W, IV_W, #1                        // CTR[0]
    ROUND CTR3.16b, KEY2.16b
    mov d8, v6.d[1]                         // GHASH[2] - mid
    ROUND CTR1.16b, KEY4.16b
    eor v4.8b, v4.8b, v5.8b                 // GHASH block 4k+1 - mid

    pmull v5.1q, v6.1d, HASH2.1d            // GHASH[2] - low
    ROUND CTR3.16b, KEY3.16b
    eor v8.8b, v8.8b, v6.8b                 // GHASH[2] - mid
    ROUND CTR1.16b, KEY5.16b
    ROUND CTR0.16b, KEY5.16b
    eor HASH0.16b, HASH0.16b, v5.16b        // GHASH[2] - low

    pmull v4.1q, v4.1d, v17.1d              // GHASH block 4k+1 - mid
    rev w9, IV_W                             // CTR block 4k+8
    ROUND CTR1.16b, KEY6.16b
    ins v8.d[1], v8.d[0]                    // GHASH[2] - mid
    ROUND CTR0.16b, KEY6.16b
    add IV_W, IV_W, #1                        // CTR block 4k+8
    ROUND CTR3.16b, KEY4.16b
    ROUND CTR1.16b, KEY7.16b
    eor v10.16b, v10.16b, v4.16b            // GHASH block 4k+1 - mid
    ROUND CTR0.16b, KEY7.16b

    pmull2 v4.1q, v6.2d, HASH2.2d           // GHASH[2] - high
    mov d6, v7.d[1]                         // GHASH[0] - mid
    ROUND CTR3.16b, KEY5.16b

    pmull2 v8.1q, v8.2d, v16.2d             // GHASH[2] - mid
    ROUND CTR0.16b, KEY8.16b
    eor v9.16b, v9.16b, v4.16b              // GHASH[2] - high
    ROUND CTR3.16b, KEY6.16b

    pmull v4.1q, v7.1d, HASH1.1d            // GHASH[0] - low
    orr x9, x11, x9, lsl #32                // CTR block 4k+8
    eor v10.16b, v10.16b, v8.16b            // GHASH[2] - mid

    pmull2 v5.1q, v7.2d, HASH1.2d           // GHASH[0] - high
    ROUND CTR0.16b, KEY9.16b
    eor v6.8b, v6.8b, v7.8b                 // GHASH[0] - mid
    ROUND CTR1.16b, KEY8.16b
    ROUND CTR2.16b, KEY6.16b
    eor v9.16b, v9.16b, v5.16b              // GHASH[0] - high
    ROUND CTR0.16b, KEY10.16b
    pmull v6.1q, v6.1d, v16.1d              // GHASH[0] - mid
    movi v8.8b, #0xc2
    ROUND CTR2.16b, KEY7.16b
    eor HASH0.16b, HASH0.16b, v4.16b        // GHASH[0] - low
    ROUND CTR0.16b, KEY11.16b
    ROUND CTR3.16b, KEY7.16b
    shl d8, d8, #56                         // mod_constant
    ROUND CTR2.16b, KEY8.16b
    eor v10.16b, v10.16b, v6.16b            // GHASH[0] - mid
    ROUND CTR0.16b, KEY12.16b
    pmull v7.1q, v9.1d, v8.1d               // MODULO - top 64b align with mid
    eor v6.16b, HASH0.16b, v9.16b           // MODULO - karatsuba tidy up
    ROUND CTR1.16b, KEY9.16b

    ld1 {OUT0.16b}, [INPUT], #16            // AES load[0] ciphertext
    aese CTR0.16b, KEY13.16b
    ext v9.16b, v9.16b, v9.16b, #8          // MODULO - other top alignment
    ROUND CTR1.16b, KEY10.16b
    eor v10.16b, v10.16b, v6.16b            // MODULO - karatsuba tidy up
    ROUND CTR2.16b, KEY9.16b

    ld1 {OUT1.16b}, [INPUT], #16            // AES load[1] ciphertext
    ROUND CTR3.16b, KEY8.16b
    eor CTR0.16b, OUT0.16b, CTR0.16b        // AES[0] block - result
    ROUND CTR1.16b, KEY11.16b

    stp x23, x24, [OUT00], #16              // AES[3] block - store result
    ROUND CTR2.16b, KEY10.16b
    eor v10.16b, v10.16b, v7.16b            // MODULO - fold into mid
    ROUND CTR3.16b, KEY9.16b
    ld1 {OUT2.16b}, [INPUT], #16            // AES load[1] ciphertext

    ROUND CTR1.16b, KEY12.16b
    ld1 {OUT3.16b}, [INPUT], #16            // AES load[1] ciphertext
    ROUND CTR2.16b, KEY11.16b
    mov x7, CTR0.d[1]                       // AES[0] block - mov high
    ROUND CTR3.16b, KEY10.16b
    eor v10.16b, v10.16b, v9.16b            // MODULO - fold into mid
    aese CTR1.16b, KEY13.16b                // AES[2] - round 13
    mov x6, CTR0.d[0]                       // AES[0] block - mov low
    ROUND CTR2.16b, KEY12.16b
    fmov d0, x10                            // CTR[0]
    ROUND CTR3.16b, KEY11.16b
#ifdef HITLS_BIG_ENDIAN
    rev x6, x6
    rev x7, x7
#endif
    fmov CTR0.d[1], x9                      // CTR[0]--OK
    pmull v8.1q, v10.1d, v8.1d              // MODULO - mid 64b align with low
    eor CTR1.16b, OUT1.16b, CTR1.16b        // AES[1] block - result
    rev w9, IV_W                             // CTR block 4k+9
    aese CTR2.16b, KEY13.16b
    orr x9, x11, x9, lsl #32                // CTR block 4k+9

    subs COUNT, COUNT, #1                   // COUNT--
    add IV_W, IV_W, #1                        // CTR++
    eor x6, x6, KEND0                       // AES[0] block - round 14 low
    eor x7, x7, KEND1                       // AES[0] block - round 14 high

    mov x20, v1.d[1]                        // AES[1] block - mov high
    eor CTR2.16b, OUT2.16b, CTR2.16b        // AES[2] block - result
    eor HASH0.16b, HASH0.16b, v8.16b        // MODULO - fold into low
    ROUND CTR3.16b, KEY12.16b
    mov x19, CTR1.d[0]                      // AES[1] block - mov low
    fmov d1, x10                            // CTR[1]
    ext v10.16b, v10.16b, v10.16b, #8       // MODULO - other mid alignment
#ifdef HITLS_BIG_ENDIAN
    rev x20, x20
    rev x19, x19
#endif
    fmov CTR1.d[1], x9                      // CTR[1]--OK
    rev w9, IV_W                             // CTR block 4k+10
    add IV_W, IV_W, #1                        // CTR++

    aese CTR3.16b, KEY13.16b
    orr x9, x11, x9, lsl #32                // CTR block 4k+10
    rev64 v5.16b, v5.16b                    // GHASH[2]
    eor x20, x20, KEND1                     // AES[1] block - round 14 high
    stp x6, x7, [OUT00], #16                // AES[0] block - store result
    eor x19, x19, KEND0                     // AES[1] block - round 14 low
    stp x19, x20, [OUT00], #16              // AES[1] block - store result
    rev64 OUT0.16b, OUT0.16b                // GHASH block[0]
    eor HASH0.16b, HASH0.16b, v10.16b       // MODULO - fold into low
.endm

#endif