/*
 * This file is part of the openHiTLS project.
 *
 * openHiTLS is licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *
 *     http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */

#include "hitls_build.h"
#if defined(HITLS_CRYPTO_AES) && (defined(HITLS_CRYPTO_CTR) || defined(HITLS_CRYPTO_GCM))

#include "crypt_arm.h"
#include "crypt_aes_macro_armv8.s"

CRYPT_AARCH64_ARCH_CRYPTO
.text

KEY     .req    x0
IN      .req    x1
OUT     .req    x2
LEN     .req    x3
IV      .req    x4

LTMP    .req    x12
CTMP    .req    v27

BLK0    .req    v0
BLK1    .req    v1
BLK2    .req    v2
BLK3    .req    v3
BLK4    .req    v4
BLK5    .req    v5
BLK6    .req    v6
BLK7    .req    v7

CTR0	.req    v19
CTR1	.req    v20
CTR2	.req    v21
CTR3	.req    v22
CTR4	.req    v23
CTR5	.req    v24
CTR6	.req    v25
CTR7	.req    v26

RDK0    .req    v17
RDK1    .req    v18
ROUNDS  .req    w6

/* ctr + 1 */
.macro ADDCTR ctr
#ifndef HITLS_BIG_ENDIAN
    add w11, w11, #1
    rev w9, w11
    mov \ctr, w9
#else
    rev w11, w11
    add w11, w11, #1
    rev w11, w11
    mov \ctr, w11
#endif
.endm

/*
 * Vn      -  V0 ~ V31
 * 8bytes  -  Vn.8B  Vn.4H  Vn.2S  Vn.1D
 * 16bytes -  Vn.16B Vn.8H  Vn.4S  Vn.2D
 */

/*
 * int32_t CRYPT_AES_CTR_Encrypt(const CRYPT_AES_Key *ctx,
 *                              const uint8_t *in,
 *                              uint8_t *out,
 *                              uint32_t len,
 *                              uint8_t *iv);
 */

CRYPT_AARCH64_FUNC_START(CRYPT_AES_CTR_Encrypt)
AARCH64_PACIASP
    ld1	{CTR0.16b}, [IV]              // Reads the IV.
    mov	CTMP.16b, CTR0.16b
    mov	w11, CTR0.s[3]
#ifndef HITLS_BIG_ENDIAN
    rev w11, w11
#endif
    mov LTMP, LEN

.Lctr_aesenc_start:
    cmp LTMP, #64
    b.ge .Lctr_enc_above_equal_4_blks
    cmp LTMP, #32
    b.ge .Lctr_enc_above_equal_2_blks
    cmp LTMP, #0
    b.eq .Lctr_len_zero
    b .Lctr_enc_proc_1_blk

.Lctr_enc_above_equal_2_blks:
    cmp LTMP, #48
    b.lt .Lctr_enc_proc_2_blks
    b .Lctr_enc_proc_3_blks

.Lctr_enc_above_equal_4_blks:
    cmp LTMP, #96
    b.ge .Lctr_enc_above_equal_6_blks
    cmp LTMP, #80
    b.lt .Lctr_enc_proc_4_blks
    b .Lctr_enc_proc_5_blks

.Lctr_enc_above_equal_6_blks:
    cmp LTMP, #112
    b.lt .Lctr_enc_proc_6_blks
    cmp LTMP, #128
    b.lt .Lctr_enc_proc_7_blks

.Lctr_enc_proc_8_blks:

/* When the length is greater than or equal to 128, eight blocks loop is used. */
.Lctr_aesenc_8_blks_loop:

    /* Calculate eight CTRs. */
    mov	CTR1.16b, CTMP.16b
    mov	CTR2.16b, CTMP.16b
    mov	CTR3.16b, CTMP.16b
    mov	CTR4.16b, CTMP.16b
    mov	CTR5.16b, CTMP.16b
    mov	CTR6.16b, CTMP.16b
    mov	CTR7.16b, CTMP.16b

    ADDCTR CTR1.s[3]
    ADDCTR CTR2.s[3]
    ADDCTR CTR3.s[3]
    ADDCTR CTR4.s[3]
    ADDCTR CTR5.s[3]
    ADDCTR CTR6.s[3]
    ADDCTR CTR7.s[3]

    mov x14, KEY                      // Prevent the key from being changed.
    AES_ENC_8_BLKS  x14, CTR0.16b, CTR1.16b, CTR2.16b, CTR3.16b, CTR4.16b, \
                    CTR5.16b, CTR6.16b, CTR7.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [IN], #64
    ld1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [IN], #64

    eor BLK0.16b, BLK0.16b, CTR0.16b
    eor BLK1.16b, BLK1.16b, CTR1.16b
    eor BLK2.16b, BLK2.16b, CTR2.16b
    eor BLK3.16b, BLK3.16b, CTR3.16b
    eor BLK4.16b, BLK4.16b, CTR4.16b
    eor BLK5.16b, BLK5.16b, CTR5.16b
    eor BLK6.16b, BLK6.16b, CTR6.16b
    eor BLK7.16b, BLK7.16b, CTR7.16b

    st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [OUT], #64
    st1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [OUT], #64

    sub LTMP, LTMP, #128
    cmp LTMP, #0
    b.eq .Lctr_aesenc_finish

    ADDCTR CTMP.s[3]
    mov CTR0.16b, CTMP.16b

    cmp LTMP, #128
    b.lt .Lctr_aesenc_start
    b .Lctr_aesenc_8_blks_loop

.Lctr_enc_proc_1_blk:

    AES_ENC_1_BLK KEY, CTR0.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS
    ld1 {BLK0.16b}, [IN]
    eor BLK0.16b, CTR0.16b, BLK0.16b
    st1 {BLK0.16b}, [OUT]
    b .Lctr_aesenc_finish

.Lctr_enc_proc_2_blks:

    mov	CTR1.16b, CTMP.16b
    ADDCTR CTR1.s[3]

    AES_ENC_2_BLKS KEY, CTR0.16b, CTR1.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b}, [IN]

    eor BLK0.16b, CTR0.16b, BLK0.16b
    eor BLK1.16b, CTR1.16b, BLK1.16b

    st1 {BLK0.16b, BLK1.16b}, [OUT]
    b .Lctr_aesenc_finish

.Lctr_enc_proc_3_blks:

    mov	CTR1.16b, CTMP.16b
    mov	CTR2.16b, CTMP.16b

    ADDCTR CTR1.s[3]
    ADDCTR CTR2.s[3]

    AES_ENC_3_BLKS KEY, CTR0.16b, CTR1.16b, CTR2.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b, BLK2.16b}, [IN]

    eor BLK0.16b, BLK0.16b, CTR0.16b
    eor BLK1.16b, BLK1.16b, CTR1.16b
    eor BLK2.16b, BLK2.16b, CTR2.16b

    st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [OUT]
    b .Lctr_aesenc_finish

.Lctr_enc_proc_4_blks:

    mov	CTR1.16b, CTMP.16b
    mov	CTR2.16b, CTMP.16b
    mov	CTR3.16b, CTMP.16b

    ADDCTR CTR1.s[3]
    ADDCTR CTR2.s[3]
    ADDCTR CTR3.s[3]

    AES_ENC_4_BLKS KEY, CTR0.16b, CTR1.16b, CTR2.16b, CTR3.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [IN]

    eor BLK0.16b, BLK0.16b, CTR0.16b
    eor BLK1.16b, BLK1.16b, CTR1.16b
    eor BLK2.16b, BLK2.16b, CTR2.16b
    eor BLK3.16b, BLK3.16b, CTR3.16b

    st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [OUT]
    b .Lctr_aesenc_finish

.Lctr_enc_proc_5_blks:

    mov	CTR1.16b, CTMP.16b
    mov	CTR2.16b, CTMP.16b
    mov	CTR3.16b, CTMP.16b
    mov	CTR4.16b, CTMP.16b

    ADDCTR CTR1.s[3]
    ADDCTR CTR2.s[3]
    ADDCTR CTR3.s[3]
    ADDCTR CTR4.s[3]

    AES_ENC_5_BLKS KEY, CTR0.16b, CTR1.16b, CTR2.16b, CTR3.16b, CTR4.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [IN], #64
    ld1 {BLK4.16b}, [IN]

    eor BLK0.16b, BLK0.16b, CTR0.16b
    eor BLK1.16b, BLK1.16b, CTR1.16b
    eor BLK2.16b, BLK2.16b, CTR2.16b
    eor BLK3.16b, BLK3.16b, CTR3.16b
    eor BLK4.16b, BLK4.16b, CTR4.16b

    st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [OUT], #64
    st1 {BLK4.16b}, [OUT]
    b .Lctr_aesenc_finish

.Lctr_enc_proc_6_blks:

    mov	CTR1.16b, CTMP.16b
    mov	CTR2.16b, CTMP.16b
    mov	CTR3.16b, CTMP.16b
    mov	CTR4.16b, CTMP.16b
    mov	CTR5.16b, CTMP.16b

    ADDCTR CTR1.s[3]
    ADDCTR CTR2.s[3]
    ADDCTR CTR3.s[3]
    ADDCTR CTR4.s[3]
    ADDCTR CTR5.s[3]

    AES_ENC_6_BLKS  KEY, CTR0.16b, CTR1.16b, CTR2.16b, CTR3.16b, CTR4.16b, \
                    CTR5.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [IN], #64
    ld1 {BLK4.16b, BLK5.16b}, [IN]

    eor BLK0.16b, BLK0.16b, CTR0.16b
    eor BLK1.16b, BLK1.16b, CTR1.16b
    eor BLK2.16b, BLK2.16b, CTR2.16b
    eor BLK3.16b, BLK3.16b, CTR3.16b
    eor BLK4.16b, BLK4.16b, CTR4.16b
    eor BLK5.16b, BLK5.16b, CTR5.16b

    st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [OUT], #64
    st1 {BLK4.16b, BLK5.16b}, [OUT]
    b .Lctr_aesenc_finish

.Lctr_enc_proc_7_blks:

    mov	CTR1.16b, CTMP.16b
    mov	CTR2.16b, CTMP.16b
    mov	CTR3.16b, CTMP.16b
    mov	CTR4.16b, CTMP.16b
    mov	CTR5.16b, CTMP.16b
    mov	CTR6.16b, CTMP.16b

    ADDCTR CTR1.s[3]
    ADDCTR CTR2.s[3]
    ADDCTR CTR3.s[3]
    ADDCTR CTR4.s[3]
    ADDCTR CTR5.s[3]
    ADDCTR CTR6.s[3]

    AES_ENC_7_BLKS  KEY, CTR0.16b, CTR1.16b, CTR2.16b, CTR3.16b, CTR4.16b, \
                    CTR5.16b, CTR6.16b, RDK0.4s, RDK1.4s, RDK0.16b, RDK1.16b, ROUNDS

    ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [IN], #64
    ld1 {BLK4.16b, BLK5.16b, BLK6.16b}, [IN]

    eor BLK0.16b, BLK0.16b, CTR0.16b
    eor BLK1.16b, BLK1.16b, CTR1.16b
    eor BLK2.16b, BLK2.16b, CTR2.16b
    eor BLK3.16b, BLK3.16b, CTR3.16b
    eor BLK4.16b, BLK4.16b, CTR4.16b
    eor BLK5.16b, BLK5.16b, CTR5.16b
    eor BLK6.16b, BLK6.16b, CTR6.16b

    st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [OUT], #64
    st1 {BLK4.16b, BLK5.16b, BLK6.16b}, [OUT]

.Lctr_aesenc_finish:
    ADDCTR CTMP.s[3]                     // Fill CTR0 for the next round.
    st1 {CTMP.16b}, [IV]

.Lctr_len_zero:
    mov x0, #0
    eor CTR0.16b, CTR0.16b, CTR0.16b
    eor CTR1.16b, CTR1.16b, CTR1.16b
    eor CTR2.16b, CTR2.16b, CTR2.16b
    eor CTR3.16b, CTR3.16b, CTR3.16b
    eor CTR4.16b, CTR4.16b, CTR4.16b
    eor CTR5.16b, CTR5.16b, CTR5.16b
    eor CTR6.16b, CTR6.16b, CTR6.16b
    eor CTR7.16b, CTR7.16b, CTR7.16b
    eor RDK0.16b, RDK0.16b, RDK0.16b
    eor RDK1.16b, RDK1.16b, RDK1.16b

AARCH64_AUTIASP
    ret
CRYPT_AARCH64_FUNC_END(CRYPT_AES_CTR_Encrypt)

#endif