/*
* This file is part of the openHiTLS project.
*
* openHiTLS is licensed under the Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
#include "hitls_build.h"
#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_CBC)
#include "crypt_aes_macro_x86_64.s"
.text
.set ARG1, %rdi
.set ARG2, %rsi
.set ARG3, %rdx
.set ARG4, %ecx
.set ARG5, %r8
.set ARG6, %r9
.set RDK, %xmm3
.set KEY, %rdi
.set KTMP, %r9
.set ROUNDS, %eax
.set RET, %eax
.set BLK0, %xmm1
.set BLK1, %xmm4
.set BLK2, %xmm5
.set BLK3, %xmm6
.set BLK4, %xmm10
.set BLK5, %xmm11
.set BLK6, %xmm12
.set BLK7, %xmm13
.set IV0, %xmm0
.set IV1, %xmm7
.set IV2, %xmm8
.set IV3, %xmm9
.set KEY1, %xmm4
.set KEY2, %xmm5
.set KEY3, %xmm6
.set KEY4, %xmm10
.set KEY5, %xmm11
.set KEY6, %xmm12
.set KEY7, %xmm13
.set KEY8, %xmm14
.set KEY9, %xmm15
.set KEY10, %xmm2
.set KEY11, %xmm7
.set KEY12, %xmm8
.set KEY13, %xmm9
.set KEYTEMP, %xmm3
/**
* Function description:AES encrypted assembly acceleration API in CBC mode.
* Function prototype:int32_t CRYPT_AES_CBC_Encrypt(const CRYPT_AES_Key *ctx,
* const uint8_t *in,
* uint8_t *out,
* uint32_t len,
* uint8_t *iv);
* Input register:
* rdi:pointer to the input key structure
* rsi:points to the input data address
* rdx:points to the output data address
* rcx:Length of the input data, which must be a multiple of 16
* r8: Points to the CBC mode mask address
* Change register:xmm0-xmm15
* Output register:eax
* Function/Macro Call: None
*/
.globl CRYPT_AES_CBC_Encrypt
.type CRYPT_AES_CBC_Encrypt, @function
CRYPT_AES_CBC_Encrypt:
.cfi_startproc
.align 16
cmpl $16, ARG4
jb .Laescbcend_end
movl 240(KEY), ROUNDS
vmovdqu (ARG5), IV0
vmovdqu (KEY), KEY1
vmovdqu 16(KEY), KEY2
vmovdqu 32(KEY), KEY3
vmovdqu 48(KEY), KEY4
vmovdqu 64(KEY), KEY5
vmovdqu 80(KEY), KEY6
vmovdqu 96(KEY), KEY7
vmovdqu 112(KEY), KEY8
vmovdqu 128(KEY), KEY9
vmovdqu 144(KEY), KEY10
vmovdqu 160(KEY), KEY11
cmpl $12, ROUNDS
jb .Laes_128_cbc_start
je .Laes_192_cbc_start
.align 16
.Laes_256_cbc_start:
vmovdqu 176(KEY), KEY12
vmovdqu 192(KEY), KEY13
.Laes_256_cbc_loop:
vpxor (ARG2), IV0, BLK0
vmovdqu 208(KEY), KEYTEMP
vpxor BLK0, KEY1, BLK0
aesenc KEY2, BLK0
aesenc KEY3, BLK0
aesenc KEY4, BLK0
aesenc KEY5, BLK0
aesenc KEY6, BLK0
aesenc KEY7, BLK0
aesenc KEY8, BLK0
aesenc KEY9, BLK0
aesenc KEY10, BLK0
aesenc KEY11, BLK0
aesenc KEY12, BLK0
aesenc KEY13, BLK0
aesenc KEYTEMP, BLK0
vmovdqu 224(KEY), KEYTEMP
aesenclast KEYTEMP, BLK0
leaq 16(ARG2), ARG2
vmovdqu BLK0, (ARG3)
movdqa BLK0, IV0
leaq 16(ARG3), ARG3
subl $16, ARG4
cmpl $16, ARG4
jnb .Laes_256_cbc_loop // Special value processing
vpxor KEY12, KEY12, KEY12
vpxor KEY13, KEY13, KEY13
vpxor KEYTEMP, KEYTEMP, KEYTEMP
jmp .Laescbcenc_finish
.align 16
.Laes_192_cbc_start:
vmovdqu 176(KEY), KEY12
vmovdqu 192(KEY), KEY13
.Laes_192_cbc_loop:
vpxor (ARG2), IV0, BLK0
vpxor BLK0, KEY1, BLK0
aesenc KEY2, BLK0
aesenc KEY3, BLK0
aesenc KEY4, BLK0
aesenc KEY5, BLK0
aesenc KEY6, BLK0
aesenc KEY7, BLK0
aesenc KEY8, BLK0
aesenc KEY9, BLK0
aesenc KEY10, BLK0
aesenc KEY11, BLK0
aesenc KEY12, BLK0
aesenclast KEY13, BLK0
leaq 16(ARG2), ARG2
vmovdqu BLK0, (ARG3)
movdqa BLK0, IV0
leaq 16(ARG3), ARG3
subl $16 , ARG4
jnz .Laes_192_cbc_loop
vpxor KEY12, KEY12, KEY12
vpxor KEY13, KEY13, KEY13
jmp .Laescbcenc_finish
.align 16
.Laes_128_cbc_start:
vpxor (ARG2), IV0, BLK0
vpxor BLK0, KEY1, BLK0
aesenc KEY2, BLK0
aesenc KEY3, BLK0
aesenc KEY4, BLK0
aesenc KEY5, BLK0
aesenc KEY6, BLK0
aesenc KEY7, BLK0
aesenc KEY8, BLK0
aesenc KEY9, BLK0
aesenc KEY10, BLK0
aesenclast KEY11, BLK0
leaq 16(ARG2), ARG2
vmovdqu BLK0, (ARG3)
movdqa BLK0, IV0
leaq 16(ARG3), ARG3
subl $16, ARG4
jnz .Laes_128_cbc_start
jmp .Laescbcenc_finish
.Laescbcenc_finish:
vmovdqu BLK0,(ARG5)
vpxor KEY1, KEY1, KEY1
vpxor KEY2, KEY2, KEY2
vpxor KEY3, KEY3, KEY3
vpxor KEY4, KEY4, KEY4
vpxor KEY5, KEY5, KEY5
vpxor KEY6, KEY6, KEY6
vpxor KEY7, KEY7, KEY7
vpxor KEY8, KEY8, KEY8
vpxor KEY9, KEY9, KEY9
vpxor KEY10, KEY10, KEY10
vpxor KEY11, KEY11, KEY11
.Laescbcend_end:
movl $0, RET
ret
.cfi_endproc
.size CRYPT_AES_CBC_Encrypt, .-CRYPT_AES_CBC_Encrypt
/**
* Function description: Sets the AES decryption and assembly accelerated implementation interface in CBC mode
* Function prototype:int32_t CRYPT_AES_CBC_Decrypt(const CRYPT_AES_Key *ctx,
* const uint8_t *in,
* uint8_t *out,
* uint32_t len,
* uint8_t *iv);
* Input register:
* rdi:pointer to the input key structure
* rsi:points to the input data address.
* rdx:points to the output data address.
* rcx:Length of the input data, which must be a multiple of 16
* r8: Points to the CBC mode mask address
* Change register:xmm0-xmm13
* Output register:eax
* Function/Macro Call: None
*/
.globl CRYPT_AES_CBC_Decrypt
.type CRYPT_AES_CBC_Decrypt, @function
CRYPT_AES_CBC_Decrypt:
.cfi_startproc
.align 16
vmovdqu (ARG5), IV0
.Laes_cbc_dec_start:
cmpl $64, ARG4
jae .Labove_equal_4_blks
cmpl $32, ARG4
jae .Labove_equal_2_blks
cmpl $0, ARG4
je .Laes_cbc_dec_finish
jmp .Lproc_1_blk
.Labove_equal_2_blks:
cmpl $48, ARG4
jb .Lproc_2_blks
jmp .Lproc_3_blks
.Labove_equal_4_blks:
cmpl $96, ARG4
jae .Labove_equal_6_blks
cmpl $80, ARG4
jb .Lproc_4_blks
jmp .Lproc_5_blks
.Labove_equal_6_blks:
cmpl $112, ARG4
jb .Lproc_6_blks
cmpl $128, ARG4
jb .Lproc_7_blks
.align 16
.Lproc_8_blks:
.Laescbcdec_8_blks_loop:
vmovdqu (ARG2), BLK0
vmovdqu 16(ARG2), BLK1
vmovdqu 32(ARG2), BLK2
movdqa BLK0, IV1
movdqa BLK1, IV2
movdqa BLK2, IV3
movq KEY, KTMP
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
vpxor BLK0, RDK, BLK0
vpxor BLK1, RDK, BLK1
vpxor BLK2, RDK, BLK2
vpxor 48(ARG2), RDK, BLK3
vpxor 64(ARG2), RDK, BLK4
vpxor 80(ARG2), RDK, BLK5
vpxor 96(ARG2), RDK, BLK6
vpxor 112(ARG2), RDK, BLK7
decl ROUNDS
AES_DEC_8_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4 BLK5 BLK6 BLK7
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vpxor BLK2, IV2, BLK2
vpxor BLK3, IV3, BLK3
vpxor 48(ARG2), BLK4, BLK4
vpxor 64(ARG2), BLK5, BLK5
vpxor 80(ARG2), BLK6, BLK6
vpxor 96(ARG2), BLK7, BLK7
vmovdqu 112(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
vmovdqu BLK2, 32(ARG3)
vmovdqu BLK3, 48(ARG3)
vmovdqu BLK4, 64(ARG3)
vmovdqu BLK5, 80(ARG3)
vmovdqu BLK6, 96(ARG3)
vmovdqu BLK7, 112(ARG3)
subl $128, ARG4
leaq 128(ARG2), ARG2
leaq 128(ARG3), ARG3
cmpl $128, ARG4
jb .Laes_cbc_dec_start
jmp .Laescbcdec_8_blks_loop
.align 16
.Lproc_1_blk:
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
vpxor (ARG2), RDK, BLK0
decl ROUNDS
AES_DEC_1_BLK KEY ROUNDS RDK BLK0
vpxor BLK0, IV0, BLK0
vmovdqu (ARG2), IV0
vmovdqu BLK0, (ARG3)
jmp .Laes_cbc_dec_finish
.align 16
.Lproc_2_blks:
vmovdqu (ARG2), BLK0
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
movdqa BLK0, IV1
vpxor BLK0, RDK, BLK0
vpxor 16(ARG2), RDK, BLK1
decl ROUNDS
AES_DEC_2_BLKS KEY ROUNDS RDK BLK0 BLK1
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vmovdqu 16(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
jmp .Laes_cbc_dec_finish
.align 16
.Lproc_3_blks:
vmovdqu (ARG2), BLK0
vmovdqu 16(ARG2), BLK1
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
movdqa BLK0, IV1
movdqa BLK1, IV2
vpxor BLK0, RDK, BLK0
vpxor BLK1, RDK, BLK1
vpxor 32(ARG2), RDK, BLK2
decl ROUNDS
AES_DEC_3_BLKS KEY ROUNDS RDK BLK0 BLK1 BLK2
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vpxor BLK2, IV2, BLK2
vmovdqu 32(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
vmovdqu BLK2, 32(ARG3)
jmp .Laes_cbc_dec_finish
.align 16
.Lproc_4_blks:
vmovdqu (ARG2), BLK0
vmovdqu 16(ARG2), BLK1
vmovdqu 32(ARG2), BLK2
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
movdqa BLK0, IV1
movdqa BLK1, IV2
movdqa BLK2, IV3
vpxor BLK0, RDK, BLK0
vpxor BLK1, RDK, BLK1
vpxor BLK2, RDK, BLK2
vpxor 48(ARG2), RDK, BLK3
decl ROUNDS
AES_DEC_4_BLKS KEY ROUNDS RDK BLK0 BLK1 BLK2 BLK3
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vpxor BLK2, IV2, BLK2
vpxor BLK3, IV3, BLK3
vmovdqu 48(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
vmovdqu BLK2, 32(ARG3)
vmovdqu BLK3, 48(ARG3)
jmp .Laes_cbc_dec_finish
.align 16
.Lproc_5_blks:
vmovdqu (ARG2), BLK0
vmovdqu 16(ARG2), BLK1
vmovdqu 32(ARG2), BLK2
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
movdqa BLK0, IV1
movdqa BLK1, IV2
movdqa BLK2, IV3
vpxor BLK0, RDK, BLK0
vpxor BLK1, RDK, BLK1
vpxor BLK2, RDK, BLK2
vpxor 48(ARG2), RDK, BLK3
vpxor 64(ARG2), RDK, BLK4
decl ROUNDS
AES_DEC_5_BLKS KEY ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vpxor BLK2, IV2, BLK2
vpxor BLK3, IV3, BLK3
vpxor 48(ARG2), BLK4, BLK4
vmovdqu 64(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
vmovdqu BLK2, 32(ARG3)
vmovdqu BLK3, 48(ARG3)
vmovdqu BLK4, 64(ARG3)
jmp .Laes_cbc_dec_finish
.align 16
.Lproc_6_blks:
vmovdqu (ARG2), BLK0
vmovdqu 16(ARG2), BLK1
vmovdqu 32(ARG2), BLK2
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
movdqa BLK0, IV1
movdqa BLK1, IV2
movdqa BLK2, IV3
vpxor (ARG2), RDK, BLK0
vpxor 16(ARG2), RDK, BLK1
vpxor 32(ARG2), RDK, BLK2
vpxor 48(ARG2), RDK, BLK3
vpxor 64(ARG2), RDK, BLK4
vpxor 80(ARG2), RDK, BLK5
decl ROUNDS
AES_DEC_6_BLKS KEY ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4 BLK5
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vpxor BLK2, IV2, BLK2
vpxor BLK3, IV3, BLK3
vpxor 48(ARG2), BLK4, BLK4
vpxor 64(ARG2), BLK5, BLK5
vmovdqu 80(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
vmovdqu BLK2, 32(ARG3)
vmovdqu BLK3, 48(ARG3)
vmovdqu BLK4, 64(ARG3)
vmovdqu BLK5, 80(ARG3)
jmp .Laes_cbc_dec_finish
.align 16
.Lproc_7_blks:
vmovdqu (ARG2), BLK0
vmovdqu 16(ARG2), BLK1
vmovdqu 32(ARG2), BLK2
movl 240(KEY), ROUNDS
vmovdqu (KEY), RDK
movdqa BLK0, IV1
movdqa BLK1, IV2
movdqa BLK2, IV3
vpxor (ARG2), RDK, BLK0
vpxor 16(ARG2), RDK, BLK1
vpxor 32(ARG2), RDK, BLK2
vpxor 48(ARG2), RDK, BLK3
vpxor 64(ARG2), RDK, BLK4
vpxor 80(ARG2), RDK, BLK5
vpxor 96(ARG2), RDK, BLK6
decl ROUNDS
AES_DEC_7_BLKS KEY ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4 BLK5 BLK6
vpxor BLK0, IV0, BLK0
vpxor BLK1, IV1, BLK1
vpxor BLK2, IV2, BLK2
vpxor BLK3, IV3, BLK3
vpxor 48(ARG2), BLK4, BLK4
vpxor 64(ARG2), BLK5, BLK5
vpxor 80(ARG2), BLK6, BLK6
vmovdqu 96(ARG2), IV0
vmovdqu BLK0, (ARG3)
vmovdqu BLK1, 16(ARG3)
vmovdqu BLK2, 32(ARG3)
vmovdqu BLK3, 48(ARG3)
vmovdqu BLK4, 64(ARG3)
vmovdqu BLK5, 80(ARG3)
vmovdqu BLK6, 96(ARG3)
.align 16
.Laes_cbc_dec_finish:
vmovdqu IV0, (ARG5)
vpxor BLK0, BLK0, BLK0
vpxor BLK1, BLK1, BLK1
vpxor BLK2, BLK2, BLK2
vpxor BLK3, BLK3, BLK3
vpxor BLK4, BLK4, BLK4
vpxor BLK5, BLK5, BLK5
vpxor BLK6, BLK6, BLK6
vpxor BLK7, BLK7, BLK7
vpxor RDK, RDK, RDK
movl $0, RET
ret
.cfi_endproc
.size CRYPT_AES_CBC_Decrypt, .-CRYPT_AES_CBC_Decrypt
#endif