/*
* This file is part of the openHiTLS project.
*
* openHiTLS is licensed under the Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
#include "hitls_build.h"
#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_XTS)
#include "crypt_arm.h"
#include "crypt_aes_macro_armv8.s"
CRYPT_AARCH64_ARCH_CRYPTO
.text
KEY .req x0
IN .req x1
OUT .req x2
LEN .req x3
TWEAK .req x4
TMPOUT .req x17
WP .req w11
WC .req w12
KTMP .req x5
LTMP .req x6
TAILNUM .req x8
POS .req x16
ROUNDS .req w7
XROUNDS .req x7
TROUNDS .req w15
WTMP0 .req w9
WTMP1 .req w10
WTMP2 .req w11
WTMP3 .req w12
XTMP1 .req x10
XTMP2 .req x11
TWX0 .req x13
TWX1 .req x14
TWW1 .req w14
BLK0 .req v0
BLK1 .req v1
BLK2 .req v2
BLK3 .req v3
BLK4 .req v4
IN0 .req v5
IN1 .req v6
IN2 .req v7
IN3 .req v30
IN4 .req v31
TWK0 .req v8
TWK1 .req v9
TWK2 .req v10
TWK3 .req v11
TWK4 .req v12
TWKD00 .req d8
TWKD10 .req d9
TWKD20 .req d10
TWKD30 .req d11
TWKD40 .req d12
#define TWKD01 v8.d[1]
#define TWKD11 v9.d[1]
#define TWKD21 v10.d[1]
#define TWKD31 v11.d[1]
#define TWKD41 v12.d[1]
RDK0 .req v16
RDK1 .req v17
RDK2 .req v18
RDK3 .req v19
RDK4 .req v20
RDK5 .req v21
RDK6 .req v22
RDK7 .req v23
RDK8 .req v24
TMP0 .req v25
TMP1 .req v26
TMP2 .req v27
TMP3 .req v28
TMP4 .req v29
.macro MOV_REG_TO_VEC SRC0, SRC1, DES0, DES1
fmov \DES0,\SRC0
fmov \DES1,\SRC1
.endm
.macro NextTweak twkl, twkh, twkd0, twkd1
asr XTMP2,\twkh,#63
extr \twkh,\twkh,\twkl,#63
and WTMP1,WTMP0,WTMP2
eor \twkl,XTMP1,\twkl,lsl#1
fmov \twkd0,\twkl // must set lower bits of 'q' register first.1
fmov \twkd1,\twkh // Set lower bits using 'd' register will clear higer bits.
.endm
.macro AesCrypt1x en, mc, d0, rk
aes\en \d0\().16b, \rk\().16b
aes\mc \d0\().16b, \d0\().16b
.endm
.macro AesEncrypt1x d0, rk
AesCrypt1x e, mc, \d0, \rk
.endm
.macro AesDecrypt1x d0, rk
AesCrypt1x d, imc, \d0, \rk
.endm
/**
* int32_t CRYPT_AES_XTS_Encrypt(const CRYPT_AES_Key *ctx, const uint8_t *in, uint8_t *out, uint32_t len, const uint8_t *tweak);
*/
CRYPT_AARCH64_FUNC_START(CRYPT_AES_XTS_Encrypt)
AARCH64_PACIASP
stp x29, x30, [sp,#-80]!
add x29, sp, #0
stp d8, d9, [sp,#16]
stp d10, d11, [sp,#32]
stp d12, d13, [sp,#48]
stp d14, d15, [sp,#64]
ld1 {TWK0.16b}, [TWEAK]
and TAILNUM, LEN, #0xF // get tail num, LEN % 16
and LTMP, LEN, #-16
mov WTMP0,0x87
ldr ROUNDS,[KEY,#240]
fmov TWX0,TWKD00
fmov TWX1,TWKD01
sub ROUNDS,ROUNDS,#6 // perload last 7 rounds key
add KTMP,KEY,XROUNDS,lsl#4
ld1 {RDK2.4s,RDK3.4s},[KTMP],#32
ld1 {RDK4.4s,RDK5.4s},[KTMP],#32
ld1 {RDK6.4s,RDK7.4s},[KTMP],#32
ld1 {RDK8.4s},[KTMP]
.Lxts_aesenc_start:
cmp LTMP, #80
b.ge .Lxts_enc_proc_5_blks
cmp LTMP, #48
b.ge .Lxts_enc_proc_3_blks
cmp LTMP, #32
b.eq .Lxts_enc_proc_2_blks
cmp LTMP, #16
b.eq .Lxts_enc_proc_1blk
.Lxtx_tail_blk:
fmov TWX0,TWKD00 // reset already computed tweak
fmov TWX1,TWKD01
cbz TAILNUM,.Lxts_aesenc_finish
// prepare encrypt tail block
sub TMPOUT,OUT,#16
.Lxtx_tail_blk_loop:
subs TAILNUM,TAILNUM,1
ldrb WC,[TMPOUT,TAILNUM]
ldrb WP,[IN,TAILNUM]
strb WC,[OUT,TAILNUM]
strb WP,[TMPOUT,TAILNUM]
b.gt .Lxtx_tail_blk_loop
ld1 {BLK0.16b}, [TMPOUT]
mov LTMP,#16
mov OUT,TMPOUT
b .Lxts_enc_proc_1blk_loaded
cbz LTMP,.Lxts_aesenc_finish
.Lxts_enc_proc_1blk:
ld1 {BLK0.16b},[IN],#16
.Lxts_enc_proc_1blk_loaded:
eor BLK0.16b,BLK0.16b,TWK0.16b
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
.Lxts_rounds_1blks:
AesEncrypt1x BLK0,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesEncrypt1x BLK0,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_rounds_1blks
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK0,RDK1
// last 7 rounds
AesEncrypt1x BLK0,RDK2
AesEncrypt1x BLK0,RDK3
AesEncrypt1x BLK0,RDK4
AesEncrypt1x BLK0,RDK5
AesEncrypt1x BLK0,RDK6
aese BLK0.16b,RDK7.16b // final round
eor BLK0.16b,BLK0.16b,RDK8.16b
eor BLK0.16b,BLK0.16b,TWK0.16b
st1 {BLK0.16b}, [OUT], #16
NextTweak TWX0,TWX1,TWKD00,TWKD01
subs LTMP,LTMP,#16
b.hs .Lxts_aesenc_start
.Lxts_enc_proc_2_blks:
ld1 {BLK0.16b, BLK1.16b}, [IN], #32
mov KTMP, KEY
NextTweak TWX0,TWX1,TWKD10,TWKD11
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
eor BLK0.16b, BLK0.16b, TWK0.16b
eor BLK1.16b, BLK1.16b, TWK1.16b
.Lxts_rounds_2blks:
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK1,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesEncrypt1x BLK0,RDK1
AesEncrypt1x BLK1,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_rounds_2blks
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK1,RDK0
AesEncrypt1x BLK0,RDK1
AesEncrypt1x BLK1,RDK1
// last 7 rounds
AesEncrypt1x BLK0,RDK2
AesEncrypt1x BLK1,RDK2
AesEncrypt1x BLK0,RDK3
AesEncrypt1x BLK1,RDK3
AesEncrypt1x BLK0,RDK4
AesEncrypt1x BLK1,RDK4
AesEncrypt1x BLK0,RDK5
AesEncrypt1x BLK1,RDK5
AesEncrypt1x BLK0,RDK6
AesEncrypt1x BLK1,RDK6
eor TWK0.16b,TWK0.16b,RDK8.16b
eor TWK1.16b,TWK1.16b,RDK8.16b
aese BLK0.16b,RDK7.16b // final round
aese BLK1.16b,RDK7.16b
eor BLK0.16b,BLK0.16b,TWK0.16b
eor BLK1.16b,BLK1.16b,TWK1.16b
st1 {BLK0.16b, BLK1.16b}, [OUT], #32
NextTweak TWX0,TWX1,TWKD00,TWKD01
subs LTMP,LTMP,#32
b.hs .Lxts_aesenc_start
.Lxts_enc_proc_3_blks:
ld1 {BLK0.16b}, [IN], #16 // first block
NextTweak TWX0,TWX1,TWKD10,TWKD11
eor BLK0.16b,BLK0.16b,TWK0.16b
ld1 {BLK1.16b}, [IN], #16 // second block
NextTweak TWX0,TWX1,TWKD20,TWKD21
eor BLK1.16b,BLK1.16b,TWK1.16b
ld1 {BLK2.16b}, [IN], #16 // third block
eor BLK2.16b,BLK2.16b,TWK2.16b
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
.Lxts_rounds_3blks:
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK1,RDK0
AesEncrypt1x BLK2,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesEncrypt1x BLK0,RDK1
AesEncrypt1x BLK1,RDK1
AesEncrypt1x BLK2,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_rounds_3blks
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK1,RDK0
AesEncrypt1x BLK2,RDK0
AesEncrypt1x BLK0,RDK1
AesEncrypt1x BLK1,RDK1
AesEncrypt1x BLK2,RDK1
// last 7 rounds
AesEncrypt1x BLK0,RDK2
AesEncrypt1x BLK1,RDK2
AesEncrypt1x BLK2,RDK2
AesEncrypt1x BLK0,RDK3
AesEncrypt1x BLK1,RDK3
AesEncrypt1x BLK2,RDK3
AesEncrypt1x BLK0,RDK4
AesEncrypt1x BLK1,RDK4
AesEncrypt1x BLK2,RDK4
AesEncrypt1x BLK0,RDK5
AesEncrypt1x BLK1,RDK5
AesEncrypt1x BLK2,RDK5
AesEncrypt1x BLK0,RDK6
AesEncrypt1x BLK1,RDK6
AesEncrypt1x BLK2,RDK6
eor TWK0.16b,TWK0.16b,RDK8.16b
eor TWK1.16b,TWK1.16b,RDK8.16b
eor TWK2.16b,TWK2.16b,RDK8.16b
aese BLK0.16b,RDK7.16b
aese BLK1.16b,RDK7.16b
aese BLK2.16b,RDK7.16b
eor BLK0.16b,BLK0.16b,TWK0.16b
eor BLK1.16b,BLK1.16b,TWK1.16b
eor BLK2.16b,BLK2.16b,TWK2.16b
st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [OUT], #48
NextTweak TWX0,TWX1,TWKD00,TWKD01
subs LTMP,LTMP,#48
b.hs .Lxts_aesenc_start
.p2align 4
.Lxts_enc_proc_5_blks:
ld1 {BLK0.16b}, [IN], #16 // first block
NextTweak TWX0,TWX1,TWKD10,TWKD11
eor BLK0.16b,BLK0.16b,TWK0.16b
ld1 {BLK1.16b}, [IN], #16 // second block
NextTweak TWX0,TWX1,TWKD20,TWKD21
eor BLK1.16b,BLK1.16b,TWK1.16b
sub LTMP,LTMP,#32
ld1 {BLK2.16b}, [IN], #16 // third block
NextTweak TWX0,TWX1,TWKD30,TWKD31
eor BLK2.16b,BLK2.16b,TWK2.16b
ld1 {BLK3.16b}, [IN], #16 // fourth block
NextTweak TWX0,TWX1,TWKD40,TWKD41
eor BLK3.16b,BLK3.16b,TWK3.16b
sub LTMP,LTMP,#32
ld1 {BLK4.16b}, [IN], #16 // fifth block
eor BLK4.16b, BLK4.16b, TWK4.16b
sub LTMP,LTMP,#16
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
.p2align 4
.Lxts_rounds_5blks:
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK1,RDK0
AesEncrypt1x BLK2,RDK0
AesEncrypt1x BLK3,RDK0
AesEncrypt1x BLK4,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesEncrypt1x BLK0,RDK1
AesEncrypt1x BLK1,RDK1
AesEncrypt1x BLK2,RDK1
AesEncrypt1x BLK3,RDK1
AesEncrypt1x BLK4,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_rounds_5blks
AesEncrypt1x BLK0,RDK0
AesEncrypt1x BLK1,RDK0
AesEncrypt1x BLK2,RDK0
AesEncrypt1x BLK3,RDK0
AesEncrypt1x BLK4,RDK0
subs LTMP,LTMP,#80
AesEncrypt1x BLK0,RDK1
AesEncrypt1x BLK1,RDK1
AesEncrypt1x BLK2,RDK1
AesEncrypt1x BLK3,RDK1
AesEncrypt1x BLK4,RDK1
// last 7 rounds
AesEncrypt1x BLK0,RDK2
AesEncrypt1x BLK1,RDK2
AesEncrypt1x BLK2,RDK2
AesEncrypt1x BLK3,RDK2
AesEncrypt1x BLK4,RDK2
csel POS,xzr,LTMP,gt //
AesEncrypt1x BLK0,RDK3
AesEncrypt1x BLK1,RDK3
AesEncrypt1x BLK2,RDK3
AesEncrypt1x BLK3,RDK3
AesEncrypt1x BLK4,RDK3
add IN,IN,POS
AesEncrypt1x BLK0,RDK4
AesEncrypt1x BLK1,RDK4
AesEncrypt1x BLK2,RDK4
AesEncrypt1x BLK3,RDK4
AesEncrypt1x BLK4,RDK4
AesEncrypt1x BLK0,RDK5
AesEncrypt1x BLK1,RDK5
AesEncrypt1x BLK2,RDK5
AesEncrypt1x BLK3,RDK5
AesEncrypt1x BLK4,RDK5
AesEncrypt1x BLK0,RDK6
AesEncrypt1x BLK1,RDK6
AesEncrypt1x BLK2,RDK6
AesEncrypt1x BLK3,RDK6
AesEncrypt1x BLK4,RDK6
eor TMP0.16b,TWK0.16b,RDK8.16b
aese BLK0.16b,RDK7.16b // final round
NextTweak TWX0,TWX1,TWKD00,TWKD01 // perform operations of next 5blks in advance
eor TMP1.16b,TWK1.16b,RDK8.16b
ld1 {IN0.16b}, [IN], #16
aese BLK1.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD10,TWKD11
eor TMP2.16b,TWK2.16b,RDK8.16b
ld1 {IN1.16b}, [IN], #16
aese BLK2.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD20,TWKD21
eor TMP3.16b,TWK3.16b,RDK8.16b
ld1 {IN2.16b}, [IN], #16
aese BLK3.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD30,TWKD31
eor TMP4.16b,TWK4.16b,RDK8.16b
ld1 {IN3.16b}, [IN], #16
aese BLK4.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD40,TWKD41
ld1 {IN4.16b}, [IN], #16
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
eor TMP0.16b,TMP0.16b,BLK0.16b
eor BLK0.16b,IN0.16b,TWK0.16b // blk0 = in0 ^ twk0
eor TMP1.16b,TMP1.16b,BLK1.16b
eor BLK1.16b,IN1.16b,TWK1.16b
st1 {TMP0.16b}, [OUT], #16
eor TMP2.16b,TMP2.16b,BLK2.16b
eor BLK2.16b,IN2.16b,TWK2.16b
eor TMP3.16b,TMP3.16b,BLK3.16b
eor BLK3.16b,IN3.16b,TWK3.16b
st1 {TMP1.16b}, [OUT], #16
eor TMP4.16b,TMP4.16b,BLK4.16b
eor BLK4.16b,IN4.16b,TWK4.16b
st1 {TMP2.16b}, [OUT], #16
sub TROUNDS,ROUNDS,#2
st1 {TMP3.16b,TMP4.16b}, [OUT], #32
b.hs .Lxts_rounds_5blks
add LTMP,LTMP,#80 // add 5 blocks length back if LTMP < 0
cbz LTMP,.Lxtx_tail_blk
cmp LTMP, #16
b.eq .Lxts_pre_last_1blks
cmp LTMP,#32
b.eq .Lxts_pre_last_2blks
cmp LTMP,#48
b.eq .Lxts_pre_last_3blks
cmp LTMP,#64
b.eq .Lxts_pre_last_4blks
.Lxts_pre_last_1blks:
eor IN0.16b,IN0.16b,IN4.16b //in0 = in0 ^ in41
eor BLK0.16b,BLK0.16b,IN0.16b // blk0 = in0 ^ twk0 ^ in0 ^ in4
fmov TWX0,TWKD00 // reset already computed tweak
fmov TWX1,TWKD01
b .Lxts_rounds_1blks
.Lxts_pre_last_2blks:
eor BLK0.16b,BLK0.16b,IN0.16b
eor BLK1.16b,BLK1.16b,IN1.16b
eor BLK0.16b,BLK0.16b,IN3.16b // in3 -> blk0
eor BLK1.16b,BLK1.16b,IN4.16b // in4 -> blk1
fmov TWX0,TWKD10 // reset already computed tweak
fmov TWX1,TWKD11
b .Lxts_rounds_2blks
.Lxts_pre_last_3blks:
eor BLK0.16b,BLK0.16b,IN0.16b
eor BLK1.16b,BLK1.16b,IN1.16b
eor BLK2.16b,BLK2.16b,IN2.16b
eor BLK0.16b,BLK0.16b,IN2.16b // in2 -> blk0
eor BLK1.16b,BLK1.16b,IN3.16b // in3 -> blk1
eor BLK2.16b,BLK2.16b,IN4.16b // in4 -> blk2
fmov TWX0,TWKD20 // reset already computed tweak
fmov TWX1,TWKD21
b .Lxts_rounds_3blks
.Lxts_pre_last_4blks:
eor BLK0.16b,BLK0.16b,IN0.16b
eor BLK1.16b,BLK1.16b,IN1.16b
eor BLK2.16b,BLK2.16b,IN2.16b
eor BLK3.16b,BLK3.16b,IN3.16b
sub IN,IN,#16 // have loaded 4blks, using 3blks to process, so step back 1blk here
eor BLK0.16b,BLK0.16b,IN1.16b // in1 -> blk0
eor BLK1.16b,BLK1.16b,IN2.16b // in2 -> blk1
eor BLK2.16b,BLK2.16b,IN3.16b // in3 -> blk2
eor BLK3.16b,BLK3.16b,IN4.16b // in4 -> blk3
fmov TWX0,TWKD20 // reset already computed tweak
fmov TWX1,TWKD21
b .Lxts_rounds_3blks
.Lxts_aesenc_finish:
MOV_REG_TO_VEC TWX0,TWX1,TWKD00,TWKD01
st1 {TWK0.16b}, [TWEAK]
mov x0, #0 // return value ? no need
ldp d14, d15, [sp,#64]
ldp d12, d13, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d8, d9, [sp, #16]
ldp x29, x30, [sp], #80
AARCH64_AUTIASP
ret
CRYPT_AARCH64_FUNC_END(CRYPT_AES_XTS_Encrypt)
/**
* int32_t CRYPT_AES_XTS_Decrypt(const CRYPT_AES_Key *ctx, const uint8_t *in, uint8_t *out, uint32_t len, const uint8_t *t);
*/
CRYPT_AARCH64_FUNC_START(CRYPT_AES_XTS_Decrypt)
AARCH64_PACIASP
stp x29, x30, [sp,#-80]!
add x29, sp, #0
stp d8, d9, [sp,#16]
stp d10, d11, [sp,#32]
stp d12, d13, [sp,#48]
stp d14, d15, [sp,#64]
ld1 {TWK0.16b}, [TWEAK]
and LTMP, LEN, #-16
ands TAILNUM, LEN, #0xF // get tail num, LEN % 16
sub XTMP1,LTMP,#16 // preserve last and tail block
csel LTMP,XTMP1,LTMP,ne // if tailnum != 0, len -= 16
mov WTMP0,0x87
ldr ROUNDS,[KEY,#240]
fmov TWX0,TWKD00
fmov TWX1,TWKD01
sub ROUNDS,ROUNDS,#6 // perload last 7 rounds key
add KTMP,KEY,XROUNDS,lsl#4
ld1 {RDK2.4s,RDK3.4s},[KTMP],#32
ld1 {RDK4.4s,RDK5.4s},[KTMP],#32
ld1 {RDK6.4s,RDK7.4s},[KTMP],#32
ld1 {RDK8.4s},[KTMP]
.Lxts_aesdec_start:
cmp LTMP, #80
b.ge .Lxts_dec_proc_5_blks
cmp LTMP, #48
b.ge .Lxts_dec_proc_3_blks
cmp LTMP, #32
b.eq .Lxts_dec_proc_2_blks
cmp LTMP, #16
b.eq .Lxts_dec_proc_1blk
cmp LTMP, #0
b.eq .Lxts_dec_last_secondblk
.Lxtx_dec_tail_blk:
fmov TWX0,TWKD00 // reset already computed tweak
fmov TWX1,TWKD01
cbz TAILNUM,.Lxts_aesdec_finish
// prepare encrypt tail block
sub TMPOUT,OUT,#16
.Lxtx_dec_tail_blk_loop:
subs TAILNUM,TAILNUM,1
ldrb WC,[TMPOUT,TAILNUM]
ldrb WP,[IN,TAILNUM]
strb WC,[OUT,TAILNUM]
strb WP,[TMPOUT,TAILNUM]
b.gt .Lxtx_dec_tail_blk_loop
ld1 {BLK0.16b}, [TMPOUT]
mov OUT,TMPOUT
mov TWK0.16b,TWK2.16b // load pre-tweak back
b .Lxts_dec_proc_1blk_loaded
cbz LTMP,.Lxts_aesdec_finish
.Lxts_dec_last_secondblk:
cbz TAILNUM,.Lxts_aesdec_finish
mov TWK2.16b,TWK0.16b // save last second tweak
NextTweak TWX0,TWX1,TWKD00,TWKD01
.Lxts_dec_proc_1blk:
ld1 {BLK0.16b}, [IN],#16
.Lxts_dec_proc_1blk_loaded:
mov KTMP, KEY
eor BLK0.16b,BLK0.16b,TWK0.16b
ld1 {RDK0.4s},[KTMP],#16
sub TROUNDS,ROUNDS,#2
ld1 {RDK1.4s},[KTMP],#16
.Lxts_dec_rounds_1blks:
AesDecrypt1x BLK0,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesDecrypt1x BLK0,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_dec_rounds_1blks
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK0,RDK1
// last 7 rounds
AesDecrypt1x BLK0,RDK2
AesDecrypt1x BLK0,RDK3
AesDecrypt1x BLK0,RDK4
AesDecrypt1x BLK0,RDK5
AesDecrypt1x BLK0,RDK6
aesd BLK0.16b,RDK7.16b // final round
eor BLK0.16b,BLK0.16b,RDK8.16b
eor BLK0.16b,BLK0.16b,TWK0.16b
st1 {BLK0.16b}, [OUT], #16
NextTweak TWX0,TWX1,TWKD00,TWKD01
subs LTMP,LTMP,#16
b.lt .Lxtx_dec_tail_blk
b.hs .Lxts_aesdec_start
.Lxts_dec_proc_2_blks:
ld1 {BLK0.16b, BLK1.16b}, [IN], #32
mov KTMP, KEY
NextTweak TWX0,TWX1,TWKD10,TWKD11
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
eor BLK0.16b, BLK0.16b, TWK0.16b
eor BLK1.16b, BLK1.16b, TWK1.16b
.Lxts_dec_rounds_2blks:
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK1,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesDecrypt1x BLK0,RDK1
AesDecrypt1x BLK1,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_dec_rounds_2blks
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK1,RDK0
AesDecrypt1x BLK0,RDK1
AesDecrypt1x BLK1,RDK1
// last 7 rounds
AesDecrypt1x BLK0,RDK2
AesDecrypt1x BLK1,RDK2
AesDecrypt1x BLK0,RDK3
AesDecrypt1x BLK1,RDK3
AesDecrypt1x BLK0,RDK4
AesDecrypt1x BLK1,RDK4
AesDecrypt1x BLK0,RDK5
AesDecrypt1x BLK1,RDK5
AesDecrypt1x BLK0,RDK6
AesDecrypt1x BLK1,RDK6
eor TWK0.16b,TWK0.16b,RDK8.16b
eor TWK1.16b,TWK1.16b,RDK8.16b
aesd BLK0.16b,RDK7.16b // final round
aesd BLK1.16b,RDK7.16b
eor BLK0.16b,BLK0.16b,TWK0.16b
eor BLK1.16b,BLK1.16b,TWK1.16b
st1 {BLK0.16b, BLK1.16b}, [OUT], #32
NextTweak TWX0,TWX1,TWKD00,TWKD01
subs LTMP,LTMP,#32
b.hs .Lxts_aesdec_start
.Lxts_dec_proc_3_blks:
ld1 {BLK0.16b}, [IN], #16 // first block
NextTweak TWX0,TWX1,TWKD10,TWKD11
eor BLK0.16b,BLK0.16b,TWK0.16b
ld1 {BLK1.16b}, [IN], #16 // second block
NextTweak TWX0,TWX1,TWKD20,TWKD21
eor BLK1.16b,BLK1.16b,TWK1.16b
ld1 {BLK2.16b}, [IN], #16 // third block
eor BLK2.16b,BLK2.16b,TWK2.16b
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
.Lxts_dec_rounds_3blks:
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK1,RDK0
AesDecrypt1x BLK2,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesDecrypt1x BLK0,RDK1
AesDecrypt1x BLK1,RDK1
AesDecrypt1x BLK2,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_dec_rounds_3blks
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK1,RDK0
AesDecrypt1x BLK2,RDK0
AesDecrypt1x BLK0,RDK1
AesDecrypt1x BLK1,RDK1
AesDecrypt1x BLK2,RDK1
// last 7 rounds
AesDecrypt1x BLK0,RDK2
AesDecrypt1x BLK1,RDK2
AesDecrypt1x BLK2,RDK2
AesDecrypt1x BLK0,RDK3
AesDecrypt1x BLK1,RDK3
AesDecrypt1x BLK2,RDK3
AesDecrypt1x BLK0,RDK4
AesDecrypt1x BLK1,RDK4
AesDecrypt1x BLK2,RDK4
AesDecrypt1x BLK0,RDK5
AesDecrypt1x BLK1,RDK5
AesDecrypt1x BLK2,RDK5
AesDecrypt1x BLK0,RDK6
AesDecrypt1x BLK1,RDK6
AesDecrypt1x BLK2,RDK6
eor TWK0.16b,TWK0.16b,RDK8.16b
eor TWK1.16b,TWK1.16b,RDK8.16b
eor TWK2.16b,TWK2.16b,RDK8.16b
aesd BLK0.16b,RDK7.16b
aesd BLK1.16b,RDK7.16b
aesd BLK2.16b,RDK7.16b
eor BLK0.16b,BLK0.16b,TWK0.16b
eor BLK1.16b,BLK1.16b,TWK1.16b
eor BLK2.16b,BLK2.16b,TWK2.16b
st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [OUT], #48
NextTweak TWX0,TWX1,TWKD00,TWKD01
subs LTMP,LTMP,#48
b.hs .Lxts_aesdec_start
.p2align 4
.Lxts_dec_proc_5_blks:
ld1 {BLK0.16b}, [IN], #16 // first block
NextTweak TWX0,TWX1,TWKD10,TWKD11
eor BLK0.16b,BLK0.16b,TWK0.16b
ld1 {BLK1.16b}, [IN], #16 // second block
NextTweak TWX0,TWX1,TWKD20,TWKD21
eor BLK1.16b,BLK1.16b,TWK1.16b
sub LTMP,LTMP,#32
ld1 {BLK2.16b}, [IN], #16 // third block
NextTweak TWX0,TWX1,TWKD30,TWKD31
eor BLK2.16b,BLK2.16b,TWK2.16b
ld1 {BLK3.16b}, [IN], #16 // fourth block
NextTweak TWX0,TWX1,TWKD40,TWKD41
eor BLK3.16b,BLK3.16b,TWK3.16b
sub LTMP,LTMP,#32
ld1 {BLK4.16b}, [IN], #16 // fifth block
eor BLK4.16b, BLK4.16b, TWK4.16b
sub LTMP,LTMP,#16
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
sub TROUNDS,ROUNDS,#2
.p2align 4
.Lxts_dec_rounds_5blks:
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK1,RDK0
AesDecrypt1x BLK2,RDK0
AesDecrypt1x BLK3,RDK0
AesDecrypt1x BLK4,RDK0
ld1 {RDK0.4s},[KTMP],#16
subs TROUNDS,TROUNDS,#2
AesDecrypt1x BLK0,RDK1
AesDecrypt1x BLK1,RDK1
AesDecrypt1x BLK2,RDK1
AesDecrypt1x BLK3,RDK1
AesDecrypt1x BLK4,RDK1
ld1 {RDK1.4s},[KTMP],#16
b.gt .Lxts_dec_rounds_5blks
AesDecrypt1x BLK0,RDK0
AesDecrypt1x BLK1,RDK0
AesDecrypt1x BLK2,RDK0
AesDecrypt1x BLK3,RDK0
AesDecrypt1x BLK4,RDK0
subs LTMP,LTMP,#80
AesDecrypt1x BLK0,RDK1
AesDecrypt1x BLK1,RDK1
AesDecrypt1x BLK2,RDK1
AesDecrypt1x BLK3,RDK1
AesDecrypt1x BLK4,RDK1
// last 7 rounds
AesDecrypt1x BLK0,RDK2
AesDecrypt1x BLK1,RDK2
AesDecrypt1x BLK2,RDK2
AesDecrypt1x BLK3,RDK2
AesDecrypt1x BLK4,RDK2
csel POS,xzr,LTMP,gt //
AesDecrypt1x BLK0,RDK3
AesDecrypt1x BLK1,RDK3
AesDecrypt1x BLK2,RDK3
AesDecrypt1x BLK3,RDK3
AesDecrypt1x BLK4,RDK3
add IN,IN,POS
AesDecrypt1x BLK0,RDK4
AesDecrypt1x BLK1,RDK4
AesDecrypt1x BLK2,RDK4
AesDecrypt1x BLK3,RDK4
AesDecrypt1x BLK4,RDK4
AesDecrypt1x BLK0,RDK5
AesDecrypt1x BLK1,RDK5
AesDecrypt1x BLK2,RDK5
AesDecrypt1x BLK3,RDK5
AesDecrypt1x BLK4,RDK5
AesDecrypt1x BLK0,RDK6
AesDecrypt1x BLK1,RDK6
AesDecrypt1x BLK2,RDK6
AesDecrypt1x BLK3,RDK6
AesDecrypt1x BLK4,RDK6
eor TMP0.16b,TWK0.16b,RDK8.16b
aesd BLK0.16b,RDK7.16b // final round
NextTweak TWX0,TWX1,TWKD00,TWKD01 // perform operations of next 5blks in advance
eor TMP1.16b,TWK1.16b,RDK8.16b
ld1 {IN0.16b}, [IN], #16
aesd BLK1.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD10,TWKD11
eor TMP2.16b,TWK2.16b,RDK8.16b
ld1 {IN1.16b}, [IN], #16
aesd BLK2.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD20,TWKD21
eor TMP3.16b,TWK3.16b,RDK8.16b
ld1 {IN2.16b}, [IN], #16
aesd BLK3.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD30,TWKD31
eor TMP4.16b,TWK4.16b,RDK8.16b
ld1 {IN3.16b}, [IN], #16
aesd BLK4.16b,RDK7.16b
NextTweak TWX0,TWX1,TWKD40,TWKD41
ld1 {IN4.16b}, [IN], #16
mov KTMP, KEY
ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
eor TMP0.16b,TMP0.16b,BLK0.16b
eor BLK0.16b,IN0.16b,TWK0.16b // blk0 = in0 ^ twk0
eor TMP1.16b,TMP1.16b,BLK1.16b
eor BLK1.16b,IN1.16b,TWK1.16b
st1 {TMP0.16b}, [OUT], #16
eor TMP2.16b,TMP2.16b,BLK2.16b
eor BLK2.16b,IN2.16b,TWK2.16b
eor TMP3.16b,TMP3.16b,BLK3.16b
eor BLK3.16b,IN3.16b,TWK3.16b
st1 {TMP1.16b}, [OUT], #16
eor TMP4.16b,TMP4.16b,BLK4.16b
eor BLK4.16b,IN4.16b,TWK4.16b
st1 {TMP2.16b}, [OUT], #16
sub TROUNDS,ROUNDS,#2
st1 {TMP3.16b,TMP4.16b}, [OUT], #32
b.hs .Lxts_dec_rounds_5blks
add LTMP,LTMP,#80 // add 5 blocks length back if LTMP < 0
cbz LTMP, .Lxts_dec_check_tail
cmp LTMP, #16
b.eq .Lxts_dec_pre_last_1blks
cmp LTMP,#32
b.eq .Lxts_dec_pre_last_2blks
cmp LTMP,#48
b.eq .Lxts_dec_pre_last_3blks
cmp LTMP,#64
b.eq .Lxts_dec_pre_last_4blks
.Lxts_dec_pre_last_secondblks:
fmov TWX0,TWKD10 // reset already computed tweak
fmov TWX1,TWKD11
mov TWK2.16b, TWK0.16b //save the last second tweak
mov TWK0.16b, TWK1.16b // use the last tweak
b .Lxts_dec_proc_1blk
.Lxts_dec_check_tail:
cbnz TAILNUM, .Lxts_dec_pre_last_secondblks
fmov TWX0,TWKD00 // reset already computed tweak
fmov TWX1,TWKD01
b .Lxts_aesdec_finish
.Lxts_dec_pre_last_1blks:
eor IN0.16b,IN0.16b,IN4.16b //in0 = in0 ^ in41
eor BLK0.16b,BLK0.16b,IN0.16b // blk0 = in0 ^ twk0 ^ in0 ^ in4
fmov TWX0,TWKD00 // reset already computed tweak
fmov TWX1,TWKD01
b .Lxts_dec_rounds_1blks
.Lxts_dec_pre_last_2blks:
eor BLK0.16b,BLK0.16b,IN0.16b
eor BLK1.16b,BLK1.16b,IN1.16b
eor BLK0.16b,BLK0.16b,IN3.16b // in3 -> blk0
eor BLK1.16b,BLK1.16b,IN4.16b // in4 -> blk1
fmov TWX0,TWKD10 // reset already computed tweak
fmov TWX1,TWKD11
b .Lxts_dec_rounds_2blks
.Lxts_dec_pre_last_3blks:
eor BLK0.16b,BLK0.16b,IN0.16b
eor BLK1.16b,BLK1.16b,IN1.16b
eor BLK2.16b,BLK2.16b,IN2.16b
eor BLK0.16b,BLK0.16b,IN2.16b // in2 -> blk0
eor BLK1.16b,BLK1.16b,IN3.16b // in3 -> blk1
eor BLK2.16b,BLK2.16b,IN4.16b // in4 -> blk2
fmov TWX0,TWKD20 // reset already computed tweak
fmov TWX1,TWKD21
b .Lxts_dec_rounds_3blks
.Lxts_dec_pre_last_4blks:
eor BLK0.16b,BLK0.16b,IN0.16b
eor BLK1.16b,BLK1.16b,IN1.16b
eor BLK2.16b,BLK2.16b,IN2.16b
eor BLK3.16b,BLK3.16b,IN3.16b
sub IN,IN,#16 // have loaded 4blks, using 3blks to process, so step back 1blk here
eor BLK0.16b,BLK0.16b,IN1.16b // in1 -> blk0
eor BLK1.16b,BLK1.16b,IN2.16b // in2 -> blk1
eor BLK2.16b,BLK2.16b,IN3.16b // in3 -> blk2
eor BLK3.16b,BLK3.16b,IN4.16b // in4 -> blk3
fmov TWX0,TWKD20 // reset already computed tweak
fmov TWX1,TWKD21
b .Lxts_dec_rounds_3blks
.Lxts_aesdec_finish:
MOV_REG_TO_VEC TWX0,TWX1,TWKD00,TWKD01
st1 {TWK0.16b}, [TWEAK]
mov x0, #0
ldp d14, d15, [sp,#64]
ldp d12, d13, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d8, d9, [sp, #16]
ldp x29, x30, [sp], #80
AARCH64_AUTIASP
ret
CRYPT_AARCH64_FUNC_END(CRYPT_AES_XTS_Decrypt)
#endif