$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
} else {
$output and open STDOUT,">$output";
}
$Xi="r0";
$Htbl="r1";
$inp="r2";
$len="r3";
$Zll="r4";
$Zlh="r5";
$Zhl="r6";
$Zhh="r7";
$Tll="r8";
$Tlh="r9";
$Thl="r10";
$Thh="r11";
$nlo="r12";
$nhi="r14";
$rem_4bit=$inp;
$cnt=$len;
sub Zsmash() {
my $i=12;
my @args=@_;
for ($Zll,$Zlh,$Zhl,$Zhh) {
$code.=<<___;
rev $_,$_
str $_,[$Xi,
str $_,[$Xi,
mov $Tlh,$_,lsr
strb $_,[$Xi,
mov $Thl,$_,lsr
strb $Tlh,[$Xi,
mov $Thh,$_,lsr
strb $Thl,[$Xi,
strb $Thh,[$Xi,
___
$code.="\t".shift(@args)."\n";
$i-=4;
}
}
$code=<<___;
.syntax unified
.thumb
.code 32
.text
.type rem_4bit,%object
.align 5
rem_4bit:
.short 0x0000,0x1C20,0x3840,0x2460
.short 0x7080,0x6CA0,0x48C0,0x54E0
.short 0xE100,0xFD20,0xD940,0xC560
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
.size rem_4bit,.-rem_4bit
.type rem_4bit_get,%function
rem_4bit_get:
adr $rem_4bit,rem_4bit
sub $rem_4bit,pc,#8+32 @ &rem_4bit
#endif
b .Lrem_4bit_got
nop
nop
.size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit
.type gcm_ghash_4bit,%function
.align 4
gcm_ghash_4bit:
#if defined(__thumb2__)
adr r12,rem_4bit
#else
sub r12,pc,#8+48 @ &rem_4bit
#endif
add $len,$inp,$len @ $len to point at the end
stmdb sp!,{r3-r11,lr} @ save $len/end too
ldmia r12,{r4-r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack
ldrb $nlo,[$inp,
ldrb $nhi,[$Xi,
.Louter:
eor $nlo,$nlo,$nhi
and $nhi,$nlo,
and $nlo,$nlo,
mov $cnt,
add $Zhh,$Htbl,$nlo,lsl
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,
and $nhi,$Zll,
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl
ldrb $nhi,[$Xi,
eor $Zlh,$Tlh,$Zlh,lsr
eor $Zlh,$Zlh,$Zhl,lsl
eor $Zhl,$Thl,$Zhl,lsr
eor $Zhl,$Zhl,$Zhh,lsl
eor $Zhh,$Thh,$Zhh,lsr
eor $nlo,$nlo,$nhi
and $nhi,$nlo,
and $nlo,$nlo,
eor $Zhh,$Zhh,$Tll,lsl
.Linner:
add $Thh,$Htbl,$nlo,lsl
and $nlo,$Zll,
subs $cnt,$cnt,
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr
eor $Zll,$Zll,$Zlh,lsl
eor $Zlh,$Tlh,$Zlh,lsr
eor $Zlh,$Zlh,$Zhl,lsl
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr
it pl
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl
eor $Zhh,$Thh,$Zhh,lsr
add $Thh,$Htbl,$nhi
and $nhi,$Zll,
eor $Zhh,$Zhh,$Tll,lsl
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr
it pl
ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl
eor $Zlh,$Tlh,$Zlh,lsr
ldrh $Tlh,[sp,$nhi]
eor $Zlh,$Zlh,$Zhl,lsl
eor $Zhl,$Thl,$Zhl,lsr
eor $Zhl,$Zhl,$Zhh,lsl
it pl
eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr
itt pl
andpl $nhi,$nlo,
andpl $nlo,$nlo,
eor $Zhh,$Zhh,$Tlh,lsl
bpl .Linner
ldr $len,[sp,
add $inp,$inp,
mov $nhi,$Zll
___
&Zsmash("cmp\t$inp,$len","\n".
"#ifdef __thumb2__\n".
" it ne\n".
"#endif\n".
" ldrneb $nlo,[$inp,#15]");
$code.=<<___;
bne .Louter
add sp,sp,
ldmia sp!,{r4-r11,pc}
ldmia sp!,{r4-r11,lr}
tst lr,
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit
.type gcm_gmult_4bit,%function
gcm_gmult_4bit:
stmdb sp!,{r4-r11,lr}
ldrb $nlo,[$Xi,
b rem_4bit_get
.Lrem_4bit_got:
and $nhi,$nlo,
and $nlo,$nlo,
mov $cnt,
add $Zhh,$Htbl,$nlo,lsl
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
ldrb $nlo,[$Xi,
add $Thh,$Htbl,$nhi
and $nhi,$Zll,
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl
eor $Zlh,$Tlh,$Zlh,lsr
eor $Zlh,$Zlh,$Zhl,lsl
eor $Zhl,$Thl,$Zhl,lsr
eor $Zhl,$Zhl,$Zhh,lsl
eor $Zhh,$Thh,$Zhh,lsr
and $nhi,$nlo,
eor $Zhh,$Zhh,$Tll,lsl
and $nlo,$nlo,
.Loop:
add $Thh,$Htbl,$nlo,lsl
and $nlo,$Zll,
subs $cnt,$cnt,
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr
eor $Zll,$Zll,$Zlh,lsl
eor $Zlh,$Tlh,$Zlh,lsr
eor $Zlh,$Zlh,$Zhl,lsl
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr
it pl
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl
eor $Zhh,$Thh,$Zhh,lsr
add $Thh,$Htbl,$nhi
and $nhi,$Zll,
eor $Zhh,$Zhh,$Tll,lsl
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr
eor $Zll,$Zll,$Zlh,lsl
eor $Zlh,$Tlh,$Zlh,lsr
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zlh,$Zlh,$Zhl,lsl
eor $Zhl,$Thl,$Zhl,lsr
eor $Zhl,$Zhl,$Zhh,lsl
eor $Zhh,$Thh,$Zhh,lsr
itt pl
andpl $nhi,$nlo,
andpl $nlo,$nlo,
eor $Zhh,$Zhh,$Tll,lsl
bpl .Loop
___
&Zsmash();
$code.=<<___;
ldmia sp!,{r4-r11,pc}
ldmia sp!,{r4-r11,lr}
tst lr,
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
vext.8 $t0
vmull.p8 $t0, $t0
vext.8 $r
vmull.p8 $r, $a, $r
vext.8 $t1
vmull.p8 $t1, $t1
vext.8 $t3
vmull.p8 $t3, $a, $t3
vext.8 $t2
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2
vext.8 $r
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r
veor $t0
vand $t0
vext.8 $t3
veor $t1
vand $t1
vmull.p8 $t3, $a, $t3
veor $t2, $t2, $r @ N = I + J
veor $t0
veor $t1
veor $t2
vand $t2
vext.8 $t0, $t0, $t0,
veor $t3
vmov.i64 $t3
vext.8 $t1, $t1, $t1,
veor $t2
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3,
vext.8 $t2, $t2, $t2,
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
___
}
$code.=<<___;
.arch armv7-a
.fpu neon
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
vld1.64 $IN
vmov.i8 $t0,
vld1.64 $IN
vshl.i64 $t0
vshr.u64 $t0
vdup.8 $t1,$IN
vshr.u64 $Hlo,$IN
vshr.s8 $t1,
vshl.i64 $IN,$IN,
vand $t0,$t0,$t1
vorr $IN
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
ret @ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
vld1.64 $IN
vld1.64 $IN
vmov.i64 $k48,
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,
vrev64.8 $IN,$IN
vmov.i64 $k16,
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 $Xl
vld1.64 $Xl
vmov.i64 $k48,
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,
vrev64.8 $Xl,$Xl
vmov.i64 $k16,
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
.Loop_neon:
vld1.64 $IN
vld1.64 $IN
vrev64.8 $IN,$IN
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo");
$code.=<<___;
veor $IN
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo");
&clmul64x64 ($Xh,$Hhi,"$IN#hi");
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
veor $Xl
veor $Xh
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 $t1,$Xl,
vshl.i64 $t2,$Xl,
veor $t2,$t2,$t1 @
vshl.i64 $t1,$Xl,
veor $t2, $t2, $t1 @
veor $Xl
veor $Xh
vshr.u64 $t2,$Xl,
veor $Xh,$Xh,$Xl
veor $Xl,$Xl,$t2 @
vshr.u64 $t2,$t2,
vshr.u64 $Xl,$Xl,
veor $Xl,$Xl,$Xh @
veor $Xl,$Xl,$t2 @
subs $len,
bne .Loop_neon
vrev64.8 $Xl,$Xl
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";