$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $1";
} else {
$output and open STDOUT,">$output";
}
$code=<<___;
.syntax unified
.thumb
.code 32
.text
___
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,
bic $a1,$a,
str $a0,[sp,
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,
mov $a4,$a1,lsl
str $a12,[sp,
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,
and $i0,$mask,$b,lsl
str $a12,[sp,
and $i1,$mask,$b,lsr
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl
mov $hi,$t1,lsr
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr
eor $lo,$lo,$t0,lsl
eor $hi,$hi,$t0,lsr
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr
eor $lo,$lo,$t1,lsl
eor $hi,$hi,$t1,lsr
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr
eor $lo,$lo,$t0,lsl
eor $hi,$hi,$t0,lsr
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr
eor $lo,$lo,$t1,lsl
eor $hi,$hi,$t1,lsr
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr
eor $lo,$lo,$t0,lsl
eor $hi,$hi,$t0,lsr
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr
eor $lo,$lo,$t1,lsl
eor $hi,$hi,$t1,lsr
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,
and $i0,$mask,$b,lsr
eor $lo,$lo,$t0,lsl
eor $hi,$hi,$t0,lsr
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
itt ne
eorne $lo,$lo,$b,lsl
eorne $hi,$hi,$b,lsr
tst $a,
eor $lo,$lo,$t1,lsl
eor $hi,$hi,$t1,lsr
itt ne
eorne $lo,$lo,$b,lsl
eorne $hi,$hi,$b,lsr
eor $lo,$lo,$t0,lsl
eor $hi,$hi,$t0,lsr
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
{
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
stmdb sp!,{r10,lr}
ldr r12,.LOPENSSL_armcap
adr r10,.LOPENSSL_armcap
ldr r12,[r12,r10]
ldr r12,[r12]
tst r12,
itt ne
ldrne r10,[sp],
bne .LNEON
stmdb sp!,{r4-r9}
stmdb sp!,{r4-r10,lr}
___
$ret="r10";
$code.=<<___;
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
sub r7,sp,#36
mov r8,sp
and r7,r7,#-32
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
mov sp,r7 @ allocate tab[8]
str r8,[r7,#32]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
ldr sp,[sp,
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,
eor $lo,$lo,$hi
str $lo,[$ret,
ldmia sp!,{r4-r10,pc}
ldmia sp!,{r4-r10,lr}
tst lr,
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
___
}
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
.arch armv7-a
.fpu neon
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
vmov $a, r2, r1
vmov $b, r12, r3
vmov.i64 $k48,
vmov.i64 $k32,
vmov.i64 $k16,
vext.8 $t0
vmull.p8 $t0, $t0
vext.8 $r
vmull.p8 $r, $a, $r
vext.8 $t1
vmull.p8 $t1, $t1
vext.8 $t3
vmull.p8 $t3, $a, $t3
vext.8 $t2
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2
vext.8 $r
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r
veor $t0
vand $t0
vext.8 $t3
veor $t1
vand $t1
vmull.p8 $t3, $a, $t3
veor $t2, $t2, $r @ N = I + J
veor $t0
veor $t1
veor $t2
vand $t2
vext.8 $t0, $t0, $t0,
veor $t3
vmov.i64 $t3
vext.8 $t1, $t1, $t1,
veor $t2
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3,
vext.8 $t2, $t2, $t2,
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vst1.32 {$r}, [r0]
ret @ bx lr
___
}
$code.=<<___;
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P
.word OPENSSL_armcap_P-.
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
.comm OPENSSL_armcap_P,4,4
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";