1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl
2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# May 2011
11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The module implements bn_GF2m_mul_2x2 polynomial multiplication
13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# C for the time being... Except that it has two code paths: pure
15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# integer code suitable for any ARMv4 and later CPU and NEON code
16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# faster than compiler-generated code. For ECDH and ECDSA verify (but
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# not for ECDSA sign) it means 25%-45% improvement depending on key
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# length, more for longer keys. Even though NEON 1x1 multiplication
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# runs in even less cycles, ~30, improvement is measurable only on
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# longer keys. One has to optimize code elsewhere to get NEON glow...
233f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root#
243f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# April 2014
253f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root#
263f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
273f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# referred below, which improves ECDH and ECDSA verify benchmarks
283f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# by 18-40%.
293f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root#
303f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
313f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# Polynomial Multiplication on ARM Processors using the NEON Engine.
323f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root#
333f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromopen STDOUT,">$output";
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___;
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#include "arm_arch.h"
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.code	32
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#if __ARM_ARCH__>=7
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.fpu	neon
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#endif
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom################
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# private interface to mul_1x1_ialu
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$a="r1";
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$b="r0";
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($a0,$a1,$a2,$a12,$a4,$a14)=
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$mask="r12";
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	mul_1x1_ialu,%function
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	5
62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommul_1x1_ialu:
63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$a0,#0
64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a0,[sp,#0]		@ tab[0]=0
66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$a2,$a1,$a1		@ a2=a1<<1
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a1,[sp,#4]		@ tab[1]=a1
68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$a12,$a1,$a2		@ a1^a2
69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a2,[sp,#8]		@ tab[2]=a2
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$a4,$a1,lsl#2		@ a4=a1<<2
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a12,[sp,#12]		@ tab[3]=a1^a2
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$a14,$a1,$a4		@ a1^a4
73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a4,[sp,#16]		@ tab[4]=a4
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$a0,$a2,$a4		@ a2^a4
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a14,[sp,#20]		@ tab[5]=a1^a4
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$a12,$a12,$a4		@ a1^a2^a4
77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a0,[sp,#24]		@ tab[6]=a2^a4
78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i0,$mask,$b,lsl#2
79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i1,$mask,$b,lsr#1
82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i0,$mask,$b,lsr#4
84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i1,$mask,$b,lsr#7
86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t1,lsl#3	@ stall
88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$hi,$t1,lsr#29
89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i0,$mask,$b,lsr#10
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t0,lsl#6
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t0,lsr#26
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i1,$mask,$b,lsr#13
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t1,lsl#9
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t1,lsr#23
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i0,$mask,$b,lsr#16
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t0,lsl#12
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t0,lsr#20
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i1,$mask,$b,lsr#19
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t1,lsl#15
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t1,lsr#17
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i0,$mask,$b,lsr#22
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t0,lsl#18
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t0,lsr#14
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i1,$mask,$b,lsr#25
117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t1,lsl#21
118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t1,lsr#11
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	tst	$a,#1<<30
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$i0,$mask,$b,lsr#28
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t0,lsl#24
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t0,lsr#8
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eorne	$lo,$lo,$b,lsl#30
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eorne	$hi,$hi,$b,lsr#2
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	tst	$a,#1<<31
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t1,lsl#27
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t1,lsr#5
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eorne	$lo,$lo,$b,lsl#31
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eorne	$hi,$hi,$b,lsr#1
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$t0,lsl#30
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,$t0,lsr#2
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	pc,lr
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	mul_1x1_ialu,.-mul_1x1_ialu
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom################
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void	bn_GF2m_mul_2x2(BN_ULONG *r,
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	BN_ULONG a1,BN_ULONG a0,
1431762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
1443f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root{
1453f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Rootmy ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
1463f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Rootmy ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global	bn_GF2m_mul_2x2
150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bn_GF2m_mul_2x2,%function
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	5
152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_GF2m_mul_2x2:
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#if __ARM_ARCH__>=7
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	r12,.LOPENSSL_armcap
155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lpic:	ldr	r12,[pc,r12]
156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	tst	r12,#1
157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	beq	.Lialu
158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1593f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	ldr		r12, [sp]		@ 5th argument
1603f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmov.32		$a, r2, r1
1613f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmov.32		$b, r12, r3
1623f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmov.i64	$k48, #0x0000ffffffffffff
1633f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmov.i64	$k32, #0x00000000ffffffff
1643f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmov.i64	$k16, #0x000000000000ffff
1653f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root
1663f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t0#lo, $a, $a, #1	@ A1
1673f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
1683f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$r#lo, $b, $b, #1	@ B1
1693f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$r, $a, $r#lo		@ E = A*B1
1703f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t1#lo, $a, $a, #2	@ A2
1713f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
1723f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t3#lo, $b, $b, #2	@ B2
1733f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
1743f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t2#lo, $a, $a, #3	@ A3
1753f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t0, $t0, $r		@ L = E + F
1763f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
1773f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$r#lo, $b, $b, #3	@ B3
1783f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t1, $t1, $t3		@ M = G + H
1793f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$r, $a, $r#lo		@ I = A*B3
1803f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
1813f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vand		$t0#hi, $t0#hi, $k48
1823f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t3#lo, $b, $b, #4	@ B4
1833f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
1843f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vand		$t1#hi, $t1#hi, $k32
1853f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
1863f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t2, $t2, $r		@ N = I + J
1873f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t0#lo, $t0#lo, $t0#hi
1883f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t1#lo, $t1#lo, $t1#hi
1893f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
1903f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vand		$t2#hi, $t2#hi, $k16
1913f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t0, $t0, $t0, #15
1923f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
1933f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmov.i64	$t3#hi, #0
1943f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t1, $t1, $t1, #14
1953f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t2#lo, $t2#lo, $t2#hi
1963f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vmull.p8	$r, $a, $b		@ D = A*B
1973f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t3, $t3, $t3, #12
1983f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vext.8		$t2, $t2, $t2, #13
1993f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t0, $t0, $t1
2003f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$t2, $t2, $t3
2013f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$r, $r, $t0
2023f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	veor		$r, $r, $t2
2033f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root
2043f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	vst1.32		{$r}, [r0]
2053f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	ret		@ bx lr
206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	4
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lialu:
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#endif
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2103f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root}
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ret="r10";	# reassigned 1st argument
212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stmdb	sp!,{r4-r10,lr}
214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$ret,r0			@ reassign 1st argument
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$b,r3			@ $b=b1
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldr	r3,[sp,#32]		@ load b0
217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$mask,#7<<2
218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	sp,sp,#32		@ allocate tab[8]
219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2201762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	bl	mul_1x1_ialu		@ a1·b1
221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$lo,[$ret,#8]
222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$hi,[$ret,#12]
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$b,$b,r3		@ flip b0 and b1
225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 eor	$a,$a,r2		@ flip a0 and a1
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	r3,r3,$b
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 eor	r2,r2,$a
228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$b,$b,r3
229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 eor	$a,$a,r2
2301762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	bl	mul_1x1_ialu		@ a0·b0
231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$lo,[$ret]
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$hi,[$ret,#4]
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$a,$a,r2
235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$b,$b,r3
2361762a559ef393f9c15300398433598989033385fDavid 'Digit' Turner	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@r=map("r$_",(6..9));
239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldmia	$ret,{@r[0]-@r[3]}
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$hi
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,@r[1]
243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,@r[0]
244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,@r[2]
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,@r[3]
246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$hi,$hi,@r[3]
247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$hi,[$ret,#8]
248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	eor	$lo,$lo,$hi
249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	sp,sp,#32		@ destroy tab[8]
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	str	$lo,[$ret,#4]
251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#if __ARM_ARCH__>=5
253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldmia	sp!,{r4-r10,pc}
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#else
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldmia	sp!,{r4-r10,lr}
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	tst	lr,#1
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	moveq	pc,lr			@ be binary compatible with V4, yet
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bx	lr			@ interoperable with Thumb ISA:-)
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#endif
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#if __ARM_ARCH__>=7
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	5
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LOPENSSL_armcap:
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.word	OPENSSL_armcap_P-(.Lpic+8)
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#endif
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	5
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.comm	OPENSSL_armcap_P,4,4
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2723f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Rootforeach (split("\n",$code)) {
2733f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	s/\`([^\`]*)\`/eval $1/geo;
2743f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root
2753f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
2763f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	s/\bret\b/bx	lr/go		or
2773f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
2783f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root
2793f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root	print $_,"\n";
2803f9e6ada2c9f7183a41081263585e6a70bbd9f59Kenny Root}
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT;   # enforce flush
282