1#include "arm_arch.h"
2
3.text
4#if !defined(__clang__)
5.arch	armv8-a+crypto
6#endif
7.globl	gcm_init_v8
8.type	gcm_init_v8,%function
9.align	4
10gcm_init_v8:
11	ld1	{v17.2d},[x1]		//load input H
12	movi	v19.16b,#0xe1
13	shl	v19.2d,v19.2d,#57		//0xc2.0
14	ext	v3.16b,v17.16b,v17.16b,#8
15	ushr	v18.2d,v19.2d,#63
16	dup	v17.4s,v17.s[1]
17	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
18	ushr	v18.2d,v3.2d,#63
19	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
20	and	v18.16b,v18.16b,v16.16b
21	shl	v3.2d,v3.2d,#1
22	ext	v18.16b,v18.16b,v18.16b,#8
23	and	v16.16b,v16.16b,v17.16b
24	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
25	eor	v20.16b,v3.16b,v16.16b		//twisted H
26	st1	{v20.2d},[x0],#16		//store Htable[0]
27
28	//calculate H^2
29	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
30	pmull	v0.1q,v20.1d,v20.1d
31	eor	v16.16b,v16.16b,v20.16b
32	pmull2	v2.1q,v20.2d,v20.2d
33	pmull	v1.1q,v16.1d,v16.1d
34
35	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
36	eor	v18.16b,v0.16b,v2.16b
37	eor	v1.16b,v1.16b,v17.16b
38	eor	v1.16b,v1.16b,v18.16b
39	pmull	v18.1q,v0.1d,v19.1d		//1st phase
40
41	ins	v2.d[0],v1.d[1]
42	ins	v1.d[1],v0.d[0]
43	eor	v0.16b,v1.16b,v18.16b
44
45	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
46	pmull	v0.1q,v0.1d,v19.1d
47	eor	v18.16b,v18.16b,v2.16b
48	eor	v22.16b,v0.16b,v18.16b
49
50	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
51	eor	v17.16b,v17.16b,v22.16b
52	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
53	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
54
55	ret
56.size	gcm_init_v8,.-gcm_init_v8
57.globl	gcm_gmult_v8
58.type	gcm_gmult_v8,%function
59.align	4
60gcm_gmult_v8:
61	ld1	{v17.2d},[x0]		//load Xi
62	movi	v19.16b,#0xe1
63	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
64	shl	v19.2d,v19.2d,#57
65#ifndef __ARMEB__
66	rev64	v17.16b,v17.16b
67#endif
68	ext	v3.16b,v17.16b,v17.16b,#8
69
70	pmull	v0.1q,v20.1d,v3.1d		//H.lo�Xi.lo
71	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
72	pmull2	v2.1q,v20.2d,v3.2d		//H.hi�Xi.hi
73	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)�(Xi.lo+Xi.hi)
74
75	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
76	eor	v18.16b,v0.16b,v2.16b
77	eor	v1.16b,v1.16b,v17.16b
78	eor	v1.16b,v1.16b,v18.16b
79	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
80
81	ins	v2.d[0],v1.d[1]
82	ins	v1.d[1],v0.d[0]
83	eor	v0.16b,v1.16b,v18.16b
84
85	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
86	pmull	v0.1q,v0.1d,v19.1d
87	eor	v18.16b,v18.16b,v2.16b
88	eor	v0.16b,v0.16b,v18.16b
89
90#ifndef __ARMEB__
91	rev64	v0.16b,v0.16b
92#endif
93	ext	v0.16b,v0.16b,v0.16b,#8
94	st1	{v0.2d},[x0]		//write out Xi
95
96	ret
97.size	gcm_gmult_v8,.-gcm_gmult_v8
98.globl	gcm_ghash_v8
99.type	gcm_ghash_v8,%function
100.align	4
101gcm_ghash_v8:
102	ld1	{v0.2d},[x0]		//load [rotated] Xi
103						//"[rotated]" means that
104						//loaded value would have
105						//to be rotated in order to
106						//make it appear as in
107						//alorithm specification
108	subs	x3,x3,#32		//see if x3 is 32 or larger
109	mov	x12,#16		//x12 is used as post-
110						//increment for input pointer;
111						//as loop is modulo-scheduled
112						//x12 is zeroed just in time
113						//to preclude oversteping
114						//inp[len], which means that
115						//last block[s] are actually
116						//loaded twice, but last
117						//copy is not processed
118	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
119	movi	v19.16b,#0xe1
120	ld1	{v22.2d},[x1]
121	csel	x12,xzr,x12,eq			//is it time to zero x12?
122	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
123	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
124	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
125#ifndef __ARMEB__
126	rev64	v16.16b,v16.16b
127	rev64	v0.16b,v0.16b
128#endif
129	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
130	b.lo	.Lodd_tail_v8		//x3 was less than 32
131	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
132#ifndef __ARMEB__
133	rev64	v17.16b,v17.16b
134#endif
135	ext	v7.16b,v17.16b,v17.16b,#8
136	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
137	pmull	v4.1q,v20.1d,v7.1d		//H�Ii+1
138	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
139	pmull2	v6.1q,v20.2d,v7.2d
140	b	.Loop_mod2x_v8
141
142.align	4
143.Loop_mod2x_v8:
144	ext	v18.16b,v3.16b,v3.16b,#8
145	subs	x3,x3,#32		//is there more data?
146	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo�Xi.lo
147	csel	x12,xzr,x12,lo			//is it time to zero x12?
148
149	pmull	v5.1q,v21.1d,v17.1d
150	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
151	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi�Xi.hi
152	eor	v0.16b,v0.16b,v4.16b		//accumulate
153	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)�(Xi.lo+Xi.hi)
154	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
155
156	eor	v2.16b,v2.16b,v6.16b
157	csel	x12,xzr,x12,eq			//is it time to zero x12?
158	eor	v1.16b,v1.16b,v5.16b
159
160	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
161	eor	v18.16b,v0.16b,v2.16b
162	eor	v1.16b,v1.16b,v17.16b
163	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
164#ifndef __ARMEB__
165	rev64	v16.16b,v16.16b
166#endif
167	eor	v1.16b,v1.16b,v18.16b
168	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
169
170#ifndef __ARMEB__
171	rev64	v17.16b,v17.16b
172#endif
173	ins	v2.d[0],v1.d[1]
174	ins	v1.d[1],v0.d[0]
175	ext	v7.16b,v17.16b,v17.16b,#8
176	ext	v3.16b,v16.16b,v16.16b,#8
177	eor	v0.16b,v1.16b,v18.16b
178	pmull	v4.1q,v20.1d,v7.1d		//H�Ii+1
179	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
180
181	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
182	pmull	v0.1q,v0.1d,v19.1d
183	eor	v3.16b,v3.16b,v18.16b
184	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
185	eor	v3.16b,v3.16b,v0.16b
186	pmull2	v6.1q,v20.2d,v7.2d
187	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
188
189	eor	v2.16b,v2.16b,v18.16b
190	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
191	adds	x3,x3,#32		//re-construct x3
192	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
193	b.eq	.Ldone_v8		//is x3 zero?
194.Lodd_tail_v8:
195	ext	v18.16b,v0.16b,v0.16b,#8
196	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
197	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
198
199	pmull	v0.1q,v20.1d,v3.1d		//H.lo�Xi.lo
200	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
201	pmull2	v2.1q,v20.2d,v3.2d		//H.hi�Xi.hi
202	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)�(Xi.lo+Xi.hi)
203
204	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
205	eor	v18.16b,v0.16b,v2.16b
206	eor	v1.16b,v1.16b,v17.16b
207	eor	v1.16b,v1.16b,v18.16b
208	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
209
210	ins	v2.d[0],v1.d[1]
211	ins	v1.d[1],v0.d[0]
212	eor	v0.16b,v1.16b,v18.16b
213
214	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
215	pmull	v0.1q,v0.1d,v19.1d
216	eor	v18.16b,v18.16b,v2.16b
217	eor	v0.16b,v0.16b,v18.16b
218
219.Ldone_v8:
220#ifndef __ARMEB__
221	rev64	v0.16b,v0.16b
222#endif
223	ext	v0.16b,v0.16b,v0.16b,#8
224	st1	{v0.2d},[x0]		//write out Xi
225
226	ret
227.size	gcm_ghash_v8,.-gcm_ghash_v8
228.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
229.align	2
230.align	2
231