1#include "arm_arch.h"
2
3.text
4.code	32
5
6.type	rem_4bit,%object
7.align	5
8rem_4bit:
9.short	0x0000,0x1C20,0x3840,0x2460
10.short	0x7080,0x6CA0,0x48C0,0x54E0
11.short	0xE100,0xFD20,0xD940,0xC560
12.short	0x9180,0x8DA0,0xA9C0,0xB5E0
13.size	rem_4bit,.-rem_4bit
14
15.type	rem_4bit_get,%function
16rem_4bit_get:
17	sub	r2,pc,#8
18	sub	r2,r2,#32	@ &rem_4bit
19	b	.Lrem_4bit_got
20	nop
21.size	rem_4bit_get,.-rem_4bit_get
22
23.global	gcm_ghash_4bit
24.type	gcm_ghash_4bit,%function
25gcm_ghash_4bit:
26	sub	r12,pc,#8
27	add	r3,r2,r3		@ r3 to point at the end
28	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
29	sub	r12,r12,#48		@ &rem_4bit
30
31	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
32	stmdb	sp!,{r4-r11}		@ ... to stack
33
34	ldrb	r12,[r2,#15]
35	ldrb	r14,[r0,#15]
36.Louter:
37	eor	r12,r12,r14
38	and	r14,r12,#0xf0
39	and	r12,r12,#0x0f
40	mov	r3,#14
41
42	add	r7,r1,r12,lsl#4
43	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
44	add	r11,r1,r14
45	ldrb	r12,[r2,#14]
46
47	and	r14,r4,#0xf		@ rem
48	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
49	add	r14,r14,r14
50	eor	r4,r8,r4,lsr#4
51	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
52	eor	r4,r4,r5,lsl#28
53	ldrb	r14,[r0,#14]
54	eor	r5,r9,r5,lsr#4
55	eor	r5,r5,r6,lsl#28
56	eor	r6,r10,r6,lsr#4
57	eor	r6,r6,r7,lsl#28
58	eor	r7,r11,r7,lsr#4
59	eor	r12,r12,r14
60	and	r14,r12,#0xf0
61	and	r12,r12,#0x0f
62	eor	r7,r7,r8,lsl#16
63
64.Linner:
65	add	r11,r1,r12,lsl#4
66	and	r12,r4,#0xf		@ rem
67	subs	r3,r3,#1
68	add	r12,r12,r12
69	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
70	eor	r4,r8,r4,lsr#4
71	eor	r4,r4,r5,lsl#28
72	eor	r5,r9,r5,lsr#4
73	eor	r5,r5,r6,lsl#28
74	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
75	eor	r6,r10,r6,lsr#4
76	ldrplb	r12,[r2,r3]
77	eor	r6,r6,r7,lsl#28
78	eor	r7,r11,r7,lsr#4
79
80	add	r11,r1,r14
81	and	r14,r4,#0xf		@ rem
82	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
83	add	r14,r14,r14
84	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
85	eor	r4,r8,r4,lsr#4
86	ldrplb	r8,[r0,r3]
87	eor	r4,r4,r5,lsl#28
88	eor	r5,r9,r5,lsr#4
89	ldrh	r9,[sp,r14]
90	eor	r5,r5,r6,lsl#28
91	eor	r6,r10,r6,lsr#4
92	eor	r6,r6,r7,lsl#28
93	eorpl	r12,r12,r8
94	eor	r7,r11,r7,lsr#4
95	andpl	r14,r12,#0xf0
96	andpl	r12,r12,#0x0f
97	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
98	bpl	.Linner
99
100	ldr	r3,[sp,#32]		@ re-load r3/end
101	add	r2,r2,#16
102	mov	r14,r4
103#if __ARM_ARCH__>=7 && defined(__ARMEL__)
104	rev	r4,r4
105	str	r4,[r0,#12]
106#elif defined(__ARMEB__)
107	str	r4,[r0,#12]
108#else
109	mov	r9,r4,lsr#8
110	strb	r4,[r0,#12+3]
111	mov	r10,r4,lsr#16
112	strb	r9,[r0,#12+2]
113	mov	r11,r4,lsr#24
114	strb	r10,[r0,#12+1]
115	strb	r11,[r0,#12]
116#endif
117	cmp	r2,r3
118#if __ARM_ARCH__>=7 && defined(__ARMEL__)
119	rev	r5,r5
120	str	r5,[r0,#8]
121#elif defined(__ARMEB__)
122	str	r5,[r0,#8]
123#else
124	mov	r9,r5,lsr#8
125	strb	r5,[r0,#8+3]
126	mov	r10,r5,lsr#16
127	strb	r9,[r0,#8+2]
128	mov	r11,r5,lsr#24
129	strb	r10,[r0,#8+1]
130	strb	r11,[r0,#8]
131#endif
132	ldrneb	r12,[r2,#15]
133#if __ARM_ARCH__>=7 && defined(__ARMEL__)
134	rev	r6,r6
135	str	r6,[r0,#4]
136#elif defined(__ARMEB__)
137	str	r6,[r0,#4]
138#else
139	mov	r9,r6,lsr#8
140	strb	r6,[r0,#4+3]
141	mov	r10,r6,lsr#16
142	strb	r9,[r0,#4+2]
143	mov	r11,r6,lsr#24
144	strb	r10,[r0,#4+1]
145	strb	r11,[r0,#4]
146#endif
147
148#if __ARM_ARCH__>=7 && defined(__ARMEL__)
149	rev	r7,r7
150	str	r7,[r0,#0]
151#elif defined(__ARMEB__)
152	str	r7,[r0,#0]
153#else
154	mov	r9,r7,lsr#8
155	strb	r7,[r0,#0+3]
156	mov	r10,r7,lsr#16
157	strb	r9,[r0,#0+2]
158	mov	r11,r7,lsr#24
159	strb	r10,[r0,#0+1]
160	strb	r11,[r0,#0]
161#endif
162
163	bne	.Louter
164
165	add	sp,sp,#36
166#if __ARM_ARCH__>=5
167	ldmia	sp!,{r4-r11,pc}
168#else
169	ldmia	sp!,{r4-r11,lr}
170	tst	lr,#1
171	moveq	pc,lr			@ be binary compatible with V4, yet
172	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
173#endif
174.size	gcm_ghash_4bit,.-gcm_ghash_4bit
175
176.global	gcm_gmult_4bit
177.type	gcm_gmult_4bit,%function
178gcm_gmult_4bit:
179	stmdb	sp!,{r4-r11,lr}
180	ldrb	r12,[r0,#15]
181	b	rem_4bit_get
182.Lrem_4bit_got:
183	and	r14,r12,#0xf0
184	and	r12,r12,#0x0f
185	mov	r3,#14
186
187	add	r7,r1,r12,lsl#4
188	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
189	ldrb	r12,[r0,#14]
190
191	add	r11,r1,r14
192	and	r14,r4,#0xf		@ rem
193	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
194	add	r14,r14,r14
195	eor	r4,r8,r4,lsr#4
196	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
197	eor	r4,r4,r5,lsl#28
198	eor	r5,r9,r5,lsr#4
199	eor	r5,r5,r6,lsl#28
200	eor	r6,r10,r6,lsr#4
201	eor	r6,r6,r7,lsl#28
202	eor	r7,r11,r7,lsr#4
203	and	r14,r12,#0xf0
204	eor	r7,r7,r8,lsl#16
205	and	r12,r12,#0x0f
206
207.Loop:
208	add	r11,r1,r12,lsl#4
209	and	r12,r4,#0xf		@ rem
210	subs	r3,r3,#1
211	add	r12,r12,r12
212	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
213	eor	r4,r8,r4,lsr#4
214	eor	r4,r4,r5,lsl#28
215	eor	r5,r9,r5,lsr#4
216	eor	r5,r5,r6,lsl#28
217	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
218	eor	r6,r10,r6,lsr#4
219	ldrplb	r12,[r0,r3]
220	eor	r6,r6,r7,lsl#28
221	eor	r7,r11,r7,lsr#4
222
223	add	r11,r1,r14
224	and	r14,r4,#0xf		@ rem
225	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
226	add	r14,r14,r14
227	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
228	eor	r4,r8,r4,lsr#4
229	eor	r4,r4,r5,lsl#28
230	eor	r5,r9,r5,lsr#4
231	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
232	eor	r5,r5,r6,lsl#28
233	eor	r6,r10,r6,lsr#4
234	eor	r6,r6,r7,lsl#28
235	eor	r7,r11,r7,lsr#4
236	andpl	r14,r12,#0xf0
237	andpl	r12,r12,#0x0f
238	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
239	bpl	.Loop
240#if __ARM_ARCH__>=7 && defined(__ARMEL__)
241	rev	r4,r4
242	str	r4,[r0,#12]
243#elif defined(__ARMEB__)
244	str	r4,[r0,#12]
245#else
246	mov	r9,r4,lsr#8
247	strb	r4,[r0,#12+3]
248	mov	r10,r4,lsr#16
249	strb	r9,[r0,#12+2]
250	mov	r11,r4,lsr#24
251	strb	r10,[r0,#12+1]
252	strb	r11,[r0,#12]
253#endif
254
255#if __ARM_ARCH__>=7 && defined(__ARMEL__)
256	rev	r5,r5
257	str	r5,[r0,#8]
258#elif defined(__ARMEB__)
259	str	r5,[r0,#8]
260#else
261	mov	r9,r5,lsr#8
262	strb	r5,[r0,#8+3]
263	mov	r10,r5,lsr#16
264	strb	r9,[r0,#8+2]
265	mov	r11,r5,lsr#24
266	strb	r10,[r0,#8+1]
267	strb	r11,[r0,#8]
268#endif
269
270#if __ARM_ARCH__>=7 && defined(__ARMEL__)
271	rev	r6,r6
272	str	r6,[r0,#4]
273#elif defined(__ARMEB__)
274	str	r6,[r0,#4]
275#else
276	mov	r9,r6,lsr#8
277	strb	r6,[r0,#4+3]
278	mov	r10,r6,lsr#16
279	strb	r9,[r0,#4+2]
280	mov	r11,r6,lsr#24
281	strb	r10,[r0,#4+1]
282	strb	r11,[r0,#4]
283#endif
284
285#if __ARM_ARCH__>=7 && defined(__ARMEL__)
286	rev	r7,r7
287	str	r7,[r0,#0]
288#elif defined(__ARMEB__)
289	str	r7,[r0,#0]
290#else
291	mov	r9,r7,lsr#8
292	strb	r7,[r0,#0+3]
293	mov	r10,r7,lsr#16
294	strb	r9,[r0,#0+2]
295	mov	r11,r7,lsr#24
296	strb	r10,[r0,#0+1]
297	strb	r11,[r0,#0]
298#endif
299
300#if __ARM_ARCH__>=5
301	ldmia	sp!,{r4-r11,pc}
302#else
303	ldmia	sp!,{r4-r11,lr}
304	tst	lr,#1
305	moveq	pc,lr			@ be binary compatible with V4, yet
306	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
307#endif
308.size	gcm_gmult_4bit,.-gcm_gmult_4bit
309#if __ARM_ARCH__>=7
310.fpu	neon
311
312.global	gcm_init_neon
313.type	gcm_init_neon,%function
314.align	4
315gcm_init_neon:
316	vld1.64		d7,[r1,:64]!	@ load H
317	vmov.i8		q8,#0xe1
318	vld1.64		d6,[r1,:64]
319	vshl.i64	d17,#57
320	vshr.u64	d16,#63		@ t0=0xc2....01
321	vdup.8		q9,d7[7]
322	vshr.u64	d26,d6,#63
323	vshr.s8		q9,#7			@ broadcast carry bit
324	vshl.i64	q3,q3,#1
325	vand		q8,q8,q9
326	vorr		d7,d26		@ H<<<=1
327	veor		q3,q3,q8		@ twisted H
328	vstmia		r0,{q3}
329
330	bx	lr					@ bx lr
331.size	gcm_init_neon,.-gcm_init_neon
332
333.global	gcm_gmult_neon
334.type	gcm_gmult_neon,%function
335.align	4
336gcm_gmult_neon:
337	vld1.64		d7,[r0,:64]!	@ load Xi
338	vld1.64		d6,[r0,:64]!
339	vmov.i64	d29,#0x0000ffffffffffff
340	vldmia		r1,{d26-d27}	@ load twisted H
341	vmov.i64	d30,#0x00000000ffffffff
342#ifdef __ARMEL__
343	vrev64.8	q3,q3
344#endif
345	vmov.i64	d31,#0x000000000000ffff
346	veor		d28,d26,d27		@ Karatsuba pre-processing
347	mov		r3,#16
348	b		.Lgmult_neon
349.size	gcm_gmult_neon,.-gcm_gmult_neon
350
351.global	gcm_ghash_neon
352.type	gcm_ghash_neon,%function
353.align	4
354gcm_ghash_neon:
355	vld1.64		d1,[r0,:64]!	@ load Xi
356	vld1.64		d0,[r0,:64]!
357	vmov.i64	d29,#0x0000ffffffffffff
358	vldmia		r1,{d26-d27}	@ load twisted H
359	vmov.i64	d30,#0x00000000ffffffff
360#ifdef __ARMEL__
361	vrev64.8	q0,q0
362#endif
363	vmov.i64	d31,#0x000000000000ffff
364	veor		d28,d26,d27		@ Karatsuba pre-processing
365
366.Loop_neon:
367	vld1.64		d7,[r2]!		@ load inp
368	vld1.64		d6,[r2]!
369#ifdef __ARMEL__
370	vrev64.8	q3,q3
371#endif
372	veor		q3,q0			@ inp^=Xi
373.Lgmult_neon:
374	vext.8		d16, d26, d26, #1	@ A1
375	vmull.p8	q8, d16, d6		@ F = A1*B
376	vext.8		d0, d6, d6, #1	@ B1
377	vmull.p8	q0, d26, d0		@ E = A*B1
378	vext.8		d18, d26, d26, #2	@ A2
379	vmull.p8	q9, d18, d6		@ H = A2*B
380	vext.8		d22, d6, d6, #2	@ B2
381	vmull.p8	q11, d26, d22		@ G = A*B2
382	vext.8		d20, d26, d26, #3	@ A3
383	veor		q8, q8, q0		@ L = E + F
384	vmull.p8	q10, d20, d6		@ J = A3*B
385	vext.8		d0, d6, d6, #3	@ B3
386	veor		q9, q9, q11		@ M = G + H
387	vmull.p8	q0, d26, d0		@ I = A*B3
388	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
389	vand		d17, d17, d29
390	vext.8		d22, d6, d6, #4	@ B4
391	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
392	vand		d19, d19, d30
393	vmull.p8	q11, d26, d22		@ K = A*B4
394	veor		q10, q10, q0		@ N = I + J
395	veor		d16, d16, d17
396	veor		d18, d18, d19
397	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
398	vand		d21, d21, d31
399	vext.8		q8, q8, q8, #15
400	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
401	vmov.i64	d23, #0
402	vext.8		q9, q9, q9, #14
403	veor		d20, d20, d21
404	vmull.p8	q0, d26, d6		@ D = A*B
405	vext.8		q11, q11, q11, #12
406	vext.8		q10, q10, q10, #13
407	veor		q8, q8, q9
408	veor		q10, q10, q11
409	veor		q0, q0, q8
410	veor		q0, q0, q10
411	veor		d6,d6,d7	@ Karatsuba pre-processing
412	vext.8		d16, d28, d28, #1	@ A1
413	vmull.p8	q8, d16, d6		@ F = A1*B
414	vext.8		d2, d6, d6, #1	@ B1
415	vmull.p8	q1, d28, d2		@ E = A*B1
416	vext.8		d18, d28, d28, #2	@ A2
417	vmull.p8	q9, d18, d6		@ H = A2*B
418	vext.8		d22, d6, d6, #2	@ B2
419	vmull.p8	q11, d28, d22		@ G = A*B2
420	vext.8		d20, d28, d28, #3	@ A3
421	veor		q8, q8, q1		@ L = E + F
422	vmull.p8	q10, d20, d6		@ J = A3*B
423	vext.8		d2, d6, d6, #3	@ B3
424	veor		q9, q9, q11		@ M = G + H
425	vmull.p8	q1, d28, d2		@ I = A*B3
426	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
427	vand		d17, d17, d29
428	vext.8		d22, d6, d6, #4	@ B4
429	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
430	vand		d19, d19, d30
431	vmull.p8	q11, d28, d22		@ K = A*B4
432	veor		q10, q10, q1		@ N = I + J
433	veor		d16, d16, d17
434	veor		d18, d18, d19
435	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
436	vand		d21, d21, d31
437	vext.8		q8, q8, q8, #15
438	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
439	vmov.i64	d23, #0
440	vext.8		q9, q9, q9, #14
441	veor		d20, d20, d21
442	vmull.p8	q1, d28, d6		@ D = A*B
443	vext.8		q11, q11, q11, #12
444	vext.8		q10, q10, q10, #13
445	veor		q8, q8, q9
446	veor		q10, q10, q11
447	veor		q1, q1, q8
448	veor		q1, q1, q10
449	vext.8		d16, d27, d27, #1	@ A1
450	vmull.p8	q8, d16, d7		@ F = A1*B
451	vext.8		d4, d7, d7, #1	@ B1
452	vmull.p8	q2, d27, d4		@ E = A*B1
453	vext.8		d18, d27, d27, #2	@ A2
454	vmull.p8	q9, d18, d7		@ H = A2*B
455	vext.8		d22, d7, d7, #2	@ B2
456	vmull.p8	q11, d27, d22		@ G = A*B2
457	vext.8		d20, d27, d27, #3	@ A3
458	veor		q8, q8, q2		@ L = E + F
459	vmull.p8	q10, d20, d7		@ J = A3*B
460	vext.8		d4, d7, d7, #3	@ B3
461	veor		q9, q9, q11		@ M = G + H
462	vmull.p8	q2, d27, d4		@ I = A*B3
463	veor		d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
464	vand		d17, d17, d29
465	vext.8		d22, d7, d7, #4	@ B4
466	veor		d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
467	vand		d19, d19, d30
468	vmull.p8	q11, d27, d22		@ K = A*B4
469	veor		q10, q10, q2		@ N = I + J
470	veor		d16, d16, d17
471	veor		d18, d18, d19
472	veor		d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
473	vand		d21, d21, d31
474	vext.8		q8, q8, q8, #15
475	veor		d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
476	vmov.i64	d23, #0
477	vext.8		q9, q9, q9, #14
478	veor		d20, d20, d21
479	vmull.p8	q2, d27, d7		@ D = A*B
480	vext.8		q11, q11, q11, #12
481	vext.8		q10, q10, q10, #13
482	veor		q8, q8, q9
483	veor		q10, q10, q11
484	veor		q2, q2, q8
485	veor		q2, q2, q10
486	veor		q1,q1,q0		@ Karatsuba post-processing
487	veor		q1,q1,q2
488	veor		d1,d1,d2
489	veor		d4,d4,d3	@ Xh|Xl - 256-bit result
490
491	@ equivalent of reduction_avx from ghash-x86_64.pl
492	vshl.i64	q9,q0,#57		@ 1st phase
493	vshl.i64	q10,q0,#62
494	veor		q10,q10,q9		@
495	vshl.i64	q9,q0,#63
496	veor		q10, q10, q9		@
497 	veor		d1,d1,d20	@
498	veor		d4,d4,d21
499
500	vshr.u64	q10,q0,#1		@ 2nd phase
501	veor		q2,q2,q0
502	veor		q0,q0,q10		@
503	vshr.u64	q10,q10,#6
504	vshr.u64	q0,q0,#1		@
505	veor		q0,q0,q2		@
506	veor		q0,q0,q10		@
507
508	subs		r3,#16
509	bne		.Loop_neon
510
511#ifdef __ARMEL__
512	vrev64.8	q0,q0
513#endif
514	sub		r0,#16
515	vst1.64		d1,[r0,:64]!	@ write out Xi
516	vst1.64		d0,[r0,:64]
517
518	bx	lr					@ bx lr
519.size	gcm_ghash_neon,.-gcm_ghash_neon
520#endif
521.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
522.align  2
523