1#if defined(__x86_64__)
2.text
3
4
5.globl	_gcm_gmult_4bit
6.private_extern _gcm_gmult_4bit
7
8.p2align	4
9_gcm_gmult_4bit:
10	pushq	%rbx
11	pushq	%rbp
12	pushq	%r12
13	pushq	%r13
14	pushq	%r14
15	pushq	%r15
16	subq	$280,%rsp
17L$gmult_prologue:
18
19	movzbq	15(%rdi),%r8
20	leaq	L$rem_4bit(%rip),%r11
21	xorq	%rax,%rax
22	xorq	%rbx,%rbx
23	movb	%r8b,%al
24	movb	%r8b,%bl
25	shlb	$4,%al
26	movq	$14,%rcx
27	movq	8(%rsi,%rax,1),%r8
28	movq	(%rsi,%rax,1),%r9
29	andb	$0xf0,%bl
30	movq	%r8,%rdx
31	jmp	L$oop1
32
33.p2align	4
34L$oop1:
35	shrq	$4,%r8
36	andq	$0xf,%rdx
37	movq	%r9,%r10
38	movb	(%rdi,%rcx,1),%al
39	shrq	$4,%r9
40	xorq	8(%rsi,%rbx,1),%r8
41	shlq	$60,%r10
42	xorq	(%rsi,%rbx,1),%r9
43	movb	%al,%bl
44	xorq	(%r11,%rdx,8),%r9
45	movq	%r8,%rdx
46	shlb	$4,%al
47	xorq	%r10,%r8
48	decq	%rcx
49	js	L$break1
50
51	shrq	$4,%r8
52	andq	$0xf,%rdx
53	movq	%r9,%r10
54	shrq	$4,%r9
55	xorq	8(%rsi,%rax,1),%r8
56	shlq	$60,%r10
57	xorq	(%rsi,%rax,1),%r9
58	andb	$0xf0,%bl
59	xorq	(%r11,%rdx,8),%r9
60	movq	%r8,%rdx
61	xorq	%r10,%r8
62	jmp	L$oop1
63
64.p2align	4
65L$break1:
66	shrq	$4,%r8
67	andq	$0xf,%rdx
68	movq	%r9,%r10
69	shrq	$4,%r9
70	xorq	8(%rsi,%rax,1),%r8
71	shlq	$60,%r10
72	xorq	(%rsi,%rax,1),%r9
73	andb	$0xf0,%bl
74	xorq	(%r11,%rdx,8),%r9
75	movq	%r8,%rdx
76	xorq	%r10,%r8
77
78	shrq	$4,%r8
79	andq	$0xf,%rdx
80	movq	%r9,%r10
81	shrq	$4,%r9
82	xorq	8(%rsi,%rbx,1),%r8
83	shlq	$60,%r10
84	xorq	(%rsi,%rbx,1),%r9
85	xorq	%r10,%r8
86	xorq	(%r11,%rdx,8),%r9
87
88	bswapq	%r8
89	bswapq	%r9
90	movq	%r8,8(%rdi)
91	movq	%r9,(%rdi)
92
93	leaq	280+48(%rsp),%rsi
94	movq	-8(%rsi),%rbx
95	leaq	(%rsi),%rsp
96L$gmult_epilogue:
97	.byte	0xf3,0xc3
98
99.globl	_gcm_ghash_4bit
100.private_extern _gcm_ghash_4bit
101
102.p2align	4
103_gcm_ghash_4bit:
104	pushq	%rbx
105	pushq	%rbp
106	pushq	%r12
107	pushq	%r13
108	pushq	%r14
109	pushq	%r15
110	subq	$280,%rsp
111L$ghash_prologue:
112	movq	%rdx,%r14
113	movq	%rcx,%r15
114	subq	$-128,%rsi
115	leaq	16+128(%rsp),%rbp
116	xorl	%edx,%edx
117	movq	0+0-128(%rsi),%r8
118	movq	0+8-128(%rsi),%rax
119	movb	%al,%dl
120	shrq	$4,%rax
121	movq	%r8,%r10
122	shrq	$4,%r8
123	movq	16+0-128(%rsi),%r9
124	shlb	$4,%dl
125	movq	16+8-128(%rsi),%rbx
126	shlq	$60,%r10
127	movb	%dl,0(%rsp)
128	orq	%r10,%rax
129	movb	%bl,%dl
130	shrq	$4,%rbx
131	movq	%r9,%r10
132	shrq	$4,%r9
133	movq	%r8,0(%rbp)
134	movq	32+0-128(%rsi),%r8
135	shlb	$4,%dl
136	movq	%rax,0-128(%rbp)
137	movq	32+8-128(%rsi),%rax
138	shlq	$60,%r10
139	movb	%dl,1(%rsp)
140	orq	%r10,%rbx
141	movb	%al,%dl
142	shrq	$4,%rax
143	movq	%r8,%r10
144	shrq	$4,%r8
145	movq	%r9,8(%rbp)
146	movq	48+0-128(%rsi),%r9
147	shlb	$4,%dl
148	movq	%rbx,8-128(%rbp)
149	movq	48+8-128(%rsi),%rbx
150	shlq	$60,%r10
151	movb	%dl,2(%rsp)
152	orq	%r10,%rax
153	movb	%bl,%dl
154	shrq	$4,%rbx
155	movq	%r9,%r10
156	shrq	$4,%r9
157	movq	%r8,16(%rbp)
158	movq	64+0-128(%rsi),%r8
159	shlb	$4,%dl
160	movq	%rax,16-128(%rbp)
161	movq	64+8-128(%rsi),%rax
162	shlq	$60,%r10
163	movb	%dl,3(%rsp)
164	orq	%r10,%rbx
165	movb	%al,%dl
166	shrq	$4,%rax
167	movq	%r8,%r10
168	shrq	$4,%r8
169	movq	%r9,24(%rbp)
170	movq	80+0-128(%rsi),%r9
171	shlb	$4,%dl
172	movq	%rbx,24-128(%rbp)
173	movq	80+8-128(%rsi),%rbx
174	shlq	$60,%r10
175	movb	%dl,4(%rsp)
176	orq	%r10,%rax
177	movb	%bl,%dl
178	shrq	$4,%rbx
179	movq	%r9,%r10
180	shrq	$4,%r9
181	movq	%r8,32(%rbp)
182	movq	96+0-128(%rsi),%r8
183	shlb	$4,%dl
184	movq	%rax,32-128(%rbp)
185	movq	96+8-128(%rsi),%rax
186	shlq	$60,%r10
187	movb	%dl,5(%rsp)
188	orq	%r10,%rbx
189	movb	%al,%dl
190	shrq	$4,%rax
191	movq	%r8,%r10
192	shrq	$4,%r8
193	movq	%r9,40(%rbp)
194	movq	112+0-128(%rsi),%r9
195	shlb	$4,%dl
196	movq	%rbx,40-128(%rbp)
197	movq	112+8-128(%rsi),%rbx
198	shlq	$60,%r10
199	movb	%dl,6(%rsp)
200	orq	%r10,%rax
201	movb	%bl,%dl
202	shrq	$4,%rbx
203	movq	%r9,%r10
204	shrq	$4,%r9
205	movq	%r8,48(%rbp)
206	movq	128+0-128(%rsi),%r8
207	shlb	$4,%dl
208	movq	%rax,48-128(%rbp)
209	movq	128+8-128(%rsi),%rax
210	shlq	$60,%r10
211	movb	%dl,7(%rsp)
212	orq	%r10,%rbx
213	movb	%al,%dl
214	shrq	$4,%rax
215	movq	%r8,%r10
216	shrq	$4,%r8
217	movq	%r9,56(%rbp)
218	movq	144+0-128(%rsi),%r9
219	shlb	$4,%dl
220	movq	%rbx,56-128(%rbp)
221	movq	144+8-128(%rsi),%rbx
222	shlq	$60,%r10
223	movb	%dl,8(%rsp)
224	orq	%r10,%rax
225	movb	%bl,%dl
226	shrq	$4,%rbx
227	movq	%r9,%r10
228	shrq	$4,%r9
229	movq	%r8,64(%rbp)
230	movq	160+0-128(%rsi),%r8
231	shlb	$4,%dl
232	movq	%rax,64-128(%rbp)
233	movq	160+8-128(%rsi),%rax
234	shlq	$60,%r10
235	movb	%dl,9(%rsp)
236	orq	%r10,%rbx
237	movb	%al,%dl
238	shrq	$4,%rax
239	movq	%r8,%r10
240	shrq	$4,%r8
241	movq	%r9,72(%rbp)
242	movq	176+0-128(%rsi),%r9
243	shlb	$4,%dl
244	movq	%rbx,72-128(%rbp)
245	movq	176+8-128(%rsi),%rbx
246	shlq	$60,%r10
247	movb	%dl,10(%rsp)
248	orq	%r10,%rax
249	movb	%bl,%dl
250	shrq	$4,%rbx
251	movq	%r9,%r10
252	shrq	$4,%r9
253	movq	%r8,80(%rbp)
254	movq	192+0-128(%rsi),%r8
255	shlb	$4,%dl
256	movq	%rax,80-128(%rbp)
257	movq	192+8-128(%rsi),%rax
258	shlq	$60,%r10
259	movb	%dl,11(%rsp)
260	orq	%r10,%rbx
261	movb	%al,%dl
262	shrq	$4,%rax
263	movq	%r8,%r10
264	shrq	$4,%r8
265	movq	%r9,88(%rbp)
266	movq	208+0-128(%rsi),%r9
267	shlb	$4,%dl
268	movq	%rbx,88-128(%rbp)
269	movq	208+8-128(%rsi),%rbx
270	shlq	$60,%r10
271	movb	%dl,12(%rsp)
272	orq	%r10,%rax
273	movb	%bl,%dl
274	shrq	$4,%rbx
275	movq	%r9,%r10
276	shrq	$4,%r9
277	movq	%r8,96(%rbp)
278	movq	224+0-128(%rsi),%r8
279	shlb	$4,%dl
280	movq	%rax,96-128(%rbp)
281	movq	224+8-128(%rsi),%rax
282	shlq	$60,%r10
283	movb	%dl,13(%rsp)
284	orq	%r10,%rbx
285	movb	%al,%dl
286	shrq	$4,%rax
287	movq	%r8,%r10
288	shrq	$4,%r8
289	movq	%r9,104(%rbp)
290	movq	240+0-128(%rsi),%r9
291	shlb	$4,%dl
292	movq	%rbx,104-128(%rbp)
293	movq	240+8-128(%rsi),%rbx
294	shlq	$60,%r10
295	movb	%dl,14(%rsp)
296	orq	%r10,%rax
297	movb	%bl,%dl
298	shrq	$4,%rbx
299	movq	%r9,%r10
300	shrq	$4,%r9
301	movq	%r8,112(%rbp)
302	shlb	$4,%dl
303	movq	%rax,112-128(%rbp)
304	shlq	$60,%r10
305	movb	%dl,15(%rsp)
306	orq	%r10,%rbx
307	movq	%r9,120(%rbp)
308	movq	%rbx,120-128(%rbp)
309	addq	$-128,%rsi
310	movq	8(%rdi),%r8
311	movq	0(%rdi),%r9
312	addq	%r14,%r15
313	leaq	L$rem_8bit(%rip),%r11
314	jmp	L$outer_loop
315.p2align	4
316L$outer_loop:
317	xorq	(%r14),%r9
318	movq	8(%r14),%rdx
319	leaq	16(%r14),%r14
320	xorq	%r8,%rdx
321	movq	%r9,(%rdi)
322	movq	%rdx,8(%rdi)
323	shrq	$32,%rdx
324	xorq	%rax,%rax
325	roll	$8,%edx
326	movb	%dl,%al
327	movzbl	%dl,%ebx
328	shlb	$4,%al
329	shrl	$4,%ebx
330	roll	$8,%edx
331	movq	8(%rsi,%rax,1),%r8
332	movq	(%rsi,%rax,1),%r9
333	movb	%dl,%al
334	movzbl	%dl,%ecx
335	shlb	$4,%al
336	movzbq	(%rsp,%rbx,1),%r12
337	shrl	$4,%ecx
338	xorq	%r8,%r12
339	movq	%r9,%r10
340	shrq	$8,%r8
341	movzbq	%r12b,%r12
342	shrq	$8,%r9
343	xorq	-128(%rbp,%rbx,8),%r8
344	shlq	$56,%r10
345	xorq	(%rbp,%rbx,8),%r9
346	roll	$8,%edx
347	xorq	8(%rsi,%rax,1),%r8
348	xorq	(%rsi,%rax,1),%r9
349	movb	%dl,%al
350	xorq	%r10,%r8
351	movzwq	(%r11,%r12,2),%r12
352	movzbl	%dl,%ebx
353	shlb	$4,%al
354	movzbq	(%rsp,%rcx,1),%r13
355	shrl	$4,%ebx
356	shlq	$48,%r12
357	xorq	%r8,%r13
358	movq	%r9,%r10
359	xorq	%r12,%r9
360	shrq	$8,%r8
361	movzbq	%r13b,%r13
362	shrq	$8,%r9
363	xorq	-128(%rbp,%rcx,8),%r8
364	shlq	$56,%r10
365	xorq	(%rbp,%rcx,8),%r9
366	roll	$8,%edx
367	xorq	8(%rsi,%rax,1),%r8
368	xorq	(%rsi,%rax,1),%r9
369	movb	%dl,%al
370	xorq	%r10,%r8
371	movzwq	(%r11,%r13,2),%r13
372	movzbl	%dl,%ecx
373	shlb	$4,%al
374	movzbq	(%rsp,%rbx,1),%r12
375	shrl	$4,%ecx
376	shlq	$48,%r13
377	xorq	%r8,%r12
378	movq	%r9,%r10
379	xorq	%r13,%r9
380	shrq	$8,%r8
381	movzbq	%r12b,%r12
382	movl	8(%rdi),%edx
383	shrq	$8,%r9
384	xorq	-128(%rbp,%rbx,8),%r8
385	shlq	$56,%r10
386	xorq	(%rbp,%rbx,8),%r9
387	roll	$8,%edx
388	xorq	8(%rsi,%rax,1),%r8
389	xorq	(%rsi,%rax,1),%r9
390	movb	%dl,%al
391	xorq	%r10,%r8
392	movzwq	(%r11,%r12,2),%r12
393	movzbl	%dl,%ebx
394	shlb	$4,%al
395	movzbq	(%rsp,%rcx,1),%r13
396	shrl	$4,%ebx
397	shlq	$48,%r12
398	xorq	%r8,%r13
399	movq	%r9,%r10
400	xorq	%r12,%r9
401	shrq	$8,%r8
402	movzbq	%r13b,%r13
403	shrq	$8,%r9
404	xorq	-128(%rbp,%rcx,8),%r8
405	shlq	$56,%r10
406	xorq	(%rbp,%rcx,8),%r9
407	roll	$8,%edx
408	xorq	8(%rsi,%rax,1),%r8
409	xorq	(%rsi,%rax,1),%r9
410	movb	%dl,%al
411	xorq	%r10,%r8
412	movzwq	(%r11,%r13,2),%r13
413	movzbl	%dl,%ecx
414	shlb	$4,%al
415	movzbq	(%rsp,%rbx,1),%r12
416	shrl	$4,%ecx
417	shlq	$48,%r13
418	xorq	%r8,%r12
419	movq	%r9,%r10
420	xorq	%r13,%r9
421	shrq	$8,%r8
422	movzbq	%r12b,%r12
423	shrq	$8,%r9
424	xorq	-128(%rbp,%rbx,8),%r8
425	shlq	$56,%r10
426	xorq	(%rbp,%rbx,8),%r9
427	roll	$8,%edx
428	xorq	8(%rsi,%rax,1),%r8
429	xorq	(%rsi,%rax,1),%r9
430	movb	%dl,%al
431	xorq	%r10,%r8
432	movzwq	(%r11,%r12,2),%r12
433	movzbl	%dl,%ebx
434	shlb	$4,%al
435	movzbq	(%rsp,%rcx,1),%r13
436	shrl	$4,%ebx
437	shlq	$48,%r12
438	xorq	%r8,%r13
439	movq	%r9,%r10
440	xorq	%r12,%r9
441	shrq	$8,%r8
442	movzbq	%r13b,%r13
443	shrq	$8,%r9
444	xorq	-128(%rbp,%rcx,8),%r8
445	shlq	$56,%r10
446	xorq	(%rbp,%rcx,8),%r9
447	roll	$8,%edx
448	xorq	8(%rsi,%rax,1),%r8
449	xorq	(%rsi,%rax,1),%r9
450	movb	%dl,%al
451	xorq	%r10,%r8
452	movzwq	(%r11,%r13,2),%r13
453	movzbl	%dl,%ecx
454	shlb	$4,%al
455	movzbq	(%rsp,%rbx,1),%r12
456	shrl	$4,%ecx
457	shlq	$48,%r13
458	xorq	%r8,%r12
459	movq	%r9,%r10
460	xorq	%r13,%r9
461	shrq	$8,%r8
462	movzbq	%r12b,%r12
463	movl	4(%rdi),%edx
464	shrq	$8,%r9
465	xorq	-128(%rbp,%rbx,8),%r8
466	shlq	$56,%r10
467	xorq	(%rbp,%rbx,8),%r9
468	roll	$8,%edx
469	xorq	8(%rsi,%rax,1),%r8
470	xorq	(%rsi,%rax,1),%r9
471	movb	%dl,%al
472	xorq	%r10,%r8
473	movzwq	(%r11,%r12,2),%r12
474	movzbl	%dl,%ebx
475	shlb	$4,%al
476	movzbq	(%rsp,%rcx,1),%r13
477	shrl	$4,%ebx
478	shlq	$48,%r12
479	xorq	%r8,%r13
480	movq	%r9,%r10
481	xorq	%r12,%r9
482	shrq	$8,%r8
483	movzbq	%r13b,%r13
484	shrq	$8,%r9
485	xorq	-128(%rbp,%rcx,8),%r8
486	shlq	$56,%r10
487	xorq	(%rbp,%rcx,8),%r9
488	roll	$8,%edx
489	xorq	8(%rsi,%rax,1),%r8
490	xorq	(%rsi,%rax,1),%r9
491	movb	%dl,%al
492	xorq	%r10,%r8
493	movzwq	(%r11,%r13,2),%r13
494	movzbl	%dl,%ecx
495	shlb	$4,%al
496	movzbq	(%rsp,%rbx,1),%r12
497	shrl	$4,%ecx
498	shlq	$48,%r13
499	xorq	%r8,%r12
500	movq	%r9,%r10
501	xorq	%r13,%r9
502	shrq	$8,%r8
503	movzbq	%r12b,%r12
504	shrq	$8,%r9
505	xorq	-128(%rbp,%rbx,8),%r8
506	shlq	$56,%r10
507	xorq	(%rbp,%rbx,8),%r9
508	roll	$8,%edx
509	xorq	8(%rsi,%rax,1),%r8
510	xorq	(%rsi,%rax,1),%r9
511	movb	%dl,%al
512	xorq	%r10,%r8
513	movzwq	(%r11,%r12,2),%r12
514	movzbl	%dl,%ebx
515	shlb	$4,%al
516	movzbq	(%rsp,%rcx,1),%r13
517	shrl	$4,%ebx
518	shlq	$48,%r12
519	xorq	%r8,%r13
520	movq	%r9,%r10
521	xorq	%r12,%r9
522	shrq	$8,%r8
523	movzbq	%r13b,%r13
524	shrq	$8,%r9
525	xorq	-128(%rbp,%rcx,8),%r8
526	shlq	$56,%r10
527	xorq	(%rbp,%rcx,8),%r9
528	roll	$8,%edx
529	xorq	8(%rsi,%rax,1),%r8
530	xorq	(%rsi,%rax,1),%r9
531	movb	%dl,%al
532	xorq	%r10,%r8
533	movzwq	(%r11,%r13,2),%r13
534	movzbl	%dl,%ecx
535	shlb	$4,%al
536	movzbq	(%rsp,%rbx,1),%r12
537	shrl	$4,%ecx
538	shlq	$48,%r13
539	xorq	%r8,%r12
540	movq	%r9,%r10
541	xorq	%r13,%r9
542	shrq	$8,%r8
543	movzbq	%r12b,%r12
544	movl	0(%rdi),%edx
545	shrq	$8,%r9
546	xorq	-128(%rbp,%rbx,8),%r8
547	shlq	$56,%r10
548	xorq	(%rbp,%rbx,8),%r9
549	roll	$8,%edx
550	xorq	8(%rsi,%rax,1),%r8
551	xorq	(%rsi,%rax,1),%r9
552	movb	%dl,%al
553	xorq	%r10,%r8
554	movzwq	(%r11,%r12,2),%r12
555	movzbl	%dl,%ebx
556	shlb	$4,%al
557	movzbq	(%rsp,%rcx,1),%r13
558	shrl	$4,%ebx
559	shlq	$48,%r12
560	xorq	%r8,%r13
561	movq	%r9,%r10
562	xorq	%r12,%r9
563	shrq	$8,%r8
564	movzbq	%r13b,%r13
565	shrq	$8,%r9
566	xorq	-128(%rbp,%rcx,8),%r8
567	shlq	$56,%r10
568	xorq	(%rbp,%rcx,8),%r9
569	roll	$8,%edx
570	xorq	8(%rsi,%rax,1),%r8
571	xorq	(%rsi,%rax,1),%r9
572	movb	%dl,%al
573	xorq	%r10,%r8
574	movzwq	(%r11,%r13,2),%r13
575	movzbl	%dl,%ecx
576	shlb	$4,%al
577	movzbq	(%rsp,%rbx,1),%r12
578	shrl	$4,%ecx
579	shlq	$48,%r13
580	xorq	%r8,%r12
581	movq	%r9,%r10
582	xorq	%r13,%r9
583	shrq	$8,%r8
584	movzbq	%r12b,%r12
585	shrq	$8,%r9
586	xorq	-128(%rbp,%rbx,8),%r8
587	shlq	$56,%r10
588	xorq	(%rbp,%rbx,8),%r9
589	roll	$8,%edx
590	xorq	8(%rsi,%rax,1),%r8
591	xorq	(%rsi,%rax,1),%r9
592	movb	%dl,%al
593	xorq	%r10,%r8
594	movzwq	(%r11,%r12,2),%r12
595	movzbl	%dl,%ebx
596	shlb	$4,%al
597	movzbq	(%rsp,%rcx,1),%r13
598	shrl	$4,%ebx
599	shlq	$48,%r12
600	xorq	%r8,%r13
601	movq	%r9,%r10
602	xorq	%r12,%r9
603	shrq	$8,%r8
604	movzbq	%r13b,%r13
605	shrq	$8,%r9
606	xorq	-128(%rbp,%rcx,8),%r8
607	shlq	$56,%r10
608	xorq	(%rbp,%rcx,8),%r9
609	roll	$8,%edx
610	xorq	8(%rsi,%rax,1),%r8
611	xorq	(%rsi,%rax,1),%r9
612	movb	%dl,%al
613	xorq	%r10,%r8
614	movzwq	(%r11,%r13,2),%r13
615	movzbl	%dl,%ecx
616	shlb	$4,%al
617	movzbq	(%rsp,%rbx,1),%r12
618	andl	$240,%ecx
619	shlq	$48,%r13
620	xorq	%r8,%r12
621	movq	%r9,%r10
622	xorq	%r13,%r9
623	shrq	$8,%r8
624	movzbq	%r12b,%r12
625	movl	-4(%rdi),%edx
626	shrq	$8,%r9
627	xorq	-128(%rbp,%rbx,8),%r8
628	shlq	$56,%r10
629	xorq	(%rbp,%rbx,8),%r9
630	movzwq	(%r11,%r12,2),%r12
631	xorq	8(%rsi,%rax,1),%r8
632	xorq	(%rsi,%rax,1),%r9
633	shlq	$48,%r12
634	xorq	%r10,%r8
635	xorq	%r12,%r9
636	movzbq	%r8b,%r13
637	shrq	$4,%r8
638	movq	%r9,%r10
639	shlb	$4,%r13b
640	shrq	$4,%r9
641	xorq	8(%rsi,%rcx,1),%r8
642	movzwq	(%r11,%r13,2),%r13
643	shlq	$60,%r10
644	xorq	(%rsi,%rcx,1),%r9
645	xorq	%r10,%r8
646	shlq	$48,%r13
647	bswapq	%r8
648	xorq	%r13,%r9
649	bswapq	%r9
650	cmpq	%r15,%r14
651	jb	L$outer_loop
652	movq	%r8,8(%rdi)
653	movq	%r9,(%rdi)
654
655	leaq	280+48(%rsp),%rsi
656	movq	-48(%rsi),%r15
657	movq	-40(%rsi),%r14
658	movq	-32(%rsi),%r13
659	movq	-24(%rsi),%r12
660	movq	-16(%rsi),%rbp
661	movq	-8(%rsi),%rbx
662	leaq	0(%rsi),%rsp
663L$ghash_epilogue:
664	.byte	0xf3,0xc3
665
666.globl	_gcm_init_clmul
667.private_extern _gcm_init_clmul
668
669.p2align	4
670_gcm_init_clmul:
671L$_init_clmul:
672	movdqu	(%rsi),%xmm2
673	pshufd	$78,%xmm2,%xmm2
674
675
676	pshufd	$255,%xmm2,%xmm4
677	movdqa	%xmm2,%xmm3
678	psllq	$1,%xmm2
679	pxor	%xmm5,%xmm5
680	psrlq	$63,%xmm3
681	pcmpgtd	%xmm4,%xmm5
682	pslldq	$8,%xmm3
683	por	%xmm3,%xmm2
684
685
686	pand	L$0x1c2_polynomial(%rip),%xmm5
687	pxor	%xmm5,%xmm2
688
689
690	pshufd	$78,%xmm2,%xmm6
691	movdqa	%xmm2,%xmm0
692	pxor	%xmm2,%xmm6
693	movdqa	%xmm0,%xmm1
694	pshufd	$78,%xmm0,%xmm3
695	pxor	%xmm0,%xmm3
696.byte	102,15,58,68,194,0
697.byte	102,15,58,68,202,17
698.byte	102,15,58,68,222,0
699	pxor	%xmm0,%xmm3
700	pxor	%xmm1,%xmm3
701
702	movdqa	%xmm3,%xmm4
703	psrldq	$8,%xmm3
704	pslldq	$8,%xmm4
705	pxor	%xmm3,%xmm1
706	pxor	%xmm4,%xmm0
707
708	movdqa	%xmm0,%xmm4
709	movdqa	%xmm0,%xmm3
710	psllq	$5,%xmm0
711	pxor	%xmm0,%xmm3
712	psllq	$1,%xmm0
713	pxor	%xmm3,%xmm0
714	psllq	$57,%xmm0
715	movdqa	%xmm0,%xmm3
716	pslldq	$8,%xmm0
717	psrldq	$8,%xmm3
718	pxor	%xmm4,%xmm0
719	pxor	%xmm3,%xmm1
720
721
722	movdqa	%xmm0,%xmm4
723	psrlq	$1,%xmm0
724	pxor	%xmm4,%xmm1
725	pxor	%xmm0,%xmm4
726	psrlq	$5,%xmm0
727	pxor	%xmm4,%xmm0
728	psrlq	$1,%xmm0
729	pxor	%xmm1,%xmm0
730	pshufd	$78,%xmm2,%xmm3
731	pshufd	$78,%xmm0,%xmm4
732	pxor	%xmm2,%xmm3
733	movdqu	%xmm2,0(%rdi)
734	pxor	%xmm0,%xmm4
735	movdqu	%xmm0,16(%rdi)
736.byte	102,15,58,15,227,8
737	movdqu	%xmm4,32(%rdi)
738	movdqa	%xmm0,%xmm1
739	pshufd	$78,%xmm0,%xmm3
740	pxor	%xmm0,%xmm3
741.byte	102,15,58,68,194,0
742.byte	102,15,58,68,202,17
743.byte	102,15,58,68,222,0
744	pxor	%xmm0,%xmm3
745	pxor	%xmm1,%xmm3
746
747	movdqa	%xmm3,%xmm4
748	psrldq	$8,%xmm3
749	pslldq	$8,%xmm4
750	pxor	%xmm3,%xmm1
751	pxor	%xmm4,%xmm0
752
753	movdqa	%xmm0,%xmm4
754	movdqa	%xmm0,%xmm3
755	psllq	$5,%xmm0
756	pxor	%xmm0,%xmm3
757	psllq	$1,%xmm0
758	pxor	%xmm3,%xmm0
759	psllq	$57,%xmm0
760	movdqa	%xmm0,%xmm3
761	pslldq	$8,%xmm0
762	psrldq	$8,%xmm3
763	pxor	%xmm4,%xmm0
764	pxor	%xmm3,%xmm1
765
766
767	movdqa	%xmm0,%xmm4
768	psrlq	$1,%xmm0
769	pxor	%xmm4,%xmm1
770	pxor	%xmm0,%xmm4
771	psrlq	$5,%xmm0
772	pxor	%xmm4,%xmm0
773	psrlq	$1,%xmm0
774	pxor	%xmm1,%xmm0
775	movdqa	%xmm0,%xmm5
776	movdqa	%xmm0,%xmm1
777	pshufd	$78,%xmm0,%xmm3
778	pxor	%xmm0,%xmm3
779.byte	102,15,58,68,194,0
780.byte	102,15,58,68,202,17
781.byte	102,15,58,68,222,0
782	pxor	%xmm0,%xmm3
783	pxor	%xmm1,%xmm3
784
785	movdqa	%xmm3,%xmm4
786	psrldq	$8,%xmm3
787	pslldq	$8,%xmm4
788	pxor	%xmm3,%xmm1
789	pxor	%xmm4,%xmm0
790
791	movdqa	%xmm0,%xmm4
792	movdqa	%xmm0,%xmm3
793	psllq	$5,%xmm0
794	pxor	%xmm0,%xmm3
795	psllq	$1,%xmm0
796	pxor	%xmm3,%xmm0
797	psllq	$57,%xmm0
798	movdqa	%xmm0,%xmm3
799	pslldq	$8,%xmm0
800	psrldq	$8,%xmm3
801	pxor	%xmm4,%xmm0
802	pxor	%xmm3,%xmm1
803
804
805	movdqa	%xmm0,%xmm4
806	psrlq	$1,%xmm0
807	pxor	%xmm4,%xmm1
808	pxor	%xmm0,%xmm4
809	psrlq	$5,%xmm0
810	pxor	%xmm4,%xmm0
811	psrlq	$1,%xmm0
812	pxor	%xmm1,%xmm0
813	pshufd	$78,%xmm5,%xmm3
814	pshufd	$78,%xmm0,%xmm4
815	pxor	%xmm5,%xmm3
816	movdqu	%xmm5,48(%rdi)
817	pxor	%xmm0,%xmm4
818	movdqu	%xmm0,64(%rdi)
819.byte	102,15,58,15,227,8
820	movdqu	%xmm4,80(%rdi)
821	.byte	0xf3,0xc3
822
823.globl	_gcm_gmult_clmul
824.private_extern _gcm_gmult_clmul
825
826.p2align	4
827_gcm_gmult_clmul:
828L$_gmult_clmul:
829	movdqu	(%rdi),%xmm0
830	movdqa	L$bswap_mask(%rip),%xmm5
831	movdqu	(%rsi),%xmm2
832	movdqu	32(%rsi),%xmm4
833.byte	102,15,56,0,197
834	movdqa	%xmm0,%xmm1
835	pshufd	$78,%xmm0,%xmm3
836	pxor	%xmm0,%xmm3
837.byte	102,15,58,68,194,0
838.byte	102,15,58,68,202,17
839.byte	102,15,58,68,220,0
840	pxor	%xmm0,%xmm3
841	pxor	%xmm1,%xmm3
842
843	movdqa	%xmm3,%xmm4
844	psrldq	$8,%xmm3
845	pslldq	$8,%xmm4
846	pxor	%xmm3,%xmm1
847	pxor	%xmm4,%xmm0
848
849	movdqa	%xmm0,%xmm4
850	movdqa	%xmm0,%xmm3
851	psllq	$5,%xmm0
852	pxor	%xmm0,%xmm3
853	psllq	$1,%xmm0
854	pxor	%xmm3,%xmm0
855	psllq	$57,%xmm0
856	movdqa	%xmm0,%xmm3
857	pslldq	$8,%xmm0
858	psrldq	$8,%xmm3
859	pxor	%xmm4,%xmm0
860	pxor	%xmm3,%xmm1
861
862
863	movdqa	%xmm0,%xmm4
864	psrlq	$1,%xmm0
865	pxor	%xmm4,%xmm1
866	pxor	%xmm0,%xmm4
867	psrlq	$5,%xmm0
868	pxor	%xmm4,%xmm0
869	psrlq	$1,%xmm0
870	pxor	%xmm1,%xmm0
871.byte	102,15,56,0,197
872	movdqu	%xmm0,(%rdi)
873	.byte	0xf3,0xc3
874
875.globl	_gcm_ghash_clmul
876.private_extern _gcm_ghash_clmul
877
878.p2align	5
879_gcm_ghash_clmul:
880L$_ghash_clmul:
881	movdqa	L$bswap_mask(%rip),%xmm10
882
883	movdqu	(%rdi),%xmm0
884	movdqu	(%rsi),%xmm2
885	movdqu	32(%rsi),%xmm7
886.byte	102,65,15,56,0,194
887
888	subq	$0x10,%rcx
889	jz	L$odd_tail
890
891	movdqu	16(%rsi),%xmm6
892	movl	_OPENSSL_ia32cap_P+4(%rip),%eax
893	cmpq	$0x30,%rcx
894	jb	L$skip4x
895
896	andl	$71303168,%eax
897	cmpl	$4194304,%eax
898	je	L$skip4x
899
900	subq	$0x30,%rcx
901	movq	$0xA040608020C0E000,%rax
902	movdqu	48(%rsi),%xmm14
903	movdqu	64(%rsi),%xmm15
904
905
906
907
908	movdqu	48(%rdx),%xmm3
909	movdqu	32(%rdx),%xmm11
910.byte	102,65,15,56,0,218
911.byte	102,69,15,56,0,218
912	movdqa	%xmm3,%xmm5
913	pshufd	$78,%xmm3,%xmm4
914	pxor	%xmm3,%xmm4
915.byte	102,15,58,68,218,0
916.byte	102,15,58,68,234,17
917.byte	102,15,58,68,231,0
918
919	movdqa	%xmm11,%xmm13
920	pshufd	$78,%xmm11,%xmm12
921	pxor	%xmm11,%xmm12
922.byte	102,68,15,58,68,222,0
923.byte	102,68,15,58,68,238,17
924.byte	102,68,15,58,68,231,16
925	xorps	%xmm11,%xmm3
926	xorps	%xmm13,%xmm5
927	movups	80(%rsi),%xmm7
928	xorps	%xmm12,%xmm4
929
930	movdqu	16(%rdx),%xmm11
931	movdqu	0(%rdx),%xmm8
932.byte	102,69,15,56,0,218
933.byte	102,69,15,56,0,194
934	movdqa	%xmm11,%xmm13
935	pshufd	$78,%xmm11,%xmm12
936	pxor	%xmm8,%xmm0
937	pxor	%xmm11,%xmm12
938.byte	102,69,15,58,68,222,0
939	movdqa	%xmm0,%xmm1
940	pshufd	$78,%xmm0,%xmm8
941	pxor	%xmm0,%xmm8
942.byte	102,69,15,58,68,238,17
943.byte	102,68,15,58,68,231,0
944	xorps	%xmm11,%xmm3
945	xorps	%xmm13,%xmm5
946
947	leaq	64(%rdx),%rdx
948	subq	$0x40,%rcx
949	jc	L$tail4x
950
951	jmp	L$mod4_loop
952.p2align	5
953L$mod4_loop:
954.byte	102,65,15,58,68,199,0
955	xorps	%xmm12,%xmm4
956	movdqu	48(%rdx),%xmm11
957.byte	102,69,15,56,0,218
958.byte	102,65,15,58,68,207,17
959	xorps	%xmm3,%xmm0
960	movdqu	32(%rdx),%xmm3
961	movdqa	%xmm11,%xmm13
962.byte	102,68,15,58,68,199,16
963	pshufd	$78,%xmm11,%xmm12
964	xorps	%xmm5,%xmm1
965	pxor	%xmm11,%xmm12
966.byte	102,65,15,56,0,218
967	movups	32(%rsi),%xmm7
968	xorps	%xmm4,%xmm8
969.byte	102,68,15,58,68,218,0
970	pshufd	$78,%xmm3,%xmm4
971
972	pxor	%xmm0,%xmm8
973	movdqa	%xmm3,%xmm5
974	pxor	%xmm1,%xmm8
975	pxor	%xmm3,%xmm4
976	movdqa	%xmm8,%xmm9
977.byte	102,68,15,58,68,234,17
978	pslldq	$8,%xmm8
979	psrldq	$8,%xmm9
980	pxor	%xmm8,%xmm0
981	movdqa	L$7_mask(%rip),%xmm8
982	pxor	%xmm9,%xmm1
983.byte	102,76,15,110,200
984
985	pand	%xmm0,%xmm8
986.byte	102,69,15,56,0,200
987	pxor	%xmm0,%xmm9
988.byte	102,68,15,58,68,231,0
989	psllq	$57,%xmm9
990	movdqa	%xmm9,%xmm8
991	pslldq	$8,%xmm9
992.byte	102,15,58,68,222,0
993	psrldq	$8,%xmm8
994	pxor	%xmm9,%xmm0
995	pxor	%xmm8,%xmm1
996	movdqu	0(%rdx),%xmm8
997
998	movdqa	%xmm0,%xmm9
999	psrlq	$1,%xmm0
1000.byte	102,15,58,68,238,17
1001	xorps	%xmm11,%xmm3
1002	movdqu	16(%rdx),%xmm11
1003.byte	102,69,15,56,0,218
1004.byte	102,15,58,68,231,16
1005	xorps	%xmm13,%xmm5
1006	movups	80(%rsi),%xmm7
1007.byte	102,69,15,56,0,194
1008	pxor	%xmm9,%xmm1
1009	pxor	%xmm0,%xmm9
1010	psrlq	$5,%xmm0
1011
1012	movdqa	%xmm11,%xmm13
1013	pxor	%xmm12,%xmm4
1014	pshufd	$78,%xmm11,%xmm12
1015	pxor	%xmm9,%xmm0
1016	pxor	%xmm8,%xmm1
1017	pxor	%xmm11,%xmm12
1018.byte	102,69,15,58,68,222,0
1019	psrlq	$1,%xmm0
1020	pxor	%xmm1,%xmm0
1021	movdqa	%xmm0,%xmm1
1022.byte	102,69,15,58,68,238,17
1023	xorps	%xmm11,%xmm3
1024	pshufd	$78,%xmm0,%xmm8
1025	pxor	%xmm0,%xmm8
1026
1027.byte	102,68,15,58,68,231,0
1028	xorps	%xmm13,%xmm5
1029
1030	leaq	64(%rdx),%rdx
1031	subq	$0x40,%rcx
1032	jnc	L$mod4_loop
1033
1034L$tail4x:
1035.byte	102,65,15,58,68,199,0
1036.byte	102,65,15,58,68,207,17
1037.byte	102,68,15,58,68,199,16
1038	xorps	%xmm12,%xmm4
1039	xorps	%xmm3,%xmm0
1040	xorps	%xmm5,%xmm1
1041	pxor	%xmm0,%xmm1
1042	pxor	%xmm4,%xmm8
1043
1044	pxor	%xmm1,%xmm8
1045	pxor	%xmm0,%xmm1
1046
1047	movdqa	%xmm8,%xmm9
1048	psrldq	$8,%xmm8
1049	pslldq	$8,%xmm9
1050	pxor	%xmm8,%xmm1
1051	pxor	%xmm9,%xmm0
1052
1053	movdqa	%xmm0,%xmm4
1054	movdqa	%xmm0,%xmm3
1055	psllq	$5,%xmm0
1056	pxor	%xmm0,%xmm3
1057	psllq	$1,%xmm0
1058	pxor	%xmm3,%xmm0
1059	psllq	$57,%xmm0
1060	movdqa	%xmm0,%xmm3
1061	pslldq	$8,%xmm0
1062	psrldq	$8,%xmm3
1063	pxor	%xmm4,%xmm0
1064	pxor	%xmm3,%xmm1
1065
1066
1067	movdqa	%xmm0,%xmm4
1068	psrlq	$1,%xmm0
1069	pxor	%xmm4,%xmm1
1070	pxor	%xmm0,%xmm4
1071	psrlq	$5,%xmm0
1072	pxor	%xmm4,%xmm0
1073	psrlq	$1,%xmm0
1074	pxor	%xmm1,%xmm0
1075	addq	$0x40,%rcx
1076	jz	L$done
1077	movdqu	32(%rsi),%xmm7
1078	subq	$0x10,%rcx
1079	jz	L$odd_tail
1080L$skip4x:
1081
1082
1083
1084
1085
1086	movdqu	(%rdx),%xmm8
1087	movdqu	16(%rdx),%xmm3
1088.byte	102,69,15,56,0,194
1089.byte	102,65,15,56,0,218
1090	pxor	%xmm8,%xmm0
1091
1092	movdqa	%xmm3,%xmm5
1093	pshufd	$78,%xmm3,%xmm4
1094	pxor	%xmm3,%xmm4
1095.byte	102,15,58,68,218,0
1096.byte	102,15,58,68,234,17
1097.byte	102,15,58,68,231,0
1098
1099	leaq	32(%rdx),%rdx
1100	nop
1101	subq	$0x20,%rcx
1102	jbe	L$even_tail
1103	nop
1104	jmp	L$mod_loop
1105
1106.p2align	5
1107L$mod_loop:
1108	movdqa	%xmm0,%xmm1
1109	movdqa	%xmm4,%xmm8
1110	pshufd	$78,%xmm0,%xmm4
1111	pxor	%xmm0,%xmm4
1112
1113.byte	102,15,58,68,198,0
1114.byte	102,15,58,68,206,17
1115.byte	102,15,58,68,231,16
1116
1117	pxor	%xmm3,%xmm0
1118	pxor	%xmm5,%xmm1
1119	movdqu	(%rdx),%xmm9
1120	pxor	%xmm0,%xmm8
1121.byte	102,69,15,56,0,202
1122	movdqu	16(%rdx),%xmm3
1123
1124	pxor	%xmm1,%xmm8
1125	pxor	%xmm9,%xmm1
1126	pxor	%xmm8,%xmm4
1127.byte	102,65,15,56,0,218
1128	movdqa	%xmm4,%xmm8
1129	psrldq	$8,%xmm8
1130	pslldq	$8,%xmm4
1131	pxor	%xmm8,%xmm1
1132	pxor	%xmm4,%xmm0
1133
1134	movdqa	%xmm3,%xmm5
1135
1136	movdqa	%xmm0,%xmm9
1137	movdqa	%xmm0,%xmm8
1138	psllq	$5,%xmm0
1139	pxor	%xmm0,%xmm8
1140.byte	102,15,58,68,218,0
1141	psllq	$1,%xmm0
1142	pxor	%xmm8,%xmm0
1143	psllq	$57,%xmm0
1144	movdqa	%xmm0,%xmm8
1145	pslldq	$8,%xmm0
1146	psrldq	$8,%xmm8
1147	pxor	%xmm9,%xmm0
1148	pshufd	$78,%xmm5,%xmm4
1149	pxor	%xmm8,%xmm1
1150	pxor	%xmm5,%xmm4
1151
1152	movdqa	%xmm0,%xmm9
1153	psrlq	$1,%xmm0
1154.byte	102,15,58,68,234,17
1155	pxor	%xmm9,%xmm1
1156	pxor	%xmm0,%xmm9
1157	psrlq	$5,%xmm0
1158	pxor	%xmm9,%xmm0
1159	leaq	32(%rdx),%rdx
1160	psrlq	$1,%xmm0
1161.byte	102,15,58,68,231,0
1162	pxor	%xmm1,%xmm0
1163
1164	subq	$0x20,%rcx
1165	ja	L$mod_loop
1166
1167L$even_tail:
1168	movdqa	%xmm0,%xmm1
1169	movdqa	%xmm4,%xmm8
1170	pshufd	$78,%xmm0,%xmm4
1171	pxor	%xmm0,%xmm4
1172
1173.byte	102,15,58,68,198,0
1174.byte	102,15,58,68,206,17
1175.byte	102,15,58,68,231,16
1176
1177	pxor	%xmm3,%xmm0
1178	pxor	%xmm5,%xmm1
1179	pxor	%xmm0,%xmm8
1180	pxor	%xmm1,%xmm8
1181	pxor	%xmm8,%xmm4
1182	movdqa	%xmm4,%xmm8
1183	psrldq	$8,%xmm8
1184	pslldq	$8,%xmm4
1185	pxor	%xmm8,%xmm1
1186	pxor	%xmm4,%xmm0
1187
1188	movdqa	%xmm0,%xmm4
1189	movdqa	%xmm0,%xmm3
1190	psllq	$5,%xmm0
1191	pxor	%xmm0,%xmm3
1192	psllq	$1,%xmm0
1193	pxor	%xmm3,%xmm0
1194	psllq	$57,%xmm0
1195	movdqa	%xmm0,%xmm3
1196	pslldq	$8,%xmm0
1197	psrldq	$8,%xmm3
1198	pxor	%xmm4,%xmm0
1199	pxor	%xmm3,%xmm1
1200
1201
1202	movdqa	%xmm0,%xmm4
1203	psrlq	$1,%xmm0
1204	pxor	%xmm4,%xmm1
1205	pxor	%xmm0,%xmm4
1206	psrlq	$5,%xmm0
1207	pxor	%xmm4,%xmm0
1208	psrlq	$1,%xmm0
1209	pxor	%xmm1,%xmm0
1210	testq	%rcx,%rcx
1211	jnz	L$done
1212
1213L$odd_tail:
1214	movdqu	(%rdx),%xmm8
1215.byte	102,69,15,56,0,194
1216	pxor	%xmm8,%xmm0
1217	movdqa	%xmm0,%xmm1
1218	pshufd	$78,%xmm0,%xmm3
1219	pxor	%xmm0,%xmm3
1220.byte	102,15,58,68,194,0
1221.byte	102,15,58,68,202,17
1222.byte	102,15,58,68,223,0
1223	pxor	%xmm0,%xmm3
1224	pxor	%xmm1,%xmm3
1225
1226	movdqa	%xmm3,%xmm4
1227	psrldq	$8,%xmm3
1228	pslldq	$8,%xmm4
1229	pxor	%xmm3,%xmm1
1230	pxor	%xmm4,%xmm0
1231
1232	movdqa	%xmm0,%xmm4
1233	movdqa	%xmm0,%xmm3
1234	psllq	$5,%xmm0
1235	pxor	%xmm0,%xmm3
1236	psllq	$1,%xmm0
1237	pxor	%xmm3,%xmm0
1238	psllq	$57,%xmm0
1239	movdqa	%xmm0,%xmm3
1240	pslldq	$8,%xmm0
1241	psrldq	$8,%xmm3
1242	pxor	%xmm4,%xmm0
1243	pxor	%xmm3,%xmm1
1244
1245
1246	movdqa	%xmm0,%xmm4
1247	psrlq	$1,%xmm0
1248	pxor	%xmm4,%xmm1
1249	pxor	%xmm0,%xmm4
1250	psrlq	$5,%xmm0
1251	pxor	%xmm4,%xmm0
1252	psrlq	$1,%xmm0
1253	pxor	%xmm1,%xmm0
1254L$done:
1255.byte	102,65,15,56,0,194
1256	movdqu	%xmm0,(%rdi)
1257	.byte	0xf3,0xc3
1258
1259.globl	_gcm_init_avx
1260.private_extern _gcm_init_avx
1261
1262.p2align	5
1263_gcm_init_avx:
1264	vzeroupper
1265
1266	vmovdqu	(%rsi),%xmm2
1267	vpshufd	$78,%xmm2,%xmm2
1268
1269
1270	vpshufd	$255,%xmm2,%xmm4
1271	vpsrlq	$63,%xmm2,%xmm3
1272	vpsllq	$1,%xmm2,%xmm2
1273	vpxor	%xmm5,%xmm5,%xmm5
1274	vpcmpgtd	%xmm4,%xmm5,%xmm5
1275	vpslldq	$8,%xmm3,%xmm3
1276	vpor	%xmm3,%xmm2,%xmm2
1277
1278
1279	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
1280	vpxor	%xmm5,%xmm2,%xmm2
1281
1282	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1283	vmovdqa	%xmm2,%xmm0
1284	vpxor	%xmm2,%xmm6,%xmm6
1285	movq	$4,%r10
1286	jmp	L$init_start_avx
1287.p2align	5
1288L$init_loop_avx:
1289	vpalignr	$8,%xmm3,%xmm4,%xmm5
1290	vmovdqu	%xmm5,-16(%rdi)
1291	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1292	vpxor	%xmm0,%xmm3,%xmm3
1293	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1294	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1295	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1296	vpxor	%xmm0,%xmm1,%xmm4
1297	vpxor	%xmm4,%xmm3,%xmm3
1298
1299	vpslldq	$8,%xmm3,%xmm4
1300	vpsrldq	$8,%xmm3,%xmm3
1301	vpxor	%xmm4,%xmm0,%xmm0
1302	vpxor	%xmm3,%xmm1,%xmm1
1303	vpsllq	$57,%xmm0,%xmm3
1304	vpsllq	$62,%xmm0,%xmm4
1305	vpxor	%xmm3,%xmm4,%xmm4
1306	vpsllq	$63,%xmm0,%xmm3
1307	vpxor	%xmm3,%xmm4,%xmm4
1308	vpslldq	$8,%xmm4,%xmm3
1309	vpsrldq	$8,%xmm4,%xmm4
1310	vpxor	%xmm3,%xmm0,%xmm0
1311	vpxor	%xmm4,%xmm1,%xmm1
1312
1313	vpsrlq	$1,%xmm0,%xmm4
1314	vpxor	%xmm0,%xmm1,%xmm1
1315	vpxor	%xmm4,%xmm0,%xmm0
1316	vpsrlq	$5,%xmm4,%xmm4
1317	vpxor	%xmm4,%xmm0,%xmm0
1318	vpsrlq	$1,%xmm0,%xmm0
1319	vpxor	%xmm1,%xmm0,%xmm0
1320L$init_start_avx:
1321	vmovdqa	%xmm0,%xmm5
1322	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1323	vpxor	%xmm0,%xmm3,%xmm3
1324	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1325	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1326	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1327	vpxor	%xmm0,%xmm1,%xmm4
1328	vpxor	%xmm4,%xmm3,%xmm3
1329
1330	vpslldq	$8,%xmm3,%xmm4
1331	vpsrldq	$8,%xmm3,%xmm3
1332	vpxor	%xmm4,%xmm0,%xmm0
1333	vpxor	%xmm3,%xmm1,%xmm1
1334	vpsllq	$57,%xmm0,%xmm3
1335	vpsllq	$62,%xmm0,%xmm4
1336	vpxor	%xmm3,%xmm4,%xmm4
1337	vpsllq	$63,%xmm0,%xmm3
1338	vpxor	%xmm3,%xmm4,%xmm4
1339	vpslldq	$8,%xmm4,%xmm3
1340	vpsrldq	$8,%xmm4,%xmm4
1341	vpxor	%xmm3,%xmm0,%xmm0
1342	vpxor	%xmm4,%xmm1,%xmm1
1343
1344	vpsrlq	$1,%xmm0,%xmm4
1345	vpxor	%xmm0,%xmm1,%xmm1
1346	vpxor	%xmm4,%xmm0,%xmm0
1347	vpsrlq	$5,%xmm4,%xmm4
1348	vpxor	%xmm4,%xmm0,%xmm0
1349	vpsrlq	$1,%xmm0,%xmm0
1350	vpxor	%xmm1,%xmm0,%xmm0
1351	vpshufd	$78,%xmm5,%xmm3
1352	vpshufd	$78,%xmm0,%xmm4
1353	vpxor	%xmm5,%xmm3,%xmm3
1354	vmovdqu	%xmm5,0(%rdi)
1355	vpxor	%xmm0,%xmm4,%xmm4
1356	vmovdqu	%xmm0,16(%rdi)
1357	leaq	48(%rdi),%rdi
1358	subq	$1,%r10
1359	jnz	L$init_loop_avx
1360
1361	vpalignr	$8,%xmm4,%xmm3,%xmm5
1362	vmovdqu	%xmm5,-16(%rdi)
1363
1364	vzeroupper
1365	.byte	0xf3,0xc3
1366
1367.globl	_gcm_gmult_avx
1368.private_extern _gcm_gmult_avx
1369
1370.p2align	5
1371_gcm_gmult_avx:
1372	jmp	L$_gmult_clmul
1373
1374.globl	_gcm_ghash_avx
1375.private_extern _gcm_ghash_avx
1376
1377.p2align	5
1378_gcm_ghash_avx:
1379	vzeroupper
1380
1381	vmovdqu	(%rdi),%xmm10
1382	leaq	L$0x1c2_polynomial(%rip),%r10
1383	leaq	64(%rsi),%rsi
1384	vmovdqu	L$bswap_mask(%rip),%xmm13
1385	vpshufb	%xmm13,%xmm10,%xmm10
1386	cmpq	$0x80,%rcx
1387	jb	L$short_avx
1388	subq	$0x80,%rcx
1389
1390	vmovdqu	112(%rdx),%xmm14
1391	vmovdqu	0-64(%rsi),%xmm6
1392	vpshufb	%xmm13,%xmm14,%xmm14
1393	vmovdqu	32-64(%rsi),%xmm7
1394
1395	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1396	vmovdqu	96(%rdx),%xmm15
1397	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1398	vpxor	%xmm14,%xmm9,%xmm9
1399	vpshufb	%xmm13,%xmm15,%xmm15
1400	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1401	vmovdqu	16-64(%rsi),%xmm6
1402	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1403	vmovdqu	80(%rdx),%xmm14
1404	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1405	vpxor	%xmm15,%xmm8,%xmm8
1406
1407	vpshufb	%xmm13,%xmm14,%xmm14
1408	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1409	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1410	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1411	vmovdqu	48-64(%rsi),%xmm6
1412	vpxor	%xmm14,%xmm9,%xmm9
1413	vmovdqu	64(%rdx),%xmm15
1414	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1415	vmovdqu	80-64(%rsi),%xmm7
1416
1417	vpshufb	%xmm13,%xmm15,%xmm15
1418	vpxor	%xmm0,%xmm3,%xmm3
1419	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1420	vpxor	%xmm1,%xmm4,%xmm4
1421	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1422	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1423	vmovdqu	64-64(%rsi),%xmm6
1424	vpxor	%xmm2,%xmm5,%xmm5
1425	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1426	vpxor	%xmm15,%xmm8,%xmm8
1427
1428	vmovdqu	48(%rdx),%xmm14
1429	vpxor	%xmm3,%xmm0,%xmm0
1430	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1431	vpxor	%xmm4,%xmm1,%xmm1
1432	vpshufb	%xmm13,%xmm14,%xmm14
1433	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1434	vmovdqu	96-64(%rsi),%xmm6
1435	vpxor	%xmm5,%xmm2,%xmm2
1436	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1437	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1438	vmovdqu	128-64(%rsi),%xmm7
1439	vpxor	%xmm14,%xmm9,%xmm9
1440
1441	vmovdqu	32(%rdx),%xmm15
1442	vpxor	%xmm0,%xmm3,%xmm3
1443	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1444	vpxor	%xmm1,%xmm4,%xmm4
1445	vpshufb	%xmm13,%xmm15,%xmm15
1446	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1447	vmovdqu	112-64(%rsi),%xmm6
1448	vpxor	%xmm2,%xmm5,%xmm5
1449	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1450	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1451	vpxor	%xmm15,%xmm8,%xmm8
1452
1453	vmovdqu	16(%rdx),%xmm14
1454	vpxor	%xmm3,%xmm0,%xmm0
1455	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1456	vpxor	%xmm4,%xmm1,%xmm1
1457	vpshufb	%xmm13,%xmm14,%xmm14
1458	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1459	vmovdqu	144-64(%rsi),%xmm6
1460	vpxor	%xmm5,%xmm2,%xmm2
1461	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1462	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1463	vmovdqu	176-64(%rsi),%xmm7
1464	vpxor	%xmm14,%xmm9,%xmm9
1465
1466	vmovdqu	(%rdx),%xmm15
1467	vpxor	%xmm0,%xmm3,%xmm3
1468	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1469	vpxor	%xmm1,%xmm4,%xmm4
1470	vpshufb	%xmm13,%xmm15,%xmm15
1471	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1472	vmovdqu	160-64(%rsi),%xmm6
1473	vpxor	%xmm2,%xmm5,%xmm5
1474	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1475
1476	leaq	128(%rdx),%rdx
1477	cmpq	$0x80,%rcx
1478	jb	L$tail_avx
1479
1480	vpxor	%xmm10,%xmm15,%xmm15
1481	subq	$0x80,%rcx
1482	jmp	L$oop8x_avx
1483
1484.p2align	5
1485L$oop8x_avx:
1486	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1487	vmovdqu	112(%rdx),%xmm14
1488	vpxor	%xmm0,%xmm3,%xmm3
1489	vpxor	%xmm15,%xmm8,%xmm8
1490	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1491	vpshufb	%xmm13,%xmm14,%xmm14
1492	vpxor	%xmm1,%xmm4,%xmm4
1493	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1494	vmovdqu	0-64(%rsi),%xmm6
1495	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1496	vpxor	%xmm2,%xmm5,%xmm5
1497	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1498	vmovdqu	32-64(%rsi),%xmm7
1499	vpxor	%xmm14,%xmm9,%xmm9
1500
1501	vmovdqu	96(%rdx),%xmm15
1502	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1503	vpxor	%xmm3,%xmm10,%xmm10
1504	vpshufb	%xmm13,%xmm15,%xmm15
1505	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1506	vxorps	%xmm4,%xmm11,%xmm11
1507	vmovdqu	16-64(%rsi),%xmm6
1508	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1509	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1510	vpxor	%xmm5,%xmm12,%xmm12
1511	vxorps	%xmm15,%xmm8,%xmm8
1512
1513	vmovdqu	80(%rdx),%xmm14
1514	vpxor	%xmm10,%xmm12,%xmm12
1515	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1516	vpxor	%xmm11,%xmm12,%xmm12
1517	vpslldq	$8,%xmm12,%xmm9
1518	vpxor	%xmm0,%xmm3,%xmm3
1519	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1520	vpsrldq	$8,%xmm12,%xmm12
1521	vpxor	%xmm9,%xmm10,%xmm10
1522	vmovdqu	48-64(%rsi),%xmm6
1523	vpshufb	%xmm13,%xmm14,%xmm14
1524	vxorps	%xmm12,%xmm11,%xmm11
1525	vpxor	%xmm1,%xmm4,%xmm4
1526	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1527	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1528	vmovdqu	80-64(%rsi),%xmm7
1529	vpxor	%xmm14,%xmm9,%xmm9
1530	vpxor	%xmm2,%xmm5,%xmm5
1531
1532	vmovdqu	64(%rdx),%xmm15
1533	vpalignr	$8,%xmm10,%xmm10,%xmm12
1534	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1535	vpshufb	%xmm13,%xmm15,%xmm15
1536	vpxor	%xmm3,%xmm0,%xmm0
1537	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1538	vmovdqu	64-64(%rsi),%xmm6
1539	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1540	vpxor	%xmm4,%xmm1,%xmm1
1541	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1542	vxorps	%xmm15,%xmm8,%xmm8
1543	vpxor	%xmm5,%xmm2,%xmm2
1544
1545	vmovdqu	48(%rdx),%xmm14
1546	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1547	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1548	vpshufb	%xmm13,%xmm14,%xmm14
1549	vpxor	%xmm0,%xmm3,%xmm3
1550	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1551	vmovdqu	96-64(%rsi),%xmm6
1552	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1553	vpxor	%xmm1,%xmm4,%xmm4
1554	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1555	vmovdqu	128-64(%rsi),%xmm7
1556	vpxor	%xmm14,%xmm9,%xmm9
1557	vpxor	%xmm2,%xmm5,%xmm5
1558
1559	vmovdqu	32(%rdx),%xmm15
1560	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1561	vpshufb	%xmm13,%xmm15,%xmm15
1562	vpxor	%xmm3,%xmm0,%xmm0
1563	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1564	vmovdqu	112-64(%rsi),%xmm6
1565	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1566	vpxor	%xmm4,%xmm1,%xmm1
1567	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1568	vpxor	%xmm15,%xmm8,%xmm8
1569	vpxor	%xmm5,%xmm2,%xmm2
1570	vxorps	%xmm12,%xmm10,%xmm10
1571
1572	vmovdqu	16(%rdx),%xmm14
1573	vpalignr	$8,%xmm10,%xmm10,%xmm12
1574	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1575	vpshufb	%xmm13,%xmm14,%xmm14
1576	vpxor	%xmm0,%xmm3,%xmm3
1577	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1578	vmovdqu	144-64(%rsi),%xmm6
1579	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1580	vxorps	%xmm11,%xmm12,%xmm12
1581	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1582	vpxor	%xmm1,%xmm4,%xmm4
1583	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1584	vmovdqu	176-64(%rsi),%xmm7
1585	vpxor	%xmm14,%xmm9,%xmm9
1586	vpxor	%xmm2,%xmm5,%xmm5
1587
1588	vmovdqu	(%rdx),%xmm15
1589	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1590	vpshufb	%xmm13,%xmm15,%xmm15
1591	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1592	vmovdqu	160-64(%rsi),%xmm6
1593	vpxor	%xmm12,%xmm15,%xmm15
1594	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1595	vpxor	%xmm10,%xmm15,%xmm15
1596
1597	leaq	128(%rdx),%rdx
1598	subq	$0x80,%rcx
1599	jnc	L$oop8x_avx
1600
1601	addq	$0x80,%rcx
1602	jmp	L$tail_no_xor_avx
1603
1604.p2align	5
1605L$short_avx:
1606	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1607	leaq	(%rdx,%rcx,1),%rdx
1608	vmovdqu	0-64(%rsi),%xmm6
1609	vmovdqu	32-64(%rsi),%xmm7
1610	vpshufb	%xmm13,%xmm14,%xmm15
1611
1612	vmovdqa	%xmm0,%xmm3
1613	vmovdqa	%xmm1,%xmm4
1614	vmovdqa	%xmm2,%xmm5
1615	subq	$0x10,%rcx
1616	jz	L$tail_avx
1617
1618	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1619	vpxor	%xmm0,%xmm3,%xmm3
1620	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1621	vpxor	%xmm15,%xmm8,%xmm8
1622	vmovdqu	-32(%rdx),%xmm14
1623	vpxor	%xmm1,%xmm4,%xmm4
1624	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1625	vmovdqu	16-64(%rsi),%xmm6
1626	vpshufb	%xmm13,%xmm14,%xmm15
1627	vpxor	%xmm2,%xmm5,%xmm5
1628	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1629	vpsrldq	$8,%xmm7,%xmm7
1630	subq	$0x10,%rcx
1631	jz	L$tail_avx
1632
1633	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1634	vpxor	%xmm0,%xmm3,%xmm3
1635	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1636	vpxor	%xmm15,%xmm8,%xmm8
1637	vmovdqu	-48(%rdx),%xmm14
1638	vpxor	%xmm1,%xmm4,%xmm4
1639	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1640	vmovdqu	48-64(%rsi),%xmm6
1641	vpshufb	%xmm13,%xmm14,%xmm15
1642	vpxor	%xmm2,%xmm5,%xmm5
1643	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1644	vmovdqu	80-64(%rsi),%xmm7
1645	subq	$0x10,%rcx
1646	jz	L$tail_avx
1647
1648	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1649	vpxor	%xmm0,%xmm3,%xmm3
1650	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1651	vpxor	%xmm15,%xmm8,%xmm8
1652	vmovdqu	-64(%rdx),%xmm14
1653	vpxor	%xmm1,%xmm4,%xmm4
1654	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1655	vmovdqu	64-64(%rsi),%xmm6
1656	vpshufb	%xmm13,%xmm14,%xmm15
1657	vpxor	%xmm2,%xmm5,%xmm5
1658	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1659	vpsrldq	$8,%xmm7,%xmm7
1660	subq	$0x10,%rcx
1661	jz	L$tail_avx
1662
1663	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1664	vpxor	%xmm0,%xmm3,%xmm3
1665	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1666	vpxor	%xmm15,%xmm8,%xmm8
1667	vmovdqu	-80(%rdx),%xmm14
1668	vpxor	%xmm1,%xmm4,%xmm4
1669	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1670	vmovdqu	96-64(%rsi),%xmm6
1671	vpshufb	%xmm13,%xmm14,%xmm15
1672	vpxor	%xmm2,%xmm5,%xmm5
1673	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1674	vmovdqu	128-64(%rsi),%xmm7
1675	subq	$0x10,%rcx
1676	jz	L$tail_avx
1677
1678	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1679	vpxor	%xmm0,%xmm3,%xmm3
1680	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1681	vpxor	%xmm15,%xmm8,%xmm8
1682	vmovdqu	-96(%rdx),%xmm14
1683	vpxor	%xmm1,%xmm4,%xmm4
1684	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1685	vmovdqu	112-64(%rsi),%xmm6
1686	vpshufb	%xmm13,%xmm14,%xmm15
1687	vpxor	%xmm2,%xmm5,%xmm5
1688	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1689	vpsrldq	$8,%xmm7,%xmm7
1690	subq	$0x10,%rcx
1691	jz	L$tail_avx
1692
1693	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1694	vpxor	%xmm0,%xmm3,%xmm3
1695	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1696	vpxor	%xmm15,%xmm8,%xmm8
1697	vmovdqu	-112(%rdx),%xmm14
1698	vpxor	%xmm1,%xmm4,%xmm4
1699	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1700	vmovdqu	144-64(%rsi),%xmm6
1701	vpshufb	%xmm13,%xmm14,%xmm15
1702	vpxor	%xmm2,%xmm5,%xmm5
1703	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1704	vmovq	184-64(%rsi),%xmm7
1705	subq	$0x10,%rcx
1706	jmp	L$tail_avx
1707
1708.p2align	5
1709L$tail_avx:
1710	vpxor	%xmm10,%xmm15,%xmm15
1711L$tail_no_xor_avx:
1712	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1713	vpxor	%xmm0,%xmm3,%xmm3
1714	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1715	vpxor	%xmm15,%xmm8,%xmm8
1716	vpxor	%xmm1,%xmm4,%xmm4
1717	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1718	vpxor	%xmm2,%xmm5,%xmm5
1719	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1720
1721	vmovdqu	(%r10),%xmm12
1722
1723	vpxor	%xmm0,%xmm3,%xmm10
1724	vpxor	%xmm1,%xmm4,%xmm11
1725	vpxor	%xmm2,%xmm5,%xmm5
1726
1727	vpxor	%xmm10,%xmm5,%xmm5
1728	vpxor	%xmm11,%xmm5,%xmm5
1729	vpslldq	$8,%xmm5,%xmm9
1730	vpsrldq	$8,%xmm5,%xmm5
1731	vpxor	%xmm9,%xmm10,%xmm10
1732	vpxor	%xmm5,%xmm11,%xmm11
1733
1734	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1735	vpalignr	$8,%xmm10,%xmm10,%xmm10
1736	vpxor	%xmm9,%xmm10,%xmm10
1737
1738	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1739	vpalignr	$8,%xmm10,%xmm10,%xmm10
1740	vpxor	%xmm11,%xmm10,%xmm10
1741	vpxor	%xmm9,%xmm10,%xmm10
1742
1743	cmpq	$0,%rcx
1744	jne	L$short_avx
1745
1746	vpshufb	%xmm13,%xmm10,%xmm10
1747	vmovdqu	%xmm10,(%rdi)
1748	vzeroupper
1749	.byte	0xf3,0xc3
1750
1751.p2align	6
1752L$bswap_mask:
1753.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1754L$0x1c2_polynomial:
1755.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1756L$7_mask:
1757.long	7,0,7,0
1758L$7_mask_poly:
1759.long	7,0,450,0
1760.p2align	6
1761
1762L$rem_4bit:
1763.long	0,0,0,471859200,0,943718400,0,610271232
1764.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1765.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1766.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1767
1768L$rem_8bit:
1769.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1770.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1771.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1772.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1773.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1774.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1775.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1776.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1777.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1778.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1779.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1780.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1781.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1782.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1783.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1784.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1785.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1786.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1787.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1788.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1789.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1790.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1791.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1792.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1793.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1794.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1795.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1796.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1797.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1798.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1799.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1800.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1801
1802.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1803.p2align	6
1804#endif
1805