1#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
2.text
3.extern	OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5
6.globl	gcm_gmult_4bit
7.hidden gcm_gmult_4bit
8.type	gcm_gmult_4bit,@function
9.align	16
10gcm_gmult_4bit:
11	pushq	%rbx
12	pushq	%rbp
13	pushq	%r12
14	pushq	%r13
15	pushq	%r14
16	pushq	%r15
17	subq	$280,%rsp
18.Lgmult_prologue:
19
20	movzbq	15(%rdi),%r8
21	leaq	.Lrem_4bit(%rip),%r11
22	xorq	%rax,%rax
23	xorq	%rbx,%rbx
24	movb	%r8b,%al
25	movb	%r8b,%bl
26	shlb	$4,%al
27	movq	$14,%rcx
28	movq	8(%rsi,%rax,1),%r8
29	movq	(%rsi,%rax,1),%r9
30	andb	$0xf0,%bl
31	movq	%r8,%rdx
32	jmp	.Loop1
33
34.align	16
35.Loop1:
36	shrq	$4,%r8
37	andq	$0xf,%rdx
38	movq	%r9,%r10
39	movb	(%rdi,%rcx,1),%al
40	shrq	$4,%r9
41	xorq	8(%rsi,%rbx,1),%r8
42	shlq	$60,%r10
43	xorq	(%rsi,%rbx,1),%r9
44	movb	%al,%bl
45	xorq	(%r11,%rdx,8),%r9
46	movq	%r8,%rdx
47	shlb	$4,%al
48	xorq	%r10,%r8
49	decq	%rcx
50	js	.Lbreak1
51
52	shrq	$4,%r8
53	andq	$0xf,%rdx
54	movq	%r9,%r10
55	shrq	$4,%r9
56	xorq	8(%rsi,%rax,1),%r8
57	shlq	$60,%r10
58	xorq	(%rsi,%rax,1),%r9
59	andb	$0xf0,%bl
60	xorq	(%r11,%rdx,8),%r9
61	movq	%r8,%rdx
62	xorq	%r10,%r8
63	jmp	.Loop1
64
65.align	16
66.Lbreak1:
67	shrq	$4,%r8
68	andq	$0xf,%rdx
69	movq	%r9,%r10
70	shrq	$4,%r9
71	xorq	8(%rsi,%rax,1),%r8
72	shlq	$60,%r10
73	xorq	(%rsi,%rax,1),%r9
74	andb	$0xf0,%bl
75	xorq	(%r11,%rdx,8),%r9
76	movq	%r8,%rdx
77	xorq	%r10,%r8
78
79	shrq	$4,%r8
80	andq	$0xf,%rdx
81	movq	%r9,%r10
82	shrq	$4,%r9
83	xorq	8(%rsi,%rbx,1),%r8
84	shlq	$60,%r10
85	xorq	(%rsi,%rbx,1),%r9
86	xorq	%r10,%r8
87	xorq	(%r11,%rdx,8),%r9
88
89	bswapq	%r8
90	bswapq	%r9
91	movq	%r8,8(%rdi)
92	movq	%r9,(%rdi)
93
94	leaq	280+48(%rsp),%rsi
95	movq	-8(%rsi),%rbx
96	leaq	(%rsi),%rsp
97.Lgmult_epilogue:
98	.byte	0xf3,0xc3
99.size	gcm_gmult_4bit,.-gcm_gmult_4bit
100.globl	gcm_ghash_4bit
101.hidden gcm_ghash_4bit
102.type	gcm_ghash_4bit,@function
103.align	16
104gcm_ghash_4bit:
105	pushq	%rbx
106	pushq	%rbp
107	pushq	%r12
108	pushq	%r13
109	pushq	%r14
110	pushq	%r15
111	subq	$280,%rsp
112.Lghash_prologue:
113	movq	%rdx,%r14
114	movq	%rcx,%r15
115	subq	$-128,%rsi
116	leaq	16+128(%rsp),%rbp
117	xorl	%edx,%edx
118	movq	0+0-128(%rsi),%r8
119	movq	0+8-128(%rsi),%rax
120	movb	%al,%dl
121	shrq	$4,%rax
122	movq	%r8,%r10
123	shrq	$4,%r8
124	movq	16+0-128(%rsi),%r9
125	shlb	$4,%dl
126	movq	16+8-128(%rsi),%rbx
127	shlq	$60,%r10
128	movb	%dl,0(%rsp)
129	orq	%r10,%rax
130	movb	%bl,%dl
131	shrq	$4,%rbx
132	movq	%r9,%r10
133	shrq	$4,%r9
134	movq	%r8,0(%rbp)
135	movq	32+0-128(%rsi),%r8
136	shlb	$4,%dl
137	movq	%rax,0-128(%rbp)
138	movq	32+8-128(%rsi),%rax
139	shlq	$60,%r10
140	movb	%dl,1(%rsp)
141	orq	%r10,%rbx
142	movb	%al,%dl
143	shrq	$4,%rax
144	movq	%r8,%r10
145	shrq	$4,%r8
146	movq	%r9,8(%rbp)
147	movq	48+0-128(%rsi),%r9
148	shlb	$4,%dl
149	movq	%rbx,8-128(%rbp)
150	movq	48+8-128(%rsi),%rbx
151	shlq	$60,%r10
152	movb	%dl,2(%rsp)
153	orq	%r10,%rax
154	movb	%bl,%dl
155	shrq	$4,%rbx
156	movq	%r9,%r10
157	shrq	$4,%r9
158	movq	%r8,16(%rbp)
159	movq	64+0-128(%rsi),%r8
160	shlb	$4,%dl
161	movq	%rax,16-128(%rbp)
162	movq	64+8-128(%rsi),%rax
163	shlq	$60,%r10
164	movb	%dl,3(%rsp)
165	orq	%r10,%rbx
166	movb	%al,%dl
167	shrq	$4,%rax
168	movq	%r8,%r10
169	shrq	$4,%r8
170	movq	%r9,24(%rbp)
171	movq	80+0-128(%rsi),%r9
172	shlb	$4,%dl
173	movq	%rbx,24-128(%rbp)
174	movq	80+8-128(%rsi),%rbx
175	shlq	$60,%r10
176	movb	%dl,4(%rsp)
177	orq	%r10,%rax
178	movb	%bl,%dl
179	shrq	$4,%rbx
180	movq	%r9,%r10
181	shrq	$4,%r9
182	movq	%r8,32(%rbp)
183	movq	96+0-128(%rsi),%r8
184	shlb	$4,%dl
185	movq	%rax,32-128(%rbp)
186	movq	96+8-128(%rsi),%rax
187	shlq	$60,%r10
188	movb	%dl,5(%rsp)
189	orq	%r10,%rbx
190	movb	%al,%dl
191	shrq	$4,%rax
192	movq	%r8,%r10
193	shrq	$4,%r8
194	movq	%r9,40(%rbp)
195	movq	112+0-128(%rsi),%r9
196	shlb	$4,%dl
197	movq	%rbx,40-128(%rbp)
198	movq	112+8-128(%rsi),%rbx
199	shlq	$60,%r10
200	movb	%dl,6(%rsp)
201	orq	%r10,%rax
202	movb	%bl,%dl
203	shrq	$4,%rbx
204	movq	%r9,%r10
205	shrq	$4,%r9
206	movq	%r8,48(%rbp)
207	movq	128+0-128(%rsi),%r8
208	shlb	$4,%dl
209	movq	%rax,48-128(%rbp)
210	movq	128+8-128(%rsi),%rax
211	shlq	$60,%r10
212	movb	%dl,7(%rsp)
213	orq	%r10,%rbx
214	movb	%al,%dl
215	shrq	$4,%rax
216	movq	%r8,%r10
217	shrq	$4,%r8
218	movq	%r9,56(%rbp)
219	movq	144+0-128(%rsi),%r9
220	shlb	$4,%dl
221	movq	%rbx,56-128(%rbp)
222	movq	144+8-128(%rsi),%rbx
223	shlq	$60,%r10
224	movb	%dl,8(%rsp)
225	orq	%r10,%rax
226	movb	%bl,%dl
227	shrq	$4,%rbx
228	movq	%r9,%r10
229	shrq	$4,%r9
230	movq	%r8,64(%rbp)
231	movq	160+0-128(%rsi),%r8
232	shlb	$4,%dl
233	movq	%rax,64-128(%rbp)
234	movq	160+8-128(%rsi),%rax
235	shlq	$60,%r10
236	movb	%dl,9(%rsp)
237	orq	%r10,%rbx
238	movb	%al,%dl
239	shrq	$4,%rax
240	movq	%r8,%r10
241	shrq	$4,%r8
242	movq	%r9,72(%rbp)
243	movq	176+0-128(%rsi),%r9
244	shlb	$4,%dl
245	movq	%rbx,72-128(%rbp)
246	movq	176+8-128(%rsi),%rbx
247	shlq	$60,%r10
248	movb	%dl,10(%rsp)
249	orq	%r10,%rax
250	movb	%bl,%dl
251	shrq	$4,%rbx
252	movq	%r9,%r10
253	shrq	$4,%r9
254	movq	%r8,80(%rbp)
255	movq	192+0-128(%rsi),%r8
256	shlb	$4,%dl
257	movq	%rax,80-128(%rbp)
258	movq	192+8-128(%rsi),%rax
259	shlq	$60,%r10
260	movb	%dl,11(%rsp)
261	orq	%r10,%rbx
262	movb	%al,%dl
263	shrq	$4,%rax
264	movq	%r8,%r10
265	shrq	$4,%r8
266	movq	%r9,88(%rbp)
267	movq	208+0-128(%rsi),%r9
268	shlb	$4,%dl
269	movq	%rbx,88-128(%rbp)
270	movq	208+8-128(%rsi),%rbx
271	shlq	$60,%r10
272	movb	%dl,12(%rsp)
273	orq	%r10,%rax
274	movb	%bl,%dl
275	shrq	$4,%rbx
276	movq	%r9,%r10
277	shrq	$4,%r9
278	movq	%r8,96(%rbp)
279	movq	224+0-128(%rsi),%r8
280	shlb	$4,%dl
281	movq	%rax,96-128(%rbp)
282	movq	224+8-128(%rsi),%rax
283	shlq	$60,%r10
284	movb	%dl,13(%rsp)
285	orq	%r10,%rbx
286	movb	%al,%dl
287	shrq	$4,%rax
288	movq	%r8,%r10
289	shrq	$4,%r8
290	movq	%r9,104(%rbp)
291	movq	240+0-128(%rsi),%r9
292	shlb	$4,%dl
293	movq	%rbx,104-128(%rbp)
294	movq	240+8-128(%rsi),%rbx
295	shlq	$60,%r10
296	movb	%dl,14(%rsp)
297	orq	%r10,%rax
298	movb	%bl,%dl
299	shrq	$4,%rbx
300	movq	%r9,%r10
301	shrq	$4,%r9
302	movq	%r8,112(%rbp)
303	shlb	$4,%dl
304	movq	%rax,112-128(%rbp)
305	shlq	$60,%r10
306	movb	%dl,15(%rsp)
307	orq	%r10,%rbx
308	movq	%r9,120(%rbp)
309	movq	%rbx,120-128(%rbp)
310	addq	$-128,%rsi
311	movq	8(%rdi),%r8
312	movq	0(%rdi),%r9
313	addq	%r14,%r15
314	leaq	.Lrem_8bit(%rip),%r11
315	jmp	.Louter_loop
316.align	16
317.Louter_loop:
318	xorq	(%r14),%r9
319	movq	8(%r14),%rdx
320	leaq	16(%r14),%r14
321	xorq	%r8,%rdx
322	movq	%r9,(%rdi)
323	movq	%rdx,8(%rdi)
324	shrq	$32,%rdx
325	xorq	%rax,%rax
326	roll	$8,%edx
327	movb	%dl,%al
328	movzbl	%dl,%ebx
329	shlb	$4,%al
330	shrl	$4,%ebx
331	roll	$8,%edx
332	movq	8(%rsi,%rax,1),%r8
333	movq	(%rsi,%rax,1),%r9
334	movb	%dl,%al
335	movzbl	%dl,%ecx
336	shlb	$4,%al
337	movzbq	(%rsp,%rbx,1),%r12
338	shrl	$4,%ecx
339	xorq	%r8,%r12
340	movq	%r9,%r10
341	shrq	$8,%r8
342	movzbq	%r12b,%r12
343	shrq	$8,%r9
344	xorq	-128(%rbp,%rbx,8),%r8
345	shlq	$56,%r10
346	xorq	(%rbp,%rbx,8),%r9
347	roll	$8,%edx
348	xorq	8(%rsi,%rax,1),%r8
349	xorq	(%rsi,%rax,1),%r9
350	movb	%dl,%al
351	xorq	%r10,%r8
352	movzwq	(%r11,%r12,2),%r12
353	movzbl	%dl,%ebx
354	shlb	$4,%al
355	movzbq	(%rsp,%rcx,1),%r13
356	shrl	$4,%ebx
357	shlq	$48,%r12
358	xorq	%r8,%r13
359	movq	%r9,%r10
360	xorq	%r12,%r9
361	shrq	$8,%r8
362	movzbq	%r13b,%r13
363	shrq	$8,%r9
364	xorq	-128(%rbp,%rcx,8),%r8
365	shlq	$56,%r10
366	xorq	(%rbp,%rcx,8),%r9
367	roll	$8,%edx
368	xorq	8(%rsi,%rax,1),%r8
369	xorq	(%rsi,%rax,1),%r9
370	movb	%dl,%al
371	xorq	%r10,%r8
372	movzwq	(%r11,%r13,2),%r13
373	movzbl	%dl,%ecx
374	shlb	$4,%al
375	movzbq	(%rsp,%rbx,1),%r12
376	shrl	$4,%ecx
377	shlq	$48,%r13
378	xorq	%r8,%r12
379	movq	%r9,%r10
380	xorq	%r13,%r9
381	shrq	$8,%r8
382	movzbq	%r12b,%r12
383	movl	8(%rdi),%edx
384	shrq	$8,%r9
385	xorq	-128(%rbp,%rbx,8),%r8
386	shlq	$56,%r10
387	xorq	(%rbp,%rbx,8),%r9
388	roll	$8,%edx
389	xorq	8(%rsi,%rax,1),%r8
390	xorq	(%rsi,%rax,1),%r9
391	movb	%dl,%al
392	xorq	%r10,%r8
393	movzwq	(%r11,%r12,2),%r12
394	movzbl	%dl,%ebx
395	shlb	$4,%al
396	movzbq	(%rsp,%rcx,1),%r13
397	shrl	$4,%ebx
398	shlq	$48,%r12
399	xorq	%r8,%r13
400	movq	%r9,%r10
401	xorq	%r12,%r9
402	shrq	$8,%r8
403	movzbq	%r13b,%r13
404	shrq	$8,%r9
405	xorq	-128(%rbp,%rcx,8),%r8
406	shlq	$56,%r10
407	xorq	(%rbp,%rcx,8),%r9
408	roll	$8,%edx
409	xorq	8(%rsi,%rax,1),%r8
410	xorq	(%rsi,%rax,1),%r9
411	movb	%dl,%al
412	xorq	%r10,%r8
413	movzwq	(%r11,%r13,2),%r13
414	movzbl	%dl,%ecx
415	shlb	$4,%al
416	movzbq	(%rsp,%rbx,1),%r12
417	shrl	$4,%ecx
418	shlq	$48,%r13
419	xorq	%r8,%r12
420	movq	%r9,%r10
421	xorq	%r13,%r9
422	shrq	$8,%r8
423	movzbq	%r12b,%r12
424	shrq	$8,%r9
425	xorq	-128(%rbp,%rbx,8),%r8
426	shlq	$56,%r10
427	xorq	(%rbp,%rbx,8),%r9
428	roll	$8,%edx
429	xorq	8(%rsi,%rax,1),%r8
430	xorq	(%rsi,%rax,1),%r9
431	movb	%dl,%al
432	xorq	%r10,%r8
433	movzwq	(%r11,%r12,2),%r12
434	movzbl	%dl,%ebx
435	shlb	$4,%al
436	movzbq	(%rsp,%rcx,1),%r13
437	shrl	$4,%ebx
438	shlq	$48,%r12
439	xorq	%r8,%r13
440	movq	%r9,%r10
441	xorq	%r12,%r9
442	shrq	$8,%r8
443	movzbq	%r13b,%r13
444	shrq	$8,%r9
445	xorq	-128(%rbp,%rcx,8),%r8
446	shlq	$56,%r10
447	xorq	(%rbp,%rcx,8),%r9
448	roll	$8,%edx
449	xorq	8(%rsi,%rax,1),%r8
450	xorq	(%rsi,%rax,1),%r9
451	movb	%dl,%al
452	xorq	%r10,%r8
453	movzwq	(%r11,%r13,2),%r13
454	movzbl	%dl,%ecx
455	shlb	$4,%al
456	movzbq	(%rsp,%rbx,1),%r12
457	shrl	$4,%ecx
458	shlq	$48,%r13
459	xorq	%r8,%r12
460	movq	%r9,%r10
461	xorq	%r13,%r9
462	shrq	$8,%r8
463	movzbq	%r12b,%r12
464	movl	4(%rdi),%edx
465	shrq	$8,%r9
466	xorq	-128(%rbp,%rbx,8),%r8
467	shlq	$56,%r10
468	xorq	(%rbp,%rbx,8),%r9
469	roll	$8,%edx
470	xorq	8(%rsi,%rax,1),%r8
471	xorq	(%rsi,%rax,1),%r9
472	movb	%dl,%al
473	xorq	%r10,%r8
474	movzwq	(%r11,%r12,2),%r12
475	movzbl	%dl,%ebx
476	shlb	$4,%al
477	movzbq	(%rsp,%rcx,1),%r13
478	shrl	$4,%ebx
479	shlq	$48,%r12
480	xorq	%r8,%r13
481	movq	%r9,%r10
482	xorq	%r12,%r9
483	shrq	$8,%r8
484	movzbq	%r13b,%r13
485	shrq	$8,%r9
486	xorq	-128(%rbp,%rcx,8),%r8
487	shlq	$56,%r10
488	xorq	(%rbp,%rcx,8),%r9
489	roll	$8,%edx
490	xorq	8(%rsi,%rax,1),%r8
491	xorq	(%rsi,%rax,1),%r9
492	movb	%dl,%al
493	xorq	%r10,%r8
494	movzwq	(%r11,%r13,2),%r13
495	movzbl	%dl,%ecx
496	shlb	$4,%al
497	movzbq	(%rsp,%rbx,1),%r12
498	shrl	$4,%ecx
499	shlq	$48,%r13
500	xorq	%r8,%r12
501	movq	%r9,%r10
502	xorq	%r13,%r9
503	shrq	$8,%r8
504	movzbq	%r12b,%r12
505	shrq	$8,%r9
506	xorq	-128(%rbp,%rbx,8),%r8
507	shlq	$56,%r10
508	xorq	(%rbp,%rbx,8),%r9
509	roll	$8,%edx
510	xorq	8(%rsi,%rax,1),%r8
511	xorq	(%rsi,%rax,1),%r9
512	movb	%dl,%al
513	xorq	%r10,%r8
514	movzwq	(%r11,%r12,2),%r12
515	movzbl	%dl,%ebx
516	shlb	$4,%al
517	movzbq	(%rsp,%rcx,1),%r13
518	shrl	$4,%ebx
519	shlq	$48,%r12
520	xorq	%r8,%r13
521	movq	%r9,%r10
522	xorq	%r12,%r9
523	shrq	$8,%r8
524	movzbq	%r13b,%r13
525	shrq	$8,%r9
526	xorq	-128(%rbp,%rcx,8),%r8
527	shlq	$56,%r10
528	xorq	(%rbp,%rcx,8),%r9
529	roll	$8,%edx
530	xorq	8(%rsi,%rax,1),%r8
531	xorq	(%rsi,%rax,1),%r9
532	movb	%dl,%al
533	xorq	%r10,%r8
534	movzwq	(%r11,%r13,2),%r13
535	movzbl	%dl,%ecx
536	shlb	$4,%al
537	movzbq	(%rsp,%rbx,1),%r12
538	shrl	$4,%ecx
539	shlq	$48,%r13
540	xorq	%r8,%r12
541	movq	%r9,%r10
542	xorq	%r13,%r9
543	shrq	$8,%r8
544	movzbq	%r12b,%r12
545	movl	0(%rdi),%edx
546	shrq	$8,%r9
547	xorq	-128(%rbp,%rbx,8),%r8
548	shlq	$56,%r10
549	xorq	(%rbp,%rbx,8),%r9
550	roll	$8,%edx
551	xorq	8(%rsi,%rax,1),%r8
552	xorq	(%rsi,%rax,1),%r9
553	movb	%dl,%al
554	xorq	%r10,%r8
555	movzwq	(%r11,%r12,2),%r12
556	movzbl	%dl,%ebx
557	shlb	$4,%al
558	movzbq	(%rsp,%rcx,1),%r13
559	shrl	$4,%ebx
560	shlq	$48,%r12
561	xorq	%r8,%r13
562	movq	%r9,%r10
563	xorq	%r12,%r9
564	shrq	$8,%r8
565	movzbq	%r13b,%r13
566	shrq	$8,%r9
567	xorq	-128(%rbp,%rcx,8),%r8
568	shlq	$56,%r10
569	xorq	(%rbp,%rcx,8),%r9
570	roll	$8,%edx
571	xorq	8(%rsi,%rax,1),%r8
572	xorq	(%rsi,%rax,1),%r9
573	movb	%dl,%al
574	xorq	%r10,%r8
575	movzwq	(%r11,%r13,2),%r13
576	movzbl	%dl,%ecx
577	shlb	$4,%al
578	movzbq	(%rsp,%rbx,1),%r12
579	shrl	$4,%ecx
580	shlq	$48,%r13
581	xorq	%r8,%r12
582	movq	%r9,%r10
583	xorq	%r13,%r9
584	shrq	$8,%r8
585	movzbq	%r12b,%r12
586	shrq	$8,%r9
587	xorq	-128(%rbp,%rbx,8),%r8
588	shlq	$56,%r10
589	xorq	(%rbp,%rbx,8),%r9
590	roll	$8,%edx
591	xorq	8(%rsi,%rax,1),%r8
592	xorq	(%rsi,%rax,1),%r9
593	movb	%dl,%al
594	xorq	%r10,%r8
595	movzwq	(%r11,%r12,2),%r12
596	movzbl	%dl,%ebx
597	shlb	$4,%al
598	movzbq	(%rsp,%rcx,1),%r13
599	shrl	$4,%ebx
600	shlq	$48,%r12
601	xorq	%r8,%r13
602	movq	%r9,%r10
603	xorq	%r12,%r9
604	shrq	$8,%r8
605	movzbq	%r13b,%r13
606	shrq	$8,%r9
607	xorq	-128(%rbp,%rcx,8),%r8
608	shlq	$56,%r10
609	xorq	(%rbp,%rcx,8),%r9
610	roll	$8,%edx
611	xorq	8(%rsi,%rax,1),%r8
612	xorq	(%rsi,%rax,1),%r9
613	movb	%dl,%al
614	xorq	%r10,%r8
615	movzwq	(%r11,%r13,2),%r13
616	movzbl	%dl,%ecx
617	shlb	$4,%al
618	movzbq	(%rsp,%rbx,1),%r12
619	andl	$240,%ecx
620	shlq	$48,%r13
621	xorq	%r8,%r12
622	movq	%r9,%r10
623	xorq	%r13,%r9
624	shrq	$8,%r8
625	movzbq	%r12b,%r12
626	movl	-4(%rdi),%edx
627	shrq	$8,%r9
628	xorq	-128(%rbp,%rbx,8),%r8
629	shlq	$56,%r10
630	xorq	(%rbp,%rbx,8),%r9
631	movzwq	(%r11,%r12,2),%r12
632	xorq	8(%rsi,%rax,1),%r8
633	xorq	(%rsi,%rax,1),%r9
634	shlq	$48,%r12
635	xorq	%r10,%r8
636	xorq	%r12,%r9
637	movzbq	%r8b,%r13
638	shrq	$4,%r8
639	movq	%r9,%r10
640	shlb	$4,%r13b
641	shrq	$4,%r9
642	xorq	8(%rsi,%rcx,1),%r8
643	movzwq	(%r11,%r13,2),%r13
644	shlq	$60,%r10
645	xorq	(%rsi,%rcx,1),%r9
646	xorq	%r10,%r8
647	shlq	$48,%r13
648	bswapq	%r8
649	xorq	%r13,%r9
650	bswapq	%r9
651	cmpq	%r15,%r14
652	jb	.Louter_loop
653	movq	%r8,8(%rdi)
654	movq	%r9,(%rdi)
655
656	leaq	280+48(%rsp),%rsi
657	movq	-48(%rsi),%r15
658	movq	-40(%rsi),%r14
659	movq	-32(%rsi),%r13
660	movq	-24(%rsi),%r12
661	movq	-16(%rsi),%rbp
662	movq	-8(%rsi),%rbx
663	leaq	0(%rsi),%rsp
664.Lghash_epilogue:
665	.byte	0xf3,0xc3
666.size	gcm_ghash_4bit,.-gcm_ghash_4bit
667.globl	gcm_init_clmul
668.hidden gcm_init_clmul
669.type	gcm_init_clmul,@function
670.align	16
671gcm_init_clmul:
672.L_init_clmul:
673	movdqu	(%rsi),%xmm2
674	pshufd	$78,%xmm2,%xmm2
675
676
677	pshufd	$255,%xmm2,%xmm4
678	movdqa	%xmm2,%xmm3
679	psllq	$1,%xmm2
680	pxor	%xmm5,%xmm5
681	psrlq	$63,%xmm3
682	pcmpgtd	%xmm4,%xmm5
683	pslldq	$8,%xmm3
684	por	%xmm3,%xmm2
685
686
687	pand	.L0x1c2_polynomial(%rip),%xmm5
688	pxor	%xmm5,%xmm2
689
690
691	pshufd	$78,%xmm2,%xmm6
692	movdqa	%xmm2,%xmm0
693	pxor	%xmm2,%xmm6
694	movdqa	%xmm0,%xmm1
695	pshufd	$78,%xmm0,%xmm3
696	pxor	%xmm0,%xmm3
697.byte	102,15,58,68,194,0
698.byte	102,15,58,68,202,17
699.byte	102,15,58,68,222,0
700	pxor	%xmm0,%xmm3
701	pxor	%xmm1,%xmm3
702
703	movdqa	%xmm3,%xmm4
704	psrldq	$8,%xmm3
705	pslldq	$8,%xmm4
706	pxor	%xmm3,%xmm1
707	pxor	%xmm4,%xmm0
708
709	movdqa	%xmm0,%xmm4
710	movdqa	%xmm0,%xmm3
711	psllq	$5,%xmm0
712	pxor	%xmm0,%xmm3
713	psllq	$1,%xmm0
714	pxor	%xmm3,%xmm0
715	psllq	$57,%xmm0
716	movdqa	%xmm0,%xmm3
717	pslldq	$8,%xmm0
718	psrldq	$8,%xmm3
719	pxor	%xmm4,%xmm0
720	pxor	%xmm3,%xmm1
721
722
723	movdqa	%xmm0,%xmm4
724	psrlq	$1,%xmm0
725	pxor	%xmm4,%xmm1
726	pxor	%xmm0,%xmm4
727	psrlq	$5,%xmm0
728	pxor	%xmm4,%xmm0
729	psrlq	$1,%xmm0
730	pxor	%xmm1,%xmm0
731	pshufd	$78,%xmm2,%xmm3
732	pshufd	$78,%xmm0,%xmm4
733	pxor	%xmm2,%xmm3
734	movdqu	%xmm2,0(%rdi)
735	pxor	%xmm0,%xmm4
736	movdqu	%xmm0,16(%rdi)
737.byte	102,15,58,15,227,8
738	movdqu	%xmm4,32(%rdi)
739	movdqa	%xmm0,%xmm1
740	pshufd	$78,%xmm0,%xmm3
741	pxor	%xmm0,%xmm3
742.byte	102,15,58,68,194,0
743.byte	102,15,58,68,202,17
744.byte	102,15,58,68,222,0
745	pxor	%xmm0,%xmm3
746	pxor	%xmm1,%xmm3
747
748	movdqa	%xmm3,%xmm4
749	psrldq	$8,%xmm3
750	pslldq	$8,%xmm4
751	pxor	%xmm3,%xmm1
752	pxor	%xmm4,%xmm0
753
754	movdqa	%xmm0,%xmm4
755	movdqa	%xmm0,%xmm3
756	psllq	$5,%xmm0
757	pxor	%xmm0,%xmm3
758	psllq	$1,%xmm0
759	pxor	%xmm3,%xmm0
760	psllq	$57,%xmm0
761	movdqa	%xmm0,%xmm3
762	pslldq	$8,%xmm0
763	psrldq	$8,%xmm3
764	pxor	%xmm4,%xmm0
765	pxor	%xmm3,%xmm1
766
767
768	movdqa	%xmm0,%xmm4
769	psrlq	$1,%xmm0
770	pxor	%xmm4,%xmm1
771	pxor	%xmm0,%xmm4
772	psrlq	$5,%xmm0
773	pxor	%xmm4,%xmm0
774	psrlq	$1,%xmm0
775	pxor	%xmm1,%xmm0
776	movdqa	%xmm0,%xmm5
777	movdqa	%xmm0,%xmm1
778	pshufd	$78,%xmm0,%xmm3
779	pxor	%xmm0,%xmm3
780.byte	102,15,58,68,194,0
781.byte	102,15,58,68,202,17
782.byte	102,15,58,68,222,0
783	pxor	%xmm0,%xmm3
784	pxor	%xmm1,%xmm3
785
786	movdqa	%xmm3,%xmm4
787	psrldq	$8,%xmm3
788	pslldq	$8,%xmm4
789	pxor	%xmm3,%xmm1
790	pxor	%xmm4,%xmm0
791
792	movdqa	%xmm0,%xmm4
793	movdqa	%xmm0,%xmm3
794	psllq	$5,%xmm0
795	pxor	%xmm0,%xmm3
796	psllq	$1,%xmm0
797	pxor	%xmm3,%xmm0
798	psllq	$57,%xmm0
799	movdqa	%xmm0,%xmm3
800	pslldq	$8,%xmm0
801	psrldq	$8,%xmm3
802	pxor	%xmm4,%xmm0
803	pxor	%xmm3,%xmm1
804
805
806	movdqa	%xmm0,%xmm4
807	psrlq	$1,%xmm0
808	pxor	%xmm4,%xmm1
809	pxor	%xmm0,%xmm4
810	psrlq	$5,%xmm0
811	pxor	%xmm4,%xmm0
812	psrlq	$1,%xmm0
813	pxor	%xmm1,%xmm0
814	pshufd	$78,%xmm5,%xmm3
815	pshufd	$78,%xmm0,%xmm4
816	pxor	%xmm5,%xmm3
817	movdqu	%xmm5,48(%rdi)
818	pxor	%xmm0,%xmm4
819	movdqu	%xmm0,64(%rdi)
820.byte	102,15,58,15,227,8
821	movdqu	%xmm4,80(%rdi)
822	.byte	0xf3,0xc3
823.size	gcm_init_clmul,.-gcm_init_clmul
824.globl	gcm_gmult_clmul
825.hidden gcm_gmult_clmul
826.type	gcm_gmult_clmul,@function
827.align	16
828gcm_gmult_clmul:
829.L_gmult_clmul:
830	movdqu	(%rdi),%xmm0
831	movdqa	.Lbswap_mask(%rip),%xmm5
832	movdqu	(%rsi),%xmm2
833	movdqu	32(%rsi),%xmm4
834.byte	102,15,56,0,197
835	movdqa	%xmm0,%xmm1
836	pshufd	$78,%xmm0,%xmm3
837	pxor	%xmm0,%xmm3
838.byte	102,15,58,68,194,0
839.byte	102,15,58,68,202,17
840.byte	102,15,58,68,220,0
841	pxor	%xmm0,%xmm3
842	pxor	%xmm1,%xmm3
843
844	movdqa	%xmm3,%xmm4
845	psrldq	$8,%xmm3
846	pslldq	$8,%xmm4
847	pxor	%xmm3,%xmm1
848	pxor	%xmm4,%xmm0
849
850	movdqa	%xmm0,%xmm4
851	movdqa	%xmm0,%xmm3
852	psllq	$5,%xmm0
853	pxor	%xmm0,%xmm3
854	psllq	$1,%xmm0
855	pxor	%xmm3,%xmm0
856	psllq	$57,%xmm0
857	movdqa	%xmm0,%xmm3
858	pslldq	$8,%xmm0
859	psrldq	$8,%xmm3
860	pxor	%xmm4,%xmm0
861	pxor	%xmm3,%xmm1
862
863
864	movdqa	%xmm0,%xmm4
865	psrlq	$1,%xmm0
866	pxor	%xmm4,%xmm1
867	pxor	%xmm0,%xmm4
868	psrlq	$5,%xmm0
869	pxor	%xmm4,%xmm0
870	psrlq	$1,%xmm0
871	pxor	%xmm1,%xmm0
872.byte	102,15,56,0,197
873	movdqu	%xmm0,(%rdi)
874	.byte	0xf3,0xc3
875.size	gcm_gmult_clmul,.-gcm_gmult_clmul
876.globl	gcm_ghash_clmul
877.hidden gcm_ghash_clmul
878.type	gcm_ghash_clmul,@function
879.align	32
880gcm_ghash_clmul:
881.L_ghash_clmul:
882	movdqa	.Lbswap_mask(%rip),%xmm10
883
884	movdqu	(%rdi),%xmm0
885	movdqu	(%rsi),%xmm2
886	movdqu	32(%rsi),%xmm7
887.byte	102,65,15,56,0,194
888
889	subq	$0x10,%rcx
890	jz	.Lodd_tail
891
892	movdqu	16(%rsi),%xmm6
893	leaq	OPENSSL_ia32cap_P(%rip),%rax
894	movl	4(%rax),%eax
895	cmpq	$0x30,%rcx
896	jb	.Lskip4x
897
898	andl	$71303168,%eax
899	cmpl	$4194304,%eax
900	je	.Lskip4x
901
902	subq	$0x30,%rcx
903	movq	$0xA040608020C0E000,%rax
904	movdqu	48(%rsi),%xmm14
905	movdqu	64(%rsi),%xmm15
906
907
908
909
910	movdqu	48(%rdx),%xmm3
911	movdqu	32(%rdx),%xmm11
912.byte	102,65,15,56,0,218
913.byte	102,69,15,56,0,218
914	movdqa	%xmm3,%xmm5
915	pshufd	$78,%xmm3,%xmm4
916	pxor	%xmm3,%xmm4
917.byte	102,15,58,68,218,0
918.byte	102,15,58,68,234,17
919.byte	102,15,58,68,231,0
920
921	movdqa	%xmm11,%xmm13
922	pshufd	$78,%xmm11,%xmm12
923	pxor	%xmm11,%xmm12
924.byte	102,68,15,58,68,222,0
925.byte	102,68,15,58,68,238,17
926.byte	102,68,15,58,68,231,16
927	xorps	%xmm11,%xmm3
928	xorps	%xmm13,%xmm5
929	movups	80(%rsi),%xmm7
930	xorps	%xmm12,%xmm4
931
932	movdqu	16(%rdx),%xmm11
933	movdqu	0(%rdx),%xmm8
934.byte	102,69,15,56,0,218
935.byte	102,69,15,56,0,194
936	movdqa	%xmm11,%xmm13
937	pshufd	$78,%xmm11,%xmm12
938	pxor	%xmm8,%xmm0
939	pxor	%xmm11,%xmm12
940.byte	102,69,15,58,68,222,0
941	movdqa	%xmm0,%xmm1
942	pshufd	$78,%xmm0,%xmm8
943	pxor	%xmm0,%xmm8
944.byte	102,69,15,58,68,238,17
945.byte	102,68,15,58,68,231,0
946	xorps	%xmm11,%xmm3
947	xorps	%xmm13,%xmm5
948
949	leaq	64(%rdx),%rdx
950	subq	$0x40,%rcx
951	jc	.Ltail4x
952
953	jmp	.Lmod4_loop
954.align	32
955.Lmod4_loop:
956.byte	102,65,15,58,68,199,0
957	xorps	%xmm12,%xmm4
958	movdqu	48(%rdx),%xmm11
959.byte	102,69,15,56,0,218
960.byte	102,65,15,58,68,207,17
961	xorps	%xmm3,%xmm0
962	movdqu	32(%rdx),%xmm3
963	movdqa	%xmm11,%xmm13
964.byte	102,68,15,58,68,199,16
965	pshufd	$78,%xmm11,%xmm12
966	xorps	%xmm5,%xmm1
967	pxor	%xmm11,%xmm12
968.byte	102,65,15,56,0,218
969	movups	32(%rsi),%xmm7
970	xorps	%xmm4,%xmm8
971.byte	102,68,15,58,68,218,0
972	pshufd	$78,%xmm3,%xmm4
973
974	pxor	%xmm0,%xmm8
975	movdqa	%xmm3,%xmm5
976	pxor	%xmm1,%xmm8
977	pxor	%xmm3,%xmm4
978	movdqa	%xmm8,%xmm9
979.byte	102,68,15,58,68,234,17
980	pslldq	$8,%xmm8
981	psrldq	$8,%xmm9
982	pxor	%xmm8,%xmm0
983	movdqa	.L7_mask(%rip),%xmm8
984	pxor	%xmm9,%xmm1
985.byte	102,76,15,110,200
986
987	pand	%xmm0,%xmm8
988.byte	102,69,15,56,0,200
989	pxor	%xmm0,%xmm9
990.byte	102,68,15,58,68,231,0
991	psllq	$57,%xmm9
992	movdqa	%xmm9,%xmm8
993	pslldq	$8,%xmm9
994.byte	102,15,58,68,222,0
995	psrldq	$8,%xmm8
996	pxor	%xmm9,%xmm0
997	pxor	%xmm8,%xmm1
998	movdqu	0(%rdx),%xmm8
999
1000	movdqa	%xmm0,%xmm9
1001	psrlq	$1,%xmm0
1002.byte	102,15,58,68,238,17
1003	xorps	%xmm11,%xmm3
1004	movdqu	16(%rdx),%xmm11
1005.byte	102,69,15,56,0,218
1006.byte	102,15,58,68,231,16
1007	xorps	%xmm13,%xmm5
1008	movups	80(%rsi),%xmm7
1009.byte	102,69,15,56,0,194
1010	pxor	%xmm9,%xmm1
1011	pxor	%xmm0,%xmm9
1012	psrlq	$5,%xmm0
1013
1014	movdqa	%xmm11,%xmm13
1015	pxor	%xmm12,%xmm4
1016	pshufd	$78,%xmm11,%xmm12
1017	pxor	%xmm9,%xmm0
1018	pxor	%xmm8,%xmm1
1019	pxor	%xmm11,%xmm12
1020.byte	102,69,15,58,68,222,0
1021	psrlq	$1,%xmm0
1022	pxor	%xmm1,%xmm0
1023	movdqa	%xmm0,%xmm1
1024.byte	102,69,15,58,68,238,17
1025	xorps	%xmm11,%xmm3
1026	pshufd	$78,%xmm0,%xmm8
1027	pxor	%xmm0,%xmm8
1028
1029.byte	102,68,15,58,68,231,0
1030	xorps	%xmm13,%xmm5
1031
1032	leaq	64(%rdx),%rdx
1033	subq	$0x40,%rcx
1034	jnc	.Lmod4_loop
1035
1036.Ltail4x:
1037.byte	102,65,15,58,68,199,0
1038.byte	102,65,15,58,68,207,17
1039.byte	102,68,15,58,68,199,16
1040	xorps	%xmm12,%xmm4
1041	xorps	%xmm3,%xmm0
1042	xorps	%xmm5,%xmm1
1043	pxor	%xmm0,%xmm1
1044	pxor	%xmm4,%xmm8
1045
1046	pxor	%xmm1,%xmm8
1047	pxor	%xmm0,%xmm1
1048
1049	movdqa	%xmm8,%xmm9
1050	psrldq	$8,%xmm8
1051	pslldq	$8,%xmm9
1052	pxor	%xmm8,%xmm1
1053	pxor	%xmm9,%xmm0
1054
1055	movdqa	%xmm0,%xmm4
1056	movdqa	%xmm0,%xmm3
1057	psllq	$5,%xmm0
1058	pxor	%xmm0,%xmm3
1059	psllq	$1,%xmm0
1060	pxor	%xmm3,%xmm0
1061	psllq	$57,%xmm0
1062	movdqa	%xmm0,%xmm3
1063	pslldq	$8,%xmm0
1064	psrldq	$8,%xmm3
1065	pxor	%xmm4,%xmm0
1066	pxor	%xmm3,%xmm1
1067
1068
1069	movdqa	%xmm0,%xmm4
1070	psrlq	$1,%xmm0
1071	pxor	%xmm4,%xmm1
1072	pxor	%xmm0,%xmm4
1073	psrlq	$5,%xmm0
1074	pxor	%xmm4,%xmm0
1075	psrlq	$1,%xmm0
1076	pxor	%xmm1,%xmm0
1077	addq	$0x40,%rcx
1078	jz	.Ldone
1079	movdqu	32(%rsi),%xmm7
1080	subq	$0x10,%rcx
1081	jz	.Lodd_tail
1082.Lskip4x:
1083
1084
1085
1086
1087
1088	movdqu	(%rdx),%xmm8
1089	movdqu	16(%rdx),%xmm3
1090.byte	102,69,15,56,0,194
1091.byte	102,65,15,56,0,218
1092	pxor	%xmm8,%xmm0
1093
1094	movdqa	%xmm3,%xmm5
1095	pshufd	$78,%xmm3,%xmm4
1096	pxor	%xmm3,%xmm4
1097.byte	102,15,58,68,218,0
1098.byte	102,15,58,68,234,17
1099.byte	102,15,58,68,231,0
1100
1101	leaq	32(%rdx),%rdx
1102	nop
1103	subq	$0x20,%rcx
1104	jbe	.Leven_tail
1105	nop
1106	jmp	.Lmod_loop
1107
1108.align	32
1109.Lmod_loop:
1110	movdqa	%xmm0,%xmm1
1111	movdqa	%xmm4,%xmm8
1112	pshufd	$78,%xmm0,%xmm4
1113	pxor	%xmm0,%xmm4
1114
1115.byte	102,15,58,68,198,0
1116.byte	102,15,58,68,206,17
1117.byte	102,15,58,68,231,16
1118
1119	pxor	%xmm3,%xmm0
1120	pxor	%xmm5,%xmm1
1121	movdqu	(%rdx),%xmm9
1122	pxor	%xmm0,%xmm8
1123.byte	102,69,15,56,0,202
1124	movdqu	16(%rdx),%xmm3
1125
1126	pxor	%xmm1,%xmm8
1127	pxor	%xmm9,%xmm1
1128	pxor	%xmm8,%xmm4
1129.byte	102,65,15,56,0,218
1130	movdqa	%xmm4,%xmm8
1131	psrldq	$8,%xmm8
1132	pslldq	$8,%xmm4
1133	pxor	%xmm8,%xmm1
1134	pxor	%xmm4,%xmm0
1135
1136	movdqa	%xmm3,%xmm5
1137
1138	movdqa	%xmm0,%xmm9
1139	movdqa	%xmm0,%xmm8
1140	psllq	$5,%xmm0
1141	pxor	%xmm0,%xmm8
1142.byte	102,15,58,68,218,0
1143	psllq	$1,%xmm0
1144	pxor	%xmm8,%xmm0
1145	psllq	$57,%xmm0
1146	movdqa	%xmm0,%xmm8
1147	pslldq	$8,%xmm0
1148	psrldq	$8,%xmm8
1149	pxor	%xmm9,%xmm0
1150	pshufd	$78,%xmm5,%xmm4
1151	pxor	%xmm8,%xmm1
1152	pxor	%xmm5,%xmm4
1153
1154	movdqa	%xmm0,%xmm9
1155	psrlq	$1,%xmm0
1156.byte	102,15,58,68,234,17
1157	pxor	%xmm9,%xmm1
1158	pxor	%xmm0,%xmm9
1159	psrlq	$5,%xmm0
1160	pxor	%xmm9,%xmm0
1161	leaq	32(%rdx),%rdx
1162	psrlq	$1,%xmm0
1163.byte	102,15,58,68,231,0
1164	pxor	%xmm1,%xmm0
1165
1166	subq	$0x20,%rcx
1167	ja	.Lmod_loop
1168
1169.Leven_tail:
1170	movdqa	%xmm0,%xmm1
1171	movdqa	%xmm4,%xmm8
1172	pshufd	$78,%xmm0,%xmm4
1173	pxor	%xmm0,%xmm4
1174
1175.byte	102,15,58,68,198,0
1176.byte	102,15,58,68,206,17
1177.byte	102,15,58,68,231,16
1178
1179	pxor	%xmm3,%xmm0
1180	pxor	%xmm5,%xmm1
1181	pxor	%xmm0,%xmm8
1182	pxor	%xmm1,%xmm8
1183	pxor	%xmm8,%xmm4
1184	movdqa	%xmm4,%xmm8
1185	psrldq	$8,%xmm8
1186	pslldq	$8,%xmm4
1187	pxor	%xmm8,%xmm1
1188	pxor	%xmm4,%xmm0
1189
1190	movdqa	%xmm0,%xmm4
1191	movdqa	%xmm0,%xmm3
1192	psllq	$5,%xmm0
1193	pxor	%xmm0,%xmm3
1194	psllq	$1,%xmm0
1195	pxor	%xmm3,%xmm0
1196	psllq	$57,%xmm0
1197	movdqa	%xmm0,%xmm3
1198	pslldq	$8,%xmm0
1199	psrldq	$8,%xmm3
1200	pxor	%xmm4,%xmm0
1201	pxor	%xmm3,%xmm1
1202
1203
1204	movdqa	%xmm0,%xmm4
1205	psrlq	$1,%xmm0
1206	pxor	%xmm4,%xmm1
1207	pxor	%xmm0,%xmm4
1208	psrlq	$5,%xmm0
1209	pxor	%xmm4,%xmm0
1210	psrlq	$1,%xmm0
1211	pxor	%xmm1,%xmm0
1212	testq	%rcx,%rcx
1213	jnz	.Ldone
1214
1215.Lodd_tail:
1216	movdqu	(%rdx),%xmm8
1217.byte	102,69,15,56,0,194
1218	pxor	%xmm8,%xmm0
1219	movdqa	%xmm0,%xmm1
1220	pshufd	$78,%xmm0,%xmm3
1221	pxor	%xmm0,%xmm3
1222.byte	102,15,58,68,194,0
1223.byte	102,15,58,68,202,17
1224.byte	102,15,58,68,223,0
1225	pxor	%xmm0,%xmm3
1226	pxor	%xmm1,%xmm3
1227
1228	movdqa	%xmm3,%xmm4
1229	psrldq	$8,%xmm3
1230	pslldq	$8,%xmm4
1231	pxor	%xmm3,%xmm1
1232	pxor	%xmm4,%xmm0
1233
1234	movdqa	%xmm0,%xmm4
1235	movdqa	%xmm0,%xmm3
1236	psllq	$5,%xmm0
1237	pxor	%xmm0,%xmm3
1238	psllq	$1,%xmm0
1239	pxor	%xmm3,%xmm0
1240	psllq	$57,%xmm0
1241	movdqa	%xmm0,%xmm3
1242	pslldq	$8,%xmm0
1243	psrldq	$8,%xmm3
1244	pxor	%xmm4,%xmm0
1245	pxor	%xmm3,%xmm1
1246
1247
1248	movdqa	%xmm0,%xmm4
1249	psrlq	$1,%xmm0
1250	pxor	%xmm4,%xmm1
1251	pxor	%xmm0,%xmm4
1252	psrlq	$5,%xmm0
1253	pxor	%xmm4,%xmm0
1254	psrlq	$1,%xmm0
1255	pxor	%xmm1,%xmm0
1256.Ldone:
1257.byte	102,65,15,56,0,194
1258	movdqu	%xmm0,(%rdi)
1259	.byte	0xf3,0xc3
1260.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1261.globl	gcm_init_avx
1262.hidden gcm_init_avx
1263.type	gcm_init_avx,@function
1264.align	32
1265gcm_init_avx:
1266	vzeroupper
1267
1268	vmovdqu	(%rsi),%xmm2
1269	vpshufd	$78,%xmm2,%xmm2
1270
1271
1272	vpshufd	$255,%xmm2,%xmm4
1273	vpsrlq	$63,%xmm2,%xmm3
1274	vpsllq	$1,%xmm2,%xmm2
1275	vpxor	%xmm5,%xmm5,%xmm5
1276	vpcmpgtd	%xmm4,%xmm5,%xmm5
1277	vpslldq	$8,%xmm3,%xmm3
1278	vpor	%xmm3,%xmm2,%xmm2
1279
1280
1281	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1282	vpxor	%xmm5,%xmm2,%xmm2
1283
1284	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1285	vmovdqa	%xmm2,%xmm0
1286	vpxor	%xmm2,%xmm6,%xmm6
1287	movq	$4,%r10
1288	jmp	.Linit_start_avx
1289.align	32
1290.Linit_loop_avx:
1291	vpalignr	$8,%xmm3,%xmm4,%xmm5
1292	vmovdqu	%xmm5,-16(%rdi)
1293	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1294	vpxor	%xmm0,%xmm3,%xmm3
1295	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1296	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1297	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1298	vpxor	%xmm0,%xmm1,%xmm4
1299	vpxor	%xmm4,%xmm3,%xmm3
1300
1301	vpslldq	$8,%xmm3,%xmm4
1302	vpsrldq	$8,%xmm3,%xmm3
1303	vpxor	%xmm4,%xmm0,%xmm0
1304	vpxor	%xmm3,%xmm1,%xmm1
1305	vpsllq	$57,%xmm0,%xmm3
1306	vpsllq	$62,%xmm0,%xmm4
1307	vpxor	%xmm3,%xmm4,%xmm4
1308	vpsllq	$63,%xmm0,%xmm3
1309	vpxor	%xmm3,%xmm4,%xmm4
1310	vpslldq	$8,%xmm4,%xmm3
1311	vpsrldq	$8,%xmm4,%xmm4
1312	vpxor	%xmm3,%xmm0,%xmm0
1313	vpxor	%xmm4,%xmm1,%xmm1
1314
1315	vpsrlq	$1,%xmm0,%xmm4
1316	vpxor	%xmm0,%xmm1,%xmm1
1317	vpxor	%xmm4,%xmm0,%xmm0
1318	vpsrlq	$5,%xmm4,%xmm4
1319	vpxor	%xmm4,%xmm0,%xmm0
1320	vpsrlq	$1,%xmm0,%xmm0
1321	vpxor	%xmm1,%xmm0,%xmm0
1322.Linit_start_avx:
1323	vmovdqa	%xmm0,%xmm5
1324	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1325	vpxor	%xmm0,%xmm3,%xmm3
1326	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1327	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1328	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1329	vpxor	%xmm0,%xmm1,%xmm4
1330	vpxor	%xmm4,%xmm3,%xmm3
1331
1332	vpslldq	$8,%xmm3,%xmm4
1333	vpsrldq	$8,%xmm3,%xmm3
1334	vpxor	%xmm4,%xmm0,%xmm0
1335	vpxor	%xmm3,%xmm1,%xmm1
1336	vpsllq	$57,%xmm0,%xmm3
1337	vpsllq	$62,%xmm0,%xmm4
1338	vpxor	%xmm3,%xmm4,%xmm4
1339	vpsllq	$63,%xmm0,%xmm3
1340	vpxor	%xmm3,%xmm4,%xmm4
1341	vpslldq	$8,%xmm4,%xmm3
1342	vpsrldq	$8,%xmm4,%xmm4
1343	vpxor	%xmm3,%xmm0,%xmm0
1344	vpxor	%xmm4,%xmm1,%xmm1
1345
1346	vpsrlq	$1,%xmm0,%xmm4
1347	vpxor	%xmm0,%xmm1,%xmm1
1348	vpxor	%xmm4,%xmm0,%xmm0
1349	vpsrlq	$5,%xmm4,%xmm4
1350	vpxor	%xmm4,%xmm0,%xmm0
1351	vpsrlq	$1,%xmm0,%xmm0
1352	vpxor	%xmm1,%xmm0,%xmm0
1353	vpshufd	$78,%xmm5,%xmm3
1354	vpshufd	$78,%xmm0,%xmm4
1355	vpxor	%xmm5,%xmm3,%xmm3
1356	vmovdqu	%xmm5,0(%rdi)
1357	vpxor	%xmm0,%xmm4,%xmm4
1358	vmovdqu	%xmm0,16(%rdi)
1359	leaq	48(%rdi),%rdi
1360	subq	$1,%r10
1361	jnz	.Linit_loop_avx
1362
1363	vpalignr	$8,%xmm4,%xmm3,%xmm5
1364	vmovdqu	%xmm5,-16(%rdi)
1365
1366	vzeroupper
1367	.byte	0xf3,0xc3
1368.size	gcm_init_avx,.-gcm_init_avx
1369.globl	gcm_gmult_avx
1370.hidden gcm_gmult_avx
1371.type	gcm_gmult_avx,@function
1372.align	32
1373gcm_gmult_avx:
1374	jmp	.L_gmult_clmul
1375.size	gcm_gmult_avx,.-gcm_gmult_avx
1376.globl	gcm_ghash_avx
1377.hidden gcm_ghash_avx
1378.type	gcm_ghash_avx,@function
1379.align	32
1380gcm_ghash_avx:
1381	vzeroupper
1382
1383	vmovdqu	(%rdi),%xmm10
1384	leaq	.L0x1c2_polynomial(%rip),%r10
1385	leaq	64(%rsi),%rsi
1386	vmovdqu	.Lbswap_mask(%rip),%xmm13
1387	vpshufb	%xmm13,%xmm10,%xmm10
1388	cmpq	$0x80,%rcx
1389	jb	.Lshort_avx
1390	subq	$0x80,%rcx
1391
1392	vmovdqu	112(%rdx),%xmm14
1393	vmovdqu	0-64(%rsi),%xmm6
1394	vpshufb	%xmm13,%xmm14,%xmm14
1395	vmovdqu	32-64(%rsi),%xmm7
1396
1397	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1398	vmovdqu	96(%rdx),%xmm15
1399	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1400	vpxor	%xmm14,%xmm9,%xmm9
1401	vpshufb	%xmm13,%xmm15,%xmm15
1402	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1403	vmovdqu	16-64(%rsi),%xmm6
1404	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1405	vmovdqu	80(%rdx),%xmm14
1406	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1407	vpxor	%xmm15,%xmm8,%xmm8
1408
1409	vpshufb	%xmm13,%xmm14,%xmm14
1410	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1411	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1412	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1413	vmovdqu	48-64(%rsi),%xmm6
1414	vpxor	%xmm14,%xmm9,%xmm9
1415	vmovdqu	64(%rdx),%xmm15
1416	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1417	vmovdqu	80-64(%rsi),%xmm7
1418
1419	vpshufb	%xmm13,%xmm15,%xmm15
1420	vpxor	%xmm0,%xmm3,%xmm3
1421	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1422	vpxor	%xmm1,%xmm4,%xmm4
1423	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1424	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1425	vmovdqu	64-64(%rsi),%xmm6
1426	vpxor	%xmm2,%xmm5,%xmm5
1427	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1428	vpxor	%xmm15,%xmm8,%xmm8
1429
1430	vmovdqu	48(%rdx),%xmm14
1431	vpxor	%xmm3,%xmm0,%xmm0
1432	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1433	vpxor	%xmm4,%xmm1,%xmm1
1434	vpshufb	%xmm13,%xmm14,%xmm14
1435	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1436	vmovdqu	96-64(%rsi),%xmm6
1437	vpxor	%xmm5,%xmm2,%xmm2
1438	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1439	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1440	vmovdqu	128-64(%rsi),%xmm7
1441	vpxor	%xmm14,%xmm9,%xmm9
1442
1443	vmovdqu	32(%rdx),%xmm15
1444	vpxor	%xmm0,%xmm3,%xmm3
1445	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1446	vpxor	%xmm1,%xmm4,%xmm4
1447	vpshufb	%xmm13,%xmm15,%xmm15
1448	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1449	vmovdqu	112-64(%rsi),%xmm6
1450	vpxor	%xmm2,%xmm5,%xmm5
1451	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1452	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1453	vpxor	%xmm15,%xmm8,%xmm8
1454
1455	vmovdqu	16(%rdx),%xmm14
1456	vpxor	%xmm3,%xmm0,%xmm0
1457	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1458	vpxor	%xmm4,%xmm1,%xmm1
1459	vpshufb	%xmm13,%xmm14,%xmm14
1460	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1461	vmovdqu	144-64(%rsi),%xmm6
1462	vpxor	%xmm5,%xmm2,%xmm2
1463	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1464	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1465	vmovdqu	176-64(%rsi),%xmm7
1466	vpxor	%xmm14,%xmm9,%xmm9
1467
1468	vmovdqu	(%rdx),%xmm15
1469	vpxor	%xmm0,%xmm3,%xmm3
1470	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1471	vpxor	%xmm1,%xmm4,%xmm4
1472	vpshufb	%xmm13,%xmm15,%xmm15
1473	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1474	vmovdqu	160-64(%rsi),%xmm6
1475	vpxor	%xmm2,%xmm5,%xmm5
1476	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1477
1478	leaq	128(%rdx),%rdx
1479	cmpq	$0x80,%rcx
1480	jb	.Ltail_avx
1481
1482	vpxor	%xmm10,%xmm15,%xmm15
1483	subq	$0x80,%rcx
1484	jmp	.Loop8x_avx
1485
1486.align	32
1487.Loop8x_avx:
1488	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1489	vmovdqu	112(%rdx),%xmm14
1490	vpxor	%xmm0,%xmm3,%xmm3
1491	vpxor	%xmm15,%xmm8,%xmm8
1492	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1493	vpshufb	%xmm13,%xmm14,%xmm14
1494	vpxor	%xmm1,%xmm4,%xmm4
1495	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1496	vmovdqu	0-64(%rsi),%xmm6
1497	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1498	vpxor	%xmm2,%xmm5,%xmm5
1499	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1500	vmovdqu	32-64(%rsi),%xmm7
1501	vpxor	%xmm14,%xmm9,%xmm9
1502
1503	vmovdqu	96(%rdx),%xmm15
1504	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1505	vpxor	%xmm3,%xmm10,%xmm10
1506	vpshufb	%xmm13,%xmm15,%xmm15
1507	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1508	vxorps	%xmm4,%xmm11,%xmm11
1509	vmovdqu	16-64(%rsi),%xmm6
1510	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1511	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1512	vpxor	%xmm5,%xmm12,%xmm12
1513	vxorps	%xmm15,%xmm8,%xmm8
1514
1515	vmovdqu	80(%rdx),%xmm14
1516	vpxor	%xmm10,%xmm12,%xmm12
1517	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1518	vpxor	%xmm11,%xmm12,%xmm12
1519	vpslldq	$8,%xmm12,%xmm9
1520	vpxor	%xmm0,%xmm3,%xmm3
1521	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1522	vpsrldq	$8,%xmm12,%xmm12
1523	vpxor	%xmm9,%xmm10,%xmm10
1524	vmovdqu	48-64(%rsi),%xmm6
1525	vpshufb	%xmm13,%xmm14,%xmm14
1526	vxorps	%xmm12,%xmm11,%xmm11
1527	vpxor	%xmm1,%xmm4,%xmm4
1528	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1529	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1530	vmovdqu	80-64(%rsi),%xmm7
1531	vpxor	%xmm14,%xmm9,%xmm9
1532	vpxor	%xmm2,%xmm5,%xmm5
1533
1534	vmovdqu	64(%rdx),%xmm15
1535	vpalignr	$8,%xmm10,%xmm10,%xmm12
1536	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1537	vpshufb	%xmm13,%xmm15,%xmm15
1538	vpxor	%xmm3,%xmm0,%xmm0
1539	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1540	vmovdqu	64-64(%rsi),%xmm6
1541	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1542	vpxor	%xmm4,%xmm1,%xmm1
1543	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1544	vxorps	%xmm15,%xmm8,%xmm8
1545	vpxor	%xmm5,%xmm2,%xmm2
1546
1547	vmovdqu	48(%rdx),%xmm14
1548	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1549	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1550	vpshufb	%xmm13,%xmm14,%xmm14
1551	vpxor	%xmm0,%xmm3,%xmm3
1552	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1553	vmovdqu	96-64(%rsi),%xmm6
1554	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1555	vpxor	%xmm1,%xmm4,%xmm4
1556	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1557	vmovdqu	128-64(%rsi),%xmm7
1558	vpxor	%xmm14,%xmm9,%xmm9
1559	vpxor	%xmm2,%xmm5,%xmm5
1560
1561	vmovdqu	32(%rdx),%xmm15
1562	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1563	vpshufb	%xmm13,%xmm15,%xmm15
1564	vpxor	%xmm3,%xmm0,%xmm0
1565	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1566	vmovdqu	112-64(%rsi),%xmm6
1567	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1568	vpxor	%xmm4,%xmm1,%xmm1
1569	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1570	vpxor	%xmm15,%xmm8,%xmm8
1571	vpxor	%xmm5,%xmm2,%xmm2
1572	vxorps	%xmm12,%xmm10,%xmm10
1573
1574	vmovdqu	16(%rdx),%xmm14
1575	vpalignr	$8,%xmm10,%xmm10,%xmm12
1576	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1577	vpshufb	%xmm13,%xmm14,%xmm14
1578	vpxor	%xmm0,%xmm3,%xmm3
1579	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1580	vmovdqu	144-64(%rsi),%xmm6
1581	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1582	vxorps	%xmm11,%xmm12,%xmm12
1583	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1584	vpxor	%xmm1,%xmm4,%xmm4
1585	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1586	vmovdqu	176-64(%rsi),%xmm7
1587	vpxor	%xmm14,%xmm9,%xmm9
1588	vpxor	%xmm2,%xmm5,%xmm5
1589
1590	vmovdqu	(%rdx),%xmm15
1591	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1592	vpshufb	%xmm13,%xmm15,%xmm15
1593	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1594	vmovdqu	160-64(%rsi),%xmm6
1595	vpxor	%xmm12,%xmm15,%xmm15
1596	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1597	vpxor	%xmm10,%xmm15,%xmm15
1598
1599	leaq	128(%rdx),%rdx
1600	subq	$0x80,%rcx
1601	jnc	.Loop8x_avx
1602
1603	addq	$0x80,%rcx
1604	jmp	.Ltail_no_xor_avx
1605
1606.align	32
1607.Lshort_avx:
1608	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1609	leaq	(%rdx,%rcx,1),%rdx
1610	vmovdqu	0-64(%rsi),%xmm6
1611	vmovdqu	32-64(%rsi),%xmm7
1612	vpshufb	%xmm13,%xmm14,%xmm15
1613
1614	vmovdqa	%xmm0,%xmm3
1615	vmovdqa	%xmm1,%xmm4
1616	vmovdqa	%xmm2,%xmm5
1617	subq	$0x10,%rcx
1618	jz	.Ltail_avx
1619
1620	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1621	vpxor	%xmm0,%xmm3,%xmm3
1622	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1623	vpxor	%xmm15,%xmm8,%xmm8
1624	vmovdqu	-32(%rdx),%xmm14
1625	vpxor	%xmm1,%xmm4,%xmm4
1626	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1627	vmovdqu	16-64(%rsi),%xmm6
1628	vpshufb	%xmm13,%xmm14,%xmm15
1629	vpxor	%xmm2,%xmm5,%xmm5
1630	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1631	vpsrldq	$8,%xmm7,%xmm7
1632	subq	$0x10,%rcx
1633	jz	.Ltail_avx
1634
1635	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1636	vpxor	%xmm0,%xmm3,%xmm3
1637	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1638	vpxor	%xmm15,%xmm8,%xmm8
1639	vmovdqu	-48(%rdx),%xmm14
1640	vpxor	%xmm1,%xmm4,%xmm4
1641	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1642	vmovdqu	48-64(%rsi),%xmm6
1643	vpshufb	%xmm13,%xmm14,%xmm15
1644	vpxor	%xmm2,%xmm5,%xmm5
1645	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1646	vmovdqu	80-64(%rsi),%xmm7
1647	subq	$0x10,%rcx
1648	jz	.Ltail_avx
1649
1650	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1651	vpxor	%xmm0,%xmm3,%xmm3
1652	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1653	vpxor	%xmm15,%xmm8,%xmm8
1654	vmovdqu	-64(%rdx),%xmm14
1655	vpxor	%xmm1,%xmm4,%xmm4
1656	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1657	vmovdqu	64-64(%rsi),%xmm6
1658	vpshufb	%xmm13,%xmm14,%xmm15
1659	vpxor	%xmm2,%xmm5,%xmm5
1660	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1661	vpsrldq	$8,%xmm7,%xmm7
1662	subq	$0x10,%rcx
1663	jz	.Ltail_avx
1664
1665	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1666	vpxor	%xmm0,%xmm3,%xmm3
1667	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1668	vpxor	%xmm15,%xmm8,%xmm8
1669	vmovdqu	-80(%rdx),%xmm14
1670	vpxor	%xmm1,%xmm4,%xmm4
1671	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1672	vmovdqu	96-64(%rsi),%xmm6
1673	vpshufb	%xmm13,%xmm14,%xmm15
1674	vpxor	%xmm2,%xmm5,%xmm5
1675	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1676	vmovdqu	128-64(%rsi),%xmm7
1677	subq	$0x10,%rcx
1678	jz	.Ltail_avx
1679
1680	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1681	vpxor	%xmm0,%xmm3,%xmm3
1682	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1683	vpxor	%xmm15,%xmm8,%xmm8
1684	vmovdqu	-96(%rdx),%xmm14
1685	vpxor	%xmm1,%xmm4,%xmm4
1686	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1687	vmovdqu	112-64(%rsi),%xmm6
1688	vpshufb	%xmm13,%xmm14,%xmm15
1689	vpxor	%xmm2,%xmm5,%xmm5
1690	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1691	vpsrldq	$8,%xmm7,%xmm7
1692	subq	$0x10,%rcx
1693	jz	.Ltail_avx
1694
1695	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1696	vpxor	%xmm0,%xmm3,%xmm3
1697	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1698	vpxor	%xmm15,%xmm8,%xmm8
1699	vmovdqu	-112(%rdx),%xmm14
1700	vpxor	%xmm1,%xmm4,%xmm4
1701	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1702	vmovdqu	144-64(%rsi),%xmm6
1703	vpshufb	%xmm13,%xmm14,%xmm15
1704	vpxor	%xmm2,%xmm5,%xmm5
1705	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1706	vmovq	184-64(%rsi),%xmm7
1707	subq	$0x10,%rcx
1708	jmp	.Ltail_avx
1709
1710.align	32
1711.Ltail_avx:
1712	vpxor	%xmm10,%xmm15,%xmm15
1713.Ltail_no_xor_avx:
1714	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1715	vpxor	%xmm0,%xmm3,%xmm3
1716	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1717	vpxor	%xmm15,%xmm8,%xmm8
1718	vpxor	%xmm1,%xmm4,%xmm4
1719	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1720	vpxor	%xmm2,%xmm5,%xmm5
1721	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1722
1723	vmovdqu	(%r10),%xmm12
1724
1725	vpxor	%xmm0,%xmm3,%xmm10
1726	vpxor	%xmm1,%xmm4,%xmm11
1727	vpxor	%xmm2,%xmm5,%xmm5
1728
1729	vpxor	%xmm10,%xmm5,%xmm5
1730	vpxor	%xmm11,%xmm5,%xmm5
1731	vpslldq	$8,%xmm5,%xmm9
1732	vpsrldq	$8,%xmm5,%xmm5
1733	vpxor	%xmm9,%xmm10,%xmm10
1734	vpxor	%xmm5,%xmm11,%xmm11
1735
1736	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1737	vpalignr	$8,%xmm10,%xmm10,%xmm10
1738	vpxor	%xmm9,%xmm10,%xmm10
1739
1740	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1741	vpalignr	$8,%xmm10,%xmm10,%xmm10
1742	vpxor	%xmm11,%xmm10,%xmm10
1743	vpxor	%xmm9,%xmm10,%xmm10
1744
1745	cmpq	$0,%rcx
1746	jne	.Lshort_avx
1747
1748	vpshufb	%xmm13,%xmm10,%xmm10
1749	vmovdqu	%xmm10,(%rdi)
1750	vzeroupper
1751	.byte	0xf3,0xc3
1752.size	gcm_ghash_avx,.-gcm_ghash_avx
1753.align	64
1754.Lbswap_mask:
1755.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1756.L0x1c2_polynomial:
1757.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1758.L7_mask:
1759.long	7,0,7,0
1760.L7_mask_poly:
1761.long	7,0,450,0
1762.align	64
1763.type	.Lrem_4bit,@object
1764.Lrem_4bit:
1765.long	0,0,0,471859200,0,943718400,0,610271232
1766.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1767.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1768.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1769.type	.Lrem_8bit,@object
1770.Lrem_8bit:
1771.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1772.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1773.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1774.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1775.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1776.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1777.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1778.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1779.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1780.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1781.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1782.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1783.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1784.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1785.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1786.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1787.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1788.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1789.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1790.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1791.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1792.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1793.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1794.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1795.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1796.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1797.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1798.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1799.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1800.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1801.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1802.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1803
1804.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1805.align	64
1806#endif
1807