1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7EXTERN	OPENSSL_ia32cap_P
8
9global	gcm_gmult_4bit
10
11ALIGN	16
12gcm_gmult_4bit:
13	mov	QWORD[8+rsp],rdi	;WIN64 prologue
14	mov	QWORD[16+rsp],rsi
15	mov	rax,rsp
16$L$SEH_begin_gcm_gmult_4bit:
17	mov	rdi,rcx
18	mov	rsi,rdx
19
20
21	push	rbx
22	push	rbp
23	push	r12
24	push	r13
25	push	r14
26	push	r15
27	sub	rsp,280
28$L$gmult_prologue:
29
30	movzx	r8,BYTE[15+rdi]
31	lea	r11,[$L$rem_4bit]
32	xor	rax,rax
33	xor	rbx,rbx
34	mov	al,r8b
35	mov	bl,r8b
36	shl	al,4
37	mov	rcx,14
38	mov	r8,QWORD[8+rax*1+rsi]
39	mov	r9,QWORD[rax*1+rsi]
40	and	bl,0xf0
41	mov	rdx,r8
42	jmp	NEAR $L$oop1
43
44ALIGN	16
45$L$oop1:
46	shr	r8,4
47	and	rdx,0xf
48	mov	r10,r9
49	mov	al,BYTE[rcx*1+rdi]
50	shr	r9,4
51	xor	r8,QWORD[8+rbx*1+rsi]
52	shl	r10,60
53	xor	r9,QWORD[rbx*1+rsi]
54	mov	bl,al
55	xor	r9,QWORD[rdx*8+r11]
56	mov	rdx,r8
57	shl	al,4
58	xor	r8,r10
59	dec	rcx
60	js	NEAR $L$break1
61
62	shr	r8,4
63	and	rdx,0xf
64	mov	r10,r9
65	shr	r9,4
66	xor	r8,QWORD[8+rax*1+rsi]
67	shl	r10,60
68	xor	r9,QWORD[rax*1+rsi]
69	and	bl,0xf0
70	xor	r9,QWORD[rdx*8+r11]
71	mov	rdx,r8
72	xor	r8,r10
73	jmp	NEAR $L$oop1
74
75ALIGN	16
76$L$break1:
77	shr	r8,4
78	and	rdx,0xf
79	mov	r10,r9
80	shr	r9,4
81	xor	r8,QWORD[8+rax*1+rsi]
82	shl	r10,60
83	xor	r9,QWORD[rax*1+rsi]
84	and	bl,0xf0
85	xor	r9,QWORD[rdx*8+r11]
86	mov	rdx,r8
87	xor	r8,r10
88
89	shr	r8,4
90	and	rdx,0xf
91	mov	r10,r9
92	shr	r9,4
93	xor	r8,QWORD[8+rbx*1+rsi]
94	shl	r10,60
95	xor	r9,QWORD[rbx*1+rsi]
96	xor	r8,r10
97	xor	r9,QWORD[rdx*8+r11]
98
99	bswap	r8
100	bswap	r9
101	mov	QWORD[8+rdi],r8
102	mov	QWORD[rdi],r9
103
104	lea	rsi,[((280+48))+rsp]
105	mov	rbx,QWORD[((-8))+rsi]
106	lea	rsp,[rsi]
107$L$gmult_epilogue:
108	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
109	mov	rsi,QWORD[16+rsp]
110	DB	0F3h,0C3h		;repret
111$L$SEH_end_gcm_gmult_4bit:
112global	gcm_ghash_4bit
113
114ALIGN	16
115gcm_ghash_4bit:
116	mov	QWORD[8+rsp],rdi	;WIN64 prologue
117	mov	QWORD[16+rsp],rsi
118	mov	rax,rsp
119$L$SEH_begin_gcm_ghash_4bit:
120	mov	rdi,rcx
121	mov	rsi,rdx
122	mov	rdx,r8
123	mov	rcx,r9
124
125
126	push	rbx
127	push	rbp
128	push	r12
129	push	r13
130	push	r14
131	push	r15
132	sub	rsp,280
133$L$ghash_prologue:
134	mov	r14,rdx
135	mov	r15,rcx
136	sub	rsi,-128
137	lea	rbp,[((16+128))+rsp]
138	xor	edx,edx
139	mov	r8,QWORD[((0+0-128))+rsi]
140	mov	rax,QWORD[((0+8-128))+rsi]
141	mov	dl,al
142	shr	rax,4
143	mov	r10,r8
144	shr	r8,4
145	mov	r9,QWORD[((16+0-128))+rsi]
146	shl	dl,4
147	mov	rbx,QWORD[((16+8-128))+rsi]
148	shl	r10,60
149	mov	BYTE[rsp],dl
150	or	rax,r10
151	mov	dl,bl
152	shr	rbx,4
153	mov	r10,r9
154	shr	r9,4
155	mov	QWORD[rbp],r8
156	mov	r8,QWORD[((32+0-128))+rsi]
157	shl	dl,4
158	mov	QWORD[((0-128))+rbp],rax
159	mov	rax,QWORD[((32+8-128))+rsi]
160	shl	r10,60
161	mov	BYTE[1+rsp],dl
162	or	rbx,r10
163	mov	dl,al
164	shr	rax,4
165	mov	r10,r8
166	shr	r8,4
167	mov	QWORD[8+rbp],r9
168	mov	r9,QWORD[((48+0-128))+rsi]
169	shl	dl,4
170	mov	QWORD[((8-128))+rbp],rbx
171	mov	rbx,QWORD[((48+8-128))+rsi]
172	shl	r10,60
173	mov	BYTE[2+rsp],dl
174	or	rax,r10
175	mov	dl,bl
176	shr	rbx,4
177	mov	r10,r9
178	shr	r9,4
179	mov	QWORD[16+rbp],r8
180	mov	r8,QWORD[((64+0-128))+rsi]
181	shl	dl,4
182	mov	QWORD[((16-128))+rbp],rax
183	mov	rax,QWORD[((64+8-128))+rsi]
184	shl	r10,60
185	mov	BYTE[3+rsp],dl
186	or	rbx,r10
187	mov	dl,al
188	shr	rax,4
189	mov	r10,r8
190	shr	r8,4
191	mov	QWORD[24+rbp],r9
192	mov	r9,QWORD[((80+0-128))+rsi]
193	shl	dl,4
194	mov	QWORD[((24-128))+rbp],rbx
195	mov	rbx,QWORD[((80+8-128))+rsi]
196	shl	r10,60
197	mov	BYTE[4+rsp],dl
198	or	rax,r10
199	mov	dl,bl
200	shr	rbx,4
201	mov	r10,r9
202	shr	r9,4
203	mov	QWORD[32+rbp],r8
204	mov	r8,QWORD[((96+0-128))+rsi]
205	shl	dl,4
206	mov	QWORD[((32-128))+rbp],rax
207	mov	rax,QWORD[((96+8-128))+rsi]
208	shl	r10,60
209	mov	BYTE[5+rsp],dl
210	or	rbx,r10
211	mov	dl,al
212	shr	rax,4
213	mov	r10,r8
214	shr	r8,4
215	mov	QWORD[40+rbp],r9
216	mov	r9,QWORD[((112+0-128))+rsi]
217	shl	dl,4
218	mov	QWORD[((40-128))+rbp],rbx
219	mov	rbx,QWORD[((112+8-128))+rsi]
220	shl	r10,60
221	mov	BYTE[6+rsp],dl
222	or	rax,r10
223	mov	dl,bl
224	shr	rbx,4
225	mov	r10,r9
226	shr	r9,4
227	mov	QWORD[48+rbp],r8
228	mov	r8,QWORD[((128+0-128))+rsi]
229	shl	dl,4
230	mov	QWORD[((48-128))+rbp],rax
231	mov	rax,QWORD[((128+8-128))+rsi]
232	shl	r10,60
233	mov	BYTE[7+rsp],dl
234	or	rbx,r10
235	mov	dl,al
236	shr	rax,4
237	mov	r10,r8
238	shr	r8,4
239	mov	QWORD[56+rbp],r9
240	mov	r9,QWORD[((144+0-128))+rsi]
241	shl	dl,4
242	mov	QWORD[((56-128))+rbp],rbx
243	mov	rbx,QWORD[((144+8-128))+rsi]
244	shl	r10,60
245	mov	BYTE[8+rsp],dl
246	or	rax,r10
247	mov	dl,bl
248	shr	rbx,4
249	mov	r10,r9
250	shr	r9,4
251	mov	QWORD[64+rbp],r8
252	mov	r8,QWORD[((160+0-128))+rsi]
253	shl	dl,4
254	mov	QWORD[((64-128))+rbp],rax
255	mov	rax,QWORD[((160+8-128))+rsi]
256	shl	r10,60
257	mov	BYTE[9+rsp],dl
258	or	rbx,r10
259	mov	dl,al
260	shr	rax,4
261	mov	r10,r8
262	shr	r8,4
263	mov	QWORD[72+rbp],r9
264	mov	r9,QWORD[((176+0-128))+rsi]
265	shl	dl,4
266	mov	QWORD[((72-128))+rbp],rbx
267	mov	rbx,QWORD[((176+8-128))+rsi]
268	shl	r10,60
269	mov	BYTE[10+rsp],dl
270	or	rax,r10
271	mov	dl,bl
272	shr	rbx,4
273	mov	r10,r9
274	shr	r9,4
275	mov	QWORD[80+rbp],r8
276	mov	r8,QWORD[((192+0-128))+rsi]
277	shl	dl,4
278	mov	QWORD[((80-128))+rbp],rax
279	mov	rax,QWORD[((192+8-128))+rsi]
280	shl	r10,60
281	mov	BYTE[11+rsp],dl
282	or	rbx,r10
283	mov	dl,al
284	shr	rax,4
285	mov	r10,r8
286	shr	r8,4
287	mov	QWORD[88+rbp],r9
288	mov	r9,QWORD[((208+0-128))+rsi]
289	shl	dl,4
290	mov	QWORD[((88-128))+rbp],rbx
291	mov	rbx,QWORD[((208+8-128))+rsi]
292	shl	r10,60
293	mov	BYTE[12+rsp],dl
294	or	rax,r10
295	mov	dl,bl
296	shr	rbx,4
297	mov	r10,r9
298	shr	r9,4
299	mov	QWORD[96+rbp],r8
300	mov	r8,QWORD[((224+0-128))+rsi]
301	shl	dl,4
302	mov	QWORD[((96-128))+rbp],rax
303	mov	rax,QWORD[((224+8-128))+rsi]
304	shl	r10,60
305	mov	BYTE[13+rsp],dl
306	or	rbx,r10
307	mov	dl,al
308	shr	rax,4
309	mov	r10,r8
310	shr	r8,4
311	mov	QWORD[104+rbp],r9
312	mov	r9,QWORD[((240+0-128))+rsi]
313	shl	dl,4
314	mov	QWORD[((104-128))+rbp],rbx
315	mov	rbx,QWORD[((240+8-128))+rsi]
316	shl	r10,60
317	mov	BYTE[14+rsp],dl
318	or	rax,r10
319	mov	dl,bl
320	shr	rbx,4
321	mov	r10,r9
322	shr	r9,4
323	mov	QWORD[112+rbp],r8
324	shl	dl,4
325	mov	QWORD[((112-128))+rbp],rax
326	shl	r10,60
327	mov	BYTE[15+rsp],dl
328	or	rbx,r10
329	mov	QWORD[120+rbp],r9
330	mov	QWORD[((120-128))+rbp],rbx
331	add	rsi,-128
332	mov	r8,QWORD[8+rdi]
333	mov	r9,QWORD[rdi]
334	add	r15,r14
335	lea	r11,[$L$rem_8bit]
336	jmp	NEAR $L$outer_loop
337ALIGN	16
338$L$outer_loop:
339	xor	r9,QWORD[r14]
340	mov	rdx,QWORD[8+r14]
341	lea	r14,[16+r14]
342	xor	rdx,r8
343	mov	QWORD[rdi],r9
344	mov	QWORD[8+rdi],rdx
345	shr	rdx,32
346	xor	rax,rax
347	rol	edx,8
348	mov	al,dl
349	movzx	ebx,dl
350	shl	al,4
351	shr	ebx,4
352	rol	edx,8
353	mov	r8,QWORD[8+rax*1+rsi]
354	mov	r9,QWORD[rax*1+rsi]
355	mov	al,dl
356	movzx	ecx,dl
357	shl	al,4
358	movzx	r12,BYTE[rbx*1+rsp]
359	shr	ecx,4
360	xor	r12,r8
361	mov	r10,r9
362	shr	r8,8
363	movzx	r12,r12b
364	shr	r9,8
365	xor	r8,QWORD[((-128))+rbx*8+rbp]
366	shl	r10,56
367	xor	r9,QWORD[rbx*8+rbp]
368	rol	edx,8
369	xor	r8,QWORD[8+rax*1+rsi]
370	xor	r9,QWORD[rax*1+rsi]
371	mov	al,dl
372	xor	r8,r10
373	movzx	r12,WORD[r12*2+r11]
374	movzx	ebx,dl
375	shl	al,4
376	movzx	r13,BYTE[rcx*1+rsp]
377	shr	ebx,4
378	shl	r12,48
379	xor	r13,r8
380	mov	r10,r9
381	xor	r9,r12
382	shr	r8,8
383	movzx	r13,r13b
384	shr	r9,8
385	xor	r8,QWORD[((-128))+rcx*8+rbp]
386	shl	r10,56
387	xor	r9,QWORD[rcx*8+rbp]
388	rol	edx,8
389	xor	r8,QWORD[8+rax*1+rsi]
390	xor	r9,QWORD[rax*1+rsi]
391	mov	al,dl
392	xor	r8,r10
393	movzx	r13,WORD[r13*2+r11]
394	movzx	ecx,dl
395	shl	al,4
396	movzx	r12,BYTE[rbx*1+rsp]
397	shr	ecx,4
398	shl	r13,48
399	xor	r12,r8
400	mov	r10,r9
401	xor	r9,r13
402	shr	r8,8
403	movzx	r12,r12b
404	mov	edx,DWORD[8+rdi]
405	shr	r9,8
406	xor	r8,QWORD[((-128))+rbx*8+rbp]
407	shl	r10,56
408	xor	r9,QWORD[rbx*8+rbp]
409	rol	edx,8
410	xor	r8,QWORD[8+rax*1+rsi]
411	xor	r9,QWORD[rax*1+rsi]
412	mov	al,dl
413	xor	r8,r10
414	movzx	r12,WORD[r12*2+r11]
415	movzx	ebx,dl
416	shl	al,4
417	movzx	r13,BYTE[rcx*1+rsp]
418	shr	ebx,4
419	shl	r12,48
420	xor	r13,r8
421	mov	r10,r9
422	xor	r9,r12
423	shr	r8,8
424	movzx	r13,r13b
425	shr	r9,8
426	xor	r8,QWORD[((-128))+rcx*8+rbp]
427	shl	r10,56
428	xor	r9,QWORD[rcx*8+rbp]
429	rol	edx,8
430	xor	r8,QWORD[8+rax*1+rsi]
431	xor	r9,QWORD[rax*1+rsi]
432	mov	al,dl
433	xor	r8,r10
434	movzx	r13,WORD[r13*2+r11]
435	movzx	ecx,dl
436	shl	al,4
437	movzx	r12,BYTE[rbx*1+rsp]
438	shr	ecx,4
439	shl	r13,48
440	xor	r12,r8
441	mov	r10,r9
442	xor	r9,r13
443	shr	r8,8
444	movzx	r12,r12b
445	shr	r9,8
446	xor	r8,QWORD[((-128))+rbx*8+rbp]
447	shl	r10,56
448	xor	r9,QWORD[rbx*8+rbp]
449	rol	edx,8
450	xor	r8,QWORD[8+rax*1+rsi]
451	xor	r9,QWORD[rax*1+rsi]
452	mov	al,dl
453	xor	r8,r10
454	movzx	r12,WORD[r12*2+r11]
455	movzx	ebx,dl
456	shl	al,4
457	movzx	r13,BYTE[rcx*1+rsp]
458	shr	ebx,4
459	shl	r12,48
460	xor	r13,r8
461	mov	r10,r9
462	xor	r9,r12
463	shr	r8,8
464	movzx	r13,r13b
465	shr	r9,8
466	xor	r8,QWORD[((-128))+rcx*8+rbp]
467	shl	r10,56
468	xor	r9,QWORD[rcx*8+rbp]
469	rol	edx,8
470	xor	r8,QWORD[8+rax*1+rsi]
471	xor	r9,QWORD[rax*1+rsi]
472	mov	al,dl
473	xor	r8,r10
474	movzx	r13,WORD[r13*2+r11]
475	movzx	ecx,dl
476	shl	al,4
477	movzx	r12,BYTE[rbx*1+rsp]
478	shr	ecx,4
479	shl	r13,48
480	xor	r12,r8
481	mov	r10,r9
482	xor	r9,r13
483	shr	r8,8
484	movzx	r12,r12b
485	mov	edx,DWORD[4+rdi]
486	shr	r9,8
487	xor	r8,QWORD[((-128))+rbx*8+rbp]
488	shl	r10,56
489	xor	r9,QWORD[rbx*8+rbp]
490	rol	edx,8
491	xor	r8,QWORD[8+rax*1+rsi]
492	xor	r9,QWORD[rax*1+rsi]
493	mov	al,dl
494	xor	r8,r10
495	movzx	r12,WORD[r12*2+r11]
496	movzx	ebx,dl
497	shl	al,4
498	movzx	r13,BYTE[rcx*1+rsp]
499	shr	ebx,4
500	shl	r12,48
501	xor	r13,r8
502	mov	r10,r9
503	xor	r9,r12
504	shr	r8,8
505	movzx	r13,r13b
506	shr	r9,8
507	xor	r8,QWORD[((-128))+rcx*8+rbp]
508	shl	r10,56
509	xor	r9,QWORD[rcx*8+rbp]
510	rol	edx,8
511	xor	r8,QWORD[8+rax*1+rsi]
512	xor	r9,QWORD[rax*1+rsi]
513	mov	al,dl
514	xor	r8,r10
515	movzx	r13,WORD[r13*2+r11]
516	movzx	ecx,dl
517	shl	al,4
518	movzx	r12,BYTE[rbx*1+rsp]
519	shr	ecx,4
520	shl	r13,48
521	xor	r12,r8
522	mov	r10,r9
523	xor	r9,r13
524	shr	r8,8
525	movzx	r12,r12b
526	shr	r9,8
527	xor	r8,QWORD[((-128))+rbx*8+rbp]
528	shl	r10,56
529	xor	r9,QWORD[rbx*8+rbp]
530	rol	edx,8
531	xor	r8,QWORD[8+rax*1+rsi]
532	xor	r9,QWORD[rax*1+rsi]
533	mov	al,dl
534	xor	r8,r10
535	movzx	r12,WORD[r12*2+r11]
536	movzx	ebx,dl
537	shl	al,4
538	movzx	r13,BYTE[rcx*1+rsp]
539	shr	ebx,4
540	shl	r12,48
541	xor	r13,r8
542	mov	r10,r9
543	xor	r9,r12
544	shr	r8,8
545	movzx	r13,r13b
546	shr	r9,8
547	xor	r8,QWORD[((-128))+rcx*8+rbp]
548	shl	r10,56
549	xor	r9,QWORD[rcx*8+rbp]
550	rol	edx,8
551	xor	r8,QWORD[8+rax*1+rsi]
552	xor	r9,QWORD[rax*1+rsi]
553	mov	al,dl
554	xor	r8,r10
555	movzx	r13,WORD[r13*2+r11]
556	movzx	ecx,dl
557	shl	al,4
558	movzx	r12,BYTE[rbx*1+rsp]
559	shr	ecx,4
560	shl	r13,48
561	xor	r12,r8
562	mov	r10,r9
563	xor	r9,r13
564	shr	r8,8
565	movzx	r12,r12b
566	mov	edx,DWORD[rdi]
567	shr	r9,8
568	xor	r8,QWORD[((-128))+rbx*8+rbp]
569	shl	r10,56
570	xor	r9,QWORD[rbx*8+rbp]
571	rol	edx,8
572	xor	r8,QWORD[8+rax*1+rsi]
573	xor	r9,QWORD[rax*1+rsi]
574	mov	al,dl
575	xor	r8,r10
576	movzx	r12,WORD[r12*2+r11]
577	movzx	ebx,dl
578	shl	al,4
579	movzx	r13,BYTE[rcx*1+rsp]
580	shr	ebx,4
581	shl	r12,48
582	xor	r13,r8
583	mov	r10,r9
584	xor	r9,r12
585	shr	r8,8
586	movzx	r13,r13b
587	shr	r9,8
588	xor	r8,QWORD[((-128))+rcx*8+rbp]
589	shl	r10,56
590	xor	r9,QWORD[rcx*8+rbp]
591	rol	edx,8
592	xor	r8,QWORD[8+rax*1+rsi]
593	xor	r9,QWORD[rax*1+rsi]
594	mov	al,dl
595	xor	r8,r10
596	movzx	r13,WORD[r13*2+r11]
597	movzx	ecx,dl
598	shl	al,4
599	movzx	r12,BYTE[rbx*1+rsp]
600	shr	ecx,4
601	shl	r13,48
602	xor	r12,r8
603	mov	r10,r9
604	xor	r9,r13
605	shr	r8,8
606	movzx	r12,r12b
607	shr	r9,8
608	xor	r8,QWORD[((-128))+rbx*8+rbp]
609	shl	r10,56
610	xor	r9,QWORD[rbx*8+rbp]
611	rol	edx,8
612	xor	r8,QWORD[8+rax*1+rsi]
613	xor	r9,QWORD[rax*1+rsi]
614	mov	al,dl
615	xor	r8,r10
616	movzx	r12,WORD[r12*2+r11]
617	movzx	ebx,dl
618	shl	al,4
619	movzx	r13,BYTE[rcx*1+rsp]
620	shr	ebx,4
621	shl	r12,48
622	xor	r13,r8
623	mov	r10,r9
624	xor	r9,r12
625	shr	r8,8
626	movzx	r13,r13b
627	shr	r9,8
628	xor	r8,QWORD[((-128))+rcx*8+rbp]
629	shl	r10,56
630	xor	r9,QWORD[rcx*8+rbp]
631	rol	edx,8
632	xor	r8,QWORD[8+rax*1+rsi]
633	xor	r9,QWORD[rax*1+rsi]
634	mov	al,dl
635	xor	r8,r10
636	movzx	r13,WORD[r13*2+r11]
637	movzx	ecx,dl
638	shl	al,4
639	movzx	r12,BYTE[rbx*1+rsp]
640	and	ecx,240
641	shl	r13,48
642	xor	r12,r8
643	mov	r10,r9
644	xor	r9,r13
645	shr	r8,8
646	movzx	r12,r12b
647	mov	edx,DWORD[((-4))+rdi]
648	shr	r9,8
649	xor	r8,QWORD[((-128))+rbx*8+rbp]
650	shl	r10,56
651	xor	r9,QWORD[rbx*8+rbp]
652	movzx	r12,WORD[r12*2+r11]
653	xor	r8,QWORD[8+rax*1+rsi]
654	xor	r9,QWORD[rax*1+rsi]
655	shl	r12,48
656	xor	r8,r10
657	xor	r9,r12
658	movzx	r13,r8b
659	shr	r8,4
660	mov	r10,r9
661	shl	r13b,4
662	shr	r9,4
663	xor	r8,QWORD[8+rcx*1+rsi]
664	movzx	r13,WORD[r13*2+r11]
665	shl	r10,60
666	xor	r9,QWORD[rcx*1+rsi]
667	xor	r8,r10
668	shl	r13,48
669	bswap	r8
670	xor	r9,r13
671	bswap	r9
672	cmp	r14,r15
673	jb	NEAR $L$outer_loop
674	mov	QWORD[8+rdi],r8
675	mov	QWORD[rdi],r9
676
677	lea	rsi,[((280+48))+rsp]
678	mov	r15,QWORD[((-48))+rsi]
679	mov	r14,QWORD[((-40))+rsi]
680	mov	r13,QWORD[((-32))+rsi]
681	mov	r12,QWORD[((-24))+rsi]
682	mov	rbp,QWORD[((-16))+rsi]
683	mov	rbx,QWORD[((-8))+rsi]
684	lea	rsp,[rsi]
685$L$ghash_epilogue:
686	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
687	mov	rsi,QWORD[16+rsp]
688	DB	0F3h,0C3h		;repret
689$L$SEH_end_gcm_ghash_4bit:
690global	gcm_init_clmul
691
692ALIGN	16
693gcm_init_clmul:
694$L$_init_clmul:
695$L$SEH_begin_gcm_init_clmul:
696
697DB	0x48,0x83,0xec,0x18
698DB	0x0f,0x29,0x34,0x24
699	movdqu	xmm2,XMMWORD[rdx]
700	pshufd	xmm2,xmm2,78
701
702
703	pshufd	xmm4,xmm2,255
704	movdqa	xmm3,xmm2
705	psllq	xmm2,1
706	pxor	xmm5,xmm5
707	psrlq	xmm3,63
708	pcmpgtd	xmm5,xmm4
709	pslldq	xmm3,8
710	por	xmm2,xmm3
711
712
713	pand	xmm5,XMMWORD[$L$0x1c2_polynomial]
714	pxor	xmm2,xmm5
715
716
717	pshufd	xmm6,xmm2,78
718	movdqa	xmm0,xmm2
719	pxor	xmm6,xmm2
720	movdqa	xmm1,xmm0
721	pshufd	xmm3,xmm0,78
722	pxor	xmm3,xmm0
723DB	102,15,58,68,194,0
724DB	102,15,58,68,202,17
725DB	102,15,58,68,222,0
726	pxor	xmm3,xmm0
727	pxor	xmm3,xmm1
728
729	movdqa	xmm4,xmm3
730	psrldq	xmm3,8
731	pslldq	xmm4,8
732	pxor	xmm1,xmm3
733	pxor	xmm0,xmm4
734
735	movdqa	xmm4,xmm0
736	movdqa	xmm3,xmm0
737	psllq	xmm0,5
738	pxor	xmm3,xmm0
739	psllq	xmm0,1
740	pxor	xmm0,xmm3
741	psllq	xmm0,57
742	movdqa	xmm3,xmm0
743	pslldq	xmm0,8
744	psrldq	xmm3,8
745	pxor	xmm0,xmm4
746	pxor	xmm1,xmm3
747
748
749	movdqa	xmm4,xmm0
750	psrlq	xmm0,1
751	pxor	xmm1,xmm4
752	pxor	xmm4,xmm0
753	psrlq	xmm0,5
754	pxor	xmm0,xmm4
755	psrlq	xmm0,1
756	pxor	xmm0,xmm1
757	pshufd	xmm3,xmm2,78
758	pshufd	xmm4,xmm0,78
759	pxor	xmm3,xmm2
760	movdqu	XMMWORD[rcx],xmm2
761	pxor	xmm4,xmm0
762	movdqu	XMMWORD[16+rcx],xmm0
763DB	102,15,58,15,227,8
764	movdqu	XMMWORD[32+rcx],xmm4
765	movdqa	xmm1,xmm0
766	pshufd	xmm3,xmm0,78
767	pxor	xmm3,xmm0
768DB	102,15,58,68,194,0
769DB	102,15,58,68,202,17
770DB	102,15,58,68,222,0
771	pxor	xmm3,xmm0
772	pxor	xmm3,xmm1
773
774	movdqa	xmm4,xmm3
775	psrldq	xmm3,8
776	pslldq	xmm4,8
777	pxor	xmm1,xmm3
778	pxor	xmm0,xmm4
779
780	movdqa	xmm4,xmm0
781	movdqa	xmm3,xmm0
782	psllq	xmm0,5
783	pxor	xmm3,xmm0
784	psllq	xmm0,1
785	pxor	xmm0,xmm3
786	psllq	xmm0,57
787	movdqa	xmm3,xmm0
788	pslldq	xmm0,8
789	psrldq	xmm3,8
790	pxor	xmm0,xmm4
791	pxor	xmm1,xmm3
792
793
794	movdqa	xmm4,xmm0
795	psrlq	xmm0,1
796	pxor	xmm1,xmm4
797	pxor	xmm4,xmm0
798	psrlq	xmm0,5
799	pxor	xmm0,xmm4
800	psrlq	xmm0,1
801	pxor	xmm0,xmm1
802	movdqa	xmm5,xmm0
803	movdqa	xmm1,xmm0
804	pshufd	xmm3,xmm0,78
805	pxor	xmm3,xmm0
806DB	102,15,58,68,194,0
807DB	102,15,58,68,202,17
808DB	102,15,58,68,222,0
809	pxor	xmm3,xmm0
810	pxor	xmm3,xmm1
811
812	movdqa	xmm4,xmm3
813	psrldq	xmm3,8
814	pslldq	xmm4,8
815	pxor	xmm1,xmm3
816	pxor	xmm0,xmm4
817
818	movdqa	xmm4,xmm0
819	movdqa	xmm3,xmm0
820	psllq	xmm0,5
821	pxor	xmm3,xmm0
822	psllq	xmm0,1
823	pxor	xmm0,xmm3
824	psllq	xmm0,57
825	movdqa	xmm3,xmm0
826	pslldq	xmm0,8
827	psrldq	xmm3,8
828	pxor	xmm0,xmm4
829	pxor	xmm1,xmm3
830
831
832	movdqa	xmm4,xmm0
833	psrlq	xmm0,1
834	pxor	xmm1,xmm4
835	pxor	xmm4,xmm0
836	psrlq	xmm0,5
837	pxor	xmm0,xmm4
838	psrlq	xmm0,1
839	pxor	xmm0,xmm1
840	pshufd	xmm3,xmm5,78
841	pshufd	xmm4,xmm0,78
842	pxor	xmm3,xmm5
843	movdqu	XMMWORD[48+rcx],xmm5
844	pxor	xmm4,xmm0
845	movdqu	XMMWORD[64+rcx],xmm0
846DB	102,15,58,15,227,8
847	movdqu	XMMWORD[80+rcx],xmm4
848	movaps	xmm6,XMMWORD[rsp]
849	lea	rsp,[24+rsp]
850$L$SEH_end_gcm_init_clmul:
851	DB	0F3h,0C3h		;repret
852
853global	gcm_gmult_clmul
854
855ALIGN	16
856gcm_gmult_clmul:
857$L$_gmult_clmul:
858	movdqu	xmm0,XMMWORD[rcx]
859	movdqa	xmm5,XMMWORD[$L$bswap_mask]
860	movdqu	xmm2,XMMWORD[rdx]
861	movdqu	xmm4,XMMWORD[32+rdx]
862DB	102,15,56,0,197
863	movdqa	xmm1,xmm0
864	pshufd	xmm3,xmm0,78
865	pxor	xmm3,xmm0
866DB	102,15,58,68,194,0
867DB	102,15,58,68,202,17
868DB	102,15,58,68,220,0
869	pxor	xmm3,xmm0
870	pxor	xmm3,xmm1
871
872	movdqa	xmm4,xmm3
873	psrldq	xmm3,8
874	pslldq	xmm4,8
875	pxor	xmm1,xmm3
876	pxor	xmm0,xmm4
877
878	movdqa	xmm4,xmm0
879	movdqa	xmm3,xmm0
880	psllq	xmm0,5
881	pxor	xmm3,xmm0
882	psllq	xmm0,1
883	pxor	xmm0,xmm3
884	psllq	xmm0,57
885	movdqa	xmm3,xmm0
886	pslldq	xmm0,8
887	psrldq	xmm3,8
888	pxor	xmm0,xmm4
889	pxor	xmm1,xmm3
890
891
892	movdqa	xmm4,xmm0
893	psrlq	xmm0,1
894	pxor	xmm1,xmm4
895	pxor	xmm4,xmm0
896	psrlq	xmm0,5
897	pxor	xmm0,xmm4
898	psrlq	xmm0,1
899	pxor	xmm0,xmm1
900DB	102,15,56,0,197
901	movdqu	XMMWORD[rcx],xmm0
902	DB	0F3h,0C3h		;repret
903
904global	gcm_ghash_clmul
905
906ALIGN	32
907gcm_ghash_clmul:
908$L$_ghash_clmul:
909	lea	rax,[((-136))+rsp]
910$L$SEH_begin_gcm_ghash_clmul:
911
912DB	0x48,0x8d,0x60,0xe0
913DB	0x0f,0x29,0x70,0xe0
914DB	0x0f,0x29,0x78,0xf0
915DB	0x44,0x0f,0x29,0x00
916DB	0x44,0x0f,0x29,0x48,0x10
917DB	0x44,0x0f,0x29,0x50,0x20
918DB	0x44,0x0f,0x29,0x58,0x30
919DB	0x44,0x0f,0x29,0x60,0x40
920DB	0x44,0x0f,0x29,0x68,0x50
921DB	0x44,0x0f,0x29,0x70,0x60
922DB	0x44,0x0f,0x29,0x78,0x70
923	movdqa	xmm10,XMMWORD[$L$bswap_mask]
924
925	movdqu	xmm0,XMMWORD[rcx]
926	movdqu	xmm2,XMMWORD[rdx]
927	movdqu	xmm7,XMMWORD[32+rdx]
928DB	102,65,15,56,0,194
929
930	sub	r9,0x10
931	jz	NEAR $L$odd_tail
932
933	movdqu	xmm6,XMMWORD[16+rdx]
934	lea	rax,[OPENSSL_ia32cap_P]
935	mov	eax,DWORD[4+rax]
936	cmp	r9,0x30
937	jb	NEAR $L$skip4x
938
939	and	eax,71303168
940	cmp	eax,4194304
941	je	NEAR $L$skip4x
942
943	sub	r9,0x30
944	mov	rax,0xA040608020C0E000
945	movdqu	xmm14,XMMWORD[48+rdx]
946	movdqu	xmm15,XMMWORD[64+rdx]
947
948
949
950
951	movdqu	xmm3,XMMWORD[48+r8]
952	movdqu	xmm11,XMMWORD[32+r8]
953DB	102,65,15,56,0,218
954DB	102,69,15,56,0,218
955	movdqa	xmm5,xmm3
956	pshufd	xmm4,xmm3,78
957	pxor	xmm4,xmm3
958DB	102,15,58,68,218,0
959DB	102,15,58,68,234,17
960DB	102,15,58,68,231,0
961
962	movdqa	xmm13,xmm11
963	pshufd	xmm12,xmm11,78
964	pxor	xmm12,xmm11
965DB	102,68,15,58,68,222,0
966DB	102,68,15,58,68,238,17
967DB	102,68,15,58,68,231,16
968	xorps	xmm3,xmm11
969	xorps	xmm5,xmm13
970	movups	xmm7,XMMWORD[80+rdx]
971	xorps	xmm4,xmm12
972
973	movdqu	xmm11,XMMWORD[16+r8]
974	movdqu	xmm8,XMMWORD[r8]
975DB	102,69,15,56,0,218
976DB	102,69,15,56,0,194
977	movdqa	xmm13,xmm11
978	pshufd	xmm12,xmm11,78
979	pxor	xmm0,xmm8
980	pxor	xmm12,xmm11
981DB	102,69,15,58,68,222,0
982	movdqa	xmm1,xmm0
983	pshufd	xmm8,xmm0,78
984	pxor	xmm8,xmm0
985DB	102,69,15,58,68,238,17
986DB	102,68,15,58,68,231,0
987	xorps	xmm3,xmm11
988	xorps	xmm5,xmm13
989
990	lea	r8,[64+r8]
991	sub	r9,0x40
992	jc	NEAR $L$tail4x
993
994	jmp	NEAR $L$mod4_loop
995ALIGN	32
996$L$mod4_loop:
997DB	102,65,15,58,68,199,0
998	xorps	xmm4,xmm12
999	movdqu	xmm11,XMMWORD[48+r8]
1000DB	102,69,15,56,0,218
1001DB	102,65,15,58,68,207,17
1002	xorps	xmm0,xmm3
1003	movdqu	xmm3,XMMWORD[32+r8]
1004	movdqa	xmm13,xmm11
1005DB	102,68,15,58,68,199,16
1006	pshufd	xmm12,xmm11,78
1007	xorps	xmm1,xmm5
1008	pxor	xmm12,xmm11
1009DB	102,65,15,56,0,218
1010	movups	xmm7,XMMWORD[32+rdx]
1011	xorps	xmm8,xmm4
1012DB	102,68,15,58,68,218,0
1013	pshufd	xmm4,xmm3,78
1014
1015	pxor	xmm8,xmm0
1016	movdqa	xmm5,xmm3
1017	pxor	xmm8,xmm1
1018	pxor	xmm4,xmm3
1019	movdqa	xmm9,xmm8
1020DB	102,68,15,58,68,234,17
1021	pslldq	xmm8,8
1022	psrldq	xmm9,8
1023	pxor	xmm0,xmm8
1024	movdqa	xmm8,XMMWORD[$L$7_mask]
1025	pxor	xmm1,xmm9
1026DB	102,76,15,110,200
1027
1028	pand	xmm8,xmm0
1029DB	102,69,15,56,0,200
1030	pxor	xmm9,xmm0
1031DB	102,68,15,58,68,231,0
1032	psllq	xmm9,57
1033	movdqa	xmm8,xmm9
1034	pslldq	xmm9,8
1035DB	102,15,58,68,222,0
1036	psrldq	xmm8,8
1037	pxor	xmm0,xmm9
1038	pxor	xmm1,xmm8
1039	movdqu	xmm8,XMMWORD[r8]
1040
1041	movdqa	xmm9,xmm0
1042	psrlq	xmm0,1
1043DB	102,15,58,68,238,17
1044	xorps	xmm3,xmm11
1045	movdqu	xmm11,XMMWORD[16+r8]
1046DB	102,69,15,56,0,218
1047DB	102,15,58,68,231,16
1048	xorps	xmm5,xmm13
1049	movups	xmm7,XMMWORD[80+rdx]
1050DB	102,69,15,56,0,194
1051	pxor	xmm1,xmm9
1052	pxor	xmm9,xmm0
1053	psrlq	xmm0,5
1054
1055	movdqa	xmm13,xmm11
1056	pxor	xmm4,xmm12
1057	pshufd	xmm12,xmm11,78
1058	pxor	xmm0,xmm9
1059	pxor	xmm1,xmm8
1060	pxor	xmm12,xmm11
1061DB	102,69,15,58,68,222,0
1062	psrlq	xmm0,1
1063	pxor	xmm0,xmm1
1064	movdqa	xmm1,xmm0
1065DB	102,69,15,58,68,238,17
1066	xorps	xmm3,xmm11
1067	pshufd	xmm8,xmm0,78
1068	pxor	xmm8,xmm0
1069
1070DB	102,68,15,58,68,231,0
1071	xorps	xmm5,xmm13
1072
1073	lea	r8,[64+r8]
1074	sub	r9,0x40
1075	jnc	NEAR $L$mod4_loop
1076
1077$L$tail4x:
1078DB	102,65,15,58,68,199,0
1079DB	102,65,15,58,68,207,17
1080DB	102,68,15,58,68,199,16
1081	xorps	xmm4,xmm12
1082	xorps	xmm0,xmm3
1083	xorps	xmm1,xmm5
1084	pxor	xmm1,xmm0
1085	pxor	xmm8,xmm4
1086
1087	pxor	xmm8,xmm1
1088	pxor	xmm1,xmm0
1089
1090	movdqa	xmm9,xmm8
1091	psrldq	xmm8,8
1092	pslldq	xmm9,8
1093	pxor	xmm1,xmm8
1094	pxor	xmm0,xmm9
1095
1096	movdqa	xmm4,xmm0
1097	movdqa	xmm3,xmm0
1098	psllq	xmm0,5
1099	pxor	xmm3,xmm0
1100	psllq	xmm0,1
1101	pxor	xmm0,xmm3
1102	psllq	xmm0,57
1103	movdqa	xmm3,xmm0
1104	pslldq	xmm0,8
1105	psrldq	xmm3,8
1106	pxor	xmm0,xmm4
1107	pxor	xmm1,xmm3
1108
1109
1110	movdqa	xmm4,xmm0
1111	psrlq	xmm0,1
1112	pxor	xmm1,xmm4
1113	pxor	xmm4,xmm0
1114	psrlq	xmm0,5
1115	pxor	xmm0,xmm4
1116	psrlq	xmm0,1
1117	pxor	xmm0,xmm1
1118	add	r9,0x40
1119	jz	NEAR $L$done
1120	movdqu	xmm7,XMMWORD[32+rdx]
1121	sub	r9,0x10
1122	jz	NEAR $L$odd_tail
1123$L$skip4x:
1124
1125
1126
1127
1128
1129	movdqu	xmm8,XMMWORD[r8]
1130	movdqu	xmm3,XMMWORD[16+r8]
1131DB	102,69,15,56,0,194
1132DB	102,65,15,56,0,218
1133	pxor	xmm0,xmm8
1134
1135	movdqa	xmm5,xmm3
1136	pshufd	xmm4,xmm3,78
1137	pxor	xmm4,xmm3
1138DB	102,15,58,68,218,0
1139DB	102,15,58,68,234,17
1140DB	102,15,58,68,231,0
1141
1142	lea	r8,[32+r8]
1143	nop
1144	sub	r9,0x20
1145	jbe	NEAR $L$even_tail
1146	nop
1147	jmp	NEAR $L$mod_loop
1148
1149ALIGN	32
1150$L$mod_loop:
1151	movdqa	xmm1,xmm0
1152	movdqa	xmm8,xmm4
1153	pshufd	xmm4,xmm0,78
1154	pxor	xmm4,xmm0
1155
1156DB	102,15,58,68,198,0
1157DB	102,15,58,68,206,17
1158DB	102,15,58,68,231,16
1159
1160	pxor	xmm0,xmm3
1161	pxor	xmm1,xmm5
1162	movdqu	xmm9,XMMWORD[r8]
1163	pxor	xmm8,xmm0
1164DB	102,69,15,56,0,202
1165	movdqu	xmm3,XMMWORD[16+r8]
1166
1167	pxor	xmm8,xmm1
1168	pxor	xmm1,xmm9
1169	pxor	xmm4,xmm8
1170DB	102,65,15,56,0,218
1171	movdqa	xmm8,xmm4
1172	psrldq	xmm8,8
1173	pslldq	xmm4,8
1174	pxor	xmm1,xmm8
1175	pxor	xmm0,xmm4
1176
1177	movdqa	xmm5,xmm3
1178
1179	movdqa	xmm9,xmm0
1180	movdqa	xmm8,xmm0
1181	psllq	xmm0,5
1182	pxor	xmm8,xmm0
1183DB	102,15,58,68,218,0
1184	psllq	xmm0,1
1185	pxor	xmm0,xmm8
1186	psllq	xmm0,57
1187	movdqa	xmm8,xmm0
1188	pslldq	xmm0,8
1189	psrldq	xmm8,8
1190	pxor	xmm0,xmm9
1191	pshufd	xmm4,xmm5,78
1192	pxor	xmm1,xmm8
1193	pxor	xmm4,xmm5
1194
1195	movdqa	xmm9,xmm0
1196	psrlq	xmm0,1
1197DB	102,15,58,68,234,17
1198	pxor	xmm1,xmm9
1199	pxor	xmm9,xmm0
1200	psrlq	xmm0,5
1201	pxor	xmm0,xmm9
1202	lea	r8,[32+r8]
1203	psrlq	xmm0,1
1204DB	102,15,58,68,231,0
1205	pxor	xmm0,xmm1
1206
1207	sub	r9,0x20
1208	ja	NEAR $L$mod_loop
1209
1210$L$even_tail:
1211	movdqa	xmm1,xmm0
1212	movdqa	xmm8,xmm4
1213	pshufd	xmm4,xmm0,78
1214	pxor	xmm4,xmm0
1215
1216DB	102,15,58,68,198,0
1217DB	102,15,58,68,206,17
1218DB	102,15,58,68,231,16
1219
1220	pxor	xmm0,xmm3
1221	pxor	xmm1,xmm5
1222	pxor	xmm8,xmm0
1223	pxor	xmm8,xmm1
1224	pxor	xmm4,xmm8
1225	movdqa	xmm8,xmm4
1226	psrldq	xmm8,8
1227	pslldq	xmm4,8
1228	pxor	xmm1,xmm8
1229	pxor	xmm0,xmm4
1230
1231	movdqa	xmm4,xmm0
1232	movdqa	xmm3,xmm0
1233	psllq	xmm0,5
1234	pxor	xmm3,xmm0
1235	psllq	xmm0,1
1236	pxor	xmm0,xmm3
1237	psllq	xmm0,57
1238	movdqa	xmm3,xmm0
1239	pslldq	xmm0,8
1240	psrldq	xmm3,8
1241	pxor	xmm0,xmm4
1242	pxor	xmm1,xmm3
1243
1244
1245	movdqa	xmm4,xmm0
1246	psrlq	xmm0,1
1247	pxor	xmm1,xmm4
1248	pxor	xmm4,xmm0
1249	psrlq	xmm0,5
1250	pxor	xmm0,xmm4
1251	psrlq	xmm0,1
1252	pxor	xmm0,xmm1
1253	test	r9,r9
1254	jnz	NEAR $L$done
1255
1256$L$odd_tail:
1257	movdqu	xmm8,XMMWORD[r8]
1258DB	102,69,15,56,0,194
1259	pxor	xmm0,xmm8
1260	movdqa	xmm1,xmm0
1261	pshufd	xmm3,xmm0,78
1262	pxor	xmm3,xmm0
1263DB	102,15,58,68,194,0
1264DB	102,15,58,68,202,17
1265DB	102,15,58,68,223,0
1266	pxor	xmm3,xmm0
1267	pxor	xmm3,xmm1
1268
1269	movdqa	xmm4,xmm3
1270	psrldq	xmm3,8
1271	pslldq	xmm4,8
1272	pxor	xmm1,xmm3
1273	pxor	xmm0,xmm4
1274
1275	movdqa	xmm4,xmm0
1276	movdqa	xmm3,xmm0
1277	psllq	xmm0,5
1278	pxor	xmm3,xmm0
1279	psllq	xmm0,1
1280	pxor	xmm0,xmm3
1281	psllq	xmm0,57
1282	movdqa	xmm3,xmm0
1283	pslldq	xmm0,8
1284	psrldq	xmm3,8
1285	pxor	xmm0,xmm4
1286	pxor	xmm1,xmm3
1287
1288
1289	movdqa	xmm4,xmm0
1290	psrlq	xmm0,1
1291	pxor	xmm1,xmm4
1292	pxor	xmm4,xmm0
1293	psrlq	xmm0,5
1294	pxor	xmm0,xmm4
1295	psrlq	xmm0,1
1296	pxor	xmm0,xmm1
1297$L$done:
1298DB	102,65,15,56,0,194
1299	movdqu	XMMWORD[rcx],xmm0
1300	movaps	xmm6,XMMWORD[rsp]
1301	movaps	xmm7,XMMWORD[16+rsp]
1302	movaps	xmm8,XMMWORD[32+rsp]
1303	movaps	xmm9,XMMWORD[48+rsp]
1304	movaps	xmm10,XMMWORD[64+rsp]
1305	movaps	xmm11,XMMWORD[80+rsp]
1306	movaps	xmm12,XMMWORD[96+rsp]
1307	movaps	xmm13,XMMWORD[112+rsp]
1308	movaps	xmm14,XMMWORD[128+rsp]
1309	movaps	xmm15,XMMWORD[144+rsp]
1310	lea	rsp,[168+rsp]
1311$L$SEH_end_gcm_ghash_clmul:
1312	DB	0F3h,0C3h		;repret
1313
1314global	gcm_init_avx
1315
1316ALIGN	32
1317gcm_init_avx:
1318$L$SEH_begin_gcm_init_avx:
1319
1320DB	0x48,0x83,0xec,0x18
1321DB	0x0f,0x29,0x34,0x24
1322	vzeroupper
1323
1324	vmovdqu	xmm2,XMMWORD[rdx]
1325	vpshufd	xmm2,xmm2,78
1326
1327
1328	vpshufd	xmm4,xmm2,255
1329	vpsrlq	xmm3,xmm2,63
1330	vpsllq	xmm2,xmm2,1
1331	vpxor	xmm5,xmm5,xmm5
1332	vpcmpgtd	xmm5,xmm5,xmm4
1333	vpslldq	xmm3,xmm3,8
1334	vpor	xmm2,xmm2,xmm3
1335
1336
1337	vpand	xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
1338	vpxor	xmm2,xmm2,xmm5
1339
1340	vpunpckhqdq	xmm6,xmm2,xmm2
1341	vmovdqa	xmm0,xmm2
1342	vpxor	xmm6,xmm6,xmm2
1343	mov	r10,4
1344	jmp	NEAR $L$init_start_avx
1345ALIGN	32
1346$L$init_loop_avx:
1347	vpalignr	xmm5,xmm4,xmm3,8
1348	vmovdqu	XMMWORD[(-16)+rcx],xmm5
1349	vpunpckhqdq	xmm3,xmm0,xmm0
1350	vpxor	xmm3,xmm3,xmm0
1351	vpclmulqdq	xmm1,xmm0,xmm2,0x11
1352	vpclmulqdq	xmm0,xmm0,xmm2,0x00
1353	vpclmulqdq	xmm3,xmm3,xmm6,0x00
1354	vpxor	xmm4,xmm1,xmm0
1355	vpxor	xmm3,xmm3,xmm4
1356
1357	vpslldq	xmm4,xmm3,8
1358	vpsrldq	xmm3,xmm3,8
1359	vpxor	xmm0,xmm0,xmm4
1360	vpxor	xmm1,xmm1,xmm3
1361	vpsllq	xmm3,xmm0,57
1362	vpsllq	xmm4,xmm0,62
1363	vpxor	xmm4,xmm4,xmm3
1364	vpsllq	xmm3,xmm0,63
1365	vpxor	xmm4,xmm4,xmm3
1366	vpslldq	xmm3,xmm4,8
1367	vpsrldq	xmm4,xmm4,8
1368	vpxor	xmm0,xmm0,xmm3
1369	vpxor	xmm1,xmm1,xmm4
1370
1371	vpsrlq	xmm4,xmm0,1
1372	vpxor	xmm1,xmm1,xmm0
1373	vpxor	xmm0,xmm0,xmm4
1374	vpsrlq	xmm4,xmm4,5
1375	vpxor	xmm0,xmm0,xmm4
1376	vpsrlq	xmm0,xmm0,1
1377	vpxor	xmm0,xmm0,xmm1
1378$L$init_start_avx:
1379	vmovdqa	xmm5,xmm0
1380	vpunpckhqdq	xmm3,xmm0,xmm0
1381	vpxor	xmm3,xmm3,xmm0
1382	vpclmulqdq	xmm1,xmm0,xmm2,0x11
1383	vpclmulqdq	xmm0,xmm0,xmm2,0x00
1384	vpclmulqdq	xmm3,xmm3,xmm6,0x00
1385	vpxor	xmm4,xmm1,xmm0
1386	vpxor	xmm3,xmm3,xmm4
1387
1388	vpslldq	xmm4,xmm3,8
1389	vpsrldq	xmm3,xmm3,8
1390	vpxor	xmm0,xmm0,xmm4
1391	vpxor	xmm1,xmm1,xmm3
1392	vpsllq	xmm3,xmm0,57
1393	vpsllq	xmm4,xmm0,62
1394	vpxor	xmm4,xmm4,xmm3
1395	vpsllq	xmm3,xmm0,63
1396	vpxor	xmm4,xmm4,xmm3
1397	vpslldq	xmm3,xmm4,8
1398	vpsrldq	xmm4,xmm4,8
1399	vpxor	xmm0,xmm0,xmm3
1400	vpxor	xmm1,xmm1,xmm4
1401
1402	vpsrlq	xmm4,xmm0,1
1403	vpxor	xmm1,xmm1,xmm0
1404	vpxor	xmm0,xmm0,xmm4
1405	vpsrlq	xmm4,xmm4,5
1406	vpxor	xmm0,xmm0,xmm4
1407	vpsrlq	xmm0,xmm0,1
1408	vpxor	xmm0,xmm0,xmm1
1409	vpshufd	xmm3,xmm5,78
1410	vpshufd	xmm4,xmm0,78
1411	vpxor	xmm3,xmm3,xmm5
1412	vmovdqu	XMMWORD[rcx],xmm5
1413	vpxor	xmm4,xmm4,xmm0
1414	vmovdqu	XMMWORD[16+rcx],xmm0
1415	lea	rcx,[48+rcx]
1416	sub	r10,1
1417	jnz	NEAR $L$init_loop_avx
1418
1419	vpalignr	xmm5,xmm3,xmm4,8
1420	vmovdqu	XMMWORD[(-16)+rcx],xmm5
1421
1422	vzeroupper
1423	movaps	xmm6,XMMWORD[rsp]
1424	lea	rsp,[24+rsp]
1425$L$SEH_end_gcm_init_avx:
1426	DB	0F3h,0C3h		;repret
1427
1428global	gcm_gmult_avx
1429
1430ALIGN	32
1431gcm_gmult_avx:
1432	jmp	NEAR $L$_gmult_clmul
1433
1434global	gcm_ghash_avx
1435
1436ALIGN	32
1437gcm_ghash_avx:
1438	lea	rax,[((-136))+rsp]
1439$L$SEH_begin_gcm_ghash_avx:
1440
1441DB	0x48,0x8d,0x60,0xe0
1442DB	0x0f,0x29,0x70,0xe0
1443DB	0x0f,0x29,0x78,0xf0
1444DB	0x44,0x0f,0x29,0x00
1445DB	0x44,0x0f,0x29,0x48,0x10
1446DB	0x44,0x0f,0x29,0x50,0x20
1447DB	0x44,0x0f,0x29,0x58,0x30
1448DB	0x44,0x0f,0x29,0x60,0x40
1449DB	0x44,0x0f,0x29,0x68,0x50
1450DB	0x44,0x0f,0x29,0x70,0x60
1451DB	0x44,0x0f,0x29,0x78,0x70
1452	vzeroupper
1453
1454	vmovdqu	xmm10,XMMWORD[rcx]
1455	lea	r10,[$L$0x1c2_polynomial]
1456	lea	rdx,[64+rdx]
1457	vmovdqu	xmm13,XMMWORD[$L$bswap_mask]
1458	vpshufb	xmm10,xmm10,xmm13
1459	cmp	r9,0x80
1460	jb	NEAR $L$short_avx
1461	sub	r9,0x80
1462
1463	vmovdqu	xmm14,XMMWORD[112+r8]
1464	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
1465	vpshufb	xmm14,xmm14,xmm13
1466	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
1467
1468	vpunpckhqdq	xmm9,xmm14,xmm14
1469	vmovdqu	xmm15,XMMWORD[96+r8]
1470	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1471	vpxor	xmm9,xmm9,xmm14
1472	vpshufb	xmm15,xmm15,xmm13
1473	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1474	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
1475	vpunpckhqdq	xmm8,xmm15,xmm15
1476	vmovdqu	xmm14,XMMWORD[80+r8]
1477	vpclmulqdq	xmm2,xmm9,xmm7,0x00
1478	vpxor	xmm8,xmm8,xmm15
1479
1480	vpshufb	xmm14,xmm14,xmm13
1481	vpclmulqdq	xmm3,xmm15,xmm6,0x00
1482	vpunpckhqdq	xmm9,xmm14,xmm14
1483	vpclmulqdq	xmm4,xmm15,xmm6,0x11
1484	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
1485	vpxor	xmm9,xmm9,xmm14
1486	vmovdqu	xmm15,XMMWORD[64+r8]
1487	vpclmulqdq	xmm5,xmm8,xmm7,0x10
1488	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
1489
1490	vpshufb	xmm15,xmm15,xmm13
1491	vpxor	xmm3,xmm3,xmm0
1492	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1493	vpxor	xmm4,xmm4,xmm1
1494	vpunpckhqdq	xmm8,xmm15,xmm15
1495	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1496	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
1497	vpxor	xmm5,xmm5,xmm2
1498	vpclmulqdq	xmm2,xmm9,xmm7,0x00
1499	vpxor	xmm8,xmm8,xmm15
1500
1501	vmovdqu	xmm14,XMMWORD[48+r8]
1502	vpxor	xmm0,xmm0,xmm3
1503	vpclmulqdq	xmm3,xmm15,xmm6,0x00
1504	vpxor	xmm1,xmm1,xmm4
1505	vpshufb	xmm14,xmm14,xmm13
1506	vpclmulqdq	xmm4,xmm15,xmm6,0x11
1507	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
1508	vpxor	xmm2,xmm2,xmm5
1509	vpunpckhqdq	xmm9,xmm14,xmm14
1510	vpclmulqdq	xmm5,xmm8,xmm7,0x10
1511	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
1512	vpxor	xmm9,xmm9,xmm14
1513
1514	vmovdqu	xmm15,XMMWORD[32+r8]
1515	vpxor	xmm3,xmm3,xmm0
1516	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1517	vpxor	xmm4,xmm4,xmm1
1518	vpshufb	xmm15,xmm15,xmm13
1519	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1520	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
1521	vpxor	xmm5,xmm5,xmm2
1522	vpunpckhqdq	xmm8,xmm15,xmm15
1523	vpclmulqdq	xmm2,xmm9,xmm7,0x00
1524	vpxor	xmm8,xmm8,xmm15
1525
1526	vmovdqu	xmm14,XMMWORD[16+r8]
1527	vpxor	xmm0,xmm0,xmm3
1528	vpclmulqdq	xmm3,xmm15,xmm6,0x00
1529	vpxor	xmm1,xmm1,xmm4
1530	vpshufb	xmm14,xmm14,xmm13
1531	vpclmulqdq	xmm4,xmm15,xmm6,0x11
1532	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
1533	vpxor	xmm2,xmm2,xmm5
1534	vpunpckhqdq	xmm9,xmm14,xmm14
1535	vpclmulqdq	xmm5,xmm8,xmm7,0x10
1536	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
1537	vpxor	xmm9,xmm9,xmm14
1538
1539	vmovdqu	xmm15,XMMWORD[r8]
1540	vpxor	xmm3,xmm3,xmm0
1541	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1542	vpxor	xmm4,xmm4,xmm1
1543	vpshufb	xmm15,xmm15,xmm13
1544	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1545	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
1546	vpxor	xmm5,xmm5,xmm2
1547	vpclmulqdq	xmm2,xmm9,xmm7,0x10
1548
1549	lea	r8,[128+r8]
1550	cmp	r9,0x80
1551	jb	NEAR $L$tail_avx
1552
1553	vpxor	xmm15,xmm15,xmm10
1554	sub	r9,0x80
1555	jmp	NEAR $L$oop8x_avx
1556
1557ALIGN	32
1558$L$oop8x_avx:
1559	vpunpckhqdq	xmm8,xmm15,xmm15
1560	vmovdqu	xmm14,XMMWORD[112+r8]
1561	vpxor	xmm3,xmm3,xmm0
1562	vpxor	xmm8,xmm8,xmm15
1563	vpclmulqdq	xmm10,xmm15,xmm6,0x00
1564	vpshufb	xmm14,xmm14,xmm13
1565	vpxor	xmm4,xmm4,xmm1
1566	vpclmulqdq	xmm11,xmm15,xmm6,0x11
1567	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
1568	vpunpckhqdq	xmm9,xmm14,xmm14
1569	vpxor	xmm5,xmm5,xmm2
1570	vpclmulqdq	xmm12,xmm8,xmm7,0x00
1571	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
1572	vpxor	xmm9,xmm9,xmm14
1573
1574	vmovdqu	xmm15,XMMWORD[96+r8]
1575	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1576	vpxor	xmm10,xmm10,xmm3
1577	vpshufb	xmm15,xmm15,xmm13
1578	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1579	vxorps	xmm11,xmm11,xmm4
1580	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
1581	vpunpckhqdq	xmm8,xmm15,xmm15
1582	vpclmulqdq	xmm2,xmm9,xmm7,0x00
1583	vpxor	xmm12,xmm12,xmm5
1584	vxorps	xmm8,xmm8,xmm15
1585
1586	vmovdqu	xmm14,XMMWORD[80+r8]
1587	vpxor	xmm12,xmm12,xmm10
1588	vpclmulqdq	xmm3,xmm15,xmm6,0x00
1589	vpxor	xmm12,xmm12,xmm11
1590	vpslldq	xmm9,xmm12,8
1591	vpxor	xmm3,xmm3,xmm0
1592	vpclmulqdq	xmm4,xmm15,xmm6,0x11
1593	vpsrldq	xmm12,xmm12,8
1594	vpxor	xmm10,xmm10,xmm9
1595	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
1596	vpshufb	xmm14,xmm14,xmm13
1597	vxorps	xmm11,xmm11,xmm12
1598	vpxor	xmm4,xmm4,xmm1
1599	vpunpckhqdq	xmm9,xmm14,xmm14
1600	vpclmulqdq	xmm5,xmm8,xmm7,0x10
1601	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
1602	vpxor	xmm9,xmm9,xmm14
1603	vpxor	xmm5,xmm5,xmm2
1604
1605	vmovdqu	xmm15,XMMWORD[64+r8]
1606	vpalignr	xmm12,xmm10,xmm10,8
1607	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1608	vpshufb	xmm15,xmm15,xmm13
1609	vpxor	xmm0,xmm0,xmm3
1610	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1611	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
1612	vpunpckhqdq	xmm8,xmm15,xmm15
1613	vpxor	xmm1,xmm1,xmm4
1614	vpclmulqdq	xmm2,xmm9,xmm7,0x00
1615	vxorps	xmm8,xmm8,xmm15
1616	vpxor	xmm2,xmm2,xmm5
1617
1618	vmovdqu	xmm14,XMMWORD[48+r8]
1619	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
1620	vpclmulqdq	xmm3,xmm15,xmm6,0x00
1621	vpshufb	xmm14,xmm14,xmm13
1622	vpxor	xmm3,xmm3,xmm0
1623	vpclmulqdq	xmm4,xmm15,xmm6,0x11
1624	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
1625	vpunpckhqdq	xmm9,xmm14,xmm14
1626	vpxor	xmm4,xmm4,xmm1
1627	vpclmulqdq	xmm5,xmm8,xmm7,0x10
1628	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
1629	vpxor	xmm9,xmm9,xmm14
1630	vpxor	xmm5,xmm5,xmm2
1631
1632	vmovdqu	xmm15,XMMWORD[32+r8]
1633	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1634	vpshufb	xmm15,xmm15,xmm13
1635	vpxor	xmm0,xmm0,xmm3
1636	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1637	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
1638	vpunpckhqdq	xmm8,xmm15,xmm15
1639	vpxor	xmm1,xmm1,xmm4
1640	vpclmulqdq	xmm2,xmm9,xmm7,0x00
1641	vpxor	xmm8,xmm8,xmm15
1642	vpxor	xmm2,xmm2,xmm5
1643	vxorps	xmm10,xmm10,xmm12
1644
1645	vmovdqu	xmm14,XMMWORD[16+r8]
1646	vpalignr	xmm12,xmm10,xmm10,8
1647	vpclmulqdq	xmm3,xmm15,xmm6,0x00
1648	vpshufb	xmm14,xmm14,xmm13
1649	vpxor	xmm3,xmm3,xmm0
1650	vpclmulqdq	xmm4,xmm15,xmm6,0x11
1651	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
1652	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
1653	vxorps	xmm12,xmm12,xmm11
1654	vpunpckhqdq	xmm9,xmm14,xmm14
1655	vpxor	xmm4,xmm4,xmm1
1656	vpclmulqdq	xmm5,xmm8,xmm7,0x10
1657	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
1658	vpxor	xmm9,xmm9,xmm14
1659	vpxor	xmm5,xmm5,xmm2
1660
1661	vmovdqu	xmm15,XMMWORD[r8]
1662	vpclmulqdq	xmm0,xmm14,xmm6,0x00
1663	vpshufb	xmm15,xmm15,xmm13
1664	vpclmulqdq	xmm1,xmm14,xmm6,0x11
1665	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
1666	vpxor	xmm15,xmm15,xmm12
1667	vpclmulqdq	xmm2,xmm9,xmm7,0x10
1668	vpxor	xmm15,xmm15,xmm10
1669
1670	lea	r8,[128+r8]
1671	sub	r9,0x80
1672	jnc	NEAR $L$oop8x_avx
1673
1674	add	r9,0x80
1675	jmp	NEAR $L$tail_no_xor_avx
1676
1677ALIGN	32
1678$L$short_avx:
1679	vmovdqu	xmm14,XMMWORD[((-16))+r9*1+r8]
1680	lea	r8,[r9*1+r8]
1681	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
1682	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
1683	vpshufb	xmm15,xmm14,xmm13
1684
1685	vmovdqa	xmm3,xmm0
1686	vmovdqa	xmm4,xmm1
1687	vmovdqa	xmm5,xmm2
1688	sub	r9,0x10
1689	jz	NEAR $L$tail_avx
1690
1691	vpunpckhqdq	xmm8,xmm15,xmm15
1692	vpxor	xmm3,xmm3,xmm0
1693	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1694	vpxor	xmm8,xmm8,xmm15
1695	vmovdqu	xmm14,XMMWORD[((-32))+r8]
1696	vpxor	xmm4,xmm4,xmm1
1697	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1698	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
1699	vpshufb	xmm15,xmm14,xmm13
1700	vpxor	xmm5,xmm5,xmm2
1701	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1702	vpsrldq	xmm7,xmm7,8
1703	sub	r9,0x10
1704	jz	NEAR $L$tail_avx
1705
1706	vpunpckhqdq	xmm8,xmm15,xmm15
1707	vpxor	xmm3,xmm3,xmm0
1708	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1709	vpxor	xmm8,xmm8,xmm15
1710	vmovdqu	xmm14,XMMWORD[((-48))+r8]
1711	vpxor	xmm4,xmm4,xmm1
1712	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1713	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
1714	vpshufb	xmm15,xmm14,xmm13
1715	vpxor	xmm5,xmm5,xmm2
1716	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1717	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
1718	sub	r9,0x10
1719	jz	NEAR $L$tail_avx
1720
1721	vpunpckhqdq	xmm8,xmm15,xmm15
1722	vpxor	xmm3,xmm3,xmm0
1723	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1724	vpxor	xmm8,xmm8,xmm15
1725	vmovdqu	xmm14,XMMWORD[((-64))+r8]
1726	vpxor	xmm4,xmm4,xmm1
1727	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1728	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
1729	vpshufb	xmm15,xmm14,xmm13
1730	vpxor	xmm5,xmm5,xmm2
1731	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1732	vpsrldq	xmm7,xmm7,8
1733	sub	r9,0x10
1734	jz	NEAR $L$tail_avx
1735
1736	vpunpckhqdq	xmm8,xmm15,xmm15
1737	vpxor	xmm3,xmm3,xmm0
1738	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1739	vpxor	xmm8,xmm8,xmm15
1740	vmovdqu	xmm14,XMMWORD[((-80))+r8]
1741	vpxor	xmm4,xmm4,xmm1
1742	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1743	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
1744	vpshufb	xmm15,xmm14,xmm13
1745	vpxor	xmm5,xmm5,xmm2
1746	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1747	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
1748	sub	r9,0x10
1749	jz	NEAR $L$tail_avx
1750
1751	vpunpckhqdq	xmm8,xmm15,xmm15
1752	vpxor	xmm3,xmm3,xmm0
1753	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1754	vpxor	xmm8,xmm8,xmm15
1755	vmovdqu	xmm14,XMMWORD[((-96))+r8]
1756	vpxor	xmm4,xmm4,xmm1
1757	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1758	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
1759	vpshufb	xmm15,xmm14,xmm13
1760	vpxor	xmm5,xmm5,xmm2
1761	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1762	vpsrldq	xmm7,xmm7,8
1763	sub	r9,0x10
1764	jz	NEAR $L$tail_avx
1765
1766	vpunpckhqdq	xmm8,xmm15,xmm15
1767	vpxor	xmm3,xmm3,xmm0
1768	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1769	vpxor	xmm8,xmm8,xmm15
1770	vmovdqu	xmm14,XMMWORD[((-112))+r8]
1771	vpxor	xmm4,xmm4,xmm1
1772	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1773	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
1774	vpshufb	xmm15,xmm14,xmm13
1775	vpxor	xmm5,xmm5,xmm2
1776	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1777	vmovq	xmm7,QWORD[((184-64))+rdx]
1778	sub	r9,0x10
1779	jmp	NEAR $L$tail_avx
1780
1781ALIGN	32
1782$L$tail_avx:
1783	vpxor	xmm15,xmm15,xmm10
1784$L$tail_no_xor_avx:
1785	vpunpckhqdq	xmm8,xmm15,xmm15
1786	vpxor	xmm3,xmm3,xmm0
1787	vpclmulqdq	xmm0,xmm15,xmm6,0x00
1788	vpxor	xmm8,xmm8,xmm15
1789	vpxor	xmm4,xmm4,xmm1
1790	vpclmulqdq	xmm1,xmm15,xmm6,0x11
1791	vpxor	xmm5,xmm5,xmm2
1792	vpclmulqdq	xmm2,xmm8,xmm7,0x00
1793
1794	vmovdqu	xmm12,XMMWORD[r10]
1795
1796	vpxor	xmm10,xmm3,xmm0
1797	vpxor	xmm11,xmm4,xmm1
1798	vpxor	xmm5,xmm5,xmm2
1799
1800	vpxor	xmm5,xmm5,xmm10
1801	vpxor	xmm5,xmm5,xmm11
1802	vpslldq	xmm9,xmm5,8
1803	vpsrldq	xmm5,xmm5,8
1804	vpxor	xmm10,xmm10,xmm9
1805	vpxor	xmm11,xmm11,xmm5
1806
1807	vpclmulqdq	xmm9,xmm10,xmm12,0x10
1808	vpalignr	xmm10,xmm10,xmm10,8
1809	vpxor	xmm10,xmm10,xmm9
1810
1811	vpclmulqdq	xmm9,xmm10,xmm12,0x10
1812	vpalignr	xmm10,xmm10,xmm10,8
1813	vpxor	xmm10,xmm10,xmm11
1814	vpxor	xmm10,xmm10,xmm9
1815
1816	cmp	r9,0
1817	jne	NEAR $L$short_avx
1818
1819	vpshufb	xmm10,xmm10,xmm13
1820	vmovdqu	XMMWORD[rcx],xmm10
1821	vzeroupper
1822	movaps	xmm6,XMMWORD[rsp]
1823	movaps	xmm7,XMMWORD[16+rsp]
1824	movaps	xmm8,XMMWORD[32+rsp]
1825	movaps	xmm9,XMMWORD[48+rsp]
1826	movaps	xmm10,XMMWORD[64+rsp]
1827	movaps	xmm11,XMMWORD[80+rsp]
1828	movaps	xmm12,XMMWORD[96+rsp]
1829	movaps	xmm13,XMMWORD[112+rsp]
1830	movaps	xmm14,XMMWORD[128+rsp]
1831	movaps	xmm15,XMMWORD[144+rsp]
1832	lea	rsp,[168+rsp]
1833$L$SEH_end_gcm_ghash_avx:
1834	DB	0F3h,0C3h		;repret
1835
1836ALIGN	64
1837$L$bswap_mask:
1838DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1839$L$0x1c2_polynomial:
1840DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1841$L$7_mask:
1842	DD	7,0,7,0
1843$L$7_mask_poly:
1844	DD	7,0,450,0
1845ALIGN	64
1846
1847$L$rem_4bit:
1848	DD	0,0,0,471859200,0,943718400,0,610271232
1849	DD	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1850	DD	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1851	DD	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1852
1853$L$rem_8bit:
1854	DW	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1855	DW	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1856	DW	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1857	DW	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1858	DW	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1859	DW	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1860	DW	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1861	DW	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1862	DW	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1863	DW	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1864	DW	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1865	DW	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1866	DW	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1867	DW	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1868	DW	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1869	DW	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1870	DW	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1871	DW	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1872	DW	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1873	DW	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1874	DW	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1875	DW	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1876	DW	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1877	DW	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1878	DW	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1879	DW	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1880	DW	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1881	DW	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1882	DW	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1883	DW	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1884	DW	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1885	DW	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1886
1887DB	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
1888DB	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1889DB	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1890DB	114,103,62,0
1891ALIGN	64
1892EXTERN	__imp_RtlVirtualUnwind
1893
1894ALIGN	16
1895se_handler:
1896	push	rsi
1897	push	rdi
1898	push	rbx
1899	push	rbp
1900	push	r12
1901	push	r13
1902	push	r14
1903	push	r15
1904	pushfq
1905	sub	rsp,64
1906
1907	mov	rax,QWORD[120+r8]
1908	mov	rbx,QWORD[248+r8]
1909
1910	mov	rsi,QWORD[8+r9]
1911	mov	r11,QWORD[56+r9]
1912
1913	mov	r10d,DWORD[r11]
1914	lea	r10,[r10*1+rsi]
1915	cmp	rbx,r10
1916	jb	NEAR $L$in_prologue
1917
1918	mov	rax,QWORD[152+r8]
1919
1920	mov	r10d,DWORD[4+r11]
1921	lea	r10,[r10*1+rsi]
1922	cmp	rbx,r10
1923	jae	NEAR $L$in_prologue
1924
1925	lea	rax,[((48+280))+rax]
1926
1927	mov	rbx,QWORD[((-8))+rax]
1928	mov	rbp,QWORD[((-16))+rax]
1929	mov	r12,QWORD[((-24))+rax]
1930	mov	r13,QWORD[((-32))+rax]
1931	mov	r14,QWORD[((-40))+rax]
1932	mov	r15,QWORD[((-48))+rax]
1933	mov	QWORD[144+r8],rbx
1934	mov	QWORD[160+r8],rbp
1935	mov	QWORD[216+r8],r12
1936	mov	QWORD[224+r8],r13
1937	mov	QWORD[232+r8],r14
1938	mov	QWORD[240+r8],r15
1939
1940$L$in_prologue:
1941	mov	rdi,QWORD[8+rax]
1942	mov	rsi,QWORD[16+rax]
1943	mov	QWORD[152+r8],rax
1944	mov	QWORD[168+r8],rsi
1945	mov	QWORD[176+r8],rdi
1946
1947	mov	rdi,QWORD[40+r9]
1948	mov	rsi,r8
1949	mov	ecx,154
1950	DD	0xa548f3fc
1951
1952	mov	rsi,r9
1953	xor	rcx,rcx
1954	mov	rdx,QWORD[8+rsi]
1955	mov	r8,QWORD[rsi]
1956	mov	r9,QWORD[16+rsi]
1957	mov	r10,QWORD[40+rsi]
1958	lea	r11,[56+rsi]
1959	lea	r12,[24+rsi]
1960	mov	QWORD[32+rsp],r10
1961	mov	QWORD[40+rsp],r11
1962	mov	QWORD[48+rsp],r12
1963	mov	QWORD[56+rsp],rcx
1964	call	QWORD[__imp_RtlVirtualUnwind]
1965
1966	mov	eax,1
1967	add	rsp,64
1968	popfq
1969	pop	r15
1970	pop	r14
1971	pop	r13
1972	pop	r12
1973	pop	rbp
1974	pop	rbx
1975	pop	rdi
1976	pop	rsi
1977	DB	0F3h,0C3h		;repret
1978
1979
1980section	.pdata rdata align=4
1981ALIGN	4
1982	DD	$L$SEH_begin_gcm_gmult_4bit wrt ..imagebase
1983	DD	$L$SEH_end_gcm_gmult_4bit wrt ..imagebase
1984	DD	$L$SEH_info_gcm_gmult_4bit wrt ..imagebase
1985
1986	DD	$L$SEH_begin_gcm_ghash_4bit wrt ..imagebase
1987	DD	$L$SEH_end_gcm_ghash_4bit wrt ..imagebase
1988	DD	$L$SEH_info_gcm_ghash_4bit wrt ..imagebase
1989
1990	DD	$L$SEH_begin_gcm_init_clmul wrt ..imagebase
1991	DD	$L$SEH_end_gcm_init_clmul wrt ..imagebase
1992	DD	$L$SEH_info_gcm_init_clmul wrt ..imagebase
1993
1994	DD	$L$SEH_begin_gcm_ghash_clmul wrt ..imagebase
1995	DD	$L$SEH_end_gcm_ghash_clmul wrt ..imagebase
1996	DD	$L$SEH_info_gcm_ghash_clmul wrt ..imagebase
1997	DD	$L$SEH_begin_gcm_init_avx wrt ..imagebase
1998	DD	$L$SEH_end_gcm_init_avx wrt ..imagebase
1999	DD	$L$SEH_info_gcm_init_clmul wrt ..imagebase
2000
2001	DD	$L$SEH_begin_gcm_ghash_avx wrt ..imagebase
2002	DD	$L$SEH_end_gcm_ghash_avx wrt ..imagebase
2003	DD	$L$SEH_info_gcm_ghash_clmul wrt ..imagebase
2004section	.xdata rdata align=8
2005ALIGN	8
2006$L$SEH_info_gcm_gmult_4bit:
2007DB	9,0,0,0
2008	DD	se_handler wrt ..imagebase
2009	DD	$L$gmult_prologue wrt ..imagebase,$L$gmult_epilogue wrt ..imagebase
2010$L$SEH_info_gcm_ghash_4bit:
2011DB	9,0,0,0
2012	DD	se_handler wrt ..imagebase
2013	DD	$L$ghash_prologue wrt ..imagebase,$L$ghash_epilogue wrt ..imagebase
2014$L$SEH_info_gcm_init_clmul:
2015DB	0x01,0x08,0x03,0x00
2016DB	0x08,0x68,0x00,0x00
2017DB	0x04,0x22,0x00,0x00
2018$L$SEH_info_gcm_ghash_clmul:
2019DB	0x01,0x33,0x16,0x00
2020DB	0x33,0xf8,0x09,0x00
2021DB	0x2e,0xe8,0x08,0x00
2022DB	0x29,0xd8,0x07,0x00
2023DB	0x24,0xc8,0x06,0x00
2024DB	0x1f,0xb8,0x05,0x00
2025DB	0x1a,0xa8,0x04,0x00
2026DB	0x15,0x98,0x03,0x00
2027DB	0x10,0x88,0x02,0x00
2028DB	0x0c,0x78,0x01,0x00
2029DB	0x08,0x68,0x00,0x00
2030DB	0x04,0x01,0x15,0x00
2031