1#if defined(__x86_64__)
2.text
3
4.globl	_aesni_encrypt
5.private_extern _aesni_encrypt
6
7.p2align	4
8_aesni_encrypt:
9	movups	(%rdi),%xmm2
10	movl	240(%rdx),%eax
11	movups	(%rdx),%xmm0
12	movups	16(%rdx),%xmm1
13	leaq	32(%rdx),%rdx
14	xorps	%xmm0,%xmm2
15L$oop_enc1_1:
16.byte	102,15,56,220,209
17	decl	%eax
18	movups	(%rdx),%xmm1
19	leaq	16(%rdx),%rdx
20	jnz	L$oop_enc1_1
21.byte	102,15,56,221,209
22	pxor	%xmm0,%xmm0
23	pxor	%xmm1,%xmm1
24	movups	%xmm2,(%rsi)
25	pxor	%xmm2,%xmm2
26	.byte	0xf3,0xc3
27
28
29.globl	_aesni_decrypt
30.private_extern _aesni_decrypt
31
32.p2align	4
33_aesni_decrypt:
34	movups	(%rdi),%xmm2
35	movl	240(%rdx),%eax
36	movups	(%rdx),%xmm0
37	movups	16(%rdx),%xmm1
38	leaq	32(%rdx),%rdx
39	xorps	%xmm0,%xmm2
40L$oop_dec1_2:
41.byte	102,15,56,222,209
42	decl	%eax
43	movups	(%rdx),%xmm1
44	leaq	16(%rdx),%rdx
45	jnz	L$oop_dec1_2
46.byte	102,15,56,223,209
47	pxor	%xmm0,%xmm0
48	pxor	%xmm1,%xmm1
49	movups	%xmm2,(%rsi)
50	pxor	%xmm2,%xmm2
51	.byte	0xf3,0xc3
52
53
54.p2align	4
55_aesni_encrypt2:
56	movups	(%rcx),%xmm0
57	shll	$4,%eax
58	movups	16(%rcx),%xmm1
59	xorps	%xmm0,%xmm2
60	xorps	%xmm0,%xmm3
61	movups	32(%rcx),%xmm0
62	leaq	32(%rcx,%rax,1),%rcx
63	negq	%rax
64	addq	$16,%rax
65
66L$enc_loop2:
67.byte	102,15,56,220,209
68.byte	102,15,56,220,217
69	movups	(%rcx,%rax,1),%xmm1
70	addq	$32,%rax
71.byte	102,15,56,220,208
72.byte	102,15,56,220,216
73	movups	-16(%rcx,%rax,1),%xmm0
74	jnz	L$enc_loop2
75
76.byte	102,15,56,220,209
77.byte	102,15,56,220,217
78.byte	102,15,56,221,208
79.byte	102,15,56,221,216
80	.byte	0xf3,0xc3
81
82
83.p2align	4
84_aesni_decrypt2:
85	movups	(%rcx),%xmm0
86	shll	$4,%eax
87	movups	16(%rcx),%xmm1
88	xorps	%xmm0,%xmm2
89	xorps	%xmm0,%xmm3
90	movups	32(%rcx),%xmm0
91	leaq	32(%rcx,%rax,1),%rcx
92	negq	%rax
93	addq	$16,%rax
94
95L$dec_loop2:
96.byte	102,15,56,222,209
97.byte	102,15,56,222,217
98	movups	(%rcx,%rax,1),%xmm1
99	addq	$32,%rax
100.byte	102,15,56,222,208
101.byte	102,15,56,222,216
102	movups	-16(%rcx,%rax,1),%xmm0
103	jnz	L$dec_loop2
104
105.byte	102,15,56,222,209
106.byte	102,15,56,222,217
107.byte	102,15,56,223,208
108.byte	102,15,56,223,216
109	.byte	0xf3,0xc3
110
111
112.p2align	4
113_aesni_encrypt3:
114	movups	(%rcx),%xmm0
115	shll	$4,%eax
116	movups	16(%rcx),%xmm1
117	xorps	%xmm0,%xmm2
118	xorps	%xmm0,%xmm3
119	xorps	%xmm0,%xmm4
120	movups	32(%rcx),%xmm0
121	leaq	32(%rcx,%rax,1),%rcx
122	negq	%rax
123	addq	$16,%rax
124
125L$enc_loop3:
126.byte	102,15,56,220,209
127.byte	102,15,56,220,217
128.byte	102,15,56,220,225
129	movups	(%rcx,%rax,1),%xmm1
130	addq	$32,%rax
131.byte	102,15,56,220,208
132.byte	102,15,56,220,216
133.byte	102,15,56,220,224
134	movups	-16(%rcx,%rax,1),%xmm0
135	jnz	L$enc_loop3
136
137.byte	102,15,56,220,209
138.byte	102,15,56,220,217
139.byte	102,15,56,220,225
140.byte	102,15,56,221,208
141.byte	102,15,56,221,216
142.byte	102,15,56,221,224
143	.byte	0xf3,0xc3
144
145
146.p2align	4
147_aesni_decrypt3:
148	movups	(%rcx),%xmm0
149	shll	$4,%eax
150	movups	16(%rcx),%xmm1
151	xorps	%xmm0,%xmm2
152	xorps	%xmm0,%xmm3
153	xorps	%xmm0,%xmm4
154	movups	32(%rcx),%xmm0
155	leaq	32(%rcx,%rax,1),%rcx
156	negq	%rax
157	addq	$16,%rax
158
159L$dec_loop3:
160.byte	102,15,56,222,209
161.byte	102,15,56,222,217
162.byte	102,15,56,222,225
163	movups	(%rcx,%rax,1),%xmm1
164	addq	$32,%rax
165.byte	102,15,56,222,208
166.byte	102,15,56,222,216
167.byte	102,15,56,222,224
168	movups	-16(%rcx,%rax,1),%xmm0
169	jnz	L$dec_loop3
170
171.byte	102,15,56,222,209
172.byte	102,15,56,222,217
173.byte	102,15,56,222,225
174.byte	102,15,56,223,208
175.byte	102,15,56,223,216
176.byte	102,15,56,223,224
177	.byte	0xf3,0xc3
178
179
180.p2align	4
181_aesni_encrypt4:
182	movups	(%rcx),%xmm0
183	shll	$4,%eax
184	movups	16(%rcx),%xmm1
185	xorps	%xmm0,%xmm2
186	xorps	%xmm0,%xmm3
187	xorps	%xmm0,%xmm4
188	xorps	%xmm0,%xmm5
189	movups	32(%rcx),%xmm0
190	leaq	32(%rcx,%rax,1),%rcx
191	negq	%rax
192.byte	0x0f,0x1f,0x00
193	addq	$16,%rax
194
195L$enc_loop4:
196.byte	102,15,56,220,209
197.byte	102,15,56,220,217
198.byte	102,15,56,220,225
199.byte	102,15,56,220,233
200	movups	(%rcx,%rax,1),%xmm1
201	addq	$32,%rax
202.byte	102,15,56,220,208
203.byte	102,15,56,220,216
204.byte	102,15,56,220,224
205.byte	102,15,56,220,232
206	movups	-16(%rcx,%rax,1),%xmm0
207	jnz	L$enc_loop4
208
209.byte	102,15,56,220,209
210.byte	102,15,56,220,217
211.byte	102,15,56,220,225
212.byte	102,15,56,220,233
213.byte	102,15,56,221,208
214.byte	102,15,56,221,216
215.byte	102,15,56,221,224
216.byte	102,15,56,221,232
217	.byte	0xf3,0xc3
218
219
220.p2align	4
221_aesni_decrypt4:
222	movups	(%rcx),%xmm0
223	shll	$4,%eax
224	movups	16(%rcx),%xmm1
225	xorps	%xmm0,%xmm2
226	xorps	%xmm0,%xmm3
227	xorps	%xmm0,%xmm4
228	xorps	%xmm0,%xmm5
229	movups	32(%rcx),%xmm0
230	leaq	32(%rcx,%rax,1),%rcx
231	negq	%rax
232.byte	0x0f,0x1f,0x00
233	addq	$16,%rax
234
235L$dec_loop4:
236.byte	102,15,56,222,209
237.byte	102,15,56,222,217
238.byte	102,15,56,222,225
239.byte	102,15,56,222,233
240	movups	(%rcx,%rax,1),%xmm1
241	addq	$32,%rax
242.byte	102,15,56,222,208
243.byte	102,15,56,222,216
244.byte	102,15,56,222,224
245.byte	102,15,56,222,232
246	movups	-16(%rcx,%rax,1),%xmm0
247	jnz	L$dec_loop4
248
249.byte	102,15,56,222,209
250.byte	102,15,56,222,217
251.byte	102,15,56,222,225
252.byte	102,15,56,222,233
253.byte	102,15,56,223,208
254.byte	102,15,56,223,216
255.byte	102,15,56,223,224
256.byte	102,15,56,223,232
257	.byte	0xf3,0xc3
258
259
260.p2align	4
261_aesni_encrypt6:
262	movups	(%rcx),%xmm0
263	shll	$4,%eax
264	movups	16(%rcx),%xmm1
265	xorps	%xmm0,%xmm2
266	pxor	%xmm0,%xmm3
267	pxor	%xmm0,%xmm4
268.byte	102,15,56,220,209
269	leaq	32(%rcx,%rax,1),%rcx
270	negq	%rax
271.byte	102,15,56,220,217
272	pxor	%xmm0,%xmm5
273	pxor	%xmm0,%xmm6
274.byte	102,15,56,220,225
275	pxor	%xmm0,%xmm7
276	movups	(%rcx,%rax,1),%xmm0
277	addq	$16,%rax
278	jmp	L$enc_loop6_enter
279.p2align	4
280L$enc_loop6:
281.byte	102,15,56,220,209
282.byte	102,15,56,220,217
283.byte	102,15,56,220,225
284L$enc_loop6_enter:
285.byte	102,15,56,220,233
286.byte	102,15,56,220,241
287.byte	102,15,56,220,249
288	movups	(%rcx,%rax,1),%xmm1
289	addq	$32,%rax
290.byte	102,15,56,220,208
291.byte	102,15,56,220,216
292.byte	102,15,56,220,224
293.byte	102,15,56,220,232
294.byte	102,15,56,220,240
295.byte	102,15,56,220,248
296	movups	-16(%rcx,%rax,1),%xmm0
297	jnz	L$enc_loop6
298
299.byte	102,15,56,220,209
300.byte	102,15,56,220,217
301.byte	102,15,56,220,225
302.byte	102,15,56,220,233
303.byte	102,15,56,220,241
304.byte	102,15,56,220,249
305.byte	102,15,56,221,208
306.byte	102,15,56,221,216
307.byte	102,15,56,221,224
308.byte	102,15,56,221,232
309.byte	102,15,56,221,240
310.byte	102,15,56,221,248
311	.byte	0xf3,0xc3
312
313
314.p2align	4
315_aesni_decrypt6:
316	movups	(%rcx),%xmm0
317	shll	$4,%eax
318	movups	16(%rcx),%xmm1
319	xorps	%xmm0,%xmm2
320	pxor	%xmm0,%xmm3
321	pxor	%xmm0,%xmm4
322.byte	102,15,56,222,209
323	leaq	32(%rcx,%rax,1),%rcx
324	negq	%rax
325.byte	102,15,56,222,217
326	pxor	%xmm0,%xmm5
327	pxor	%xmm0,%xmm6
328.byte	102,15,56,222,225
329	pxor	%xmm0,%xmm7
330	movups	(%rcx,%rax,1),%xmm0
331	addq	$16,%rax
332	jmp	L$dec_loop6_enter
333.p2align	4
334L$dec_loop6:
335.byte	102,15,56,222,209
336.byte	102,15,56,222,217
337.byte	102,15,56,222,225
338L$dec_loop6_enter:
339.byte	102,15,56,222,233
340.byte	102,15,56,222,241
341.byte	102,15,56,222,249
342	movups	(%rcx,%rax,1),%xmm1
343	addq	$32,%rax
344.byte	102,15,56,222,208
345.byte	102,15,56,222,216
346.byte	102,15,56,222,224
347.byte	102,15,56,222,232
348.byte	102,15,56,222,240
349.byte	102,15,56,222,248
350	movups	-16(%rcx,%rax,1),%xmm0
351	jnz	L$dec_loop6
352
353.byte	102,15,56,222,209
354.byte	102,15,56,222,217
355.byte	102,15,56,222,225
356.byte	102,15,56,222,233
357.byte	102,15,56,222,241
358.byte	102,15,56,222,249
359.byte	102,15,56,223,208
360.byte	102,15,56,223,216
361.byte	102,15,56,223,224
362.byte	102,15,56,223,232
363.byte	102,15,56,223,240
364.byte	102,15,56,223,248
365	.byte	0xf3,0xc3
366
367
368.p2align	4
369_aesni_encrypt8:
370	movups	(%rcx),%xmm0
371	shll	$4,%eax
372	movups	16(%rcx),%xmm1
373	xorps	%xmm0,%xmm2
374	xorps	%xmm0,%xmm3
375	pxor	%xmm0,%xmm4
376	pxor	%xmm0,%xmm5
377	pxor	%xmm0,%xmm6
378	leaq	32(%rcx,%rax,1),%rcx
379	negq	%rax
380.byte	102,15,56,220,209
381	pxor	%xmm0,%xmm7
382	pxor	%xmm0,%xmm8
383.byte	102,15,56,220,217
384	pxor	%xmm0,%xmm9
385	movups	(%rcx,%rax,1),%xmm0
386	addq	$16,%rax
387	jmp	L$enc_loop8_inner
388.p2align	4
389L$enc_loop8:
390.byte	102,15,56,220,209
391.byte	102,15,56,220,217
392L$enc_loop8_inner:
393.byte	102,15,56,220,225
394.byte	102,15,56,220,233
395.byte	102,15,56,220,241
396.byte	102,15,56,220,249
397.byte	102,68,15,56,220,193
398.byte	102,68,15,56,220,201
399L$enc_loop8_enter:
400	movups	(%rcx,%rax,1),%xmm1
401	addq	$32,%rax
402.byte	102,15,56,220,208
403.byte	102,15,56,220,216
404.byte	102,15,56,220,224
405.byte	102,15,56,220,232
406.byte	102,15,56,220,240
407.byte	102,15,56,220,248
408.byte	102,68,15,56,220,192
409.byte	102,68,15,56,220,200
410	movups	-16(%rcx,%rax,1),%xmm0
411	jnz	L$enc_loop8
412
413.byte	102,15,56,220,209
414.byte	102,15,56,220,217
415.byte	102,15,56,220,225
416.byte	102,15,56,220,233
417.byte	102,15,56,220,241
418.byte	102,15,56,220,249
419.byte	102,68,15,56,220,193
420.byte	102,68,15,56,220,201
421.byte	102,15,56,221,208
422.byte	102,15,56,221,216
423.byte	102,15,56,221,224
424.byte	102,15,56,221,232
425.byte	102,15,56,221,240
426.byte	102,15,56,221,248
427.byte	102,68,15,56,221,192
428.byte	102,68,15,56,221,200
429	.byte	0xf3,0xc3
430
431
432.p2align	4
433_aesni_decrypt8:
434	movups	(%rcx),%xmm0
435	shll	$4,%eax
436	movups	16(%rcx),%xmm1
437	xorps	%xmm0,%xmm2
438	xorps	%xmm0,%xmm3
439	pxor	%xmm0,%xmm4
440	pxor	%xmm0,%xmm5
441	pxor	%xmm0,%xmm6
442	leaq	32(%rcx,%rax,1),%rcx
443	negq	%rax
444.byte	102,15,56,222,209
445	pxor	%xmm0,%xmm7
446	pxor	%xmm0,%xmm8
447.byte	102,15,56,222,217
448	pxor	%xmm0,%xmm9
449	movups	(%rcx,%rax,1),%xmm0
450	addq	$16,%rax
451	jmp	L$dec_loop8_inner
452.p2align	4
453L$dec_loop8:
454.byte	102,15,56,222,209
455.byte	102,15,56,222,217
456L$dec_loop8_inner:
457.byte	102,15,56,222,225
458.byte	102,15,56,222,233
459.byte	102,15,56,222,241
460.byte	102,15,56,222,249
461.byte	102,68,15,56,222,193
462.byte	102,68,15,56,222,201
463L$dec_loop8_enter:
464	movups	(%rcx,%rax,1),%xmm1
465	addq	$32,%rax
466.byte	102,15,56,222,208
467.byte	102,15,56,222,216
468.byte	102,15,56,222,224
469.byte	102,15,56,222,232
470.byte	102,15,56,222,240
471.byte	102,15,56,222,248
472.byte	102,68,15,56,222,192
473.byte	102,68,15,56,222,200
474	movups	-16(%rcx,%rax,1),%xmm0
475	jnz	L$dec_loop8
476
477.byte	102,15,56,222,209
478.byte	102,15,56,222,217
479.byte	102,15,56,222,225
480.byte	102,15,56,222,233
481.byte	102,15,56,222,241
482.byte	102,15,56,222,249
483.byte	102,68,15,56,222,193
484.byte	102,68,15,56,222,201
485.byte	102,15,56,223,208
486.byte	102,15,56,223,216
487.byte	102,15,56,223,224
488.byte	102,15,56,223,232
489.byte	102,15,56,223,240
490.byte	102,15,56,223,248
491.byte	102,68,15,56,223,192
492.byte	102,68,15,56,223,200
493	.byte	0xf3,0xc3
494
495.globl	_aesni_ecb_encrypt
496.private_extern _aesni_ecb_encrypt
497
498.p2align	4
499_aesni_ecb_encrypt:
500	andq	$-16,%rdx
501	jz	L$ecb_ret
502
503	movl	240(%rcx),%eax
504	movups	(%rcx),%xmm0
505	movq	%rcx,%r11
506	movl	%eax,%r10d
507	testl	%r8d,%r8d
508	jz	L$ecb_decrypt
509
510	cmpq	$128,%rdx
511	jb	L$ecb_enc_tail
512
513	movdqu	(%rdi),%xmm2
514	movdqu	16(%rdi),%xmm3
515	movdqu	32(%rdi),%xmm4
516	movdqu	48(%rdi),%xmm5
517	movdqu	64(%rdi),%xmm6
518	movdqu	80(%rdi),%xmm7
519	movdqu	96(%rdi),%xmm8
520	movdqu	112(%rdi),%xmm9
521	leaq	128(%rdi),%rdi
522	subq	$128,%rdx
523	jmp	L$ecb_enc_loop8_enter
524.p2align	4
525L$ecb_enc_loop8:
526	movups	%xmm2,(%rsi)
527	movq	%r11,%rcx
528	movdqu	(%rdi),%xmm2
529	movl	%r10d,%eax
530	movups	%xmm3,16(%rsi)
531	movdqu	16(%rdi),%xmm3
532	movups	%xmm4,32(%rsi)
533	movdqu	32(%rdi),%xmm4
534	movups	%xmm5,48(%rsi)
535	movdqu	48(%rdi),%xmm5
536	movups	%xmm6,64(%rsi)
537	movdqu	64(%rdi),%xmm6
538	movups	%xmm7,80(%rsi)
539	movdqu	80(%rdi),%xmm7
540	movups	%xmm8,96(%rsi)
541	movdqu	96(%rdi),%xmm8
542	movups	%xmm9,112(%rsi)
543	leaq	128(%rsi),%rsi
544	movdqu	112(%rdi),%xmm9
545	leaq	128(%rdi),%rdi
546L$ecb_enc_loop8_enter:
547
548	call	_aesni_encrypt8
549
550	subq	$128,%rdx
551	jnc	L$ecb_enc_loop8
552
553	movups	%xmm2,(%rsi)
554	movq	%r11,%rcx
555	movups	%xmm3,16(%rsi)
556	movl	%r10d,%eax
557	movups	%xmm4,32(%rsi)
558	movups	%xmm5,48(%rsi)
559	movups	%xmm6,64(%rsi)
560	movups	%xmm7,80(%rsi)
561	movups	%xmm8,96(%rsi)
562	movups	%xmm9,112(%rsi)
563	leaq	128(%rsi),%rsi
564	addq	$128,%rdx
565	jz	L$ecb_ret
566
567L$ecb_enc_tail:
568	movups	(%rdi),%xmm2
569	cmpq	$32,%rdx
570	jb	L$ecb_enc_one
571	movups	16(%rdi),%xmm3
572	je	L$ecb_enc_two
573	movups	32(%rdi),%xmm4
574	cmpq	$64,%rdx
575	jb	L$ecb_enc_three
576	movups	48(%rdi),%xmm5
577	je	L$ecb_enc_four
578	movups	64(%rdi),%xmm6
579	cmpq	$96,%rdx
580	jb	L$ecb_enc_five
581	movups	80(%rdi),%xmm7
582	je	L$ecb_enc_six
583	movdqu	96(%rdi),%xmm8
584	xorps	%xmm9,%xmm9
585	call	_aesni_encrypt8
586	movups	%xmm2,(%rsi)
587	movups	%xmm3,16(%rsi)
588	movups	%xmm4,32(%rsi)
589	movups	%xmm5,48(%rsi)
590	movups	%xmm6,64(%rsi)
591	movups	%xmm7,80(%rsi)
592	movups	%xmm8,96(%rsi)
593	jmp	L$ecb_ret
594.p2align	4
595L$ecb_enc_one:
596	movups	(%rcx),%xmm0
597	movups	16(%rcx),%xmm1
598	leaq	32(%rcx),%rcx
599	xorps	%xmm0,%xmm2
600L$oop_enc1_3:
601.byte	102,15,56,220,209
602	decl	%eax
603	movups	(%rcx),%xmm1
604	leaq	16(%rcx),%rcx
605	jnz	L$oop_enc1_3
606.byte	102,15,56,221,209
607	movups	%xmm2,(%rsi)
608	jmp	L$ecb_ret
609.p2align	4
610L$ecb_enc_two:
611	call	_aesni_encrypt2
612	movups	%xmm2,(%rsi)
613	movups	%xmm3,16(%rsi)
614	jmp	L$ecb_ret
615.p2align	4
616L$ecb_enc_three:
617	call	_aesni_encrypt3
618	movups	%xmm2,(%rsi)
619	movups	%xmm3,16(%rsi)
620	movups	%xmm4,32(%rsi)
621	jmp	L$ecb_ret
622.p2align	4
623L$ecb_enc_four:
624	call	_aesni_encrypt4
625	movups	%xmm2,(%rsi)
626	movups	%xmm3,16(%rsi)
627	movups	%xmm4,32(%rsi)
628	movups	%xmm5,48(%rsi)
629	jmp	L$ecb_ret
630.p2align	4
631L$ecb_enc_five:
632	xorps	%xmm7,%xmm7
633	call	_aesni_encrypt6
634	movups	%xmm2,(%rsi)
635	movups	%xmm3,16(%rsi)
636	movups	%xmm4,32(%rsi)
637	movups	%xmm5,48(%rsi)
638	movups	%xmm6,64(%rsi)
639	jmp	L$ecb_ret
640.p2align	4
641L$ecb_enc_six:
642	call	_aesni_encrypt6
643	movups	%xmm2,(%rsi)
644	movups	%xmm3,16(%rsi)
645	movups	%xmm4,32(%rsi)
646	movups	%xmm5,48(%rsi)
647	movups	%xmm6,64(%rsi)
648	movups	%xmm7,80(%rsi)
649	jmp	L$ecb_ret
650
651.p2align	4
652L$ecb_decrypt:
653	cmpq	$128,%rdx
654	jb	L$ecb_dec_tail
655
656	movdqu	(%rdi),%xmm2
657	movdqu	16(%rdi),%xmm3
658	movdqu	32(%rdi),%xmm4
659	movdqu	48(%rdi),%xmm5
660	movdqu	64(%rdi),%xmm6
661	movdqu	80(%rdi),%xmm7
662	movdqu	96(%rdi),%xmm8
663	movdqu	112(%rdi),%xmm9
664	leaq	128(%rdi),%rdi
665	subq	$128,%rdx
666	jmp	L$ecb_dec_loop8_enter
667.p2align	4
668L$ecb_dec_loop8:
669	movups	%xmm2,(%rsi)
670	movq	%r11,%rcx
671	movdqu	(%rdi),%xmm2
672	movl	%r10d,%eax
673	movups	%xmm3,16(%rsi)
674	movdqu	16(%rdi),%xmm3
675	movups	%xmm4,32(%rsi)
676	movdqu	32(%rdi),%xmm4
677	movups	%xmm5,48(%rsi)
678	movdqu	48(%rdi),%xmm5
679	movups	%xmm6,64(%rsi)
680	movdqu	64(%rdi),%xmm6
681	movups	%xmm7,80(%rsi)
682	movdqu	80(%rdi),%xmm7
683	movups	%xmm8,96(%rsi)
684	movdqu	96(%rdi),%xmm8
685	movups	%xmm9,112(%rsi)
686	leaq	128(%rsi),%rsi
687	movdqu	112(%rdi),%xmm9
688	leaq	128(%rdi),%rdi
689L$ecb_dec_loop8_enter:
690
691	call	_aesni_decrypt8
692
693	movups	(%r11),%xmm0
694	subq	$128,%rdx
695	jnc	L$ecb_dec_loop8
696
697	movups	%xmm2,(%rsi)
698	pxor	%xmm2,%xmm2
699	movq	%r11,%rcx
700	movups	%xmm3,16(%rsi)
701	pxor	%xmm3,%xmm3
702	movl	%r10d,%eax
703	movups	%xmm4,32(%rsi)
704	pxor	%xmm4,%xmm4
705	movups	%xmm5,48(%rsi)
706	pxor	%xmm5,%xmm5
707	movups	%xmm6,64(%rsi)
708	pxor	%xmm6,%xmm6
709	movups	%xmm7,80(%rsi)
710	pxor	%xmm7,%xmm7
711	movups	%xmm8,96(%rsi)
712	pxor	%xmm8,%xmm8
713	movups	%xmm9,112(%rsi)
714	pxor	%xmm9,%xmm9
715	leaq	128(%rsi),%rsi
716	addq	$128,%rdx
717	jz	L$ecb_ret
718
719L$ecb_dec_tail:
720	movups	(%rdi),%xmm2
721	cmpq	$32,%rdx
722	jb	L$ecb_dec_one
723	movups	16(%rdi),%xmm3
724	je	L$ecb_dec_two
725	movups	32(%rdi),%xmm4
726	cmpq	$64,%rdx
727	jb	L$ecb_dec_three
728	movups	48(%rdi),%xmm5
729	je	L$ecb_dec_four
730	movups	64(%rdi),%xmm6
731	cmpq	$96,%rdx
732	jb	L$ecb_dec_five
733	movups	80(%rdi),%xmm7
734	je	L$ecb_dec_six
735	movups	96(%rdi),%xmm8
736	movups	(%rcx),%xmm0
737	xorps	%xmm9,%xmm9
738	call	_aesni_decrypt8
739	movups	%xmm2,(%rsi)
740	pxor	%xmm2,%xmm2
741	movups	%xmm3,16(%rsi)
742	pxor	%xmm3,%xmm3
743	movups	%xmm4,32(%rsi)
744	pxor	%xmm4,%xmm4
745	movups	%xmm5,48(%rsi)
746	pxor	%xmm5,%xmm5
747	movups	%xmm6,64(%rsi)
748	pxor	%xmm6,%xmm6
749	movups	%xmm7,80(%rsi)
750	pxor	%xmm7,%xmm7
751	movups	%xmm8,96(%rsi)
752	pxor	%xmm8,%xmm8
753	pxor	%xmm9,%xmm9
754	jmp	L$ecb_ret
755.p2align	4
756L$ecb_dec_one:
757	movups	(%rcx),%xmm0
758	movups	16(%rcx),%xmm1
759	leaq	32(%rcx),%rcx
760	xorps	%xmm0,%xmm2
761L$oop_dec1_4:
762.byte	102,15,56,222,209
763	decl	%eax
764	movups	(%rcx),%xmm1
765	leaq	16(%rcx),%rcx
766	jnz	L$oop_dec1_4
767.byte	102,15,56,223,209
768	movups	%xmm2,(%rsi)
769	pxor	%xmm2,%xmm2
770	jmp	L$ecb_ret
771.p2align	4
772L$ecb_dec_two:
773	call	_aesni_decrypt2
774	movups	%xmm2,(%rsi)
775	pxor	%xmm2,%xmm2
776	movups	%xmm3,16(%rsi)
777	pxor	%xmm3,%xmm3
778	jmp	L$ecb_ret
779.p2align	4
780L$ecb_dec_three:
781	call	_aesni_decrypt3
782	movups	%xmm2,(%rsi)
783	pxor	%xmm2,%xmm2
784	movups	%xmm3,16(%rsi)
785	pxor	%xmm3,%xmm3
786	movups	%xmm4,32(%rsi)
787	pxor	%xmm4,%xmm4
788	jmp	L$ecb_ret
789.p2align	4
790L$ecb_dec_four:
791	call	_aesni_decrypt4
792	movups	%xmm2,(%rsi)
793	pxor	%xmm2,%xmm2
794	movups	%xmm3,16(%rsi)
795	pxor	%xmm3,%xmm3
796	movups	%xmm4,32(%rsi)
797	pxor	%xmm4,%xmm4
798	movups	%xmm5,48(%rsi)
799	pxor	%xmm5,%xmm5
800	jmp	L$ecb_ret
801.p2align	4
802L$ecb_dec_five:
803	xorps	%xmm7,%xmm7
804	call	_aesni_decrypt6
805	movups	%xmm2,(%rsi)
806	pxor	%xmm2,%xmm2
807	movups	%xmm3,16(%rsi)
808	pxor	%xmm3,%xmm3
809	movups	%xmm4,32(%rsi)
810	pxor	%xmm4,%xmm4
811	movups	%xmm5,48(%rsi)
812	pxor	%xmm5,%xmm5
813	movups	%xmm6,64(%rsi)
814	pxor	%xmm6,%xmm6
815	pxor	%xmm7,%xmm7
816	jmp	L$ecb_ret
817.p2align	4
818L$ecb_dec_six:
819	call	_aesni_decrypt6
820	movups	%xmm2,(%rsi)
821	pxor	%xmm2,%xmm2
822	movups	%xmm3,16(%rsi)
823	pxor	%xmm3,%xmm3
824	movups	%xmm4,32(%rsi)
825	pxor	%xmm4,%xmm4
826	movups	%xmm5,48(%rsi)
827	pxor	%xmm5,%xmm5
828	movups	%xmm6,64(%rsi)
829	pxor	%xmm6,%xmm6
830	movups	%xmm7,80(%rsi)
831	pxor	%xmm7,%xmm7
832
833L$ecb_ret:
834	xorps	%xmm0,%xmm0
835	pxor	%xmm1,%xmm1
836	.byte	0xf3,0xc3
837
838.globl	_aesni_ccm64_encrypt_blocks
839.private_extern _aesni_ccm64_encrypt_blocks
840
841.p2align	4
842_aesni_ccm64_encrypt_blocks:
843	movl	240(%rcx),%eax
844	movdqu	(%r8),%xmm6
845	movdqa	L$increment64(%rip),%xmm9
846	movdqa	L$bswap_mask(%rip),%xmm7
847
848	shll	$4,%eax
849	movl	$16,%r10d
850	leaq	0(%rcx),%r11
851	movdqu	(%r9),%xmm3
852	movdqa	%xmm6,%xmm2
853	leaq	32(%rcx,%rax,1),%rcx
854.byte	102,15,56,0,247
855	subq	%rax,%r10
856	jmp	L$ccm64_enc_outer
857.p2align	4
858L$ccm64_enc_outer:
859	movups	(%r11),%xmm0
860	movq	%r10,%rax
861	movups	(%rdi),%xmm8
862
863	xorps	%xmm0,%xmm2
864	movups	16(%r11),%xmm1
865	xorps	%xmm8,%xmm0
866	xorps	%xmm0,%xmm3
867	movups	32(%r11),%xmm0
868
869L$ccm64_enc2_loop:
870.byte	102,15,56,220,209
871.byte	102,15,56,220,217
872	movups	(%rcx,%rax,1),%xmm1
873	addq	$32,%rax
874.byte	102,15,56,220,208
875.byte	102,15,56,220,216
876	movups	-16(%rcx,%rax,1),%xmm0
877	jnz	L$ccm64_enc2_loop
878.byte	102,15,56,220,209
879.byte	102,15,56,220,217
880	paddq	%xmm9,%xmm6
881	decq	%rdx
882.byte	102,15,56,221,208
883.byte	102,15,56,221,216
884
885	leaq	16(%rdi),%rdi
886	xorps	%xmm2,%xmm8
887	movdqa	%xmm6,%xmm2
888	movups	%xmm8,(%rsi)
889.byte	102,15,56,0,215
890	leaq	16(%rsi),%rsi
891	jnz	L$ccm64_enc_outer
892
893	pxor	%xmm0,%xmm0
894	pxor	%xmm1,%xmm1
895	pxor	%xmm2,%xmm2
896	movups	%xmm3,(%r9)
897	pxor	%xmm3,%xmm3
898	pxor	%xmm8,%xmm8
899	pxor	%xmm6,%xmm6
900	.byte	0xf3,0xc3
901
902.globl	_aesni_ccm64_decrypt_blocks
903.private_extern _aesni_ccm64_decrypt_blocks
904
905.p2align	4
906_aesni_ccm64_decrypt_blocks:
907	movl	240(%rcx),%eax
908	movups	(%r8),%xmm6
909	movdqu	(%r9),%xmm3
910	movdqa	L$increment64(%rip),%xmm9
911	movdqa	L$bswap_mask(%rip),%xmm7
912
913	movaps	%xmm6,%xmm2
914	movl	%eax,%r10d
915	movq	%rcx,%r11
916.byte	102,15,56,0,247
917	movups	(%rcx),%xmm0
918	movups	16(%rcx),%xmm1
919	leaq	32(%rcx),%rcx
920	xorps	%xmm0,%xmm2
921L$oop_enc1_5:
922.byte	102,15,56,220,209
923	decl	%eax
924	movups	(%rcx),%xmm1
925	leaq	16(%rcx),%rcx
926	jnz	L$oop_enc1_5
927.byte	102,15,56,221,209
928	shll	$4,%r10d
929	movl	$16,%eax
930	movups	(%rdi),%xmm8
931	paddq	%xmm9,%xmm6
932	leaq	16(%rdi),%rdi
933	subq	%r10,%rax
934	leaq	32(%r11,%r10,1),%rcx
935	movq	%rax,%r10
936	jmp	L$ccm64_dec_outer
937.p2align	4
938L$ccm64_dec_outer:
939	xorps	%xmm2,%xmm8
940	movdqa	%xmm6,%xmm2
941	movups	%xmm8,(%rsi)
942	leaq	16(%rsi),%rsi
943.byte	102,15,56,0,215
944
945	subq	$1,%rdx
946	jz	L$ccm64_dec_break
947
948	movups	(%r11),%xmm0
949	movq	%r10,%rax
950	movups	16(%r11),%xmm1
951	xorps	%xmm0,%xmm8
952	xorps	%xmm0,%xmm2
953	xorps	%xmm8,%xmm3
954	movups	32(%r11),%xmm0
955	jmp	L$ccm64_dec2_loop
956.p2align	4
957L$ccm64_dec2_loop:
958.byte	102,15,56,220,209
959.byte	102,15,56,220,217
960	movups	(%rcx,%rax,1),%xmm1
961	addq	$32,%rax
962.byte	102,15,56,220,208
963.byte	102,15,56,220,216
964	movups	-16(%rcx,%rax,1),%xmm0
965	jnz	L$ccm64_dec2_loop
966	movups	(%rdi),%xmm8
967	paddq	%xmm9,%xmm6
968.byte	102,15,56,220,209
969.byte	102,15,56,220,217
970.byte	102,15,56,221,208
971.byte	102,15,56,221,216
972	leaq	16(%rdi),%rdi
973	jmp	L$ccm64_dec_outer
974
975.p2align	4
976L$ccm64_dec_break:
977
978	movl	240(%r11),%eax
979	movups	(%r11),%xmm0
980	movups	16(%r11),%xmm1
981	xorps	%xmm0,%xmm8
982	leaq	32(%r11),%r11
983	xorps	%xmm8,%xmm3
984L$oop_enc1_6:
985.byte	102,15,56,220,217
986	decl	%eax
987	movups	(%r11),%xmm1
988	leaq	16(%r11),%r11
989	jnz	L$oop_enc1_6
990.byte	102,15,56,221,217
991	pxor	%xmm0,%xmm0
992	pxor	%xmm1,%xmm1
993	pxor	%xmm2,%xmm2
994	movups	%xmm3,(%r9)
995	pxor	%xmm3,%xmm3
996	pxor	%xmm8,%xmm8
997	pxor	%xmm6,%xmm6
998	.byte	0xf3,0xc3
999
1000.globl	_aesni_ctr32_encrypt_blocks
1001.private_extern _aesni_ctr32_encrypt_blocks
1002
1003.p2align	4
1004_aesni_ctr32_encrypt_blocks:
1005	cmpq	$1,%rdx
1006	jne	L$ctr32_bulk
1007
1008
1009
1010	movups	(%r8),%xmm2
1011	movups	(%rdi),%xmm3
1012	movl	240(%rcx),%edx
1013	movups	(%rcx),%xmm0
1014	movups	16(%rcx),%xmm1
1015	leaq	32(%rcx),%rcx
1016	xorps	%xmm0,%xmm2
1017L$oop_enc1_7:
1018.byte	102,15,56,220,209
1019	decl	%edx
1020	movups	(%rcx),%xmm1
1021	leaq	16(%rcx),%rcx
1022	jnz	L$oop_enc1_7
1023.byte	102,15,56,221,209
1024	pxor	%xmm0,%xmm0
1025	pxor	%xmm1,%xmm1
1026	xorps	%xmm3,%xmm2
1027	pxor	%xmm3,%xmm3
1028	movups	%xmm2,(%rsi)
1029	xorps	%xmm2,%xmm2
1030	jmp	L$ctr32_epilogue
1031
1032.p2align	4
1033L$ctr32_bulk:
1034	leaq	(%rsp),%rax
1035	pushq	%rbp
1036	subq	$128,%rsp
1037	andq	$-16,%rsp
1038	leaq	-8(%rax),%rbp
1039
1040
1041
1042
1043	movdqu	(%r8),%xmm2
1044	movdqu	(%rcx),%xmm0
1045	movl	12(%r8),%r8d
1046	pxor	%xmm0,%xmm2
1047	movl	12(%rcx),%r11d
1048	movdqa	%xmm2,0(%rsp)
1049	bswapl	%r8d
1050	movdqa	%xmm2,%xmm3
1051	movdqa	%xmm2,%xmm4
1052	movdqa	%xmm2,%xmm5
1053	movdqa	%xmm2,64(%rsp)
1054	movdqa	%xmm2,80(%rsp)
1055	movdqa	%xmm2,96(%rsp)
1056	movq	%rdx,%r10
1057	movdqa	%xmm2,112(%rsp)
1058
1059	leaq	1(%r8),%rax
1060	leaq	2(%r8),%rdx
1061	bswapl	%eax
1062	bswapl	%edx
1063	xorl	%r11d,%eax
1064	xorl	%r11d,%edx
1065.byte	102,15,58,34,216,3
1066	leaq	3(%r8),%rax
1067	movdqa	%xmm3,16(%rsp)
1068.byte	102,15,58,34,226,3
1069	bswapl	%eax
1070	movq	%r10,%rdx
1071	leaq	4(%r8),%r10
1072	movdqa	%xmm4,32(%rsp)
1073	xorl	%r11d,%eax
1074	bswapl	%r10d
1075.byte	102,15,58,34,232,3
1076	xorl	%r11d,%r10d
1077	movdqa	%xmm5,48(%rsp)
1078	leaq	5(%r8),%r9
1079	movl	%r10d,64+12(%rsp)
1080	bswapl	%r9d
1081	leaq	6(%r8),%r10
1082	movl	240(%rcx),%eax
1083	xorl	%r11d,%r9d
1084	bswapl	%r10d
1085	movl	%r9d,80+12(%rsp)
1086	xorl	%r11d,%r10d
1087	leaq	7(%r8),%r9
1088	movl	%r10d,96+12(%rsp)
1089	bswapl	%r9d
1090	movl	_OPENSSL_ia32cap_P+4(%rip),%r10d
1091	xorl	%r11d,%r9d
1092	andl	$71303168,%r10d
1093	movl	%r9d,112+12(%rsp)
1094
1095	movups	16(%rcx),%xmm1
1096
1097	movdqa	64(%rsp),%xmm6
1098	movdqa	80(%rsp),%xmm7
1099
1100	cmpq	$8,%rdx
1101	jb	L$ctr32_tail
1102
1103	subq	$6,%rdx
1104	cmpl	$4194304,%r10d
1105	je	L$ctr32_6x
1106
1107	leaq	128(%rcx),%rcx
1108	subq	$2,%rdx
1109	jmp	L$ctr32_loop8
1110
1111.p2align	4
1112L$ctr32_6x:
1113	shll	$4,%eax
1114	movl	$48,%r10d
1115	bswapl	%r11d
1116	leaq	32(%rcx,%rax,1),%rcx
1117	subq	%rax,%r10
1118	jmp	L$ctr32_loop6
1119
1120.p2align	4
1121L$ctr32_loop6:
1122	addl	$6,%r8d
1123	movups	-48(%rcx,%r10,1),%xmm0
1124.byte	102,15,56,220,209
1125	movl	%r8d,%eax
1126	xorl	%r11d,%eax
1127.byte	102,15,56,220,217
1128.byte	0x0f,0x38,0xf1,0x44,0x24,12
1129	leal	1(%r8),%eax
1130.byte	102,15,56,220,225
1131	xorl	%r11d,%eax
1132.byte	0x0f,0x38,0xf1,0x44,0x24,28
1133.byte	102,15,56,220,233
1134	leal	2(%r8),%eax
1135	xorl	%r11d,%eax
1136.byte	102,15,56,220,241
1137.byte	0x0f,0x38,0xf1,0x44,0x24,44
1138	leal	3(%r8),%eax
1139.byte	102,15,56,220,249
1140	movups	-32(%rcx,%r10,1),%xmm1
1141	xorl	%r11d,%eax
1142
1143.byte	102,15,56,220,208
1144.byte	0x0f,0x38,0xf1,0x44,0x24,60
1145	leal	4(%r8),%eax
1146.byte	102,15,56,220,216
1147	xorl	%r11d,%eax
1148.byte	0x0f,0x38,0xf1,0x44,0x24,76
1149.byte	102,15,56,220,224
1150	leal	5(%r8),%eax
1151	xorl	%r11d,%eax
1152.byte	102,15,56,220,232
1153.byte	0x0f,0x38,0xf1,0x44,0x24,92
1154	movq	%r10,%rax
1155.byte	102,15,56,220,240
1156.byte	102,15,56,220,248
1157	movups	-16(%rcx,%r10,1),%xmm0
1158
1159	call	L$enc_loop6
1160
1161	movdqu	(%rdi),%xmm8
1162	movdqu	16(%rdi),%xmm9
1163	movdqu	32(%rdi),%xmm10
1164	movdqu	48(%rdi),%xmm11
1165	movdqu	64(%rdi),%xmm12
1166	movdqu	80(%rdi),%xmm13
1167	leaq	96(%rdi),%rdi
1168	movups	-64(%rcx,%r10,1),%xmm1
1169	pxor	%xmm2,%xmm8
1170	movaps	0(%rsp),%xmm2
1171	pxor	%xmm3,%xmm9
1172	movaps	16(%rsp),%xmm3
1173	pxor	%xmm4,%xmm10
1174	movaps	32(%rsp),%xmm4
1175	pxor	%xmm5,%xmm11
1176	movaps	48(%rsp),%xmm5
1177	pxor	%xmm6,%xmm12
1178	movaps	64(%rsp),%xmm6
1179	pxor	%xmm7,%xmm13
1180	movaps	80(%rsp),%xmm7
1181	movdqu	%xmm8,(%rsi)
1182	movdqu	%xmm9,16(%rsi)
1183	movdqu	%xmm10,32(%rsi)
1184	movdqu	%xmm11,48(%rsi)
1185	movdqu	%xmm12,64(%rsi)
1186	movdqu	%xmm13,80(%rsi)
1187	leaq	96(%rsi),%rsi
1188
1189	subq	$6,%rdx
1190	jnc	L$ctr32_loop6
1191
1192	addq	$6,%rdx
1193	jz	L$ctr32_done
1194
1195	leal	-48(%r10),%eax
1196	leaq	-80(%rcx,%r10,1),%rcx
1197	negl	%eax
1198	shrl	$4,%eax
1199	jmp	L$ctr32_tail
1200
1201.p2align	5
1202L$ctr32_loop8:
1203	addl	$8,%r8d
1204	movdqa	96(%rsp),%xmm8
1205.byte	102,15,56,220,209
1206	movl	%r8d,%r9d
1207	movdqa	112(%rsp),%xmm9
1208.byte	102,15,56,220,217
1209	bswapl	%r9d
1210	movups	32-128(%rcx),%xmm0
1211.byte	102,15,56,220,225
1212	xorl	%r11d,%r9d
1213	nop
1214.byte	102,15,56,220,233
1215	movl	%r9d,0+12(%rsp)
1216	leaq	1(%r8),%r9
1217.byte	102,15,56,220,241
1218.byte	102,15,56,220,249
1219.byte	102,68,15,56,220,193
1220.byte	102,68,15,56,220,201
1221	movups	48-128(%rcx),%xmm1
1222	bswapl	%r9d
1223.byte	102,15,56,220,208
1224.byte	102,15,56,220,216
1225	xorl	%r11d,%r9d
1226.byte	0x66,0x90
1227.byte	102,15,56,220,224
1228.byte	102,15,56,220,232
1229	movl	%r9d,16+12(%rsp)
1230	leaq	2(%r8),%r9
1231.byte	102,15,56,220,240
1232.byte	102,15,56,220,248
1233.byte	102,68,15,56,220,192
1234.byte	102,68,15,56,220,200
1235	movups	64-128(%rcx),%xmm0
1236	bswapl	%r9d
1237.byte	102,15,56,220,209
1238.byte	102,15,56,220,217
1239	xorl	%r11d,%r9d
1240.byte	0x66,0x90
1241.byte	102,15,56,220,225
1242.byte	102,15,56,220,233
1243	movl	%r9d,32+12(%rsp)
1244	leaq	3(%r8),%r9
1245.byte	102,15,56,220,241
1246.byte	102,15,56,220,249
1247.byte	102,68,15,56,220,193
1248.byte	102,68,15,56,220,201
1249	movups	80-128(%rcx),%xmm1
1250	bswapl	%r9d
1251.byte	102,15,56,220,208
1252.byte	102,15,56,220,216
1253	xorl	%r11d,%r9d
1254.byte	0x66,0x90
1255.byte	102,15,56,220,224
1256.byte	102,15,56,220,232
1257	movl	%r9d,48+12(%rsp)
1258	leaq	4(%r8),%r9
1259.byte	102,15,56,220,240
1260.byte	102,15,56,220,248
1261.byte	102,68,15,56,220,192
1262.byte	102,68,15,56,220,200
1263	movups	96-128(%rcx),%xmm0
1264	bswapl	%r9d
1265.byte	102,15,56,220,209
1266.byte	102,15,56,220,217
1267	xorl	%r11d,%r9d
1268.byte	0x66,0x90
1269.byte	102,15,56,220,225
1270.byte	102,15,56,220,233
1271	movl	%r9d,64+12(%rsp)
1272	leaq	5(%r8),%r9
1273.byte	102,15,56,220,241
1274.byte	102,15,56,220,249
1275.byte	102,68,15,56,220,193
1276.byte	102,68,15,56,220,201
1277	movups	112-128(%rcx),%xmm1
1278	bswapl	%r9d
1279.byte	102,15,56,220,208
1280.byte	102,15,56,220,216
1281	xorl	%r11d,%r9d
1282.byte	0x66,0x90
1283.byte	102,15,56,220,224
1284.byte	102,15,56,220,232
1285	movl	%r9d,80+12(%rsp)
1286	leaq	6(%r8),%r9
1287.byte	102,15,56,220,240
1288.byte	102,15,56,220,248
1289.byte	102,68,15,56,220,192
1290.byte	102,68,15,56,220,200
1291	movups	128-128(%rcx),%xmm0
1292	bswapl	%r9d
1293.byte	102,15,56,220,209
1294.byte	102,15,56,220,217
1295	xorl	%r11d,%r9d
1296.byte	0x66,0x90
1297.byte	102,15,56,220,225
1298.byte	102,15,56,220,233
1299	movl	%r9d,96+12(%rsp)
1300	leaq	7(%r8),%r9
1301.byte	102,15,56,220,241
1302.byte	102,15,56,220,249
1303.byte	102,68,15,56,220,193
1304.byte	102,68,15,56,220,201
1305	movups	144-128(%rcx),%xmm1
1306	bswapl	%r9d
1307.byte	102,15,56,220,208
1308.byte	102,15,56,220,216
1309.byte	102,15,56,220,224
1310	xorl	%r11d,%r9d
1311	movdqu	0(%rdi),%xmm10
1312.byte	102,15,56,220,232
1313	movl	%r9d,112+12(%rsp)
1314	cmpl	$11,%eax
1315.byte	102,15,56,220,240
1316.byte	102,15,56,220,248
1317.byte	102,68,15,56,220,192
1318.byte	102,68,15,56,220,200
1319	movups	160-128(%rcx),%xmm0
1320
1321	jb	L$ctr32_enc_done
1322
1323.byte	102,15,56,220,209
1324.byte	102,15,56,220,217
1325.byte	102,15,56,220,225
1326.byte	102,15,56,220,233
1327.byte	102,15,56,220,241
1328.byte	102,15,56,220,249
1329.byte	102,68,15,56,220,193
1330.byte	102,68,15,56,220,201
1331	movups	176-128(%rcx),%xmm1
1332
1333.byte	102,15,56,220,208
1334.byte	102,15,56,220,216
1335.byte	102,15,56,220,224
1336.byte	102,15,56,220,232
1337.byte	102,15,56,220,240
1338.byte	102,15,56,220,248
1339.byte	102,68,15,56,220,192
1340.byte	102,68,15,56,220,200
1341	movups	192-128(%rcx),%xmm0
1342	je	L$ctr32_enc_done
1343
1344.byte	102,15,56,220,209
1345.byte	102,15,56,220,217
1346.byte	102,15,56,220,225
1347.byte	102,15,56,220,233
1348.byte	102,15,56,220,241
1349.byte	102,15,56,220,249
1350.byte	102,68,15,56,220,193
1351.byte	102,68,15,56,220,201
1352	movups	208-128(%rcx),%xmm1
1353
1354.byte	102,15,56,220,208
1355.byte	102,15,56,220,216
1356.byte	102,15,56,220,224
1357.byte	102,15,56,220,232
1358.byte	102,15,56,220,240
1359.byte	102,15,56,220,248
1360.byte	102,68,15,56,220,192
1361.byte	102,68,15,56,220,200
1362	movups	224-128(%rcx),%xmm0
1363	jmp	L$ctr32_enc_done
1364
1365.p2align	4
1366L$ctr32_enc_done:
1367	movdqu	16(%rdi),%xmm11
1368	pxor	%xmm0,%xmm10
1369	movdqu	32(%rdi),%xmm12
1370	pxor	%xmm0,%xmm11
1371	movdqu	48(%rdi),%xmm13
1372	pxor	%xmm0,%xmm12
1373	movdqu	64(%rdi),%xmm14
1374	pxor	%xmm0,%xmm13
1375	movdqu	80(%rdi),%xmm15
1376	pxor	%xmm0,%xmm14
1377	pxor	%xmm0,%xmm15
1378.byte	102,15,56,220,209
1379.byte	102,15,56,220,217
1380.byte	102,15,56,220,225
1381.byte	102,15,56,220,233
1382.byte	102,15,56,220,241
1383.byte	102,15,56,220,249
1384.byte	102,68,15,56,220,193
1385.byte	102,68,15,56,220,201
1386	movdqu	96(%rdi),%xmm1
1387	leaq	128(%rdi),%rdi
1388
1389.byte	102,65,15,56,221,210
1390	pxor	%xmm0,%xmm1
1391	movdqu	112-128(%rdi),%xmm10
1392.byte	102,65,15,56,221,219
1393	pxor	%xmm0,%xmm10
1394	movdqa	0(%rsp),%xmm11
1395.byte	102,65,15,56,221,228
1396.byte	102,65,15,56,221,237
1397	movdqa	16(%rsp),%xmm12
1398	movdqa	32(%rsp),%xmm13
1399.byte	102,65,15,56,221,246
1400.byte	102,65,15,56,221,255
1401	movdqa	48(%rsp),%xmm14
1402	movdqa	64(%rsp),%xmm15
1403.byte	102,68,15,56,221,193
1404	movdqa	80(%rsp),%xmm0
1405	movups	16-128(%rcx),%xmm1
1406.byte	102,69,15,56,221,202
1407
1408	movups	%xmm2,(%rsi)
1409	movdqa	%xmm11,%xmm2
1410	movups	%xmm3,16(%rsi)
1411	movdqa	%xmm12,%xmm3
1412	movups	%xmm4,32(%rsi)
1413	movdqa	%xmm13,%xmm4
1414	movups	%xmm5,48(%rsi)
1415	movdqa	%xmm14,%xmm5
1416	movups	%xmm6,64(%rsi)
1417	movdqa	%xmm15,%xmm6
1418	movups	%xmm7,80(%rsi)
1419	movdqa	%xmm0,%xmm7
1420	movups	%xmm8,96(%rsi)
1421	movups	%xmm9,112(%rsi)
1422	leaq	128(%rsi),%rsi
1423
1424	subq	$8,%rdx
1425	jnc	L$ctr32_loop8
1426
1427	addq	$8,%rdx
1428	jz	L$ctr32_done
1429	leaq	-128(%rcx),%rcx
1430
1431L$ctr32_tail:
1432
1433
1434	leaq	16(%rcx),%rcx
1435	cmpq	$4,%rdx
1436	jb	L$ctr32_loop3
1437	je	L$ctr32_loop4
1438
1439
1440	shll	$4,%eax
1441	movdqa	96(%rsp),%xmm8
1442	pxor	%xmm9,%xmm9
1443
1444	movups	16(%rcx),%xmm0
1445.byte	102,15,56,220,209
1446.byte	102,15,56,220,217
1447	leaq	32-16(%rcx,%rax,1),%rcx
1448	negq	%rax
1449.byte	102,15,56,220,225
1450	addq	$16,%rax
1451	movups	(%rdi),%xmm10
1452.byte	102,15,56,220,233
1453.byte	102,15,56,220,241
1454	movups	16(%rdi),%xmm11
1455	movups	32(%rdi),%xmm12
1456.byte	102,15,56,220,249
1457.byte	102,68,15,56,220,193
1458
1459	call	L$enc_loop8_enter
1460
1461	movdqu	48(%rdi),%xmm13
1462	pxor	%xmm10,%xmm2
1463	movdqu	64(%rdi),%xmm10
1464	pxor	%xmm11,%xmm3
1465	movdqu	%xmm2,(%rsi)
1466	pxor	%xmm12,%xmm4
1467	movdqu	%xmm3,16(%rsi)
1468	pxor	%xmm13,%xmm5
1469	movdqu	%xmm4,32(%rsi)
1470	pxor	%xmm10,%xmm6
1471	movdqu	%xmm5,48(%rsi)
1472	movdqu	%xmm6,64(%rsi)
1473	cmpq	$6,%rdx
1474	jb	L$ctr32_done
1475
1476	movups	80(%rdi),%xmm11
1477	xorps	%xmm11,%xmm7
1478	movups	%xmm7,80(%rsi)
1479	je	L$ctr32_done
1480
1481	movups	96(%rdi),%xmm12
1482	xorps	%xmm12,%xmm8
1483	movups	%xmm8,96(%rsi)
1484	jmp	L$ctr32_done
1485
1486.p2align	5
1487L$ctr32_loop4:
1488.byte	102,15,56,220,209
1489	leaq	16(%rcx),%rcx
1490	decl	%eax
1491.byte	102,15,56,220,217
1492.byte	102,15,56,220,225
1493.byte	102,15,56,220,233
1494	movups	(%rcx),%xmm1
1495	jnz	L$ctr32_loop4
1496.byte	102,15,56,221,209
1497.byte	102,15,56,221,217
1498	movups	(%rdi),%xmm10
1499	movups	16(%rdi),%xmm11
1500.byte	102,15,56,221,225
1501.byte	102,15,56,221,233
1502	movups	32(%rdi),%xmm12
1503	movups	48(%rdi),%xmm13
1504
1505	xorps	%xmm10,%xmm2
1506	movups	%xmm2,(%rsi)
1507	xorps	%xmm11,%xmm3
1508	movups	%xmm3,16(%rsi)
1509	pxor	%xmm12,%xmm4
1510	movdqu	%xmm4,32(%rsi)
1511	pxor	%xmm13,%xmm5
1512	movdqu	%xmm5,48(%rsi)
1513	jmp	L$ctr32_done
1514
1515.p2align	5
1516L$ctr32_loop3:
1517.byte	102,15,56,220,209
1518	leaq	16(%rcx),%rcx
1519	decl	%eax
1520.byte	102,15,56,220,217
1521.byte	102,15,56,220,225
1522	movups	(%rcx),%xmm1
1523	jnz	L$ctr32_loop3
1524.byte	102,15,56,221,209
1525.byte	102,15,56,221,217
1526.byte	102,15,56,221,225
1527
1528	movups	(%rdi),%xmm10
1529	xorps	%xmm10,%xmm2
1530	movups	%xmm2,(%rsi)
1531	cmpq	$2,%rdx
1532	jb	L$ctr32_done
1533
1534	movups	16(%rdi),%xmm11
1535	xorps	%xmm11,%xmm3
1536	movups	%xmm3,16(%rsi)
1537	je	L$ctr32_done
1538
1539	movups	32(%rdi),%xmm12
1540	xorps	%xmm12,%xmm4
1541	movups	%xmm4,32(%rsi)
1542
1543L$ctr32_done:
1544	xorps	%xmm0,%xmm0
1545	xorl	%r11d,%r11d
1546	pxor	%xmm1,%xmm1
1547	pxor	%xmm2,%xmm2
1548	pxor	%xmm3,%xmm3
1549	pxor	%xmm4,%xmm4
1550	pxor	%xmm5,%xmm5
1551	pxor	%xmm6,%xmm6
1552	pxor	%xmm7,%xmm7
1553	movaps	%xmm0,0(%rsp)
1554	pxor	%xmm8,%xmm8
1555	movaps	%xmm0,16(%rsp)
1556	pxor	%xmm9,%xmm9
1557	movaps	%xmm0,32(%rsp)
1558	pxor	%xmm10,%xmm10
1559	movaps	%xmm0,48(%rsp)
1560	pxor	%xmm11,%xmm11
1561	movaps	%xmm0,64(%rsp)
1562	pxor	%xmm12,%xmm12
1563	movaps	%xmm0,80(%rsp)
1564	pxor	%xmm13,%xmm13
1565	movaps	%xmm0,96(%rsp)
1566	pxor	%xmm14,%xmm14
1567	movaps	%xmm0,112(%rsp)
1568	pxor	%xmm15,%xmm15
1569	leaq	(%rbp),%rsp
1570	popq	%rbp
1571L$ctr32_epilogue:
1572	.byte	0xf3,0xc3
1573
1574.globl	_aesni_xts_encrypt
1575.private_extern _aesni_xts_encrypt
1576
1577.p2align	4
1578_aesni_xts_encrypt:
1579	leaq	(%rsp),%rax
1580	pushq	%rbp
1581	subq	$112,%rsp
1582	andq	$-16,%rsp
1583	leaq	-8(%rax),%rbp
1584	movups	(%r9),%xmm2
1585	movl	240(%r8),%eax
1586	movl	240(%rcx),%r10d
1587	movups	(%r8),%xmm0
1588	movups	16(%r8),%xmm1
1589	leaq	32(%r8),%r8
1590	xorps	%xmm0,%xmm2
1591L$oop_enc1_8:
1592.byte	102,15,56,220,209
1593	decl	%eax
1594	movups	(%r8),%xmm1
1595	leaq	16(%r8),%r8
1596	jnz	L$oop_enc1_8
1597.byte	102,15,56,221,209
1598	movups	(%rcx),%xmm0
1599	movq	%rcx,%r11
1600	movl	%r10d,%eax
1601	shll	$4,%r10d
1602	movq	%rdx,%r9
1603	andq	$-16,%rdx
1604
1605	movups	16(%rcx,%r10,1),%xmm1
1606
1607	movdqa	L$xts_magic(%rip),%xmm8
1608	movdqa	%xmm2,%xmm15
1609	pshufd	$95,%xmm2,%xmm9
1610	pxor	%xmm0,%xmm1
1611	movdqa	%xmm9,%xmm14
1612	paddd	%xmm9,%xmm9
1613	movdqa	%xmm15,%xmm10
1614	psrad	$31,%xmm14
1615	paddq	%xmm15,%xmm15
1616	pand	%xmm8,%xmm14
1617	pxor	%xmm0,%xmm10
1618	pxor	%xmm14,%xmm15
1619	movdqa	%xmm9,%xmm14
1620	paddd	%xmm9,%xmm9
1621	movdqa	%xmm15,%xmm11
1622	psrad	$31,%xmm14
1623	paddq	%xmm15,%xmm15
1624	pand	%xmm8,%xmm14
1625	pxor	%xmm0,%xmm11
1626	pxor	%xmm14,%xmm15
1627	movdqa	%xmm9,%xmm14
1628	paddd	%xmm9,%xmm9
1629	movdqa	%xmm15,%xmm12
1630	psrad	$31,%xmm14
1631	paddq	%xmm15,%xmm15
1632	pand	%xmm8,%xmm14
1633	pxor	%xmm0,%xmm12
1634	pxor	%xmm14,%xmm15
1635	movdqa	%xmm9,%xmm14
1636	paddd	%xmm9,%xmm9
1637	movdqa	%xmm15,%xmm13
1638	psrad	$31,%xmm14
1639	paddq	%xmm15,%xmm15
1640	pand	%xmm8,%xmm14
1641	pxor	%xmm0,%xmm13
1642	pxor	%xmm14,%xmm15
1643	movdqa	%xmm15,%xmm14
1644	psrad	$31,%xmm9
1645	paddq	%xmm15,%xmm15
1646	pand	%xmm8,%xmm9
1647	pxor	%xmm0,%xmm14
1648	pxor	%xmm9,%xmm15
1649	movaps	%xmm1,96(%rsp)
1650
1651	subq	$96,%rdx
1652	jc	L$xts_enc_short
1653
1654	movl	$16+96,%eax
1655	leaq	32(%r11,%r10,1),%rcx
1656	subq	%r10,%rax
1657	movups	16(%r11),%xmm1
1658	movq	%rax,%r10
1659	leaq	L$xts_magic(%rip),%r8
1660	jmp	L$xts_enc_grandloop
1661
1662.p2align	5
1663L$xts_enc_grandloop:
1664	movdqu	0(%rdi),%xmm2
1665	movdqa	%xmm0,%xmm8
1666	movdqu	16(%rdi),%xmm3
1667	pxor	%xmm10,%xmm2
1668	movdqu	32(%rdi),%xmm4
1669	pxor	%xmm11,%xmm3
1670.byte	102,15,56,220,209
1671	movdqu	48(%rdi),%xmm5
1672	pxor	%xmm12,%xmm4
1673.byte	102,15,56,220,217
1674	movdqu	64(%rdi),%xmm6
1675	pxor	%xmm13,%xmm5
1676.byte	102,15,56,220,225
1677	movdqu	80(%rdi),%xmm7
1678	pxor	%xmm15,%xmm8
1679	movdqa	96(%rsp),%xmm9
1680	pxor	%xmm14,%xmm6
1681.byte	102,15,56,220,233
1682	movups	32(%r11),%xmm0
1683	leaq	96(%rdi),%rdi
1684	pxor	%xmm8,%xmm7
1685
1686	pxor	%xmm9,%xmm10
1687.byte	102,15,56,220,241
1688	pxor	%xmm9,%xmm11
1689	movdqa	%xmm10,0(%rsp)
1690.byte	102,15,56,220,249
1691	movups	48(%r11),%xmm1
1692	pxor	%xmm9,%xmm12
1693
1694.byte	102,15,56,220,208
1695	pxor	%xmm9,%xmm13
1696	movdqa	%xmm11,16(%rsp)
1697.byte	102,15,56,220,216
1698	pxor	%xmm9,%xmm14
1699	movdqa	%xmm12,32(%rsp)
1700.byte	102,15,56,220,224
1701.byte	102,15,56,220,232
1702	pxor	%xmm9,%xmm8
1703	movdqa	%xmm14,64(%rsp)
1704.byte	102,15,56,220,240
1705.byte	102,15,56,220,248
1706	movups	64(%r11),%xmm0
1707	movdqa	%xmm8,80(%rsp)
1708	pshufd	$95,%xmm15,%xmm9
1709	jmp	L$xts_enc_loop6
1710.p2align	5
1711L$xts_enc_loop6:
1712.byte	102,15,56,220,209
1713.byte	102,15,56,220,217
1714.byte	102,15,56,220,225
1715.byte	102,15,56,220,233
1716.byte	102,15,56,220,241
1717.byte	102,15,56,220,249
1718	movups	-64(%rcx,%rax,1),%xmm1
1719	addq	$32,%rax
1720
1721.byte	102,15,56,220,208
1722.byte	102,15,56,220,216
1723.byte	102,15,56,220,224
1724.byte	102,15,56,220,232
1725.byte	102,15,56,220,240
1726.byte	102,15,56,220,248
1727	movups	-80(%rcx,%rax,1),%xmm0
1728	jnz	L$xts_enc_loop6
1729
1730	movdqa	(%r8),%xmm8
1731	movdqa	%xmm9,%xmm14
1732	paddd	%xmm9,%xmm9
1733.byte	102,15,56,220,209
1734	paddq	%xmm15,%xmm15
1735	psrad	$31,%xmm14
1736.byte	102,15,56,220,217
1737	pand	%xmm8,%xmm14
1738	movups	(%r11),%xmm10
1739.byte	102,15,56,220,225
1740.byte	102,15,56,220,233
1741.byte	102,15,56,220,241
1742	pxor	%xmm14,%xmm15
1743	movaps	%xmm10,%xmm11
1744.byte	102,15,56,220,249
1745	movups	-64(%rcx),%xmm1
1746
1747	movdqa	%xmm9,%xmm14
1748.byte	102,15,56,220,208
1749	paddd	%xmm9,%xmm9
1750	pxor	%xmm15,%xmm10
1751.byte	102,15,56,220,216
1752	psrad	$31,%xmm14
1753	paddq	%xmm15,%xmm15
1754.byte	102,15,56,220,224
1755.byte	102,15,56,220,232
1756	pand	%xmm8,%xmm14
1757	movaps	%xmm11,%xmm12
1758.byte	102,15,56,220,240
1759	pxor	%xmm14,%xmm15
1760	movdqa	%xmm9,%xmm14
1761.byte	102,15,56,220,248
1762	movups	-48(%rcx),%xmm0
1763
1764	paddd	%xmm9,%xmm9
1765.byte	102,15,56,220,209
1766	pxor	%xmm15,%xmm11
1767	psrad	$31,%xmm14
1768.byte	102,15,56,220,217
1769	paddq	%xmm15,%xmm15
1770	pand	%xmm8,%xmm14
1771.byte	102,15,56,220,225
1772.byte	102,15,56,220,233
1773	movdqa	%xmm13,48(%rsp)
1774	pxor	%xmm14,%xmm15
1775.byte	102,15,56,220,241
1776	movaps	%xmm12,%xmm13
1777	movdqa	%xmm9,%xmm14
1778.byte	102,15,56,220,249
1779	movups	-32(%rcx),%xmm1
1780
1781	paddd	%xmm9,%xmm9
1782.byte	102,15,56,220,208
1783	pxor	%xmm15,%xmm12
1784	psrad	$31,%xmm14
1785.byte	102,15,56,220,216
1786	paddq	%xmm15,%xmm15
1787	pand	%xmm8,%xmm14
1788.byte	102,15,56,220,224
1789.byte	102,15,56,220,232
1790.byte	102,15,56,220,240
1791	pxor	%xmm14,%xmm15
1792	movaps	%xmm13,%xmm14
1793.byte	102,15,56,220,248
1794
1795	movdqa	%xmm9,%xmm0
1796	paddd	%xmm9,%xmm9
1797.byte	102,15,56,220,209
1798	pxor	%xmm15,%xmm13
1799	psrad	$31,%xmm0
1800.byte	102,15,56,220,217
1801	paddq	%xmm15,%xmm15
1802	pand	%xmm8,%xmm0
1803.byte	102,15,56,220,225
1804.byte	102,15,56,220,233
1805	pxor	%xmm0,%xmm15
1806	movups	(%r11),%xmm0
1807.byte	102,15,56,220,241
1808.byte	102,15,56,220,249
1809	movups	16(%r11),%xmm1
1810
1811	pxor	%xmm15,%xmm14
1812.byte	102,15,56,221,84,36,0
1813	psrad	$31,%xmm9
1814	paddq	%xmm15,%xmm15
1815.byte	102,15,56,221,92,36,16
1816.byte	102,15,56,221,100,36,32
1817	pand	%xmm8,%xmm9
1818	movq	%r10,%rax
1819.byte	102,15,56,221,108,36,48
1820.byte	102,15,56,221,116,36,64
1821.byte	102,15,56,221,124,36,80
1822	pxor	%xmm9,%xmm15
1823
1824	leaq	96(%rsi),%rsi
1825	movups	%xmm2,-96(%rsi)
1826	movups	%xmm3,-80(%rsi)
1827	movups	%xmm4,-64(%rsi)
1828	movups	%xmm5,-48(%rsi)
1829	movups	%xmm6,-32(%rsi)
1830	movups	%xmm7,-16(%rsi)
1831	subq	$96,%rdx
1832	jnc	L$xts_enc_grandloop
1833
1834	movl	$16+96,%eax
1835	subl	%r10d,%eax
1836	movq	%r11,%rcx
1837	shrl	$4,%eax
1838
1839L$xts_enc_short:
1840
1841	movl	%eax,%r10d
1842	pxor	%xmm0,%xmm10
1843	addq	$96,%rdx
1844	jz	L$xts_enc_done
1845
1846	pxor	%xmm0,%xmm11
1847	cmpq	$32,%rdx
1848	jb	L$xts_enc_one
1849	pxor	%xmm0,%xmm12
1850	je	L$xts_enc_two
1851
1852	pxor	%xmm0,%xmm13
1853	cmpq	$64,%rdx
1854	jb	L$xts_enc_three
1855	pxor	%xmm0,%xmm14
1856	je	L$xts_enc_four
1857
1858	movdqu	(%rdi),%xmm2
1859	movdqu	16(%rdi),%xmm3
1860	movdqu	32(%rdi),%xmm4
1861	pxor	%xmm10,%xmm2
1862	movdqu	48(%rdi),%xmm5
1863	pxor	%xmm11,%xmm3
1864	movdqu	64(%rdi),%xmm6
1865	leaq	80(%rdi),%rdi
1866	pxor	%xmm12,%xmm4
1867	pxor	%xmm13,%xmm5
1868	pxor	%xmm14,%xmm6
1869	pxor	%xmm7,%xmm7
1870
1871	call	_aesni_encrypt6
1872
1873	xorps	%xmm10,%xmm2
1874	movdqa	%xmm15,%xmm10
1875	xorps	%xmm11,%xmm3
1876	xorps	%xmm12,%xmm4
1877	movdqu	%xmm2,(%rsi)
1878	xorps	%xmm13,%xmm5
1879	movdqu	%xmm3,16(%rsi)
1880	xorps	%xmm14,%xmm6
1881	movdqu	%xmm4,32(%rsi)
1882	movdqu	%xmm5,48(%rsi)
1883	movdqu	%xmm6,64(%rsi)
1884	leaq	80(%rsi),%rsi
1885	jmp	L$xts_enc_done
1886
1887.p2align	4
1888L$xts_enc_one:
1889	movups	(%rdi),%xmm2
1890	leaq	16(%rdi),%rdi
1891	xorps	%xmm10,%xmm2
1892	movups	(%rcx),%xmm0
1893	movups	16(%rcx),%xmm1
1894	leaq	32(%rcx),%rcx
1895	xorps	%xmm0,%xmm2
1896L$oop_enc1_9:
1897.byte	102,15,56,220,209
1898	decl	%eax
1899	movups	(%rcx),%xmm1
1900	leaq	16(%rcx),%rcx
1901	jnz	L$oop_enc1_9
1902.byte	102,15,56,221,209
1903	xorps	%xmm10,%xmm2
1904	movdqa	%xmm11,%xmm10
1905	movups	%xmm2,(%rsi)
1906	leaq	16(%rsi),%rsi
1907	jmp	L$xts_enc_done
1908
1909.p2align	4
1910L$xts_enc_two:
1911	movups	(%rdi),%xmm2
1912	movups	16(%rdi),%xmm3
1913	leaq	32(%rdi),%rdi
1914	xorps	%xmm10,%xmm2
1915	xorps	%xmm11,%xmm3
1916
1917	call	_aesni_encrypt2
1918
1919	xorps	%xmm10,%xmm2
1920	movdqa	%xmm12,%xmm10
1921	xorps	%xmm11,%xmm3
1922	movups	%xmm2,(%rsi)
1923	movups	%xmm3,16(%rsi)
1924	leaq	32(%rsi),%rsi
1925	jmp	L$xts_enc_done
1926
1927.p2align	4
1928L$xts_enc_three:
1929	movups	(%rdi),%xmm2
1930	movups	16(%rdi),%xmm3
1931	movups	32(%rdi),%xmm4
1932	leaq	48(%rdi),%rdi
1933	xorps	%xmm10,%xmm2
1934	xorps	%xmm11,%xmm3
1935	xorps	%xmm12,%xmm4
1936
1937	call	_aesni_encrypt3
1938
1939	xorps	%xmm10,%xmm2
1940	movdqa	%xmm13,%xmm10
1941	xorps	%xmm11,%xmm3
1942	xorps	%xmm12,%xmm4
1943	movups	%xmm2,(%rsi)
1944	movups	%xmm3,16(%rsi)
1945	movups	%xmm4,32(%rsi)
1946	leaq	48(%rsi),%rsi
1947	jmp	L$xts_enc_done
1948
1949.p2align	4
1950L$xts_enc_four:
1951	movups	(%rdi),%xmm2
1952	movups	16(%rdi),%xmm3
1953	movups	32(%rdi),%xmm4
1954	xorps	%xmm10,%xmm2
1955	movups	48(%rdi),%xmm5
1956	leaq	64(%rdi),%rdi
1957	xorps	%xmm11,%xmm3
1958	xorps	%xmm12,%xmm4
1959	xorps	%xmm13,%xmm5
1960
1961	call	_aesni_encrypt4
1962
1963	pxor	%xmm10,%xmm2
1964	movdqa	%xmm14,%xmm10
1965	pxor	%xmm11,%xmm3
1966	pxor	%xmm12,%xmm4
1967	movdqu	%xmm2,(%rsi)
1968	pxor	%xmm13,%xmm5
1969	movdqu	%xmm3,16(%rsi)
1970	movdqu	%xmm4,32(%rsi)
1971	movdqu	%xmm5,48(%rsi)
1972	leaq	64(%rsi),%rsi
1973	jmp	L$xts_enc_done
1974
1975.p2align	4
1976L$xts_enc_done:
1977	andq	$15,%r9
1978	jz	L$xts_enc_ret
1979	movq	%r9,%rdx
1980
1981L$xts_enc_steal:
1982	movzbl	(%rdi),%eax
1983	movzbl	-16(%rsi),%ecx
1984	leaq	1(%rdi),%rdi
1985	movb	%al,-16(%rsi)
1986	movb	%cl,0(%rsi)
1987	leaq	1(%rsi),%rsi
1988	subq	$1,%rdx
1989	jnz	L$xts_enc_steal
1990
1991	subq	%r9,%rsi
1992	movq	%r11,%rcx
1993	movl	%r10d,%eax
1994
1995	movups	-16(%rsi),%xmm2
1996	xorps	%xmm10,%xmm2
1997	movups	(%rcx),%xmm0
1998	movups	16(%rcx),%xmm1
1999	leaq	32(%rcx),%rcx
2000	xorps	%xmm0,%xmm2
2001L$oop_enc1_10:
2002.byte	102,15,56,220,209
2003	decl	%eax
2004	movups	(%rcx),%xmm1
2005	leaq	16(%rcx),%rcx
2006	jnz	L$oop_enc1_10
2007.byte	102,15,56,221,209
2008	xorps	%xmm10,%xmm2
2009	movups	%xmm2,-16(%rsi)
2010
2011L$xts_enc_ret:
2012	xorps	%xmm0,%xmm0
2013	pxor	%xmm1,%xmm1
2014	pxor	%xmm2,%xmm2
2015	pxor	%xmm3,%xmm3
2016	pxor	%xmm4,%xmm4
2017	pxor	%xmm5,%xmm5
2018	pxor	%xmm6,%xmm6
2019	pxor	%xmm7,%xmm7
2020	movaps	%xmm0,0(%rsp)
2021	pxor	%xmm8,%xmm8
2022	movaps	%xmm0,16(%rsp)
2023	pxor	%xmm9,%xmm9
2024	movaps	%xmm0,32(%rsp)
2025	pxor	%xmm10,%xmm10
2026	movaps	%xmm0,48(%rsp)
2027	pxor	%xmm11,%xmm11
2028	movaps	%xmm0,64(%rsp)
2029	pxor	%xmm12,%xmm12
2030	movaps	%xmm0,80(%rsp)
2031	pxor	%xmm13,%xmm13
2032	movaps	%xmm0,96(%rsp)
2033	pxor	%xmm14,%xmm14
2034	pxor	%xmm15,%xmm15
2035	leaq	(%rbp),%rsp
2036	popq	%rbp
2037L$xts_enc_epilogue:
2038	.byte	0xf3,0xc3
2039
2040.globl	_aesni_xts_decrypt
2041.private_extern _aesni_xts_decrypt
2042
2043.p2align	4
2044_aesni_xts_decrypt:
2045	leaq	(%rsp),%rax
2046	pushq	%rbp
2047	subq	$112,%rsp
2048	andq	$-16,%rsp
2049	leaq	-8(%rax),%rbp
2050	movups	(%r9),%xmm2
2051	movl	240(%r8),%eax
2052	movl	240(%rcx),%r10d
2053	movups	(%r8),%xmm0
2054	movups	16(%r8),%xmm1
2055	leaq	32(%r8),%r8
2056	xorps	%xmm0,%xmm2
2057L$oop_enc1_11:
2058.byte	102,15,56,220,209
2059	decl	%eax
2060	movups	(%r8),%xmm1
2061	leaq	16(%r8),%r8
2062	jnz	L$oop_enc1_11
2063.byte	102,15,56,221,209
2064	xorl	%eax,%eax
2065	testq	$15,%rdx
2066	setnz	%al
2067	shlq	$4,%rax
2068	subq	%rax,%rdx
2069
2070	movups	(%rcx),%xmm0
2071	movq	%rcx,%r11
2072	movl	%r10d,%eax
2073	shll	$4,%r10d
2074	movq	%rdx,%r9
2075	andq	$-16,%rdx
2076
2077	movups	16(%rcx,%r10,1),%xmm1
2078
2079	movdqa	L$xts_magic(%rip),%xmm8
2080	movdqa	%xmm2,%xmm15
2081	pshufd	$95,%xmm2,%xmm9
2082	pxor	%xmm0,%xmm1
2083	movdqa	%xmm9,%xmm14
2084	paddd	%xmm9,%xmm9
2085	movdqa	%xmm15,%xmm10
2086	psrad	$31,%xmm14
2087	paddq	%xmm15,%xmm15
2088	pand	%xmm8,%xmm14
2089	pxor	%xmm0,%xmm10
2090	pxor	%xmm14,%xmm15
2091	movdqa	%xmm9,%xmm14
2092	paddd	%xmm9,%xmm9
2093	movdqa	%xmm15,%xmm11
2094	psrad	$31,%xmm14
2095	paddq	%xmm15,%xmm15
2096	pand	%xmm8,%xmm14
2097	pxor	%xmm0,%xmm11
2098	pxor	%xmm14,%xmm15
2099	movdqa	%xmm9,%xmm14
2100	paddd	%xmm9,%xmm9
2101	movdqa	%xmm15,%xmm12
2102	psrad	$31,%xmm14
2103	paddq	%xmm15,%xmm15
2104	pand	%xmm8,%xmm14
2105	pxor	%xmm0,%xmm12
2106	pxor	%xmm14,%xmm15
2107	movdqa	%xmm9,%xmm14
2108	paddd	%xmm9,%xmm9
2109	movdqa	%xmm15,%xmm13
2110	psrad	$31,%xmm14
2111	paddq	%xmm15,%xmm15
2112	pand	%xmm8,%xmm14
2113	pxor	%xmm0,%xmm13
2114	pxor	%xmm14,%xmm15
2115	movdqa	%xmm15,%xmm14
2116	psrad	$31,%xmm9
2117	paddq	%xmm15,%xmm15
2118	pand	%xmm8,%xmm9
2119	pxor	%xmm0,%xmm14
2120	pxor	%xmm9,%xmm15
2121	movaps	%xmm1,96(%rsp)
2122
2123	subq	$96,%rdx
2124	jc	L$xts_dec_short
2125
2126	movl	$16+96,%eax
2127	leaq	32(%r11,%r10,1),%rcx
2128	subq	%r10,%rax
2129	movups	16(%r11),%xmm1
2130	movq	%rax,%r10
2131	leaq	L$xts_magic(%rip),%r8
2132	jmp	L$xts_dec_grandloop
2133
2134.p2align	5
2135L$xts_dec_grandloop:
2136	movdqu	0(%rdi),%xmm2
2137	movdqa	%xmm0,%xmm8
2138	movdqu	16(%rdi),%xmm3
2139	pxor	%xmm10,%xmm2
2140	movdqu	32(%rdi),%xmm4
2141	pxor	%xmm11,%xmm3
2142.byte	102,15,56,222,209
2143	movdqu	48(%rdi),%xmm5
2144	pxor	%xmm12,%xmm4
2145.byte	102,15,56,222,217
2146	movdqu	64(%rdi),%xmm6
2147	pxor	%xmm13,%xmm5
2148.byte	102,15,56,222,225
2149	movdqu	80(%rdi),%xmm7
2150	pxor	%xmm15,%xmm8
2151	movdqa	96(%rsp),%xmm9
2152	pxor	%xmm14,%xmm6
2153.byte	102,15,56,222,233
2154	movups	32(%r11),%xmm0
2155	leaq	96(%rdi),%rdi
2156	pxor	%xmm8,%xmm7
2157
2158	pxor	%xmm9,%xmm10
2159.byte	102,15,56,222,241
2160	pxor	%xmm9,%xmm11
2161	movdqa	%xmm10,0(%rsp)
2162.byte	102,15,56,222,249
2163	movups	48(%r11),%xmm1
2164	pxor	%xmm9,%xmm12
2165
2166.byte	102,15,56,222,208
2167	pxor	%xmm9,%xmm13
2168	movdqa	%xmm11,16(%rsp)
2169.byte	102,15,56,222,216
2170	pxor	%xmm9,%xmm14
2171	movdqa	%xmm12,32(%rsp)
2172.byte	102,15,56,222,224
2173.byte	102,15,56,222,232
2174	pxor	%xmm9,%xmm8
2175	movdqa	%xmm14,64(%rsp)
2176.byte	102,15,56,222,240
2177.byte	102,15,56,222,248
2178	movups	64(%r11),%xmm0
2179	movdqa	%xmm8,80(%rsp)
2180	pshufd	$95,%xmm15,%xmm9
2181	jmp	L$xts_dec_loop6
2182.p2align	5
2183L$xts_dec_loop6:
2184.byte	102,15,56,222,209
2185.byte	102,15,56,222,217
2186.byte	102,15,56,222,225
2187.byte	102,15,56,222,233
2188.byte	102,15,56,222,241
2189.byte	102,15,56,222,249
2190	movups	-64(%rcx,%rax,1),%xmm1
2191	addq	$32,%rax
2192
2193.byte	102,15,56,222,208
2194.byte	102,15,56,222,216
2195.byte	102,15,56,222,224
2196.byte	102,15,56,222,232
2197.byte	102,15,56,222,240
2198.byte	102,15,56,222,248
2199	movups	-80(%rcx,%rax,1),%xmm0
2200	jnz	L$xts_dec_loop6
2201
2202	movdqa	(%r8),%xmm8
2203	movdqa	%xmm9,%xmm14
2204	paddd	%xmm9,%xmm9
2205.byte	102,15,56,222,209
2206	paddq	%xmm15,%xmm15
2207	psrad	$31,%xmm14
2208.byte	102,15,56,222,217
2209	pand	%xmm8,%xmm14
2210	movups	(%r11),%xmm10
2211.byte	102,15,56,222,225
2212.byte	102,15,56,222,233
2213.byte	102,15,56,222,241
2214	pxor	%xmm14,%xmm15
2215	movaps	%xmm10,%xmm11
2216.byte	102,15,56,222,249
2217	movups	-64(%rcx),%xmm1
2218
2219	movdqa	%xmm9,%xmm14
2220.byte	102,15,56,222,208
2221	paddd	%xmm9,%xmm9
2222	pxor	%xmm15,%xmm10
2223.byte	102,15,56,222,216
2224	psrad	$31,%xmm14
2225	paddq	%xmm15,%xmm15
2226.byte	102,15,56,222,224
2227.byte	102,15,56,222,232
2228	pand	%xmm8,%xmm14
2229	movaps	%xmm11,%xmm12
2230.byte	102,15,56,222,240
2231	pxor	%xmm14,%xmm15
2232	movdqa	%xmm9,%xmm14
2233.byte	102,15,56,222,248
2234	movups	-48(%rcx),%xmm0
2235
2236	paddd	%xmm9,%xmm9
2237.byte	102,15,56,222,209
2238	pxor	%xmm15,%xmm11
2239	psrad	$31,%xmm14
2240.byte	102,15,56,222,217
2241	paddq	%xmm15,%xmm15
2242	pand	%xmm8,%xmm14
2243.byte	102,15,56,222,225
2244.byte	102,15,56,222,233
2245	movdqa	%xmm13,48(%rsp)
2246	pxor	%xmm14,%xmm15
2247.byte	102,15,56,222,241
2248	movaps	%xmm12,%xmm13
2249	movdqa	%xmm9,%xmm14
2250.byte	102,15,56,222,249
2251	movups	-32(%rcx),%xmm1
2252
2253	paddd	%xmm9,%xmm9
2254.byte	102,15,56,222,208
2255	pxor	%xmm15,%xmm12
2256	psrad	$31,%xmm14
2257.byte	102,15,56,222,216
2258	paddq	%xmm15,%xmm15
2259	pand	%xmm8,%xmm14
2260.byte	102,15,56,222,224
2261.byte	102,15,56,222,232
2262.byte	102,15,56,222,240
2263	pxor	%xmm14,%xmm15
2264	movaps	%xmm13,%xmm14
2265.byte	102,15,56,222,248
2266
2267	movdqa	%xmm9,%xmm0
2268	paddd	%xmm9,%xmm9
2269.byte	102,15,56,222,209
2270	pxor	%xmm15,%xmm13
2271	psrad	$31,%xmm0
2272.byte	102,15,56,222,217
2273	paddq	%xmm15,%xmm15
2274	pand	%xmm8,%xmm0
2275.byte	102,15,56,222,225
2276.byte	102,15,56,222,233
2277	pxor	%xmm0,%xmm15
2278	movups	(%r11),%xmm0
2279.byte	102,15,56,222,241
2280.byte	102,15,56,222,249
2281	movups	16(%r11),%xmm1
2282
2283	pxor	%xmm15,%xmm14
2284.byte	102,15,56,223,84,36,0
2285	psrad	$31,%xmm9
2286	paddq	%xmm15,%xmm15
2287.byte	102,15,56,223,92,36,16
2288.byte	102,15,56,223,100,36,32
2289	pand	%xmm8,%xmm9
2290	movq	%r10,%rax
2291.byte	102,15,56,223,108,36,48
2292.byte	102,15,56,223,116,36,64
2293.byte	102,15,56,223,124,36,80
2294	pxor	%xmm9,%xmm15
2295
2296	leaq	96(%rsi),%rsi
2297	movups	%xmm2,-96(%rsi)
2298	movups	%xmm3,-80(%rsi)
2299	movups	%xmm4,-64(%rsi)
2300	movups	%xmm5,-48(%rsi)
2301	movups	%xmm6,-32(%rsi)
2302	movups	%xmm7,-16(%rsi)
2303	subq	$96,%rdx
2304	jnc	L$xts_dec_grandloop
2305
2306	movl	$16+96,%eax
2307	subl	%r10d,%eax
2308	movq	%r11,%rcx
2309	shrl	$4,%eax
2310
2311L$xts_dec_short:
2312
2313	movl	%eax,%r10d
2314	pxor	%xmm0,%xmm10
2315	pxor	%xmm0,%xmm11
2316	addq	$96,%rdx
2317	jz	L$xts_dec_done
2318
2319	pxor	%xmm0,%xmm12
2320	cmpq	$32,%rdx
2321	jb	L$xts_dec_one
2322	pxor	%xmm0,%xmm13
2323	je	L$xts_dec_two
2324
2325	pxor	%xmm0,%xmm14
2326	cmpq	$64,%rdx
2327	jb	L$xts_dec_three
2328	je	L$xts_dec_four
2329
2330	movdqu	(%rdi),%xmm2
2331	movdqu	16(%rdi),%xmm3
2332	movdqu	32(%rdi),%xmm4
2333	pxor	%xmm10,%xmm2
2334	movdqu	48(%rdi),%xmm5
2335	pxor	%xmm11,%xmm3
2336	movdqu	64(%rdi),%xmm6
2337	leaq	80(%rdi),%rdi
2338	pxor	%xmm12,%xmm4
2339	pxor	%xmm13,%xmm5
2340	pxor	%xmm14,%xmm6
2341
2342	call	_aesni_decrypt6
2343
2344	xorps	%xmm10,%xmm2
2345	xorps	%xmm11,%xmm3
2346	xorps	%xmm12,%xmm4
2347	movdqu	%xmm2,(%rsi)
2348	xorps	%xmm13,%xmm5
2349	movdqu	%xmm3,16(%rsi)
2350	xorps	%xmm14,%xmm6
2351	movdqu	%xmm4,32(%rsi)
2352	pxor	%xmm14,%xmm14
2353	movdqu	%xmm5,48(%rsi)
2354	pcmpgtd	%xmm15,%xmm14
2355	movdqu	%xmm6,64(%rsi)
2356	leaq	80(%rsi),%rsi
2357	pshufd	$19,%xmm14,%xmm11
2358	andq	$15,%r9
2359	jz	L$xts_dec_ret
2360
2361	movdqa	%xmm15,%xmm10
2362	paddq	%xmm15,%xmm15
2363	pand	%xmm8,%xmm11
2364	pxor	%xmm15,%xmm11
2365	jmp	L$xts_dec_done2
2366
2367.p2align	4
2368L$xts_dec_one:
2369	movups	(%rdi),%xmm2
2370	leaq	16(%rdi),%rdi
2371	xorps	%xmm10,%xmm2
2372	movups	(%rcx),%xmm0
2373	movups	16(%rcx),%xmm1
2374	leaq	32(%rcx),%rcx
2375	xorps	%xmm0,%xmm2
2376L$oop_dec1_12:
2377.byte	102,15,56,222,209
2378	decl	%eax
2379	movups	(%rcx),%xmm1
2380	leaq	16(%rcx),%rcx
2381	jnz	L$oop_dec1_12
2382.byte	102,15,56,223,209
2383	xorps	%xmm10,%xmm2
2384	movdqa	%xmm11,%xmm10
2385	movups	%xmm2,(%rsi)
2386	movdqa	%xmm12,%xmm11
2387	leaq	16(%rsi),%rsi
2388	jmp	L$xts_dec_done
2389
2390.p2align	4
2391L$xts_dec_two:
2392	movups	(%rdi),%xmm2
2393	movups	16(%rdi),%xmm3
2394	leaq	32(%rdi),%rdi
2395	xorps	%xmm10,%xmm2
2396	xorps	%xmm11,%xmm3
2397
2398	call	_aesni_decrypt2
2399
2400	xorps	%xmm10,%xmm2
2401	movdqa	%xmm12,%xmm10
2402	xorps	%xmm11,%xmm3
2403	movdqa	%xmm13,%xmm11
2404	movups	%xmm2,(%rsi)
2405	movups	%xmm3,16(%rsi)
2406	leaq	32(%rsi),%rsi
2407	jmp	L$xts_dec_done
2408
2409.p2align	4
2410L$xts_dec_three:
2411	movups	(%rdi),%xmm2
2412	movups	16(%rdi),%xmm3
2413	movups	32(%rdi),%xmm4
2414	leaq	48(%rdi),%rdi
2415	xorps	%xmm10,%xmm2
2416	xorps	%xmm11,%xmm3
2417	xorps	%xmm12,%xmm4
2418
2419	call	_aesni_decrypt3
2420
2421	xorps	%xmm10,%xmm2
2422	movdqa	%xmm13,%xmm10
2423	xorps	%xmm11,%xmm3
2424	movdqa	%xmm14,%xmm11
2425	xorps	%xmm12,%xmm4
2426	movups	%xmm2,(%rsi)
2427	movups	%xmm3,16(%rsi)
2428	movups	%xmm4,32(%rsi)
2429	leaq	48(%rsi),%rsi
2430	jmp	L$xts_dec_done
2431
2432.p2align	4
2433L$xts_dec_four:
2434	movups	(%rdi),%xmm2
2435	movups	16(%rdi),%xmm3
2436	movups	32(%rdi),%xmm4
2437	xorps	%xmm10,%xmm2
2438	movups	48(%rdi),%xmm5
2439	leaq	64(%rdi),%rdi
2440	xorps	%xmm11,%xmm3
2441	xorps	%xmm12,%xmm4
2442	xorps	%xmm13,%xmm5
2443
2444	call	_aesni_decrypt4
2445
2446	pxor	%xmm10,%xmm2
2447	movdqa	%xmm14,%xmm10
2448	pxor	%xmm11,%xmm3
2449	movdqa	%xmm15,%xmm11
2450	pxor	%xmm12,%xmm4
2451	movdqu	%xmm2,(%rsi)
2452	pxor	%xmm13,%xmm5
2453	movdqu	%xmm3,16(%rsi)
2454	movdqu	%xmm4,32(%rsi)
2455	movdqu	%xmm5,48(%rsi)
2456	leaq	64(%rsi),%rsi
2457	jmp	L$xts_dec_done
2458
2459.p2align	4
2460L$xts_dec_done:
2461	andq	$15,%r9
2462	jz	L$xts_dec_ret
2463L$xts_dec_done2:
2464	movq	%r9,%rdx
2465	movq	%r11,%rcx
2466	movl	%r10d,%eax
2467
2468	movups	(%rdi),%xmm2
2469	xorps	%xmm11,%xmm2
2470	movups	(%rcx),%xmm0
2471	movups	16(%rcx),%xmm1
2472	leaq	32(%rcx),%rcx
2473	xorps	%xmm0,%xmm2
2474L$oop_dec1_13:
2475.byte	102,15,56,222,209
2476	decl	%eax
2477	movups	(%rcx),%xmm1
2478	leaq	16(%rcx),%rcx
2479	jnz	L$oop_dec1_13
2480.byte	102,15,56,223,209
2481	xorps	%xmm11,%xmm2
2482	movups	%xmm2,(%rsi)
2483
2484L$xts_dec_steal:
2485	movzbl	16(%rdi),%eax
2486	movzbl	(%rsi),%ecx
2487	leaq	1(%rdi),%rdi
2488	movb	%al,(%rsi)
2489	movb	%cl,16(%rsi)
2490	leaq	1(%rsi),%rsi
2491	subq	$1,%rdx
2492	jnz	L$xts_dec_steal
2493
2494	subq	%r9,%rsi
2495	movq	%r11,%rcx
2496	movl	%r10d,%eax
2497
2498	movups	(%rsi),%xmm2
2499	xorps	%xmm10,%xmm2
2500	movups	(%rcx),%xmm0
2501	movups	16(%rcx),%xmm1
2502	leaq	32(%rcx),%rcx
2503	xorps	%xmm0,%xmm2
2504L$oop_dec1_14:
2505.byte	102,15,56,222,209
2506	decl	%eax
2507	movups	(%rcx),%xmm1
2508	leaq	16(%rcx),%rcx
2509	jnz	L$oop_dec1_14
2510.byte	102,15,56,223,209
2511	xorps	%xmm10,%xmm2
2512	movups	%xmm2,(%rsi)
2513
2514L$xts_dec_ret:
2515	xorps	%xmm0,%xmm0
2516	pxor	%xmm1,%xmm1
2517	pxor	%xmm2,%xmm2
2518	pxor	%xmm3,%xmm3
2519	pxor	%xmm4,%xmm4
2520	pxor	%xmm5,%xmm5
2521	pxor	%xmm6,%xmm6
2522	pxor	%xmm7,%xmm7
2523	movaps	%xmm0,0(%rsp)
2524	pxor	%xmm8,%xmm8
2525	movaps	%xmm0,16(%rsp)
2526	pxor	%xmm9,%xmm9
2527	movaps	%xmm0,32(%rsp)
2528	pxor	%xmm10,%xmm10
2529	movaps	%xmm0,48(%rsp)
2530	pxor	%xmm11,%xmm11
2531	movaps	%xmm0,64(%rsp)
2532	pxor	%xmm12,%xmm12
2533	movaps	%xmm0,80(%rsp)
2534	pxor	%xmm13,%xmm13
2535	movaps	%xmm0,96(%rsp)
2536	pxor	%xmm14,%xmm14
2537	pxor	%xmm15,%xmm15
2538	leaq	(%rbp),%rsp
2539	popq	%rbp
2540L$xts_dec_epilogue:
2541	.byte	0xf3,0xc3
2542
2543.globl	_aesni_cbc_encrypt
2544.private_extern _aesni_cbc_encrypt
2545
2546.p2align	4
2547_aesni_cbc_encrypt:
2548	testq	%rdx,%rdx
2549	jz	L$cbc_ret
2550
2551	movl	240(%rcx),%r10d
2552	movq	%rcx,%r11
2553	testl	%r9d,%r9d
2554	jz	L$cbc_decrypt
2555
2556	movups	(%r8),%xmm2
2557	movl	%r10d,%eax
2558	cmpq	$16,%rdx
2559	jb	L$cbc_enc_tail
2560	subq	$16,%rdx
2561	jmp	L$cbc_enc_loop
2562.p2align	4
2563L$cbc_enc_loop:
2564	movups	(%rdi),%xmm3
2565	leaq	16(%rdi),%rdi
2566
2567	movups	(%rcx),%xmm0
2568	movups	16(%rcx),%xmm1
2569	xorps	%xmm0,%xmm3
2570	leaq	32(%rcx),%rcx
2571	xorps	%xmm3,%xmm2
2572L$oop_enc1_15:
2573.byte	102,15,56,220,209
2574	decl	%eax
2575	movups	(%rcx),%xmm1
2576	leaq	16(%rcx),%rcx
2577	jnz	L$oop_enc1_15
2578.byte	102,15,56,221,209
2579	movl	%r10d,%eax
2580	movq	%r11,%rcx
2581	movups	%xmm2,0(%rsi)
2582	leaq	16(%rsi),%rsi
2583	subq	$16,%rdx
2584	jnc	L$cbc_enc_loop
2585	addq	$16,%rdx
2586	jnz	L$cbc_enc_tail
2587	pxor	%xmm0,%xmm0
2588	pxor	%xmm1,%xmm1
2589	movups	%xmm2,(%r8)
2590	pxor	%xmm2,%xmm2
2591	pxor	%xmm3,%xmm3
2592	jmp	L$cbc_ret
2593
2594L$cbc_enc_tail:
2595	movq	%rdx,%rcx
2596	xchgq	%rdi,%rsi
2597.long	0x9066A4F3
2598	movl	$16,%ecx
2599	subq	%rdx,%rcx
2600	xorl	%eax,%eax
2601.long	0x9066AAF3
2602	leaq	-16(%rdi),%rdi
2603	movl	%r10d,%eax
2604	movq	%rdi,%rsi
2605	movq	%r11,%rcx
2606	xorq	%rdx,%rdx
2607	jmp	L$cbc_enc_loop
2608
2609.p2align	4
2610L$cbc_decrypt:
2611	cmpq	$16,%rdx
2612	jne	L$cbc_decrypt_bulk
2613
2614
2615
2616	movdqu	(%rdi),%xmm2
2617	movdqu	(%r8),%xmm3
2618	movdqa	%xmm2,%xmm4
2619	movups	(%rcx),%xmm0
2620	movups	16(%rcx),%xmm1
2621	leaq	32(%rcx),%rcx
2622	xorps	%xmm0,%xmm2
2623L$oop_dec1_16:
2624.byte	102,15,56,222,209
2625	decl	%r10d
2626	movups	(%rcx),%xmm1
2627	leaq	16(%rcx),%rcx
2628	jnz	L$oop_dec1_16
2629.byte	102,15,56,223,209
2630	pxor	%xmm0,%xmm0
2631	pxor	%xmm1,%xmm1
2632	movdqu	%xmm4,(%r8)
2633	xorps	%xmm3,%xmm2
2634	pxor	%xmm3,%xmm3
2635	movups	%xmm2,(%rsi)
2636	pxor	%xmm2,%xmm2
2637	jmp	L$cbc_ret
2638.p2align	4
2639L$cbc_decrypt_bulk:
2640	leaq	(%rsp),%rax
2641	pushq	%rbp
2642	subq	$16,%rsp
2643	andq	$-16,%rsp
2644	leaq	-8(%rax),%rbp
2645	movups	(%r8),%xmm10
2646	movl	%r10d,%eax
2647	cmpq	$80,%rdx
2648	jbe	L$cbc_dec_tail
2649
2650	movups	(%rcx),%xmm0
2651	movdqu	0(%rdi),%xmm2
2652	movdqu	16(%rdi),%xmm3
2653	movdqa	%xmm2,%xmm11
2654	movdqu	32(%rdi),%xmm4
2655	movdqa	%xmm3,%xmm12
2656	movdqu	48(%rdi),%xmm5
2657	movdqa	%xmm4,%xmm13
2658	movdqu	64(%rdi),%xmm6
2659	movdqa	%xmm5,%xmm14
2660	movdqu	80(%rdi),%xmm7
2661	movdqa	%xmm6,%xmm15
2662	movl	_OPENSSL_ia32cap_P+4(%rip),%r9d
2663	cmpq	$112,%rdx
2664	jbe	L$cbc_dec_six_or_seven
2665
2666	andl	$71303168,%r9d
2667	subq	$80,%rdx
2668	cmpl	$4194304,%r9d
2669	je	L$cbc_dec_loop6_enter
2670	subq	$32,%rdx
2671	leaq	112(%rcx),%rcx
2672	jmp	L$cbc_dec_loop8_enter
2673.p2align	4
2674L$cbc_dec_loop8:
2675	movups	%xmm9,(%rsi)
2676	leaq	16(%rsi),%rsi
2677L$cbc_dec_loop8_enter:
2678	movdqu	96(%rdi),%xmm8
2679	pxor	%xmm0,%xmm2
2680	movdqu	112(%rdi),%xmm9
2681	pxor	%xmm0,%xmm3
2682	movups	16-112(%rcx),%xmm1
2683	pxor	%xmm0,%xmm4
2684	xorq	%r11,%r11
2685	cmpq	$112,%rdx
2686	pxor	%xmm0,%xmm5
2687	pxor	%xmm0,%xmm6
2688	pxor	%xmm0,%xmm7
2689	pxor	%xmm0,%xmm8
2690
2691.byte	102,15,56,222,209
2692	pxor	%xmm0,%xmm9
2693	movups	32-112(%rcx),%xmm0
2694.byte	102,15,56,222,217
2695.byte	102,15,56,222,225
2696.byte	102,15,56,222,233
2697.byte	102,15,56,222,241
2698.byte	102,15,56,222,249
2699.byte	102,68,15,56,222,193
2700	setnc	%r11b
2701	shlq	$7,%r11
2702.byte	102,68,15,56,222,201
2703	addq	%rdi,%r11
2704	movups	48-112(%rcx),%xmm1
2705.byte	102,15,56,222,208
2706.byte	102,15,56,222,216
2707.byte	102,15,56,222,224
2708.byte	102,15,56,222,232
2709.byte	102,15,56,222,240
2710.byte	102,15,56,222,248
2711.byte	102,68,15,56,222,192
2712.byte	102,68,15,56,222,200
2713	movups	64-112(%rcx),%xmm0
2714	nop
2715.byte	102,15,56,222,209
2716.byte	102,15,56,222,217
2717.byte	102,15,56,222,225
2718.byte	102,15,56,222,233
2719.byte	102,15,56,222,241
2720.byte	102,15,56,222,249
2721.byte	102,68,15,56,222,193
2722.byte	102,68,15,56,222,201
2723	movups	80-112(%rcx),%xmm1
2724	nop
2725.byte	102,15,56,222,208
2726.byte	102,15,56,222,216
2727.byte	102,15,56,222,224
2728.byte	102,15,56,222,232
2729.byte	102,15,56,222,240
2730.byte	102,15,56,222,248
2731.byte	102,68,15,56,222,192
2732.byte	102,68,15,56,222,200
2733	movups	96-112(%rcx),%xmm0
2734	nop
2735.byte	102,15,56,222,209
2736.byte	102,15,56,222,217
2737.byte	102,15,56,222,225
2738.byte	102,15,56,222,233
2739.byte	102,15,56,222,241
2740.byte	102,15,56,222,249
2741.byte	102,68,15,56,222,193
2742.byte	102,68,15,56,222,201
2743	movups	112-112(%rcx),%xmm1
2744	nop
2745.byte	102,15,56,222,208
2746.byte	102,15,56,222,216
2747.byte	102,15,56,222,224
2748.byte	102,15,56,222,232
2749.byte	102,15,56,222,240
2750.byte	102,15,56,222,248
2751.byte	102,68,15,56,222,192
2752.byte	102,68,15,56,222,200
2753	movups	128-112(%rcx),%xmm0
2754	nop
2755.byte	102,15,56,222,209
2756.byte	102,15,56,222,217
2757.byte	102,15,56,222,225
2758.byte	102,15,56,222,233
2759.byte	102,15,56,222,241
2760.byte	102,15,56,222,249
2761.byte	102,68,15,56,222,193
2762.byte	102,68,15,56,222,201
2763	movups	144-112(%rcx),%xmm1
2764	cmpl	$11,%eax
2765.byte	102,15,56,222,208
2766.byte	102,15,56,222,216
2767.byte	102,15,56,222,224
2768.byte	102,15,56,222,232
2769.byte	102,15,56,222,240
2770.byte	102,15,56,222,248
2771.byte	102,68,15,56,222,192
2772.byte	102,68,15,56,222,200
2773	movups	160-112(%rcx),%xmm0
2774	jb	L$cbc_dec_done
2775.byte	102,15,56,222,209
2776.byte	102,15,56,222,217
2777.byte	102,15,56,222,225
2778.byte	102,15,56,222,233
2779.byte	102,15,56,222,241
2780.byte	102,15,56,222,249
2781.byte	102,68,15,56,222,193
2782.byte	102,68,15,56,222,201
2783	movups	176-112(%rcx),%xmm1
2784	nop
2785.byte	102,15,56,222,208
2786.byte	102,15,56,222,216
2787.byte	102,15,56,222,224
2788.byte	102,15,56,222,232
2789.byte	102,15,56,222,240
2790.byte	102,15,56,222,248
2791.byte	102,68,15,56,222,192
2792.byte	102,68,15,56,222,200
2793	movups	192-112(%rcx),%xmm0
2794	je	L$cbc_dec_done
2795.byte	102,15,56,222,209
2796.byte	102,15,56,222,217
2797.byte	102,15,56,222,225
2798.byte	102,15,56,222,233
2799.byte	102,15,56,222,241
2800.byte	102,15,56,222,249
2801.byte	102,68,15,56,222,193
2802.byte	102,68,15,56,222,201
2803	movups	208-112(%rcx),%xmm1
2804	nop
2805.byte	102,15,56,222,208
2806.byte	102,15,56,222,216
2807.byte	102,15,56,222,224
2808.byte	102,15,56,222,232
2809.byte	102,15,56,222,240
2810.byte	102,15,56,222,248
2811.byte	102,68,15,56,222,192
2812.byte	102,68,15,56,222,200
2813	movups	224-112(%rcx),%xmm0
2814	jmp	L$cbc_dec_done
2815.p2align	4
2816L$cbc_dec_done:
2817.byte	102,15,56,222,209
2818.byte	102,15,56,222,217
2819	pxor	%xmm0,%xmm10
2820	pxor	%xmm0,%xmm11
2821.byte	102,15,56,222,225
2822.byte	102,15,56,222,233
2823	pxor	%xmm0,%xmm12
2824	pxor	%xmm0,%xmm13
2825.byte	102,15,56,222,241
2826.byte	102,15,56,222,249
2827	pxor	%xmm0,%xmm14
2828	pxor	%xmm0,%xmm15
2829.byte	102,68,15,56,222,193
2830.byte	102,68,15,56,222,201
2831	movdqu	80(%rdi),%xmm1
2832
2833.byte	102,65,15,56,223,210
2834	movdqu	96(%rdi),%xmm10
2835	pxor	%xmm0,%xmm1
2836.byte	102,65,15,56,223,219
2837	pxor	%xmm0,%xmm10
2838	movdqu	112(%rdi),%xmm0
2839.byte	102,65,15,56,223,228
2840	leaq	128(%rdi),%rdi
2841	movdqu	0(%r11),%xmm11
2842.byte	102,65,15,56,223,237
2843.byte	102,65,15,56,223,246
2844	movdqu	16(%r11),%xmm12
2845	movdqu	32(%r11),%xmm13
2846.byte	102,65,15,56,223,255
2847.byte	102,68,15,56,223,193
2848	movdqu	48(%r11),%xmm14
2849	movdqu	64(%r11),%xmm15
2850.byte	102,69,15,56,223,202
2851	movdqa	%xmm0,%xmm10
2852	movdqu	80(%r11),%xmm1
2853	movups	-112(%rcx),%xmm0
2854
2855	movups	%xmm2,(%rsi)
2856	movdqa	%xmm11,%xmm2
2857	movups	%xmm3,16(%rsi)
2858	movdqa	%xmm12,%xmm3
2859	movups	%xmm4,32(%rsi)
2860	movdqa	%xmm13,%xmm4
2861	movups	%xmm5,48(%rsi)
2862	movdqa	%xmm14,%xmm5
2863	movups	%xmm6,64(%rsi)
2864	movdqa	%xmm15,%xmm6
2865	movups	%xmm7,80(%rsi)
2866	movdqa	%xmm1,%xmm7
2867	movups	%xmm8,96(%rsi)
2868	leaq	112(%rsi),%rsi
2869
2870	subq	$128,%rdx
2871	ja	L$cbc_dec_loop8
2872
2873	movaps	%xmm9,%xmm2
2874	leaq	-112(%rcx),%rcx
2875	addq	$112,%rdx
2876	jle	L$cbc_dec_clear_tail_collected
2877	movups	%xmm9,(%rsi)
2878	leaq	16(%rsi),%rsi
2879	cmpq	$80,%rdx
2880	jbe	L$cbc_dec_tail
2881
2882	movaps	%xmm11,%xmm2
2883L$cbc_dec_six_or_seven:
2884	cmpq	$96,%rdx
2885	ja	L$cbc_dec_seven
2886
2887	movaps	%xmm7,%xmm8
2888	call	_aesni_decrypt6
2889	pxor	%xmm10,%xmm2
2890	movaps	%xmm8,%xmm10
2891	pxor	%xmm11,%xmm3
2892	movdqu	%xmm2,(%rsi)
2893	pxor	%xmm12,%xmm4
2894	movdqu	%xmm3,16(%rsi)
2895	pxor	%xmm3,%xmm3
2896	pxor	%xmm13,%xmm5
2897	movdqu	%xmm4,32(%rsi)
2898	pxor	%xmm4,%xmm4
2899	pxor	%xmm14,%xmm6
2900	movdqu	%xmm5,48(%rsi)
2901	pxor	%xmm5,%xmm5
2902	pxor	%xmm15,%xmm7
2903	movdqu	%xmm6,64(%rsi)
2904	pxor	%xmm6,%xmm6
2905	leaq	80(%rsi),%rsi
2906	movdqa	%xmm7,%xmm2
2907	pxor	%xmm7,%xmm7
2908	jmp	L$cbc_dec_tail_collected
2909
2910.p2align	4
2911L$cbc_dec_seven:
2912	movups	96(%rdi),%xmm8
2913	xorps	%xmm9,%xmm9
2914	call	_aesni_decrypt8
2915	movups	80(%rdi),%xmm9
2916	pxor	%xmm10,%xmm2
2917	movups	96(%rdi),%xmm10
2918	pxor	%xmm11,%xmm3
2919	movdqu	%xmm2,(%rsi)
2920	pxor	%xmm12,%xmm4
2921	movdqu	%xmm3,16(%rsi)
2922	pxor	%xmm3,%xmm3
2923	pxor	%xmm13,%xmm5
2924	movdqu	%xmm4,32(%rsi)
2925	pxor	%xmm4,%xmm4
2926	pxor	%xmm14,%xmm6
2927	movdqu	%xmm5,48(%rsi)
2928	pxor	%xmm5,%xmm5
2929	pxor	%xmm15,%xmm7
2930	movdqu	%xmm6,64(%rsi)
2931	pxor	%xmm6,%xmm6
2932	pxor	%xmm9,%xmm8
2933	movdqu	%xmm7,80(%rsi)
2934	pxor	%xmm7,%xmm7
2935	leaq	96(%rsi),%rsi
2936	movdqa	%xmm8,%xmm2
2937	pxor	%xmm8,%xmm8
2938	pxor	%xmm9,%xmm9
2939	jmp	L$cbc_dec_tail_collected
2940
2941.p2align	4
2942L$cbc_dec_loop6:
2943	movups	%xmm7,(%rsi)
2944	leaq	16(%rsi),%rsi
2945	movdqu	0(%rdi),%xmm2
2946	movdqu	16(%rdi),%xmm3
2947	movdqa	%xmm2,%xmm11
2948	movdqu	32(%rdi),%xmm4
2949	movdqa	%xmm3,%xmm12
2950	movdqu	48(%rdi),%xmm5
2951	movdqa	%xmm4,%xmm13
2952	movdqu	64(%rdi),%xmm6
2953	movdqa	%xmm5,%xmm14
2954	movdqu	80(%rdi),%xmm7
2955	movdqa	%xmm6,%xmm15
2956L$cbc_dec_loop6_enter:
2957	leaq	96(%rdi),%rdi
2958	movdqa	%xmm7,%xmm8
2959
2960	call	_aesni_decrypt6
2961
2962	pxor	%xmm10,%xmm2
2963	movdqa	%xmm8,%xmm10
2964	pxor	%xmm11,%xmm3
2965	movdqu	%xmm2,(%rsi)
2966	pxor	%xmm12,%xmm4
2967	movdqu	%xmm3,16(%rsi)
2968	pxor	%xmm13,%xmm5
2969	movdqu	%xmm4,32(%rsi)
2970	pxor	%xmm14,%xmm6
2971	movq	%r11,%rcx
2972	movdqu	%xmm5,48(%rsi)
2973	pxor	%xmm15,%xmm7
2974	movl	%r10d,%eax
2975	movdqu	%xmm6,64(%rsi)
2976	leaq	80(%rsi),%rsi
2977	subq	$96,%rdx
2978	ja	L$cbc_dec_loop6
2979
2980	movdqa	%xmm7,%xmm2
2981	addq	$80,%rdx
2982	jle	L$cbc_dec_clear_tail_collected
2983	movups	%xmm7,(%rsi)
2984	leaq	16(%rsi),%rsi
2985
2986L$cbc_dec_tail:
2987	movups	(%rdi),%xmm2
2988	subq	$16,%rdx
2989	jbe	L$cbc_dec_one
2990
2991	movups	16(%rdi),%xmm3
2992	movaps	%xmm2,%xmm11
2993	subq	$16,%rdx
2994	jbe	L$cbc_dec_two
2995
2996	movups	32(%rdi),%xmm4
2997	movaps	%xmm3,%xmm12
2998	subq	$16,%rdx
2999	jbe	L$cbc_dec_three
3000
3001	movups	48(%rdi),%xmm5
3002	movaps	%xmm4,%xmm13
3003	subq	$16,%rdx
3004	jbe	L$cbc_dec_four
3005
3006	movups	64(%rdi),%xmm6
3007	movaps	%xmm5,%xmm14
3008	movaps	%xmm6,%xmm15
3009	xorps	%xmm7,%xmm7
3010	call	_aesni_decrypt6
3011	pxor	%xmm10,%xmm2
3012	movaps	%xmm15,%xmm10
3013	pxor	%xmm11,%xmm3
3014	movdqu	%xmm2,(%rsi)
3015	pxor	%xmm12,%xmm4
3016	movdqu	%xmm3,16(%rsi)
3017	pxor	%xmm3,%xmm3
3018	pxor	%xmm13,%xmm5
3019	movdqu	%xmm4,32(%rsi)
3020	pxor	%xmm4,%xmm4
3021	pxor	%xmm14,%xmm6
3022	movdqu	%xmm5,48(%rsi)
3023	pxor	%xmm5,%xmm5
3024	leaq	64(%rsi),%rsi
3025	movdqa	%xmm6,%xmm2
3026	pxor	%xmm6,%xmm6
3027	pxor	%xmm7,%xmm7
3028	subq	$16,%rdx
3029	jmp	L$cbc_dec_tail_collected
3030
3031.p2align	4
3032L$cbc_dec_one:
3033	movaps	%xmm2,%xmm11
3034	movups	(%rcx),%xmm0
3035	movups	16(%rcx),%xmm1
3036	leaq	32(%rcx),%rcx
3037	xorps	%xmm0,%xmm2
3038L$oop_dec1_17:
3039.byte	102,15,56,222,209
3040	decl	%eax
3041	movups	(%rcx),%xmm1
3042	leaq	16(%rcx),%rcx
3043	jnz	L$oop_dec1_17
3044.byte	102,15,56,223,209
3045	xorps	%xmm10,%xmm2
3046	movaps	%xmm11,%xmm10
3047	jmp	L$cbc_dec_tail_collected
3048.p2align	4
3049L$cbc_dec_two:
3050	movaps	%xmm3,%xmm12
3051	call	_aesni_decrypt2
3052	pxor	%xmm10,%xmm2
3053	movaps	%xmm12,%xmm10
3054	pxor	%xmm11,%xmm3
3055	movdqu	%xmm2,(%rsi)
3056	movdqa	%xmm3,%xmm2
3057	pxor	%xmm3,%xmm3
3058	leaq	16(%rsi),%rsi
3059	jmp	L$cbc_dec_tail_collected
3060.p2align	4
3061L$cbc_dec_three:
3062	movaps	%xmm4,%xmm13
3063	call	_aesni_decrypt3
3064	pxor	%xmm10,%xmm2
3065	movaps	%xmm13,%xmm10
3066	pxor	%xmm11,%xmm3
3067	movdqu	%xmm2,(%rsi)
3068	pxor	%xmm12,%xmm4
3069	movdqu	%xmm3,16(%rsi)
3070	pxor	%xmm3,%xmm3
3071	movdqa	%xmm4,%xmm2
3072	pxor	%xmm4,%xmm4
3073	leaq	32(%rsi),%rsi
3074	jmp	L$cbc_dec_tail_collected
3075.p2align	4
3076L$cbc_dec_four:
3077	movaps	%xmm5,%xmm14
3078	call	_aesni_decrypt4
3079	pxor	%xmm10,%xmm2
3080	movaps	%xmm14,%xmm10
3081	pxor	%xmm11,%xmm3
3082	movdqu	%xmm2,(%rsi)
3083	pxor	%xmm12,%xmm4
3084	movdqu	%xmm3,16(%rsi)
3085	pxor	%xmm3,%xmm3
3086	pxor	%xmm13,%xmm5
3087	movdqu	%xmm4,32(%rsi)
3088	pxor	%xmm4,%xmm4
3089	movdqa	%xmm5,%xmm2
3090	pxor	%xmm5,%xmm5
3091	leaq	48(%rsi),%rsi
3092	jmp	L$cbc_dec_tail_collected
3093
3094.p2align	4
3095L$cbc_dec_clear_tail_collected:
3096	pxor	%xmm3,%xmm3
3097	pxor	%xmm4,%xmm4
3098	pxor	%xmm5,%xmm5
3099	pxor	%xmm6,%xmm6
3100	pxor	%xmm7,%xmm7
3101	pxor	%xmm8,%xmm8
3102	pxor	%xmm9,%xmm9
3103L$cbc_dec_tail_collected:
3104	movups	%xmm10,(%r8)
3105	andq	$15,%rdx
3106	jnz	L$cbc_dec_tail_partial
3107	movups	%xmm2,(%rsi)
3108	pxor	%xmm2,%xmm2
3109	jmp	L$cbc_dec_ret
3110.p2align	4
3111L$cbc_dec_tail_partial:
3112	movaps	%xmm2,(%rsp)
3113	pxor	%xmm2,%xmm2
3114	movq	$16,%rcx
3115	movq	%rsi,%rdi
3116	subq	%rdx,%rcx
3117	leaq	(%rsp),%rsi
3118.long	0x9066A4F3
3119	movdqa	%xmm2,(%rsp)
3120
3121L$cbc_dec_ret:
3122	xorps	%xmm0,%xmm0
3123	pxor	%xmm1,%xmm1
3124	leaq	(%rbp),%rsp
3125	popq	%rbp
3126L$cbc_ret:
3127	.byte	0xf3,0xc3
3128
3129.globl	_aesni_set_decrypt_key
3130.private_extern _aesni_set_decrypt_key
3131
3132.p2align	4
3133_aesni_set_decrypt_key:
3134.byte	0x48,0x83,0xEC,0x08
3135	call	__aesni_set_encrypt_key
3136	shll	$4,%esi
3137	testl	%eax,%eax
3138	jnz	L$dec_key_ret
3139	leaq	16(%rdx,%rsi,1),%rdi
3140
3141	movups	(%rdx),%xmm0
3142	movups	(%rdi),%xmm1
3143	movups	%xmm0,(%rdi)
3144	movups	%xmm1,(%rdx)
3145	leaq	16(%rdx),%rdx
3146	leaq	-16(%rdi),%rdi
3147
3148L$dec_key_inverse:
3149	movups	(%rdx),%xmm0
3150	movups	(%rdi),%xmm1
3151.byte	102,15,56,219,192
3152.byte	102,15,56,219,201
3153	leaq	16(%rdx),%rdx
3154	leaq	-16(%rdi),%rdi
3155	movups	%xmm0,16(%rdi)
3156	movups	%xmm1,-16(%rdx)
3157	cmpq	%rdx,%rdi
3158	ja	L$dec_key_inverse
3159
3160	movups	(%rdx),%xmm0
3161.byte	102,15,56,219,192
3162	pxor	%xmm1,%xmm1
3163	movups	%xmm0,(%rdi)
3164	pxor	%xmm0,%xmm0
3165L$dec_key_ret:
3166	addq	$8,%rsp
3167	.byte	0xf3,0xc3
3168L$SEH_end_set_decrypt_key:
3169
3170.globl	_aesni_set_encrypt_key
3171.private_extern _aesni_set_encrypt_key
3172
3173.p2align	4
3174_aesni_set_encrypt_key:
3175__aesni_set_encrypt_key:
3176.byte	0x48,0x83,0xEC,0x08
3177	movq	$-1,%rax
3178	testq	%rdi,%rdi
3179	jz	L$enc_key_ret
3180	testq	%rdx,%rdx
3181	jz	L$enc_key_ret
3182
3183	movl	$268437504,%r10d
3184	movups	(%rdi),%xmm0
3185	xorps	%xmm4,%xmm4
3186	andl	_OPENSSL_ia32cap_P+4(%rip),%r10d
3187	leaq	16(%rdx),%rax
3188	cmpl	$256,%esi
3189	je	L$14rounds
3190	cmpl	$192,%esi
3191	je	L$12rounds
3192	cmpl	$128,%esi
3193	jne	L$bad_keybits
3194
3195L$10rounds:
3196	movl	$9,%esi
3197	cmpl	$268435456,%r10d
3198	je	L$10rounds_alt
3199
3200	movups	%xmm0,(%rdx)
3201.byte	102,15,58,223,200,1
3202	call	L$key_expansion_128_cold
3203.byte	102,15,58,223,200,2
3204	call	L$key_expansion_128
3205.byte	102,15,58,223,200,4
3206	call	L$key_expansion_128
3207.byte	102,15,58,223,200,8
3208	call	L$key_expansion_128
3209.byte	102,15,58,223,200,16
3210	call	L$key_expansion_128
3211.byte	102,15,58,223,200,32
3212	call	L$key_expansion_128
3213.byte	102,15,58,223,200,64
3214	call	L$key_expansion_128
3215.byte	102,15,58,223,200,128
3216	call	L$key_expansion_128
3217.byte	102,15,58,223,200,27
3218	call	L$key_expansion_128
3219.byte	102,15,58,223,200,54
3220	call	L$key_expansion_128
3221	movups	%xmm0,(%rax)
3222	movl	%esi,80(%rax)
3223	xorl	%eax,%eax
3224	jmp	L$enc_key_ret
3225
3226.p2align	4
3227L$10rounds_alt:
3228	movdqa	L$key_rotate(%rip),%xmm5
3229	movl	$8,%r10d
3230	movdqa	L$key_rcon1(%rip),%xmm4
3231	movdqa	%xmm0,%xmm2
3232	movdqu	%xmm0,(%rdx)
3233	jmp	L$oop_key128
3234
3235.p2align	4
3236L$oop_key128:
3237.byte	102,15,56,0,197
3238.byte	102,15,56,221,196
3239	pslld	$1,%xmm4
3240	leaq	16(%rax),%rax
3241
3242	movdqa	%xmm2,%xmm3
3243	pslldq	$4,%xmm2
3244	pxor	%xmm2,%xmm3
3245	pslldq	$4,%xmm2
3246	pxor	%xmm2,%xmm3
3247	pslldq	$4,%xmm2
3248	pxor	%xmm3,%xmm2
3249
3250	pxor	%xmm2,%xmm0
3251	movdqu	%xmm0,-16(%rax)
3252	movdqa	%xmm0,%xmm2
3253
3254	decl	%r10d
3255	jnz	L$oop_key128
3256
3257	movdqa	L$key_rcon1b(%rip),%xmm4
3258
3259.byte	102,15,56,0,197
3260.byte	102,15,56,221,196
3261	pslld	$1,%xmm4
3262
3263	movdqa	%xmm2,%xmm3
3264	pslldq	$4,%xmm2
3265	pxor	%xmm2,%xmm3
3266	pslldq	$4,%xmm2
3267	pxor	%xmm2,%xmm3
3268	pslldq	$4,%xmm2
3269	pxor	%xmm3,%xmm2
3270
3271	pxor	%xmm2,%xmm0
3272	movdqu	%xmm0,(%rax)
3273
3274	movdqa	%xmm0,%xmm2
3275.byte	102,15,56,0,197
3276.byte	102,15,56,221,196
3277
3278	movdqa	%xmm2,%xmm3
3279	pslldq	$4,%xmm2
3280	pxor	%xmm2,%xmm3
3281	pslldq	$4,%xmm2
3282	pxor	%xmm2,%xmm3
3283	pslldq	$4,%xmm2
3284	pxor	%xmm3,%xmm2
3285
3286	pxor	%xmm2,%xmm0
3287	movdqu	%xmm0,16(%rax)
3288
3289	movl	%esi,96(%rax)
3290	xorl	%eax,%eax
3291	jmp	L$enc_key_ret
3292
3293.p2align	4
3294L$12rounds:
3295	movq	16(%rdi),%xmm2
3296	movl	$11,%esi
3297	cmpl	$268435456,%r10d
3298	je	L$12rounds_alt
3299
3300	movups	%xmm0,(%rdx)
3301.byte	102,15,58,223,202,1
3302	call	L$key_expansion_192a_cold
3303.byte	102,15,58,223,202,2
3304	call	L$key_expansion_192b
3305.byte	102,15,58,223,202,4
3306	call	L$key_expansion_192a
3307.byte	102,15,58,223,202,8
3308	call	L$key_expansion_192b
3309.byte	102,15,58,223,202,16
3310	call	L$key_expansion_192a
3311.byte	102,15,58,223,202,32
3312	call	L$key_expansion_192b
3313.byte	102,15,58,223,202,64
3314	call	L$key_expansion_192a
3315.byte	102,15,58,223,202,128
3316	call	L$key_expansion_192b
3317	movups	%xmm0,(%rax)
3318	movl	%esi,48(%rax)
3319	xorq	%rax,%rax
3320	jmp	L$enc_key_ret
3321
3322.p2align	4
3323L$12rounds_alt:
3324	movdqa	L$key_rotate192(%rip),%xmm5
3325	movdqa	L$key_rcon1(%rip),%xmm4
3326	movl	$8,%r10d
3327	movdqu	%xmm0,(%rdx)
3328	jmp	L$oop_key192
3329
3330.p2align	4
3331L$oop_key192:
3332	movq	%xmm2,0(%rax)
3333	movdqa	%xmm2,%xmm1
3334.byte	102,15,56,0,213
3335.byte	102,15,56,221,212
3336	pslld	$1,%xmm4
3337	leaq	24(%rax),%rax
3338
3339	movdqa	%xmm0,%xmm3
3340	pslldq	$4,%xmm0
3341	pxor	%xmm0,%xmm3
3342	pslldq	$4,%xmm0
3343	pxor	%xmm0,%xmm3
3344	pslldq	$4,%xmm0
3345	pxor	%xmm3,%xmm0
3346
3347	pshufd	$255,%xmm0,%xmm3
3348	pxor	%xmm1,%xmm3
3349	pslldq	$4,%xmm1
3350	pxor	%xmm1,%xmm3
3351
3352	pxor	%xmm2,%xmm0
3353	pxor	%xmm3,%xmm2
3354	movdqu	%xmm0,-16(%rax)
3355
3356	decl	%r10d
3357	jnz	L$oop_key192
3358
3359	movl	%esi,32(%rax)
3360	xorl	%eax,%eax
3361	jmp	L$enc_key_ret
3362
3363.p2align	4
3364L$14rounds:
3365	movups	16(%rdi),%xmm2
3366	movl	$13,%esi
3367	leaq	16(%rax),%rax
3368	cmpl	$268435456,%r10d
3369	je	L$14rounds_alt
3370
3371	movups	%xmm0,(%rdx)
3372	movups	%xmm2,16(%rdx)
3373.byte	102,15,58,223,202,1
3374	call	L$key_expansion_256a_cold
3375.byte	102,15,58,223,200,1
3376	call	L$key_expansion_256b
3377.byte	102,15,58,223,202,2
3378	call	L$key_expansion_256a
3379.byte	102,15,58,223,200,2
3380	call	L$key_expansion_256b
3381.byte	102,15,58,223,202,4
3382	call	L$key_expansion_256a
3383.byte	102,15,58,223,200,4
3384	call	L$key_expansion_256b
3385.byte	102,15,58,223,202,8
3386	call	L$key_expansion_256a
3387.byte	102,15,58,223,200,8
3388	call	L$key_expansion_256b
3389.byte	102,15,58,223,202,16
3390	call	L$key_expansion_256a
3391.byte	102,15,58,223,200,16
3392	call	L$key_expansion_256b
3393.byte	102,15,58,223,202,32
3394	call	L$key_expansion_256a
3395.byte	102,15,58,223,200,32
3396	call	L$key_expansion_256b
3397.byte	102,15,58,223,202,64
3398	call	L$key_expansion_256a
3399	movups	%xmm0,(%rax)
3400	movl	%esi,16(%rax)
3401	xorq	%rax,%rax
3402	jmp	L$enc_key_ret
3403
3404.p2align	4
3405L$14rounds_alt:
3406	movdqa	L$key_rotate(%rip),%xmm5
3407	movdqa	L$key_rcon1(%rip),%xmm4
3408	movl	$7,%r10d
3409	movdqu	%xmm0,0(%rdx)
3410	movdqa	%xmm2,%xmm1
3411	movdqu	%xmm2,16(%rdx)
3412	jmp	L$oop_key256
3413
3414.p2align	4
3415L$oop_key256:
3416.byte	102,15,56,0,213
3417.byte	102,15,56,221,212
3418
3419	movdqa	%xmm0,%xmm3
3420	pslldq	$4,%xmm0
3421	pxor	%xmm0,%xmm3
3422	pslldq	$4,%xmm0
3423	pxor	%xmm0,%xmm3
3424	pslldq	$4,%xmm0
3425	pxor	%xmm3,%xmm0
3426	pslld	$1,%xmm4
3427
3428	pxor	%xmm2,%xmm0
3429	movdqu	%xmm0,(%rax)
3430
3431	decl	%r10d
3432	jz	L$done_key256
3433
3434	pshufd	$255,%xmm0,%xmm2
3435	pxor	%xmm3,%xmm3
3436.byte	102,15,56,221,211
3437
3438	movdqa	%xmm1,%xmm3
3439	pslldq	$4,%xmm1
3440	pxor	%xmm1,%xmm3
3441	pslldq	$4,%xmm1
3442	pxor	%xmm1,%xmm3
3443	pslldq	$4,%xmm1
3444	pxor	%xmm3,%xmm1
3445
3446	pxor	%xmm1,%xmm2
3447	movdqu	%xmm2,16(%rax)
3448	leaq	32(%rax),%rax
3449	movdqa	%xmm2,%xmm1
3450
3451	jmp	L$oop_key256
3452
3453L$done_key256:
3454	movl	%esi,16(%rax)
3455	xorl	%eax,%eax
3456	jmp	L$enc_key_ret
3457
3458.p2align	4
3459L$bad_keybits:
3460	movq	$-2,%rax
3461L$enc_key_ret:
3462	pxor	%xmm0,%xmm0
3463	pxor	%xmm1,%xmm1
3464	pxor	%xmm2,%xmm2
3465	pxor	%xmm3,%xmm3
3466	pxor	%xmm4,%xmm4
3467	pxor	%xmm5,%xmm5
3468	addq	$8,%rsp
3469	.byte	0xf3,0xc3
3470L$SEH_end_set_encrypt_key:
3471
3472.p2align	4
3473L$key_expansion_128:
3474	movups	%xmm0,(%rax)
3475	leaq	16(%rax),%rax
3476L$key_expansion_128_cold:
3477	shufps	$16,%xmm0,%xmm4
3478	xorps	%xmm4,%xmm0
3479	shufps	$140,%xmm0,%xmm4
3480	xorps	%xmm4,%xmm0
3481	shufps	$255,%xmm1,%xmm1
3482	xorps	%xmm1,%xmm0
3483	.byte	0xf3,0xc3
3484
3485.p2align	4
3486L$key_expansion_192a:
3487	movups	%xmm0,(%rax)
3488	leaq	16(%rax),%rax
3489L$key_expansion_192a_cold:
3490	movaps	%xmm2,%xmm5
3491L$key_expansion_192b_warm:
3492	shufps	$16,%xmm0,%xmm4
3493	movdqa	%xmm2,%xmm3
3494	xorps	%xmm4,%xmm0
3495	shufps	$140,%xmm0,%xmm4
3496	pslldq	$4,%xmm3
3497	xorps	%xmm4,%xmm0
3498	pshufd	$85,%xmm1,%xmm1
3499	pxor	%xmm3,%xmm2
3500	pxor	%xmm1,%xmm0
3501	pshufd	$255,%xmm0,%xmm3
3502	pxor	%xmm3,%xmm2
3503	.byte	0xf3,0xc3
3504
3505.p2align	4
3506L$key_expansion_192b:
3507	movaps	%xmm0,%xmm3
3508	shufps	$68,%xmm0,%xmm5
3509	movups	%xmm5,(%rax)
3510	shufps	$78,%xmm2,%xmm3
3511	movups	%xmm3,16(%rax)
3512	leaq	32(%rax),%rax
3513	jmp	L$key_expansion_192b_warm
3514
3515.p2align	4
3516L$key_expansion_256a:
3517	movups	%xmm2,(%rax)
3518	leaq	16(%rax),%rax
3519L$key_expansion_256a_cold:
3520	shufps	$16,%xmm0,%xmm4
3521	xorps	%xmm4,%xmm0
3522	shufps	$140,%xmm0,%xmm4
3523	xorps	%xmm4,%xmm0
3524	shufps	$255,%xmm1,%xmm1
3525	xorps	%xmm1,%xmm0
3526	.byte	0xf3,0xc3
3527
3528.p2align	4
3529L$key_expansion_256b:
3530	movups	%xmm0,(%rax)
3531	leaq	16(%rax),%rax
3532
3533	shufps	$16,%xmm2,%xmm4
3534	xorps	%xmm4,%xmm2
3535	shufps	$140,%xmm2,%xmm4
3536	xorps	%xmm4,%xmm2
3537	shufps	$170,%xmm1,%xmm1
3538	xorps	%xmm1,%xmm2
3539	.byte	0xf3,0xc3
3540
3541
3542.p2align	6
3543L$bswap_mask:
3544.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3545L$increment32:
3546.long	6,6,6,0
3547L$increment64:
3548.long	1,0,0,0
3549L$xts_magic:
3550.long	0x87,0,1,0
3551L$increment1:
3552.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3553L$key_rotate:
3554.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
3555L$key_rotate192:
3556.long	0x04070605,0x04070605,0x04070605,0x04070605
3557L$key_rcon1:
3558.long	1,1,1,1
3559L$key_rcon1b:
3560.long	0x1b,0x1b,0x1b,0x1b
3561
3562.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3563.p2align	6
3564#endif
3565