1#if defined(__x86_64__)
2.text
3
4.globl	_aesni_encrypt
5.private_extern _aesni_encrypt
6
7.p2align	4
8_aesni_encrypt:
9	movups	(%rdi),%xmm2
10	movl	240(%rdx),%eax
11	movups	(%rdx),%xmm0
12	movups	16(%rdx),%xmm1
13	leaq	32(%rdx),%rdx
14	xorps	%xmm0,%xmm2
15L$oop_enc1_1:
16.byte	102,15,56,220,209
17	decl	%eax
18	movups	(%rdx),%xmm1
19	leaq	16(%rdx),%rdx
20	jnz	L$oop_enc1_1
21.byte	102,15,56,221,209
22	pxor	%xmm0,%xmm0
23	pxor	%xmm1,%xmm1
24	movups	%xmm2,(%rsi)
25	pxor	%xmm2,%xmm2
26	.byte	0xf3,0xc3
27
28
29.globl	_aesni_decrypt
30.private_extern _aesni_decrypt
31
32.p2align	4
33_aesni_decrypt:
34	movups	(%rdi),%xmm2
35	movl	240(%rdx),%eax
36	movups	(%rdx),%xmm0
37	movups	16(%rdx),%xmm1
38	leaq	32(%rdx),%rdx
39	xorps	%xmm0,%xmm2
40L$oop_dec1_2:
41.byte	102,15,56,222,209
42	decl	%eax
43	movups	(%rdx),%xmm1
44	leaq	16(%rdx),%rdx
45	jnz	L$oop_dec1_2
46.byte	102,15,56,223,209
47	pxor	%xmm0,%xmm0
48	pxor	%xmm1,%xmm1
49	movups	%xmm2,(%rsi)
50	pxor	%xmm2,%xmm2
51	.byte	0xf3,0xc3
52
53
54.p2align	4
55_aesni_encrypt2:
56	movups	(%rcx),%xmm0
57	shll	$4,%eax
58	movups	16(%rcx),%xmm1
59	xorps	%xmm0,%xmm2
60	xorps	%xmm0,%xmm3
61	movups	32(%rcx),%xmm0
62	leaq	32(%rcx,%rax,1),%rcx
63	negq	%rax
64	addq	$16,%rax
65
66L$enc_loop2:
67.byte	102,15,56,220,209
68.byte	102,15,56,220,217
69	movups	(%rcx,%rax,1),%xmm1
70	addq	$32,%rax
71.byte	102,15,56,220,208
72.byte	102,15,56,220,216
73	movups	-16(%rcx,%rax,1),%xmm0
74	jnz	L$enc_loop2
75
76.byte	102,15,56,220,209
77.byte	102,15,56,220,217
78.byte	102,15,56,221,208
79.byte	102,15,56,221,216
80	.byte	0xf3,0xc3
81
82
83.p2align	4
84_aesni_decrypt2:
85	movups	(%rcx),%xmm0
86	shll	$4,%eax
87	movups	16(%rcx),%xmm1
88	xorps	%xmm0,%xmm2
89	xorps	%xmm0,%xmm3
90	movups	32(%rcx),%xmm0
91	leaq	32(%rcx,%rax,1),%rcx
92	negq	%rax
93	addq	$16,%rax
94
95L$dec_loop2:
96.byte	102,15,56,222,209
97.byte	102,15,56,222,217
98	movups	(%rcx,%rax,1),%xmm1
99	addq	$32,%rax
100.byte	102,15,56,222,208
101.byte	102,15,56,222,216
102	movups	-16(%rcx,%rax,1),%xmm0
103	jnz	L$dec_loop2
104
105.byte	102,15,56,222,209
106.byte	102,15,56,222,217
107.byte	102,15,56,223,208
108.byte	102,15,56,223,216
109	.byte	0xf3,0xc3
110
111
112.p2align	4
113_aesni_encrypt3:
114	movups	(%rcx),%xmm0
115	shll	$4,%eax
116	movups	16(%rcx),%xmm1
117	xorps	%xmm0,%xmm2
118	xorps	%xmm0,%xmm3
119	xorps	%xmm0,%xmm4
120	movups	32(%rcx),%xmm0
121	leaq	32(%rcx,%rax,1),%rcx
122	negq	%rax
123	addq	$16,%rax
124
125L$enc_loop3:
126.byte	102,15,56,220,209
127.byte	102,15,56,220,217
128.byte	102,15,56,220,225
129	movups	(%rcx,%rax,1),%xmm1
130	addq	$32,%rax
131.byte	102,15,56,220,208
132.byte	102,15,56,220,216
133.byte	102,15,56,220,224
134	movups	-16(%rcx,%rax,1),%xmm0
135	jnz	L$enc_loop3
136
137.byte	102,15,56,220,209
138.byte	102,15,56,220,217
139.byte	102,15,56,220,225
140.byte	102,15,56,221,208
141.byte	102,15,56,221,216
142.byte	102,15,56,221,224
143	.byte	0xf3,0xc3
144
145
146.p2align	4
147_aesni_decrypt3:
148	movups	(%rcx),%xmm0
149	shll	$4,%eax
150	movups	16(%rcx),%xmm1
151	xorps	%xmm0,%xmm2
152	xorps	%xmm0,%xmm3
153	xorps	%xmm0,%xmm4
154	movups	32(%rcx),%xmm0
155	leaq	32(%rcx,%rax,1),%rcx
156	negq	%rax
157	addq	$16,%rax
158
159L$dec_loop3:
160.byte	102,15,56,222,209
161.byte	102,15,56,222,217
162.byte	102,15,56,222,225
163	movups	(%rcx,%rax,1),%xmm1
164	addq	$32,%rax
165.byte	102,15,56,222,208
166.byte	102,15,56,222,216
167.byte	102,15,56,222,224
168	movups	-16(%rcx,%rax,1),%xmm0
169	jnz	L$dec_loop3
170
171.byte	102,15,56,222,209
172.byte	102,15,56,222,217
173.byte	102,15,56,222,225
174.byte	102,15,56,223,208
175.byte	102,15,56,223,216
176.byte	102,15,56,223,224
177	.byte	0xf3,0xc3
178
179
180.p2align	4
181_aesni_encrypt4:
182	movups	(%rcx),%xmm0
183	shll	$4,%eax
184	movups	16(%rcx),%xmm1
185	xorps	%xmm0,%xmm2
186	xorps	%xmm0,%xmm3
187	xorps	%xmm0,%xmm4
188	xorps	%xmm0,%xmm5
189	movups	32(%rcx),%xmm0
190	leaq	32(%rcx,%rax,1),%rcx
191	negq	%rax
192.byte	0x0f,0x1f,0x00
193	addq	$16,%rax
194
195L$enc_loop4:
196.byte	102,15,56,220,209
197.byte	102,15,56,220,217
198.byte	102,15,56,220,225
199.byte	102,15,56,220,233
200	movups	(%rcx,%rax,1),%xmm1
201	addq	$32,%rax
202.byte	102,15,56,220,208
203.byte	102,15,56,220,216
204.byte	102,15,56,220,224
205.byte	102,15,56,220,232
206	movups	-16(%rcx,%rax,1),%xmm0
207	jnz	L$enc_loop4
208
209.byte	102,15,56,220,209
210.byte	102,15,56,220,217
211.byte	102,15,56,220,225
212.byte	102,15,56,220,233
213.byte	102,15,56,221,208
214.byte	102,15,56,221,216
215.byte	102,15,56,221,224
216.byte	102,15,56,221,232
217	.byte	0xf3,0xc3
218
219
220.p2align	4
221_aesni_decrypt4:
222	movups	(%rcx),%xmm0
223	shll	$4,%eax
224	movups	16(%rcx),%xmm1
225	xorps	%xmm0,%xmm2
226	xorps	%xmm0,%xmm3
227	xorps	%xmm0,%xmm4
228	xorps	%xmm0,%xmm5
229	movups	32(%rcx),%xmm0
230	leaq	32(%rcx,%rax,1),%rcx
231	negq	%rax
232.byte	0x0f,0x1f,0x00
233	addq	$16,%rax
234
235L$dec_loop4:
236.byte	102,15,56,222,209
237.byte	102,15,56,222,217
238.byte	102,15,56,222,225
239.byte	102,15,56,222,233
240	movups	(%rcx,%rax,1),%xmm1
241	addq	$32,%rax
242.byte	102,15,56,222,208
243.byte	102,15,56,222,216
244.byte	102,15,56,222,224
245.byte	102,15,56,222,232
246	movups	-16(%rcx,%rax,1),%xmm0
247	jnz	L$dec_loop4
248
249.byte	102,15,56,222,209
250.byte	102,15,56,222,217
251.byte	102,15,56,222,225
252.byte	102,15,56,222,233
253.byte	102,15,56,223,208
254.byte	102,15,56,223,216
255.byte	102,15,56,223,224
256.byte	102,15,56,223,232
257	.byte	0xf3,0xc3
258
259
260.p2align	4
261_aesni_encrypt6:
262	movups	(%rcx),%xmm0
263	shll	$4,%eax
264	movups	16(%rcx),%xmm1
265	xorps	%xmm0,%xmm2
266	pxor	%xmm0,%xmm3
267	pxor	%xmm0,%xmm4
268.byte	102,15,56,220,209
269	leaq	32(%rcx,%rax,1),%rcx
270	negq	%rax
271.byte	102,15,56,220,217
272	pxor	%xmm0,%xmm5
273	pxor	%xmm0,%xmm6
274.byte	102,15,56,220,225
275	pxor	%xmm0,%xmm7
276	movups	(%rcx,%rax,1),%xmm0
277	addq	$16,%rax
278	jmp	L$enc_loop6_enter
279.p2align	4
280L$enc_loop6:
281.byte	102,15,56,220,209
282.byte	102,15,56,220,217
283.byte	102,15,56,220,225
284L$enc_loop6_enter:
285.byte	102,15,56,220,233
286.byte	102,15,56,220,241
287.byte	102,15,56,220,249
288	movups	(%rcx,%rax,1),%xmm1
289	addq	$32,%rax
290.byte	102,15,56,220,208
291.byte	102,15,56,220,216
292.byte	102,15,56,220,224
293.byte	102,15,56,220,232
294.byte	102,15,56,220,240
295.byte	102,15,56,220,248
296	movups	-16(%rcx,%rax,1),%xmm0
297	jnz	L$enc_loop6
298
299.byte	102,15,56,220,209
300.byte	102,15,56,220,217
301.byte	102,15,56,220,225
302.byte	102,15,56,220,233
303.byte	102,15,56,220,241
304.byte	102,15,56,220,249
305.byte	102,15,56,221,208
306.byte	102,15,56,221,216
307.byte	102,15,56,221,224
308.byte	102,15,56,221,232
309.byte	102,15,56,221,240
310.byte	102,15,56,221,248
311	.byte	0xf3,0xc3
312
313
314.p2align	4
315_aesni_decrypt6:
316	movups	(%rcx),%xmm0
317	shll	$4,%eax
318	movups	16(%rcx),%xmm1
319	xorps	%xmm0,%xmm2
320	pxor	%xmm0,%xmm3
321	pxor	%xmm0,%xmm4
322.byte	102,15,56,222,209
323	leaq	32(%rcx,%rax,1),%rcx
324	negq	%rax
325.byte	102,15,56,222,217
326	pxor	%xmm0,%xmm5
327	pxor	%xmm0,%xmm6
328.byte	102,15,56,222,225
329	pxor	%xmm0,%xmm7
330	movups	(%rcx,%rax,1),%xmm0
331	addq	$16,%rax
332	jmp	L$dec_loop6_enter
333.p2align	4
334L$dec_loop6:
335.byte	102,15,56,222,209
336.byte	102,15,56,222,217
337.byte	102,15,56,222,225
338L$dec_loop6_enter:
339.byte	102,15,56,222,233
340.byte	102,15,56,222,241
341.byte	102,15,56,222,249
342	movups	(%rcx,%rax,1),%xmm1
343	addq	$32,%rax
344.byte	102,15,56,222,208
345.byte	102,15,56,222,216
346.byte	102,15,56,222,224
347.byte	102,15,56,222,232
348.byte	102,15,56,222,240
349.byte	102,15,56,222,248
350	movups	-16(%rcx,%rax,1),%xmm0
351	jnz	L$dec_loop6
352
353.byte	102,15,56,222,209
354.byte	102,15,56,222,217
355.byte	102,15,56,222,225
356.byte	102,15,56,222,233
357.byte	102,15,56,222,241
358.byte	102,15,56,222,249
359.byte	102,15,56,223,208
360.byte	102,15,56,223,216
361.byte	102,15,56,223,224
362.byte	102,15,56,223,232
363.byte	102,15,56,223,240
364.byte	102,15,56,223,248
365	.byte	0xf3,0xc3
366
367
368.p2align	4
369_aesni_encrypt8:
370	movups	(%rcx),%xmm0
371	shll	$4,%eax
372	movups	16(%rcx),%xmm1
373	xorps	%xmm0,%xmm2
374	xorps	%xmm0,%xmm3
375	pxor	%xmm0,%xmm4
376	pxor	%xmm0,%xmm5
377	pxor	%xmm0,%xmm6
378	leaq	32(%rcx,%rax,1),%rcx
379	negq	%rax
380.byte	102,15,56,220,209
381	pxor	%xmm0,%xmm7
382	pxor	%xmm0,%xmm8
383.byte	102,15,56,220,217
384	pxor	%xmm0,%xmm9
385	movups	(%rcx,%rax,1),%xmm0
386	addq	$16,%rax
387	jmp	L$enc_loop8_inner
388.p2align	4
389L$enc_loop8:
390.byte	102,15,56,220,209
391.byte	102,15,56,220,217
392L$enc_loop8_inner:
393.byte	102,15,56,220,225
394.byte	102,15,56,220,233
395.byte	102,15,56,220,241
396.byte	102,15,56,220,249
397.byte	102,68,15,56,220,193
398.byte	102,68,15,56,220,201
399L$enc_loop8_enter:
400	movups	(%rcx,%rax,1),%xmm1
401	addq	$32,%rax
402.byte	102,15,56,220,208
403.byte	102,15,56,220,216
404.byte	102,15,56,220,224
405.byte	102,15,56,220,232
406.byte	102,15,56,220,240
407.byte	102,15,56,220,248
408.byte	102,68,15,56,220,192
409.byte	102,68,15,56,220,200
410	movups	-16(%rcx,%rax,1),%xmm0
411	jnz	L$enc_loop8
412
413.byte	102,15,56,220,209
414.byte	102,15,56,220,217
415.byte	102,15,56,220,225
416.byte	102,15,56,220,233
417.byte	102,15,56,220,241
418.byte	102,15,56,220,249
419.byte	102,68,15,56,220,193
420.byte	102,68,15,56,220,201
421.byte	102,15,56,221,208
422.byte	102,15,56,221,216
423.byte	102,15,56,221,224
424.byte	102,15,56,221,232
425.byte	102,15,56,221,240
426.byte	102,15,56,221,248
427.byte	102,68,15,56,221,192
428.byte	102,68,15,56,221,200
429	.byte	0xf3,0xc3
430
431
432.p2align	4
433_aesni_decrypt8:
434	movups	(%rcx),%xmm0
435	shll	$4,%eax
436	movups	16(%rcx),%xmm1
437	xorps	%xmm0,%xmm2
438	xorps	%xmm0,%xmm3
439	pxor	%xmm0,%xmm4
440	pxor	%xmm0,%xmm5
441	pxor	%xmm0,%xmm6
442	leaq	32(%rcx,%rax,1),%rcx
443	negq	%rax
444.byte	102,15,56,222,209
445	pxor	%xmm0,%xmm7
446	pxor	%xmm0,%xmm8
447.byte	102,15,56,222,217
448	pxor	%xmm0,%xmm9
449	movups	(%rcx,%rax,1),%xmm0
450	addq	$16,%rax
451	jmp	L$dec_loop8_inner
452.p2align	4
453L$dec_loop8:
454.byte	102,15,56,222,209
455.byte	102,15,56,222,217
456L$dec_loop8_inner:
457.byte	102,15,56,222,225
458.byte	102,15,56,222,233
459.byte	102,15,56,222,241
460.byte	102,15,56,222,249
461.byte	102,68,15,56,222,193
462.byte	102,68,15,56,222,201
463L$dec_loop8_enter:
464	movups	(%rcx,%rax,1),%xmm1
465	addq	$32,%rax
466.byte	102,15,56,222,208
467.byte	102,15,56,222,216
468.byte	102,15,56,222,224
469.byte	102,15,56,222,232
470.byte	102,15,56,222,240
471.byte	102,15,56,222,248
472.byte	102,68,15,56,222,192
473.byte	102,68,15,56,222,200
474	movups	-16(%rcx,%rax,1),%xmm0
475	jnz	L$dec_loop8
476
477.byte	102,15,56,222,209
478.byte	102,15,56,222,217
479.byte	102,15,56,222,225
480.byte	102,15,56,222,233
481.byte	102,15,56,222,241
482.byte	102,15,56,222,249
483.byte	102,68,15,56,222,193
484.byte	102,68,15,56,222,201
485.byte	102,15,56,223,208
486.byte	102,15,56,223,216
487.byte	102,15,56,223,224
488.byte	102,15,56,223,232
489.byte	102,15,56,223,240
490.byte	102,15,56,223,248
491.byte	102,68,15,56,223,192
492.byte	102,68,15,56,223,200
493	.byte	0xf3,0xc3
494
495.globl	_aesni_ecb_encrypt
496.private_extern _aesni_ecb_encrypt
497
498.p2align	4
499_aesni_ecb_encrypt:
500	andq	$-16,%rdx
501	jz	L$ecb_ret
502
503	movl	240(%rcx),%eax
504	movups	(%rcx),%xmm0
505	movq	%rcx,%r11
506	movl	%eax,%r10d
507	testl	%r8d,%r8d
508	jz	L$ecb_decrypt
509
510	cmpq	$0x80,%rdx
511	jb	L$ecb_enc_tail
512
513	movdqu	(%rdi),%xmm2
514	movdqu	16(%rdi),%xmm3
515	movdqu	32(%rdi),%xmm4
516	movdqu	48(%rdi),%xmm5
517	movdqu	64(%rdi),%xmm6
518	movdqu	80(%rdi),%xmm7
519	movdqu	96(%rdi),%xmm8
520	movdqu	112(%rdi),%xmm9
521	leaq	128(%rdi),%rdi
522	subq	$0x80,%rdx
523	jmp	L$ecb_enc_loop8_enter
524.p2align	4
525L$ecb_enc_loop8:
526	movups	%xmm2,(%rsi)
527	movq	%r11,%rcx
528	movdqu	(%rdi),%xmm2
529	movl	%r10d,%eax
530	movups	%xmm3,16(%rsi)
531	movdqu	16(%rdi),%xmm3
532	movups	%xmm4,32(%rsi)
533	movdqu	32(%rdi),%xmm4
534	movups	%xmm5,48(%rsi)
535	movdqu	48(%rdi),%xmm5
536	movups	%xmm6,64(%rsi)
537	movdqu	64(%rdi),%xmm6
538	movups	%xmm7,80(%rsi)
539	movdqu	80(%rdi),%xmm7
540	movups	%xmm8,96(%rsi)
541	movdqu	96(%rdi),%xmm8
542	movups	%xmm9,112(%rsi)
543	leaq	128(%rsi),%rsi
544	movdqu	112(%rdi),%xmm9
545	leaq	128(%rdi),%rdi
546L$ecb_enc_loop8_enter:
547
548	call	_aesni_encrypt8
549
550	subq	$0x80,%rdx
551	jnc	L$ecb_enc_loop8
552
553	movups	%xmm2,(%rsi)
554	movq	%r11,%rcx
555	movups	%xmm3,16(%rsi)
556	movl	%r10d,%eax
557	movups	%xmm4,32(%rsi)
558	movups	%xmm5,48(%rsi)
559	movups	%xmm6,64(%rsi)
560	movups	%xmm7,80(%rsi)
561	movups	%xmm8,96(%rsi)
562	movups	%xmm9,112(%rsi)
563	leaq	128(%rsi),%rsi
564	addq	$0x80,%rdx
565	jz	L$ecb_ret
566
567L$ecb_enc_tail:
568	movups	(%rdi),%xmm2
569	cmpq	$0x20,%rdx
570	jb	L$ecb_enc_one
571	movups	16(%rdi),%xmm3
572	je	L$ecb_enc_two
573	movups	32(%rdi),%xmm4
574	cmpq	$0x40,%rdx
575	jb	L$ecb_enc_three
576	movups	48(%rdi),%xmm5
577	je	L$ecb_enc_four
578	movups	64(%rdi),%xmm6
579	cmpq	$0x60,%rdx
580	jb	L$ecb_enc_five
581	movups	80(%rdi),%xmm7
582	je	L$ecb_enc_six
583	movdqu	96(%rdi),%xmm8
584	xorps	%xmm9,%xmm9
585	call	_aesni_encrypt8
586	movups	%xmm2,(%rsi)
587	movups	%xmm3,16(%rsi)
588	movups	%xmm4,32(%rsi)
589	movups	%xmm5,48(%rsi)
590	movups	%xmm6,64(%rsi)
591	movups	%xmm7,80(%rsi)
592	movups	%xmm8,96(%rsi)
593	jmp	L$ecb_ret
594.p2align	4
595L$ecb_enc_one:
596	movups	(%rcx),%xmm0
597	movups	16(%rcx),%xmm1
598	leaq	32(%rcx),%rcx
599	xorps	%xmm0,%xmm2
600L$oop_enc1_3:
601.byte	102,15,56,220,209
602	decl	%eax
603	movups	(%rcx),%xmm1
604	leaq	16(%rcx),%rcx
605	jnz	L$oop_enc1_3
606.byte	102,15,56,221,209
607	movups	%xmm2,(%rsi)
608	jmp	L$ecb_ret
609.p2align	4
610L$ecb_enc_two:
611	call	_aesni_encrypt2
612	movups	%xmm2,(%rsi)
613	movups	%xmm3,16(%rsi)
614	jmp	L$ecb_ret
615.p2align	4
616L$ecb_enc_three:
617	call	_aesni_encrypt3
618	movups	%xmm2,(%rsi)
619	movups	%xmm3,16(%rsi)
620	movups	%xmm4,32(%rsi)
621	jmp	L$ecb_ret
622.p2align	4
623L$ecb_enc_four:
624	call	_aesni_encrypt4
625	movups	%xmm2,(%rsi)
626	movups	%xmm3,16(%rsi)
627	movups	%xmm4,32(%rsi)
628	movups	%xmm5,48(%rsi)
629	jmp	L$ecb_ret
630.p2align	4
631L$ecb_enc_five:
632	xorps	%xmm7,%xmm7
633	call	_aesni_encrypt6
634	movups	%xmm2,(%rsi)
635	movups	%xmm3,16(%rsi)
636	movups	%xmm4,32(%rsi)
637	movups	%xmm5,48(%rsi)
638	movups	%xmm6,64(%rsi)
639	jmp	L$ecb_ret
640.p2align	4
641L$ecb_enc_six:
642	call	_aesni_encrypt6
643	movups	%xmm2,(%rsi)
644	movups	%xmm3,16(%rsi)
645	movups	%xmm4,32(%rsi)
646	movups	%xmm5,48(%rsi)
647	movups	%xmm6,64(%rsi)
648	movups	%xmm7,80(%rsi)
649	jmp	L$ecb_ret
650
651.p2align	4
652L$ecb_decrypt:
653	cmpq	$0x80,%rdx
654	jb	L$ecb_dec_tail
655
656	movdqu	(%rdi),%xmm2
657	movdqu	16(%rdi),%xmm3
658	movdqu	32(%rdi),%xmm4
659	movdqu	48(%rdi),%xmm5
660	movdqu	64(%rdi),%xmm6
661	movdqu	80(%rdi),%xmm7
662	movdqu	96(%rdi),%xmm8
663	movdqu	112(%rdi),%xmm9
664	leaq	128(%rdi),%rdi
665	subq	$0x80,%rdx
666	jmp	L$ecb_dec_loop8_enter
667.p2align	4
668L$ecb_dec_loop8:
669	movups	%xmm2,(%rsi)
670	movq	%r11,%rcx
671	movdqu	(%rdi),%xmm2
672	movl	%r10d,%eax
673	movups	%xmm3,16(%rsi)
674	movdqu	16(%rdi),%xmm3
675	movups	%xmm4,32(%rsi)
676	movdqu	32(%rdi),%xmm4
677	movups	%xmm5,48(%rsi)
678	movdqu	48(%rdi),%xmm5
679	movups	%xmm6,64(%rsi)
680	movdqu	64(%rdi),%xmm6
681	movups	%xmm7,80(%rsi)
682	movdqu	80(%rdi),%xmm7
683	movups	%xmm8,96(%rsi)
684	movdqu	96(%rdi),%xmm8
685	movups	%xmm9,112(%rsi)
686	leaq	128(%rsi),%rsi
687	movdqu	112(%rdi),%xmm9
688	leaq	128(%rdi),%rdi
689L$ecb_dec_loop8_enter:
690
691	call	_aesni_decrypt8
692
693	movups	(%r11),%xmm0
694	subq	$0x80,%rdx
695	jnc	L$ecb_dec_loop8
696
697	movups	%xmm2,(%rsi)
698	pxor	%xmm2,%xmm2
699	movq	%r11,%rcx
700	movups	%xmm3,16(%rsi)
701	pxor	%xmm3,%xmm3
702	movl	%r10d,%eax
703	movups	%xmm4,32(%rsi)
704	pxor	%xmm4,%xmm4
705	movups	%xmm5,48(%rsi)
706	pxor	%xmm5,%xmm5
707	movups	%xmm6,64(%rsi)
708	pxor	%xmm6,%xmm6
709	movups	%xmm7,80(%rsi)
710	pxor	%xmm7,%xmm7
711	movups	%xmm8,96(%rsi)
712	pxor	%xmm8,%xmm8
713	movups	%xmm9,112(%rsi)
714	pxor	%xmm9,%xmm9
715	leaq	128(%rsi),%rsi
716	addq	$0x80,%rdx
717	jz	L$ecb_ret
718
719L$ecb_dec_tail:
720	movups	(%rdi),%xmm2
721	cmpq	$0x20,%rdx
722	jb	L$ecb_dec_one
723	movups	16(%rdi),%xmm3
724	je	L$ecb_dec_two
725	movups	32(%rdi),%xmm4
726	cmpq	$0x40,%rdx
727	jb	L$ecb_dec_three
728	movups	48(%rdi),%xmm5
729	je	L$ecb_dec_four
730	movups	64(%rdi),%xmm6
731	cmpq	$0x60,%rdx
732	jb	L$ecb_dec_five
733	movups	80(%rdi),%xmm7
734	je	L$ecb_dec_six
735	movups	96(%rdi),%xmm8
736	movups	(%rcx),%xmm0
737	xorps	%xmm9,%xmm9
738	call	_aesni_decrypt8
739	movups	%xmm2,(%rsi)
740	pxor	%xmm2,%xmm2
741	movups	%xmm3,16(%rsi)
742	pxor	%xmm3,%xmm3
743	movups	%xmm4,32(%rsi)
744	pxor	%xmm4,%xmm4
745	movups	%xmm5,48(%rsi)
746	pxor	%xmm5,%xmm5
747	movups	%xmm6,64(%rsi)
748	pxor	%xmm6,%xmm6
749	movups	%xmm7,80(%rsi)
750	pxor	%xmm7,%xmm7
751	movups	%xmm8,96(%rsi)
752	pxor	%xmm8,%xmm8
753	pxor	%xmm9,%xmm9
754	jmp	L$ecb_ret
755.p2align	4
756L$ecb_dec_one:
757	movups	(%rcx),%xmm0
758	movups	16(%rcx),%xmm1
759	leaq	32(%rcx),%rcx
760	xorps	%xmm0,%xmm2
761L$oop_dec1_4:
762.byte	102,15,56,222,209
763	decl	%eax
764	movups	(%rcx),%xmm1
765	leaq	16(%rcx),%rcx
766	jnz	L$oop_dec1_4
767.byte	102,15,56,223,209
768	movups	%xmm2,(%rsi)
769	pxor	%xmm2,%xmm2
770	jmp	L$ecb_ret
771.p2align	4
772L$ecb_dec_two:
773	call	_aesni_decrypt2
774	movups	%xmm2,(%rsi)
775	pxor	%xmm2,%xmm2
776	movups	%xmm3,16(%rsi)
777	pxor	%xmm3,%xmm3
778	jmp	L$ecb_ret
779.p2align	4
780L$ecb_dec_three:
781	call	_aesni_decrypt3
782	movups	%xmm2,(%rsi)
783	pxor	%xmm2,%xmm2
784	movups	%xmm3,16(%rsi)
785	pxor	%xmm3,%xmm3
786	movups	%xmm4,32(%rsi)
787	pxor	%xmm4,%xmm4
788	jmp	L$ecb_ret
789.p2align	4
790L$ecb_dec_four:
791	call	_aesni_decrypt4
792	movups	%xmm2,(%rsi)
793	pxor	%xmm2,%xmm2
794	movups	%xmm3,16(%rsi)
795	pxor	%xmm3,%xmm3
796	movups	%xmm4,32(%rsi)
797	pxor	%xmm4,%xmm4
798	movups	%xmm5,48(%rsi)
799	pxor	%xmm5,%xmm5
800	jmp	L$ecb_ret
801.p2align	4
802L$ecb_dec_five:
803	xorps	%xmm7,%xmm7
804	call	_aesni_decrypt6
805	movups	%xmm2,(%rsi)
806	pxor	%xmm2,%xmm2
807	movups	%xmm3,16(%rsi)
808	pxor	%xmm3,%xmm3
809	movups	%xmm4,32(%rsi)
810	pxor	%xmm4,%xmm4
811	movups	%xmm5,48(%rsi)
812	pxor	%xmm5,%xmm5
813	movups	%xmm6,64(%rsi)
814	pxor	%xmm6,%xmm6
815	pxor	%xmm7,%xmm7
816	jmp	L$ecb_ret
817.p2align	4
818L$ecb_dec_six:
819	call	_aesni_decrypt6
820	movups	%xmm2,(%rsi)
821	pxor	%xmm2,%xmm2
822	movups	%xmm3,16(%rsi)
823	pxor	%xmm3,%xmm3
824	movups	%xmm4,32(%rsi)
825	pxor	%xmm4,%xmm4
826	movups	%xmm5,48(%rsi)
827	pxor	%xmm5,%xmm5
828	movups	%xmm6,64(%rsi)
829	pxor	%xmm6,%xmm6
830	movups	%xmm7,80(%rsi)
831	pxor	%xmm7,%xmm7
832
833L$ecb_ret:
834	xorps	%xmm0,%xmm0
835	pxor	%xmm1,%xmm1
836	.byte	0xf3,0xc3
837
838.globl	_aesni_ccm64_encrypt_blocks
839.private_extern _aesni_ccm64_encrypt_blocks
840
841.p2align	4
842_aesni_ccm64_encrypt_blocks:
843	movl	240(%rcx),%eax
844	movdqu	(%r8),%xmm6
845	movdqa	L$increment64(%rip),%xmm9
846	movdqa	L$bswap_mask(%rip),%xmm7
847
848	shll	$4,%eax
849	movl	$16,%r10d
850	leaq	0(%rcx),%r11
851	movdqu	(%r9),%xmm3
852	movdqa	%xmm6,%xmm2
853	leaq	32(%rcx,%rax,1),%rcx
854.byte	102,15,56,0,247
855	subq	%rax,%r10
856	jmp	L$ccm64_enc_outer
857.p2align	4
858L$ccm64_enc_outer:
859	movups	(%r11),%xmm0
860	movq	%r10,%rax
861	movups	(%rdi),%xmm8
862
863	xorps	%xmm0,%xmm2
864	movups	16(%r11),%xmm1
865	xorps	%xmm8,%xmm0
866	xorps	%xmm0,%xmm3
867	movups	32(%r11),%xmm0
868
869L$ccm64_enc2_loop:
870.byte	102,15,56,220,209
871.byte	102,15,56,220,217
872	movups	(%rcx,%rax,1),%xmm1
873	addq	$32,%rax
874.byte	102,15,56,220,208
875.byte	102,15,56,220,216
876	movups	-16(%rcx,%rax,1),%xmm0
877	jnz	L$ccm64_enc2_loop
878.byte	102,15,56,220,209
879.byte	102,15,56,220,217
880	paddq	%xmm9,%xmm6
881	decq	%rdx
882.byte	102,15,56,221,208
883.byte	102,15,56,221,216
884
885	leaq	16(%rdi),%rdi
886	xorps	%xmm2,%xmm8
887	movdqa	%xmm6,%xmm2
888	movups	%xmm8,(%rsi)
889.byte	102,15,56,0,215
890	leaq	16(%rsi),%rsi
891	jnz	L$ccm64_enc_outer
892
893	pxor	%xmm0,%xmm0
894	pxor	%xmm1,%xmm1
895	pxor	%xmm2,%xmm2
896	movups	%xmm3,(%r9)
897	pxor	%xmm3,%xmm3
898	pxor	%xmm8,%xmm8
899	pxor	%xmm6,%xmm6
900	.byte	0xf3,0xc3
901
902.globl	_aesni_ccm64_decrypt_blocks
903.private_extern _aesni_ccm64_decrypt_blocks
904
905.p2align	4
906_aesni_ccm64_decrypt_blocks:
907	movl	240(%rcx),%eax
908	movups	(%r8),%xmm6
909	movdqu	(%r9),%xmm3
910	movdqa	L$increment64(%rip),%xmm9
911	movdqa	L$bswap_mask(%rip),%xmm7
912
913	movaps	%xmm6,%xmm2
914	movl	%eax,%r10d
915	movq	%rcx,%r11
916.byte	102,15,56,0,247
917	movups	(%rcx),%xmm0
918	movups	16(%rcx),%xmm1
919	leaq	32(%rcx),%rcx
920	xorps	%xmm0,%xmm2
921L$oop_enc1_5:
922.byte	102,15,56,220,209
923	decl	%eax
924	movups	(%rcx),%xmm1
925	leaq	16(%rcx),%rcx
926	jnz	L$oop_enc1_5
927.byte	102,15,56,221,209
928	shll	$4,%r10d
929	movl	$16,%eax
930	movups	(%rdi),%xmm8
931	paddq	%xmm9,%xmm6
932	leaq	16(%rdi),%rdi
933	subq	%r10,%rax
934	leaq	32(%r11,%r10,1),%rcx
935	movq	%rax,%r10
936	jmp	L$ccm64_dec_outer
937.p2align	4
938L$ccm64_dec_outer:
939	xorps	%xmm2,%xmm8
940	movdqa	%xmm6,%xmm2
941	movups	%xmm8,(%rsi)
942	leaq	16(%rsi),%rsi
943.byte	102,15,56,0,215
944
945	subq	$1,%rdx
946	jz	L$ccm64_dec_break
947
948	movups	(%r11),%xmm0
949	movq	%r10,%rax
950	movups	16(%r11),%xmm1
951	xorps	%xmm0,%xmm8
952	xorps	%xmm0,%xmm2
953	xorps	%xmm8,%xmm3
954	movups	32(%r11),%xmm0
955	jmp	L$ccm64_dec2_loop
956.p2align	4
957L$ccm64_dec2_loop:
958.byte	102,15,56,220,209
959.byte	102,15,56,220,217
960	movups	(%rcx,%rax,1),%xmm1
961	addq	$32,%rax
962.byte	102,15,56,220,208
963.byte	102,15,56,220,216
964	movups	-16(%rcx,%rax,1),%xmm0
965	jnz	L$ccm64_dec2_loop
966	movups	(%rdi),%xmm8
967	paddq	%xmm9,%xmm6
968.byte	102,15,56,220,209
969.byte	102,15,56,220,217
970.byte	102,15,56,221,208
971.byte	102,15,56,221,216
972	leaq	16(%rdi),%rdi
973	jmp	L$ccm64_dec_outer
974
975.p2align	4
976L$ccm64_dec_break:
977
978	movl	240(%r11),%eax
979	movups	(%r11),%xmm0
980	movups	16(%r11),%xmm1
981	xorps	%xmm0,%xmm8
982	leaq	32(%r11),%r11
983	xorps	%xmm8,%xmm3
984L$oop_enc1_6:
985.byte	102,15,56,220,217
986	decl	%eax
987	movups	(%r11),%xmm1
988	leaq	16(%r11),%r11
989	jnz	L$oop_enc1_6
990.byte	102,15,56,221,217
991	pxor	%xmm0,%xmm0
992	pxor	%xmm1,%xmm1
993	pxor	%xmm2,%xmm2
994	movups	%xmm3,(%r9)
995	pxor	%xmm3,%xmm3
996	pxor	%xmm8,%xmm8
997	pxor	%xmm6,%xmm6
998	.byte	0xf3,0xc3
999
1000.globl	_aesni_ctr32_encrypt_blocks
1001.private_extern _aesni_ctr32_encrypt_blocks
1002
1003.p2align	4
1004_aesni_ctr32_encrypt_blocks:
1005	cmpq	$1,%rdx
1006	jne	L$ctr32_bulk
1007
1008
1009
1010	movups	(%r8),%xmm2
1011	movups	(%rdi),%xmm3
1012	movl	240(%rcx),%edx
1013	movups	(%rcx),%xmm0
1014	movups	16(%rcx),%xmm1
1015	leaq	32(%rcx),%rcx
1016	xorps	%xmm0,%xmm2
1017L$oop_enc1_7:
1018.byte	102,15,56,220,209
1019	decl	%edx
1020	movups	(%rcx),%xmm1
1021	leaq	16(%rcx),%rcx
1022	jnz	L$oop_enc1_7
1023.byte	102,15,56,221,209
1024	pxor	%xmm0,%xmm0
1025	pxor	%xmm1,%xmm1
1026	xorps	%xmm3,%xmm2
1027	pxor	%xmm3,%xmm3
1028	movups	%xmm2,(%rsi)
1029	xorps	%xmm2,%xmm2
1030	jmp	L$ctr32_epilogue
1031
1032.p2align	4
1033L$ctr32_bulk:
1034	leaq	(%rsp),%r11
1035	pushq	%rbp
1036	subq	$128,%rsp
1037	andq	$-16,%rsp
1038
1039
1040
1041
1042	movdqu	(%r8),%xmm2
1043	movdqu	(%rcx),%xmm0
1044	movl	12(%r8),%r8d
1045	pxor	%xmm0,%xmm2
1046	movl	12(%rcx),%ebp
1047	movdqa	%xmm2,0(%rsp)
1048	bswapl	%r8d
1049	movdqa	%xmm2,%xmm3
1050	movdqa	%xmm2,%xmm4
1051	movdqa	%xmm2,%xmm5
1052	movdqa	%xmm2,64(%rsp)
1053	movdqa	%xmm2,80(%rsp)
1054	movdqa	%xmm2,96(%rsp)
1055	movq	%rdx,%r10
1056	movdqa	%xmm2,112(%rsp)
1057
1058	leaq	1(%r8),%rax
1059	leaq	2(%r8),%rdx
1060	bswapl	%eax
1061	bswapl	%edx
1062	xorl	%ebp,%eax
1063	xorl	%ebp,%edx
1064.byte	102,15,58,34,216,3
1065	leaq	3(%r8),%rax
1066	movdqa	%xmm3,16(%rsp)
1067.byte	102,15,58,34,226,3
1068	bswapl	%eax
1069	movq	%r10,%rdx
1070	leaq	4(%r8),%r10
1071	movdqa	%xmm4,32(%rsp)
1072	xorl	%ebp,%eax
1073	bswapl	%r10d
1074.byte	102,15,58,34,232,3
1075	xorl	%ebp,%r10d
1076	movdqa	%xmm5,48(%rsp)
1077	leaq	5(%r8),%r9
1078	movl	%r10d,64+12(%rsp)
1079	bswapl	%r9d
1080	leaq	6(%r8),%r10
1081	movl	240(%rcx),%eax
1082	xorl	%ebp,%r9d
1083	bswapl	%r10d
1084	movl	%r9d,80+12(%rsp)
1085	xorl	%ebp,%r10d
1086	leaq	7(%r8),%r9
1087	movl	%r10d,96+12(%rsp)
1088	bswapl	%r9d
1089	movl	_OPENSSL_ia32cap_P+4(%rip),%r10d
1090	xorl	%ebp,%r9d
1091	andl	$71303168,%r10d
1092	movl	%r9d,112+12(%rsp)
1093
1094	movups	16(%rcx),%xmm1
1095
1096	movdqa	64(%rsp),%xmm6
1097	movdqa	80(%rsp),%xmm7
1098
1099	cmpq	$8,%rdx
1100	jb	L$ctr32_tail
1101
1102	subq	$6,%rdx
1103	cmpl	$4194304,%r10d
1104	je	L$ctr32_6x
1105
1106	leaq	128(%rcx),%rcx
1107	subq	$2,%rdx
1108	jmp	L$ctr32_loop8
1109
1110.p2align	4
1111L$ctr32_6x:
1112	shll	$4,%eax
1113	movl	$48,%r10d
1114	bswapl	%ebp
1115	leaq	32(%rcx,%rax,1),%rcx
1116	subq	%rax,%r10
1117	jmp	L$ctr32_loop6
1118
1119.p2align	4
1120L$ctr32_loop6:
1121	addl	$6,%r8d
1122	movups	-48(%rcx,%r10,1),%xmm0
1123.byte	102,15,56,220,209
1124	movl	%r8d,%eax
1125	xorl	%ebp,%eax
1126.byte	102,15,56,220,217
1127.byte	0x0f,0x38,0xf1,0x44,0x24,12
1128	leal	1(%r8),%eax
1129.byte	102,15,56,220,225
1130	xorl	%ebp,%eax
1131.byte	0x0f,0x38,0xf1,0x44,0x24,28
1132.byte	102,15,56,220,233
1133	leal	2(%r8),%eax
1134	xorl	%ebp,%eax
1135.byte	102,15,56,220,241
1136.byte	0x0f,0x38,0xf1,0x44,0x24,44
1137	leal	3(%r8),%eax
1138.byte	102,15,56,220,249
1139	movups	-32(%rcx,%r10,1),%xmm1
1140	xorl	%ebp,%eax
1141
1142.byte	102,15,56,220,208
1143.byte	0x0f,0x38,0xf1,0x44,0x24,60
1144	leal	4(%r8),%eax
1145.byte	102,15,56,220,216
1146	xorl	%ebp,%eax
1147.byte	0x0f,0x38,0xf1,0x44,0x24,76
1148.byte	102,15,56,220,224
1149	leal	5(%r8),%eax
1150	xorl	%ebp,%eax
1151.byte	102,15,56,220,232
1152.byte	0x0f,0x38,0xf1,0x44,0x24,92
1153	movq	%r10,%rax
1154.byte	102,15,56,220,240
1155.byte	102,15,56,220,248
1156	movups	-16(%rcx,%r10,1),%xmm0
1157
1158	call	L$enc_loop6
1159
1160	movdqu	(%rdi),%xmm8
1161	movdqu	16(%rdi),%xmm9
1162	movdqu	32(%rdi),%xmm10
1163	movdqu	48(%rdi),%xmm11
1164	movdqu	64(%rdi),%xmm12
1165	movdqu	80(%rdi),%xmm13
1166	leaq	96(%rdi),%rdi
1167	movups	-64(%rcx,%r10,1),%xmm1
1168	pxor	%xmm2,%xmm8
1169	movaps	0(%rsp),%xmm2
1170	pxor	%xmm3,%xmm9
1171	movaps	16(%rsp),%xmm3
1172	pxor	%xmm4,%xmm10
1173	movaps	32(%rsp),%xmm4
1174	pxor	%xmm5,%xmm11
1175	movaps	48(%rsp),%xmm5
1176	pxor	%xmm6,%xmm12
1177	movaps	64(%rsp),%xmm6
1178	pxor	%xmm7,%xmm13
1179	movaps	80(%rsp),%xmm7
1180	movdqu	%xmm8,(%rsi)
1181	movdqu	%xmm9,16(%rsi)
1182	movdqu	%xmm10,32(%rsi)
1183	movdqu	%xmm11,48(%rsi)
1184	movdqu	%xmm12,64(%rsi)
1185	movdqu	%xmm13,80(%rsi)
1186	leaq	96(%rsi),%rsi
1187
1188	subq	$6,%rdx
1189	jnc	L$ctr32_loop6
1190
1191	addq	$6,%rdx
1192	jz	L$ctr32_done
1193
1194	leal	-48(%r10),%eax
1195	leaq	-80(%rcx,%r10,1),%rcx
1196	negl	%eax
1197	shrl	$4,%eax
1198	jmp	L$ctr32_tail
1199
1200.p2align	5
1201L$ctr32_loop8:
1202	addl	$8,%r8d
1203	movdqa	96(%rsp),%xmm8
1204.byte	102,15,56,220,209
1205	movl	%r8d,%r9d
1206	movdqa	112(%rsp),%xmm9
1207.byte	102,15,56,220,217
1208	bswapl	%r9d
1209	movups	32-128(%rcx),%xmm0
1210.byte	102,15,56,220,225
1211	xorl	%ebp,%r9d
1212	nop
1213.byte	102,15,56,220,233
1214	movl	%r9d,0+12(%rsp)
1215	leaq	1(%r8),%r9
1216.byte	102,15,56,220,241
1217.byte	102,15,56,220,249
1218.byte	102,68,15,56,220,193
1219.byte	102,68,15,56,220,201
1220	movups	48-128(%rcx),%xmm1
1221	bswapl	%r9d
1222.byte	102,15,56,220,208
1223.byte	102,15,56,220,216
1224	xorl	%ebp,%r9d
1225.byte	0x66,0x90
1226.byte	102,15,56,220,224
1227.byte	102,15,56,220,232
1228	movl	%r9d,16+12(%rsp)
1229	leaq	2(%r8),%r9
1230.byte	102,15,56,220,240
1231.byte	102,15,56,220,248
1232.byte	102,68,15,56,220,192
1233.byte	102,68,15,56,220,200
1234	movups	64-128(%rcx),%xmm0
1235	bswapl	%r9d
1236.byte	102,15,56,220,209
1237.byte	102,15,56,220,217
1238	xorl	%ebp,%r9d
1239.byte	0x66,0x90
1240.byte	102,15,56,220,225
1241.byte	102,15,56,220,233
1242	movl	%r9d,32+12(%rsp)
1243	leaq	3(%r8),%r9
1244.byte	102,15,56,220,241
1245.byte	102,15,56,220,249
1246.byte	102,68,15,56,220,193
1247.byte	102,68,15,56,220,201
1248	movups	80-128(%rcx),%xmm1
1249	bswapl	%r9d
1250.byte	102,15,56,220,208
1251.byte	102,15,56,220,216
1252	xorl	%ebp,%r9d
1253.byte	0x66,0x90
1254.byte	102,15,56,220,224
1255.byte	102,15,56,220,232
1256	movl	%r9d,48+12(%rsp)
1257	leaq	4(%r8),%r9
1258.byte	102,15,56,220,240
1259.byte	102,15,56,220,248
1260.byte	102,68,15,56,220,192
1261.byte	102,68,15,56,220,200
1262	movups	96-128(%rcx),%xmm0
1263	bswapl	%r9d
1264.byte	102,15,56,220,209
1265.byte	102,15,56,220,217
1266	xorl	%ebp,%r9d
1267.byte	0x66,0x90
1268.byte	102,15,56,220,225
1269.byte	102,15,56,220,233
1270	movl	%r9d,64+12(%rsp)
1271	leaq	5(%r8),%r9
1272.byte	102,15,56,220,241
1273.byte	102,15,56,220,249
1274.byte	102,68,15,56,220,193
1275.byte	102,68,15,56,220,201
1276	movups	112-128(%rcx),%xmm1
1277	bswapl	%r9d
1278.byte	102,15,56,220,208
1279.byte	102,15,56,220,216
1280	xorl	%ebp,%r9d
1281.byte	0x66,0x90
1282.byte	102,15,56,220,224
1283.byte	102,15,56,220,232
1284	movl	%r9d,80+12(%rsp)
1285	leaq	6(%r8),%r9
1286.byte	102,15,56,220,240
1287.byte	102,15,56,220,248
1288.byte	102,68,15,56,220,192
1289.byte	102,68,15,56,220,200
1290	movups	128-128(%rcx),%xmm0
1291	bswapl	%r9d
1292.byte	102,15,56,220,209
1293.byte	102,15,56,220,217
1294	xorl	%ebp,%r9d
1295.byte	0x66,0x90
1296.byte	102,15,56,220,225
1297.byte	102,15,56,220,233
1298	movl	%r9d,96+12(%rsp)
1299	leaq	7(%r8),%r9
1300.byte	102,15,56,220,241
1301.byte	102,15,56,220,249
1302.byte	102,68,15,56,220,193
1303.byte	102,68,15,56,220,201
1304	movups	144-128(%rcx),%xmm1
1305	bswapl	%r9d
1306.byte	102,15,56,220,208
1307.byte	102,15,56,220,216
1308.byte	102,15,56,220,224
1309	xorl	%ebp,%r9d
1310	movdqu	0(%rdi),%xmm10
1311.byte	102,15,56,220,232
1312	movl	%r9d,112+12(%rsp)
1313	cmpl	$11,%eax
1314.byte	102,15,56,220,240
1315.byte	102,15,56,220,248
1316.byte	102,68,15,56,220,192
1317.byte	102,68,15,56,220,200
1318	movups	160-128(%rcx),%xmm0
1319
1320	jb	L$ctr32_enc_done
1321
1322.byte	102,15,56,220,209
1323.byte	102,15,56,220,217
1324.byte	102,15,56,220,225
1325.byte	102,15,56,220,233
1326.byte	102,15,56,220,241
1327.byte	102,15,56,220,249
1328.byte	102,68,15,56,220,193
1329.byte	102,68,15,56,220,201
1330	movups	176-128(%rcx),%xmm1
1331
1332.byte	102,15,56,220,208
1333.byte	102,15,56,220,216
1334.byte	102,15,56,220,224
1335.byte	102,15,56,220,232
1336.byte	102,15,56,220,240
1337.byte	102,15,56,220,248
1338.byte	102,68,15,56,220,192
1339.byte	102,68,15,56,220,200
1340	movups	192-128(%rcx),%xmm0
1341	je	L$ctr32_enc_done
1342
1343.byte	102,15,56,220,209
1344.byte	102,15,56,220,217
1345.byte	102,15,56,220,225
1346.byte	102,15,56,220,233
1347.byte	102,15,56,220,241
1348.byte	102,15,56,220,249
1349.byte	102,68,15,56,220,193
1350.byte	102,68,15,56,220,201
1351	movups	208-128(%rcx),%xmm1
1352
1353.byte	102,15,56,220,208
1354.byte	102,15,56,220,216
1355.byte	102,15,56,220,224
1356.byte	102,15,56,220,232
1357.byte	102,15,56,220,240
1358.byte	102,15,56,220,248
1359.byte	102,68,15,56,220,192
1360.byte	102,68,15,56,220,200
1361	movups	224-128(%rcx),%xmm0
1362	jmp	L$ctr32_enc_done
1363
1364.p2align	4
1365L$ctr32_enc_done:
1366	movdqu	16(%rdi),%xmm11
1367	pxor	%xmm0,%xmm10
1368	movdqu	32(%rdi),%xmm12
1369	pxor	%xmm0,%xmm11
1370	movdqu	48(%rdi),%xmm13
1371	pxor	%xmm0,%xmm12
1372	movdqu	64(%rdi),%xmm14
1373	pxor	%xmm0,%xmm13
1374	movdqu	80(%rdi),%xmm15
1375	pxor	%xmm0,%xmm14
1376	pxor	%xmm0,%xmm15
1377.byte	102,15,56,220,209
1378.byte	102,15,56,220,217
1379.byte	102,15,56,220,225
1380.byte	102,15,56,220,233
1381.byte	102,15,56,220,241
1382.byte	102,15,56,220,249
1383.byte	102,68,15,56,220,193
1384.byte	102,68,15,56,220,201
1385	movdqu	96(%rdi),%xmm1
1386	leaq	128(%rdi),%rdi
1387
1388.byte	102,65,15,56,221,210
1389	pxor	%xmm0,%xmm1
1390	movdqu	112-128(%rdi),%xmm10
1391.byte	102,65,15,56,221,219
1392	pxor	%xmm0,%xmm10
1393	movdqa	0(%rsp),%xmm11
1394.byte	102,65,15,56,221,228
1395.byte	102,65,15,56,221,237
1396	movdqa	16(%rsp),%xmm12
1397	movdqa	32(%rsp),%xmm13
1398.byte	102,65,15,56,221,246
1399.byte	102,65,15,56,221,255
1400	movdqa	48(%rsp),%xmm14
1401	movdqa	64(%rsp),%xmm15
1402.byte	102,68,15,56,221,193
1403	movdqa	80(%rsp),%xmm0
1404	movups	16-128(%rcx),%xmm1
1405.byte	102,69,15,56,221,202
1406
1407	movups	%xmm2,(%rsi)
1408	movdqa	%xmm11,%xmm2
1409	movups	%xmm3,16(%rsi)
1410	movdqa	%xmm12,%xmm3
1411	movups	%xmm4,32(%rsi)
1412	movdqa	%xmm13,%xmm4
1413	movups	%xmm5,48(%rsi)
1414	movdqa	%xmm14,%xmm5
1415	movups	%xmm6,64(%rsi)
1416	movdqa	%xmm15,%xmm6
1417	movups	%xmm7,80(%rsi)
1418	movdqa	%xmm0,%xmm7
1419	movups	%xmm8,96(%rsi)
1420	movups	%xmm9,112(%rsi)
1421	leaq	128(%rsi),%rsi
1422
1423	subq	$8,%rdx
1424	jnc	L$ctr32_loop8
1425
1426	addq	$8,%rdx
1427	jz	L$ctr32_done
1428	leaq	-128(%rcx),%rcx
1429
1430L$ctr32_tail:
1431
1432
1433	leaq	16(%rcx),%rcx
1434	cmpq	$4,%rdx
1435	jb	L$ctr32_loop3
1436	je	L$ctr32_loop4
1437
1438
1439	shll	$4,%eax
1440	movdqa	96(%rsp),%xmm8
1441	pxor	%xmm9,%xmm9
1442
1443	movups	16(%rcx),%xmm0
1444.byte	102,15,56,220,209
1445.byte	102,15,56,220,217
1446	leaq	32-16(%rcx,%rax,1),%rcx
1447	negq	%rax
1448.byte	102,15,56,220,225
1449	addq	$16,%rax
1450	movups	(%rdi),%xmm10
1451.byte	102,15,56,220,233
1452.byte	102,15,56,220,241
1453	movups	16(%rdi),%xmm11
1454	movups	32(%rdi),%xmm12
1455.byte	102,15,56,220,249
1456.byte	102,68,15,56,220,193
1457
1458	call	L$enc_loop8_enter
1459
1460	movdqu	48(%rdi),%xmm13
1461	pxor	%xmm10,%xmm2
1462	movdqu	64(%rdi),%xmm10
1463	pxor	%xmm11,%xmm3
1464	movdqu	%xmm2,(%rsi)
1465	pxor	%xmm12,%xmm4
1466	movdqu	%xmm3,16(%rsi)
1467	pxor	%xmm13,%xmm5
1468	movdqu	%xmm4,32(%rsi)
1469	pxor	%xmm10,%xmm6
1470	movdqu	%xmm5,48(%rsi)
1471	movdqu	%xmm6,64(%rsi)
1472	cmpq	$6,%rdx
1473	jb	L$ctr32_done
1474
1475	movups	80(%rdi),%xmm11
1476	xorps	%xmm11,%xmm7
1477	movups	%xmm7,80(%rsi)
1478	je	L$ctr32_done
1479
1480	movups	96(%rdi),%xmm12
1481	xorps	%xmm12,%xmm8
1482	movups	%xmm8,96(%rsi)
1483	jmp	L$ctr32_done
1484
1485.p2align	5
1486L$ctr32_loop4:
1487.byte	102,15,56,220,209
1488	leaq	16(%rcx),%rcx
1489	decl	%eax
1490.byte	102,15,56,220,217
1491.byte	102,15,56,220,225
1492.byte	102,15,56,220,233
1493	movups	(%rcx),%xmm1
1494	jnz	L$ctr32_loop4
1495.byte	102,15,56,221,209
1496.byte	102,15,56,221,217
1497	movups	(%rdi),%xmm10
1498	movups	16(%rdi),%xmm11
1499.byte	102,15,56,221,225
1500.byte	102,15,56,221,233
1501	movups	32(%rdi),%xmm12
1502	movups	48(%rdi),%xmm13
1503
1504	xorps	%xmm10,%xmm2
1505	movups	%xmm2,(%rsi)
1506	xorps	%xmm11,%xmm3
1507	movups	%xmm3,16(%rsi)
1508	pxor	%xmm12,%xmm4
1509	movdqu	%xmm4,32(%rsi)
1510	pxor	%xmm13,%xmm5
1511	movdqu	%xmm5,48(%rsi)
1512	jmp	L$ctr32_done
1513
1514.p2align	5
1515L$ctr32_loop3:
1516.byte	102,15,56,220,209
1517	leaq	16(%rcx),%rcx
1518	decl	%eax
1519.byte	102,15,56,220,217
1520.byte	102,15,56,220,225
1521	movups	(%rcx),%xmm1
1522	jnz	L$ctr32_loop3
1523.byte	102,15,56,221,209
1524.byte	102,15,56,221,217
1525.byte	102,15,56,221,225
1526
1527	movups	(%rdi),%xmm10
1528	xorps	%xmm10,%xmm2
1529	movups	%xmm2,(%rsi)
1530	cmpq	$2,%rdx
1531	jb	L$ctr32_done
1532
1533	movups	16(%rdi),%xmm11
1534	xorps	%xmm11,%xmm3
1535	movups	%xmm3,16(%rsi)
1536	je	L$ctr32_done
1537
1538	movups	32(%rdi),%xmm12
1539	xorps	%xmm12,%xmm4
1540	movups	%xmm4,32(%rsi)
1541
1542L$ctr32_done:
1543	xorps	%xmm0,%xmm0
1544	xorl	%ebp,%ebp
1545	pxor	%xmm1,%xmm1
1546	pxor	%xmm2,%xmm2
1547	pxor	%xmm3,%xmm3
1548	pxor	%xmm4,%xmm4
1549	pxor	%xmm5,%xmm5
1550	pxor	%xmm6,%xmm6
1551	pxor	%xmm7,%xmm7
1552	movaps	%xmm0,0(%rsp)
1553	pxor	%xmm8,%xmm8
1554	movaps	%xmm0,16(%rsp)
1555	pxor	%xmm9,%xmm9
1556	movaps	%xmm0,32(%rsp)
1557	pxor	%xmm10,%xmm10
1558	movaps	%xmm0,48(%rsp)
1559	pxor	%xmm11,%xmm11
1560	movaps	%xmm0,64(%rsp)
1561	pxor	%xmm12,%xmm12
1562	movaps	%xmm0,80(%rsp)
1563	pxor	%xmm13,%xmm13
1564	movaps	%xmm0,96(%rsp)
1565	pxor	%xmm14,%xmm14
1566	movaps	%xmm0,112(%rsp)
1567	pxor	%xmm15,%xmm15
1568	movq	-8(%r11),%rbp
1569	leaq	(%r11),%rsp
1570L$ctr32_epilogue:
1571	.byte	0xf3,0xc3
1572
1573.globl	_aesni_xts_encrypt
1574.private_extern _aesni_xts_encrypt
1575
1576.p2align	4
1577_aesni_xts_encrypt:
1578	leaq	(%rsp),%r11
1579	pushq	%rbp
1580	subq	$112,%rsp
1581	andq	$-16,%rsp
1582	movups	(%r9),%xmm2
1583	movl	240(%r8),%eax
1584	movl	240(%rcx),%r10d
1585	movups	(%r8),%xmm0
1586	movups	16(%r8),%xmm1
1587	leaq	32(%r8),%r8
1588	xorps	%xmm0,%xmm2
1589L$oop_enc1_8:
1590.byte	102,15,56,220,209
1591	decl	%eax
1592	movups	(%r8),%xmm1
1593	leaq	16(%r8),%r8
1594	jnz	L$oop_enc1_8
1595.byte	102,15,56,221,209
1596	movups	(%rcx),%xmm0
1597	movq	%rcx,%rbp
1598	movl	%r10d,%eax
1599	shll	$4,%r10d
1600	movq	%rdx,%r9
1601	andq	$-16,%rdx
1602
1603	movups	16(%rcx,%r10,1),%xmm1
1604
1605	movdqa	L$xts_magic(%rip),%xmm8
1606	movdqa	%xmm2,%xmm15
1607	pshufd	$0x5f,%xmm2,%xmm9
1608	pxor	%xmm0,%xmm1
1609	movdqa	%xmm9,%xmm14
1610	paddd	%xmm9,%xmm9
1611	movdqa	%xmm15,%xmm10
1612	psrad	$31,%xmm14
1613	paddq	%xmm15,%xmm15
1614	pand	%xmm8,%xmm14
1615	pxor	%xmm0,%xmm10
1616	pxor	%xmm14,%xmm15
1617	movdqa	%xmm9,%xmm14
1618	paddd	%xmm9,%xmm9
1619	movdqa	%xmm15,%xmm11
1620	psrad	$31,%xmm14
1621	paddq	%xmm15,%xmm15
1622	pand	%xmm8,%xmm14
1623	pxor	%xmm0,%xmm11
1624	pxor	%xmm14,%xmm15
1625	movdqa	%xmm9,%xmm14
1626	paddd	%xmm9,%xmm9
1627	movdqa	%xmm15,%xmm12
1628	psrad	$31,%xmm14
1629	paddq	%xmm15,%xmm15
1630	pand	%xmm8,%xmm14
1631	pxor	%xmm0,%xmm12
1632	pxor	%xmm14,%xmm15
1633	movdqa	%xmm9,%xmm14
1634	paddd	%xmm9,%xmm9
1635	movdqa	%xmm15,%xmm13
1636	psrad	$31,%xmm14
1637	paddq	%xmm15,%xmm15
1638	pand	%xmm8,%xmm14
1639	pxor	%xmm0,%xmm13
1640	pxor	%xmm14,%xmm15
1641	movdqa	%xmm15,%xmm14
1642	psrad	$31,%xmm9
1643	paddq	%xmm15,%xmm15
1644	pand	%xmm8,%xmm9
1645	pxor	%xmm0,%xmm14
1646	pxor	%xmm9,%xmm15
1647	movaps	%xmm1,96(%rsp)
1648
1649	subq	$96,%rdx
1650	jc	L$xts_enc_short
1651
1652	movl	$16+96,%eax
1653	leaq	32(%rbp,%r10,1),%rcx
1654	subq	%r10,%rax
1655	movups	16(%rbp),%xmm1
1656	movq	%rax,%r10
1657	leaq	L$xts_magic(%rip),%r8
1658	jmp	L$xts_enc_grandloop
1659
1660.p2align	5
1661L$xts_enc_grandloop:
1662	movdqu	0(%rdi),%xmm2
1663	movdqa	%xmm0,%xmm8
1664	movdqu	16(%rdi),%xmm3
1665	pxor	%xmm10,%xmm2
1666	movdqu	32(%rdi),%xmm4
1667	pxor	%xmm11,%xmm3
1668.byte	102,15,56,220,209
1669	movdqu	48(%rdi),%xmm5
1670	pxor	%xmm12,%xmm4
1671.byte	102,15,56,220,217
1672	movdqu	64(%rdi),%xmm6
1673	pxor	%xmm13,%xmm5
1674.byte	102,15,56,220,225
1675	movdqu	80(%rdi),%xmm7
1676	pxor	%xmm15,%xmm8
1677	movdqa	96(%rsp),%xmm9
1678	pxor	%xmm14,%xmm6
1679.byte	102,15,56,220,233
1680	movups	32(%rbp),%xmm0
1681	leaq	96(%rdi),%rdi
1682	pxor	%xmm8,%xmm7
1683
1684	pxor	%xmm9,%xmm10
1685.byte	102,15,56,220,241
1686	pxor	%xmm9,%xmm11
1687	movdqa	%xmm10,0(%rsp)
1688.byte	102,15,56,220,249
1689	movups	48(%rbp),%xmm1
1690	pxor	%xmm9,%xmm12
1691
1692.byte	102,15,56,220,208
1693	pxor	%xmm9,%xmm13
1694	movdqa	%xmm11,16(%rsp)
1695.byte	102,15,56,220,216
1696	pxor	%xmm9,%xmm14
1697	movdqa	%xmm12,32(%rsp)
1698.byte	102,15,56,220,224
1699.byte	102,15,56,220,232
1700	pxor	%xmm9,%xmm8
1701	movdqa	%xmm14,64(%rsp)
1702.byte	102,15,56,220,240
1703.byte	102,15,56,220,248
1704	movups	64(%rbp),%xmm0
1705	movdqa	%xmm8,80(%rsp)
1706	pshufd	$0x5f,%xmm15,%xmm9
1707	jmp	L$xts_enc_loop6
1708.p2align	5
1709L$xts_enc_loop6:
1710.byte	102,15,56,220,209
1711.byte	102,15,56,220,217
1712.byte	102,15,56,220,225
1713.byte	102,15,56,220,233
1714.byte	102,15,56,220,241
1715.byte	102,15,56,220,249
1716	movups	-64(%rcx,%rax,1),%xmm1
1717	addq	$32,%rax
1718
1719.byte	102,15,56,220,208
1720.byte	102,15,56,220,216
1721.byte	102,15,56,220,224
1722.byte	102,15,56,220,232
1723.byte	102,15,56,220,240
1724.byte	102,15,56,220,248
1725	movups	-80(%rcx,%rax,1),%xmm0
1726	jnz	L$xts_enc_loop6
1727
1728	movdqa	(%r8),%xmm8
1729	movdqa	%xmm9,%xmm14
1730	paddd	%xmm9,%xmm9
1731.byte	102,15,56,220,209
1732	paddq	%xmm15,%xmm15
1733	psrad	$31,%xmm14
1734.byte	102,15,56,220,217
1735	pand	%xmm8,%xmm14
1736	movups	(%rbp),%xmm10
1737.byte	102,15,56,220,225
1738.byte	102,15,56,220,233
1739.byte	102,15,56,220,241
1740	pxor	%xmm14,%xmm15
1741	movaps	%xmm10,%xmm11
1742.byte	102,15,56,220,249
1743	movups	-64(%rcx),%xmm1
1744
1745	movdqa	%xmm9,%xmm14
1746.byte	102,15,56,220,208
1747	paddd	%xmm9,%xmm9
1748	pxor	%xmm15,%xmm10
1749.byte	102,15,56,220,216
1750	psrad	$31,%xmm14
1751	paddq	%xmm15,%xmm15
1752.byte	102,15,56,220,224
1753.byte	102,15,56,220,232
1754	pand	%xmm8,%xmm14
1755	movaps	%xmm11,%xmm12
1756.byte	102,15,56,220,240
1757	pxor	%xmm14,%xmm15
1758	movdqa	%xmm9,%xmm14
1759.byte	102,15,56,220,248
1760	movups	-48(%rcx),%xmm0
1761
1762	paddd	%xmm9,%xmm9
1763.byte	102,15,56,220,209
1764	pxor	%xmm15,%xmm11
1765	psrad	$31,%xmm14
1766.byte	102,15,56,220,217
1767	paddq	%xmm15,%xmm15
1768	pand	%xmm8,%xmm14
1769.byte	102,15,56,220,225
1770.byte	102,15,56,220,233
1771	movdqa	%xmm13,48(%rsp)
1772	pxor	%xmm14,%xmm15
1773.byte	102,15,56,220,241
1774	movaps	%xmm12,%xmm13
1775	movdqa	%xmm9,%xmm14
1776.byte	102,15,56,220,249
1777	movups	-32(%rcx),%xmm1
1778
1779	paddd	%xmm9,%xmm9
1780.byte	102,15,56,220,208
1781	pxor	%xmm15,%xmm12
1782	psrad	$31,%xmm14
1783.byte	102,15,56,220,216
1784	paddq	%xmm15,%xmm15
1785	pand	%xmm8,%xmm14
1786.byte	102,15,56,220,224
1787.byte	102,15,56,220,232
1788.byte	102,15,56,220,240
1789	pxor	%xmm14,%xmm15
1790	movaps	%xmm13,%xmm14
1791.byte	102,15,56,220,248
1792
1793	movdqa	%xmm9,%xmm0
1794	paddd	%xmm9,%xmm9
1795.byte	102,15,56,220,209
1796	pxor	%xmm15,%xmm13
1797	psrad	$31,%xmm0
1798.byte	102,15,56,220,217
1799	paddq	%xmm15,%xmm15
1800	pand	%xmm8,%xmm0
1801.byte	102,15,56,220,225
1802.byte	102,15,56,220,233
1803	pxor	%xmm0,%xmm15
1804	movups	(%rbp),%xmm0
1805.byte	102,15,56,220,241
1806.byte	102,15,56,220,249
1807	movups	16(%rbp),%xmm1
1808
1809	pxor	%xmm15,%xmm14
1810.byte	102,15,56,221,84,36,0
1811	psrad	$31,%xmm9
1812	paddq	%xmm15,%xmm15
1813.byte	102,15,56,221,92,36,16
1814.byte	102,15,56,221,100,36,32
1815	pand	%xmm8,%xmm9
1816	movq	%r10,%rax
1817.byte	102,15,56,221,108,36,48
1818.byte	102,15,56,221,116,36,64
1819.byte	102,15,56,221,124,36,80
1820	pxor	%xmm9,%xmm15
1821
1822	leaq	96(%rsi),%rsi
1823	movups	%xmm2,-96(%rsi)
1824	movups	%xmm3,-80(%rsi)
1825	movups	%xmm4,-64(%rsi)
1826	movups	%xmm5,-48(%rsi)
1827	movups	%xmm6,-32(%rsi)
1828	movups	%xmm7,-16(%rsi)
1829	subq	$96,%rdx
1830	jnc	L$xts_enc_grandloop
1831
1832	movl	$16+96,%eax
1833	subl	%r10d,%eax
1834	movq	%rbp,%rcx
1835	shrl	$4,%eax
1836
1837L$xts_enc_short:
1838
1839	movl	%eax,%r10d
1840	pxor	%xmm0,%xmm10
1841	addq	$96,%rdx
1842	jz	L$xts_enc_done
1843
1844	pxor	%xmm0,%xmm11
1845	cmpq	$0x20,%rdx
1846	jb	L$xts_enc_one
1847	pxor	%xmm0,%xmm12
1848	je	L$xts_enc_two
1849
1850	pxor	%xmm0,%xmm13
1851	cmpq	$0x40,%rdx
1852	jb	L$xts_enc_three
1853	pxor	%xmm0,%xmm14
1854	je	L$xts_enc_four
1855
1856	movdqu	(%rdi),%xmm2
1857	movdqu	16(%rdi),%xmm3
1858	movdqu	32(%rdi),%xmm4
1859	pxor	%xmm10,%xmm2
1860	movdqu	48(%rdi),%xmm5
1861	pxor	%xmm11,%xmm3
1862	movdqu	64(%rdi),%xmm6
1863	leaq	80(%rdi),%rdi
1864	pxor	%xmm12,%xmm4
1865	pxor	%xmm13,%xmm5
1866	pxor	%xmm14,%xmm6
1867	pxor	%xmm7,%xmm7
1868
1869	call	_aesni_encrypt6
1870
1871	xorps	%xmm10,%xmm2
1872	movdqa	%xmm15,%xmm10
1873	xorps	%xmm11,%xmm3
1874	xorps	%xmm12,%xmm4
1875	movdqu	%xmm2,(%rsi)
1876	xorps	%xmm13,%xmm5
1877	movdqu	%xmm3,16(%rsi)
1878	xorps	%xmm14,%xmm6
1879	movdqu	%xmm4,32(%rsi)
1880	movdqu	%xmm5,48(%rsi)
1881	movdqu	%xmm6,64(%rsi)
1882	leaq	80(%rsi),%rsi
1883	jmp	L$xts_enc_done
1884
1885.p2align	4
1886L$xts_enc_one:
1887	movups	(%rdi),%xmm2
1888	leaq	16(%rdi),%rdi
1889	xorps	%xmm10,%xmm2
1890	movups	(%rcx),%xmm0
1891	movups	16(%rcx),%xmm1
1892	leaq	32(%rcx),%rcx
1893	xorps	%xmm0,%xmm2
1894L$oop_enc1_9:
1895.byte	102,15,56,220,209
1896	decl	%eax
1897	movups	(%rcx),%xmm1
1898	leaq	16(%rcx),%rcx
1899	jnz	L$oop_enc1_9
1900.byte	102,15,56,221,209
1901	xorps	%xmm10,%xmm2
1902	movdqa	%xmm11,%xmm10
1903	movups	%xmm2,(%rsi)
1904	leaq	16(%rsi),%rsi
1905	jmp	L$xts_enc_done
1906
1907.p2align	4
1908L$xts_enc_two:
1909	movups	(%rdi),%xmm2
1910	movups	16(%rdi),%xmm3
1911	leaq	32(%rdi),%rdi
1912	xorps	%xmm10,%xmm2
1913	xorps	%xmm11,%xmm3
1914
1915	call	_aesni_encrypt2
1916
1917	xorps	%xmm10,%xmm2
1918	movdqa	%xmm12,%xmm10
1919	xorps	%xmm11,%xmm3
1920	movups	%xmm2,(%rsi)
1921	movups	%xmm3,16(%rsi)
1922	leaq	32(%rsi),%rsi
1923	jmp	L$xts_enc_done
1924
1925.p2align	4
1926L$xts_enc_three:
1927	movups	(%rdi),%xmm2
1928	movups	16(%rdi),%xmm3
1929	movups	32(%rdi),%xmm4
1930	leaq	48(%rdi),%rdi
1931	xorps	%xmm10,%xmm2
1932	xorps	%xmm11,%xmm3
1933	xorps	%xmm12,%xmm4
1934
1935	call	_aesni_encrypt3
1936
1937	xorps	%xmm10,%xmm2
1938	movdqa	%xmm13,%xmm10
1939	xorps	%xmm11,%xmm3
1940	xorps	%xmm12,%xmm4
1941	movups	%xmm2,(%rsi)
1942	movups	%xmm3,16(%rsi)
1943	movups	%xmm4,32(%rsi)
1944	leaq	48(%rsi),%rsi
1945	jmp	L$xts_enc_done
1946
1947.p2align	4
1948L$xts_enc_four:
1949	movups	(%rdi),%xmm2
1950	movups	16(%rdi),%xmm3
1951	movups	32(%rdi),%xmm4
1952	xorps	%xmm10,%xmm2
1953	movups	48(%rdi),%xmm5
1954	leaq	64(%rdi),%rdi
1955	xorps	%xmm11,%xmm3
1956	xorps	%xmm12,%xmm4
1957	xorps	%xmm13,%xmm5
1958
1959	call	_aesni_encrypt4
1960
1961	pxor	%xmm10,%xmm2
1962	movdqa	%xmm14,%xmm10
1963	pxor	%xmm11,%xmm3
1964	pxor	%xmm12,%xmm4
1965	movdqu	%xmm2,(%rsi)
1966	pxor	%xmm13,%xmm5
1967	movdqu	%xmm3,16(%rsi)
1968	movdqu	%xmm4,32(%rsi)
1969	movdqu	%xmm5,48(%rsi)
1970	leaq	64(%rsi),%rsi
1971	jmp	L$xts_enc_done
1972
1973.p2align	4
1974L$xts_enc_done:
1975	andq	$15,%r9
1976	jz	L$xts_enc_ret
1977	movq	%r9,%rdx
1978
1979L$xts_enc_steal:
1980	movzbl	(%rdi),%eax
1981	movzbl	-16(%rsi),%ecx
1982	leaq	1(%rdi),%rdi
1983	movb	%al,-16(%rsi)
1984	movb	%cl,0(%rsi)
1985	leaq	1(%rsi),%rsi
1986	subq	$1,%rdx
1987	jnz	L$xts_enc_steal
1988
1989	subq	%r9,%rsi
1990	movq	%rbp,%rcx
1991	movl	%r10d,%eax
1992
1993	movups	-16(%rsi),%xmm2
1994	xorps	%xmm10,%xmm2
1995	movups	(%rcx),%xmm0
1996	movups	16(%rcx),%xmm1
1997	leaq	32(%rcx),%rcx
1998	xorps	%xmm0,%xmm2
1999L$oop_enc1_10:
2000.byte	102,15,56,220,209
2001	decl	%eax
2002	movups	(%rcx),%xmm1
2003	leaq	16(%rcx),%rcx
2004	jnz	L$oop_enc1_10
2005.byte	102,15,56,221,209
2006	xorps	%xmm10,%xmm2
2007	movups	%xmm2,-16(%rsi)
2008
2009L$xts_enc_ret:
2010	xorps	%xmm0,%xmm0
2011	pxor	%xmm1,%xmm1
2012	pxor	%xmm2,%xmm2
2013	pxor	%xmm3,%xmm3
2014	pxor	%xmm4,%xmm4
2015	pxor	%xmm5,%xmm5
2016	pxor	%xmm6,%xmm6
2017	pxor	%xmm7,%xmm7
2018	movaps	%xmm0,0(%rsp)
2019	pxor	%xmm8,%xmm8
2020	movaps	%xmm0,16(%rsp)
2021	pxor	%xmm9,%xmm9
2022	movaps	%xmm0,32(%rsp)
2023	pxor	%xmm10,%xmm10
2024	movaps	%xmm0,48(%rsp)
2025	pxor	%xmm11,%xmm11
2026	movaps	%xmm0,64(%rsp)
2027	pxor	%xmm12,%xmm12
2028	movaps	%xmm0,80(%rsp)
2029	pxor	%xmm13,%xmm13
2030	movaps	%xmm0,96(%rsp)
2031	pxor	%xmm14,%xmm14
2032	pxor	%xmm15,%xmm15
2033	movq	-8(%r11),%rbp
2034	leaq	(%r11),%rsp
2035L$xts_enc_epilogue:
2036	.byte	0xf3,0xc3
2037
2038.globl	_aesni_xts_decrypt
2039.private_extern _aesni_xts_decrypt
2040
2041.p2align	4
2042_aesni_xts_decrypt:
2043	leaq	(%rsp),%r11
2044	pushq	%rbp
2045	subq	$112,%rsp
2046	andq	$-16,%rsp
2047	movups	(%r9),%xmm2
2048	movl	240(%r8),%eax
2049	movl	240(%rcx),%r10d
2050	movups	(%r8),%xmm0
2051	movups	16(%r8),%xmm1
2052	leaq	32(%r8),%r8
2053	xorps	%xmm0,%xmm2
2054L$oop_enc1_11:
2055.byte	102,15,56,220,209
2056	decl	%eax
2057	movups	(%r8),%xmm1
2058	leaq	16(%r8),%r8
2059	jnz	L$oop_enc1_11
2060.byte	102,15,56,221,209
2061	xorl	%eax,%eax
2062	testq	$15,%rdx
2063	setnz	%al
2064	shlq	$4,%rax
2065	subq	%rax,%rdx
2066
2067	movups	(%rcx),%xmm0
2068	movq	%rcx,%rbp
2069	movl	%r10d,%eax
2070	shll	$4,%r10d
2071	movq	%rdx,%r9
2072	andq	$-16,%rdx
2073
2074	movups	16(%rcx,%r10,1),%xmm1
2075
2076	movdqa	L$xts_magic(%rip),%xmm8
2077	movdqa	%xmm2,%xmm15
2078	pshufd	$0x5f,%xmm2,%xmm9
2079	pxor	%xmm0,%xmm1
2080	movdqa	%xmm9,%xmm14
2081	paddd	%xmm9,%xmm9
2082	movdqa	%xmm15,%xmm10
2083	psrad	$31,%xmm14
2084	paddq	%xmm15,%xmm15
2085	pand	%xmm8,%xmm14
2086	pxor	%xmm0,%xmm10
2087	pxor	%xmm14,%xmm15
2088	movdqa	%xmm9,%xmm14
2089	paddd	%xmm9,%xmm9
2090	movdqa	%xmm15,%xmm11
2091	psrad	$31,%xmm14
2092	paddq	%xmm15,%xmm15
2093	pand	%xmm8,%xmm14
2094	pxor	%xmm0,%xmm11
2095	pxor	%xmm14,%xmm15
2096	movdqa	%xmm9,%xmm14
2097	paddd	%xmm9,%xmm9
2098	movdqa	%xmm15,%xmm12
2099	psrad	$31,%xmm14
2100	paddq	%xmm15,%xmm15
2101	pand	%xmm8,%xmm14
2102	pxor	%xmm0,%xmm12
2103	pxor	%xmm14,%xmm15
2104	movdqa	%xmm9,%xmm14
2105	paddd	%xmm9,%xmm9
2106	movdqa	%xmm15,%xmm13
2107	psrad	$31,%xmm14
2108	paddq	%xmm15,%xmm15
2109	pand	%xmm8,%xmm14
2110	pxor	%xmm0,%xmm13
2111	pxor	%xmm14,%xmm15
2112	movdqa	%xmm15,%xmm14
2113	psrad	$31,%xmm9
2114	paddq	%xmm15,%xmm15
2115	pand	%xmm8,%xmm9
2116	pxor	%xmm0,%xmm14
2117	pxor	%xmm9,%xmm15
2118	movaps	%xmm1,96(%rsp)
2119
2120	subq	$96,%rdx
2121	jc	L$xts_dec_short
2122
2123	movl	$16+96,%eax
2124	leaq	32(%rbp,%r10,1),%rcx
2125	subq	%r10,%rax
2126	movups	16(%rbp),%xmm1
2127	movq	%rax,%r10
2128	leaq	L$xts_magic(%rip),%r8
2129	jmp	L$xts_dec_grandloop
2130
2131.p2align	5
2132L$xts_dec_grandloop:
2133	movdqu	0(%rdi),%xmm2
2134	movdqa	%xmm0,%xmm8
2135	movdqu	16(%rdi),%xmm3
2136	pxor	%xmm10,%xmm2
2137	movdqu	32(%rdi),%xmm4
2138	pxor	%xmm11,%xmm3
2139.byte	102,15,56,222,209
2140	movdqu	48(%rdi),%xmm5
2141	pxor	%xmm12,%xmm4
2142.byte	102,15,56,222,217
2143	movdqu	64(%rdi),%xmm6
2144	pxor	%xmm13,%xmm5
2145.byte	102,15,56,222,225
2146	movdqu	80(%rdi),%xmm7
2147	pxor	%xmm15,%xmm8
2148	movdqa	96(%rsp),%xmm9
2149	pxor	%xmm14,%xmm6
2150.byte	102,15,56,222,233
2151	movups	32(%rbp),%xmm0
2152	leaq	96(%rdi),%rdi
2153	pxor	%xmm8,%xmm7
2154
2155	pxor	%xmm9,%xmm10
2156.byte	102,15,56,222,241
2157	pxor	%xmm9,%xmm11
2158	movdqa	%xmm10,0(%rsp)
2159.byte	102,15,56,222,249
2160	movups	48(%rbp),%xmm1
2161	pxor	%xmm9,%xmm12
2162
2163.byte	102,15,56,222,208
2164	pxor	%xmm9,%xmm13
2165	movdqa	%xmm11,16(%rsp)
2166.byte	102,15,56,222,216
2167	pxor	%xmm9,%xmm14
2168	movdqa	%xmm12,32(%rsp)
2169.byte	102,15,56,222,224
2170.byte	102,15,56,222,232
2171	pxor	%xmm9,%xmm8
2172	movdqa	%xmm14,64(%rsp)
2173.byte	102,15,56,222,240
2174.byte	102,15,56,222,248
2175	movups	64(%rbp),%xmm0
2176	movdqa	%xmm8,80(%rsp)
2177	pshufd	$0x5f,%xmm15,%xmm9
2178	jmp	L$xts_dec_loop6
2179.p2align	5
2180L$xts_dec_loop6:
2181.byte	102,15,56,222,209
2182.byte	102,15,56,222,217
2183.byte	102,15,56,222,225
2184.byte	102,15,56,222,233
2185.byte	102,15,56,222,241
2186.byte	102,15,56,222,249
2187	movups	-64(%rcx,%rax,1),%xmm1
2188	addq	$32,%rax
2189
2190.byte	102,15,56,222,208
2191.byte	102,15,56,222,216
2192.byte	102,15,56,222,224
2193.byte	102,15,56,222,232
2194.byte	102,15,56,222,240
2195.byte	102,15,56,222,248
2196	movups	-80(%rcx,%rax,1),%xmm0
2197	jnz	L$xts_dec_loop6
2198
2199	movdqa	(%r8),%xmm8
2200	movdqa	%xmm9,%xmm14
2201	paddd	%xmm9,%xmm9
2202.byte	102,15,56,222,209
2203	paddq	%xmm15,%xmm15
2204	psrad	$31,%xmm14
2205.byte	102,15,56,222,217
2206	pand	%xmm8,%xmm14
2207	movups	(%rbp),%xmm10
2208.byte	102,15,56,222,225
2209.byte	102,15,56,222,233
2210.byte	102,15,56,222,241
2211	pxor	%xmm14,%xmm15
2212	movaps	%xmm10,%xmm11
2213.byte	102,15,56,222,249
2214	movups	-64(%rcx),%xmm1
2215
2216	movdqa	%xmm9,%xmm14
2217.byte	102,15,56,222,208
2218	paddd	%xmm9,%xmm9
2219	pxor	%xmm15,%xmm10
2220.byte	102,15,56,222,216
2221	psrad	$31,%xmm14
2222	paddq	%xmm15,%xmm15
2223.byte	102,15,56,222,224
2224.byte	102,15,56,222,232
2225	pand	%xmm8,%xmm14
2226	movaps	%xmm11,%xmm12
2227.byte	102,15,56,222,240
2228	pxor	%xmm14,%xmm15
2229	movdqa	%xmm9,%xmm14
2230.byte	102,15,56,222,248
2231	movups	-48(%rcx),%xmm0
2232
2233	paddd	%xmm9,%xmm9
2234.byte	102,15,56,222,209
2235	pxor	%xmm15,%xmm11
2236	psrad	$31,%xmm14
2237.byte	102,15,56,222,217
2238	paddq	%xmm15,%xmm15
2239	pand	%xmm8,%xmm14
2240.byte	102,15,56,222,225
2241.byte	102,15,56,222,233
2242	movdqa	%xmm13,48(%rsp)
2243	pxor	%xmm14,%xmm15
2244.byte	102,15,56,222,241
2245	movaps	%xmm12,%xmm13
2246	movdqa	%xmm9,%xmm14
2247.byte	102,15,56,222,249
2248	movups	-32(%rcx),%xmm1
2249
2250	paddd	%xmm9,%xmm9
2251.byte	102,15,56,222,208
2252	pxor	%xmm15,%xmm12
2253	psrad	$31,%xmm14
2254.byte	102,15,56,222,216
2255	paddq	%xmm15,%xmm15
2256	pand	%xmm8,%xmm14
2257.byte	102,15,56,222,224
2258.byte	102,15,56,222,232
2259.byte	102,15,56,222,240
2260	pxor	%xmm14,%xmm15
2261	movaps	%xmm13,%xmm14
2262.byte	102,15,56,222,248
2263
2264	movdqa	%xmm9,%xmm0
2265	paddd	%xmm9,%xmm9
2266.byte	102,15,56,222,209
2267	pxor	%xmm15,%xmm13
2268	psrad	$31,%xmm0
2269.byte	102,15,56,222,217
2270	paddq	%xmm15,%xmm15
2271	pand	%xmm8,%xmm0
2272.byte	102,15,56,222,225
2273.byte	102,15,56,222,233
2274	pxor	%xmm0,%xmm15
2275	movups	(%rbp),%xmm0
2276.byte	102,15,56,222,241
2277.byte	102,15,56,222,249
2278	movups	16(%rbp),%xmm1
2279
2280	pxor	%xmm15,%xmm14
2281.byte	102,15,56,223,84,36,0
2282	psrad	$31,%xmm9
2283	paddq	%xmm15,%xmm15
2284.byte	102,15,56,223,92,36,16
2285.byte	102,15,56,223,100,36,32
2286	pand	%xmm8,%xmm9
2287	movq	%r10,%rax
2288.byte	102,15,56,223,108,36,48
2289.byte	102,15,56,223,116,36,64
2290.byte	102,15,56,223,124,36,80
2291	pxor	%xmm9,%xmm15
2292
2293	leaq	96(%rsi),%rsi
2294	movups	%xmm2,-96(%rsi)
2295	movups	%xmm3,-80(%rsi)
2296	movups	%xmm4,-64(%rsi)
2297	movups	%xmm5,-48(%rsi)
2298	movups	%xmm6,-32(%rsi)
2299	movups	%xmm7,-16(%rsi)
2300	subq	$96,%rdx
2301	jnc	L$xts_dec_grandloop
2302
2303	movl	$16+96,%eax
2304	subl	%r10d,%eax
2305	movq	%rbp,%rcx
2306	shrl	$4,%eax
2307
2308L$xts_dec_short:
2309
2310	movl	%eax,%r10d
2311	pxor	%xmm0,%xmm10
2312	pxor	%xmm0,%xmm11
2313	addq	$96,%rdx
2314	jz	L$xts_dec_done
2315
2316	pxor	%xmm0,%xmm12
2317	cmpq	$0x20,%rdx
2318	jb	L$xts_dec_one
2319	pxor	%xmm0,%xmm13
2320	je	L$xts_dec_two
2321
2322	pxor	%xmm0,%xmm14
2323	cmpq	$0x40,%rdx
2324	jb	L$xts_dec_three
2325	je	L$xts_dec_four
2326
2327	movdqu	(%rdi),%xmm2
2328	movdqu	16(%rdi),%xmm3
2329	movdqu	32(%rdi),%xmm4
2330	pxor	%xmm10,%xmm2
2331	movdqu	48(%rdi),%xmm5
2332	pxor	%xmm11,%xmm3
2333	movdqu	64(%rdi),%xmm6
2334	leaq	80(%rdi),%rdi
2335	pxor	%xmm12,%xmm4
2336	pxor	%xmm13,%xmm5
2337	pxor	%xmm14,%xmm6
2338
2339	call	_aesni_decrypt6
2340
2341	xorps	%xmm10,%xmm2
2342	xorps	%xmm11,%xmm3
2343	xorps	%xmm12,%xmm4
2344	movdqu	%xmm2,(%rsi)
2345	xorps	%xmm13,%xmm5
2346	movdqu	%xmm3,16(%rsi)
2347	xorps	%xmm14,%xmm6
2348	movdqu	%xmm4,32(%rsi)
2349	pxor	%xmm14,%xmm14
2350	movdqu	%xmm5,48(%rsi)
2351	pcmpgtd	%xmm15,%xmm14
2352	movdqu	%xmm6,64(%rsi)
2353	leaq	80(%rsi),%rsi
2354	pshufd	$0x13,%xmm14,%xmm11
2355	andq	$15,%r9
2356	jz	L$xts_dec_ret
2357
2358	movdqa	%xmm15,%xmm10
2359	paddq	%xmm15,%xmm15
2360	pand	%xmm8,%xmm11
2361	pxor	%xmm15,%xmm11
2362	jmp	L$xts_dec_done2
2363
2364.p2align	4
2365L$xts_dec_one:
2366	movups	(%rdi),%xmm2
2367	leaq	16(%rdi),%rdi
2368	xorps	%xmm10,%xmm2
2369	movups	(%rcx),%xmm0
2370	movups	16(%rcx),%xmm1
2371	leaq	32(%rcx),%rcx
2372	xorps	%xmm0,%xmm2
2373L$oop_dec1_12:
2374.byte	102,15,56,222,209
2375	decl	%eax
2376	movups	(%rcx),%xmm1
2377	leaq	16(%rcx),%rcx
2378	jnz	L$oop_dec1_12
2379.byte	102,15,56,223,209
2380	xorps	%xmm10,%xmm2
2381	movdqa	%xmm11,%xmm10
2382	movups	%xmm2,(%rsi)
2383	movdqa	%xmm12,%xmm11
2384	leaq	16(%rsi),%rsi
2385	jmp	L$xts_dec_done
2386
2387.p2align	4
2388L$xts_dec_two:
2389	movups	(%rdi),%xmm2
2390	movups	16(%rdi),%xmm3
2391	leaq	32(%rdi),%rdi
2392	xorps	%xmm10,%xmm2
2393	xorps	%xmm11,%xmm3
2394
2395	call	_aesni_decrypt2
2396
2397	xorps	%xmm10,%xmm2
2398	movdqa	%xmm12,%xmm10
2399	xorps	%xmm11,%xmm3
2400	movdqa	%xmm13,%xmm11
2401	movups	%xmm2,(%rsi)
2402	movups	%xmm3,16(%rsi)
2403	leaq	32(%rsi),%rsi
2404	jmp	L$xts_dec_done
2405
2406.p2align	4
2407L$xts_dec_three:
2408	movups	(%rdi),%xmm2
2409	movups	16(%rdi),%xmm3
2410	movups	32(%rdi),%xmm4
2411	leaq	48(%rdi),%rdi
2412	xorps	%xmm10,%xmm2
2413	xorps	%xmm11,%xmm3
2414	xorps	%xmm12,%xmm4
2415
2416	call	_aesni_decrypt3
2417
2418	xorps	%xmm10,%xmm2
2419	movdqa	%xmm13,%xmm10
2420	xorps	%xmm11,%xmm3
2421	movdqa	%xmm14,%xmm11
2422	xorps	%xmm12,%xmm4
2423	movups	%xmm2,(%rsi)
2424	movups	%xmm3,16(%rsi)
2425	movups	%xmm4,32(%rsi)
2426	leaq	48(%rsi),%rsi
2427	jmp	L$xts_dec_done
2428
2429.p2align	4
2430L$xts_dec_four:
2431	movups	(%rdi),%xmm2
2432	movups	16(%rdi),%xmm3
2433	movups	32(%rdi),%xmm4
2434	xorps	%xmm10,%xmm2
2435	movups	48(%rdi),%xmm5
2436	leaq	64(%rdi),%rdi
2437	xorps	%xmm11,%xmm3
2438	xorps	%xmm12,%xmm4
2439	xorps	%xmm13,%xmm5
2440
2441	call	_aesni_decrypt4
2442
2443	pxor	%xmm10,%xmm2
2444	movdqa	%xmm14,%xmm10
2445	pxor	%xmm11,%xmm3
2446	movdqa	%xmm15,%xmm11
2447	pxor	%xmm12,%xmm4
2448	movdqu	%xmm2,(%rsi)
2449	pxor	%xmm13,%xmm5
2450	movdqu	%xmm3,16(%rsi)
2451	movdqu	%xmm4,32(%rsi)
2452	movdqu	%xmm5,48(%rsi)
2453	leaq	64(%rsi),%rsi
2454	jmp	L$xts_dec_done
2455
2456.p2align	4
2457L$xts_dec_done:
2458	andq	$15,%r9
2459	jz	L$xts_dec_ret
2460L$xts_dec_done2:
2461	movq	%r9,%rdx
2462	movq	%rbp,%rcx
2463	movl	%r10d,%eax
2464
2465	movups	(%rdi),%xmm2
2466	xorps	%xmm11,%xmm2
2467	movups	(%rcx),%xmm0
2468	movups	16(%rcx),%xmm1
2469	leaq	32(%rcx),%rcx
2470	xorps	%xmm0,%xmm2
2471L$oop_dec1_13:
2472.byte	102,15,56,222,209
2473	decl	%eax
2474	movups	(%rcx),%xmm1
2475	leaq	16(%rcx),%rcx
2476	jnz	L$oop_dec1_13
2477.byte	102,15,56,223,209
2478	xorps	%xmm11,%xmm2
2479	movups	%xmm2,(%rsi)
2480
2481L$xts_dec_steal:
2482	movzbl	16(%rdi),%eax
2483	movzbl	(%rsi),%ecx
2484	leaq	1(%rdi),%rdi
2485	movb	%al,(%rsi)
2486	movb	%cl,16(%rsi)
2487	leaq	1(%rsi),%rsi
2488	subq	$1,%rdx
2489	jnz	L$xts_dec_steal
2490
2491	subq	%r9,%rsi
2492	movq	%rbp,%rcx
2493	movl	%r10d,%eax
2494
2495	movups	(%rsi),%xmm2
2496	xorps	%xmm10,%xmm2
2497	movups	(%rcx),%xmm0
2498	movups	16(%rcx),%xmm1
2499	leaq	32(%rcx),%rcx
2500	xorps	%xmm0,%xmm2
2501L$oop_dec1_14:
2502.byte	102,15,56,222,209
2503	decl	%eax
2504	movups	(%rcx),%xmm1
2505	leaq	16(%rcx),%rcx
2506	jnz	L$oop_dec1_14
2507.byte	102,15,56,223,209
2508	xorps	%xmm10,%xmm2
2509	movups	%xmm2,(%rsi)
2510
2511L$xts_dec_ret:
2512	xorps	%xmm0,%xmm0
2513	pxor	%xmm1,%xmm1
2514	pxor	%xmm2,%xmm2
2515	pxor	%xmm3,%xmm3
2516	pxor	%xmm4,%xmm4
2517	pxor	%xmm5,%xmm5
2518	pxor	%xmm6,%xmm6
2519	pxor	%xmm7,%xmm7
2520	movaps	%xmm0,0(%rsp)
2521	pxor	%xmm8,%xmm8
2522	movaps	%xmm0,16(%rsp)
2523	pxor	%xmm9,%xmm9
2524	movaps	%xmm0,32(%rsp)
2525	pxor	%xmm10,%xmm10
2526	movaps	%xmm0,48(%rsp)
2527	pxor	%xmm11,%xmm11
2528	movaps	%xmm0,64(%rsp)
2529	pxor	%xmm12,%xmm12
2530	movaps	%xmm0,80(%rsp)
2531	pxor	%xmm13,%xmm13
2532	movaps	%xmm0,96(%rsp)
2533	pxor	%xmm14,%xmm14
2534	pxor	%xmm15,%xmm15
2535	movq	-8(%r11),%rbp
2536	leaq	(%r11),%rsp
2537L$xts_dec_epilogue:
2538	.byte	0xf3,0xc3
2539
2540.globl	_aesni_ocb_encrypt
2541.private_extern _aesni_ocb_encrypt
2542
2543.p2align	5
2544_aesni_ocb_encrypt:
2545	leaq	(%rsp),%rax
2546	pushq	%rbx
2547	pushq	%rbp
2548	pushq	%r12
2549	pushq	%r13
2550	pushq	%r14
2551	movq	8(%rax),%rbx
2552	movq	8+8(%rax),%rbp
2553
2554	movl	240(%rcx),%r10d
2555	movq	%rcx,%r11
2556	shll	$4,%r10d
2557	movups	(%rcx),%xmm9
2558	movups	16(%rcx,%r10,1),%xmm1
2559
2560	movdqu	(%r9),%xmm15
2561	pxor	%xmm1,%xmm9
2562	pxor	%xmm1,%xmm15
2563
2564	movl	$16+32,%eax
2565	leaq	32(%r11,%r10,1),%rcx
2566	movups	16(%r11),%xmm1
2567	subq	%r10,%rax
2568	movq	%rax,%r10
2569
2570	movdqu	(%rbx),%xmm10
2571	movdqu	(%rbp),%xmm8
2572
2573	testq	$1,%r8
2574	jnz	L$ocb_enc_odd
2575
2576	bsfq	%r8,%r12
2577	addq	$1,%r8
2578	shlq	$4,%r12
2579	movdqu	(%rbx,%r12,1),%xmm7
2580	movdqu	(%rdi),%xmm2
2581	leaq	16(%rdi),%rdi
2582
2583	call	__ocb_encrypt1
2584
2585	movdqa	%xmm7,%xmm15
2586	movups	%xmm2,(%rsi)
2587	leaq	16(%rsi),%rsi
2588	subq	$1,%rdx
2589	jz	L$ocb_enc_done
2590
2591L$ocb_enc_odd:
2592	leaq	1(%r8),%r12
2593	leaq	3(%r8),%r13
2594	leaq	5(%r8),%r14
2595	leaq	6(%r8),%r8
2596	bsfq	%r12,%r12
2597	bsfq	%r13,%r13
2598	bsfq	%r14,%r14
2599	shlq	$4,%r12
2600	shlq	$4,%r13
2601	shlq	$4,%r14
2602
2603	subq	$6,%rdx
2604	jc	L$ocb_enc_short
2605	jmp	L$ocb_enc_grandloop
2606
2607.p2align	5
2608L$ocb_enc_grandloop:
2609	movdqu	0(%rdi),%xmm2
2610	movdqu	16(%rdi),%xmm3
2611	movdqu	32(%rdi),%xmm4
2612	movdqu	48(%rdi),%xmm5
2613	movdqu	64(%rdi),%xmm6
2614	movdqu	80(%rdi),%xmm7
2615	leaq	96(%rdi),%rdi
2616
2617	call	__ocb_encrypt6
2618
2619	movups	%xmm2,0(%rsi)
2620	movups	%xmm3,16(%rsi)
2621	movups	%xmm4,32(%rsi)
2622	movups	%xmm5,48(%rsi)
2623	movups	%xmm6,64(%rsi)
2624	movups	%xmm7,80(%rsi)
2625	leaq	96(%rsi),%rsi
2626	subq	$6,%rdx
2627	jnc	L$ocb_enc_grandloop
2628
2629L$ocb_enc_short:
2630	addq	$6,%rdx
2631	jz	L$ocb_enc_done
2632
2633	movdqu	0(%rdi),%xmm2
2634	cmpq	$2,%rdx
2635	jb	L$ocb_enc_one
2636	movdqu	16(%rdi),%xmm3
2637	je	L$ocb_enc_two
2638
2639	movdqu	32(%rdi),%xmm4
2640	cmpq	$4,%rdx
2641	jb	L$ocb_enc_three
2642	movdqu	48(%rdi),%xmm5
2643	je	L$ocb_enc_four
2644
2645	movdqu	64(%rdi),%xmm6
2646	pxor	%xmm7,%xmm7
2647
2648	call	__ocb_encrypt6
2649
2650	movdqa	%xmm14,%xmm15
2651	movups	%xmm2,0(%rsi)
2652	movups	%xmm3,16(%rsi)
2653	movups	%xmm4,32(%rsi)
2654	movups	%xmm5,48(%rsi)
2655	movups	%xmm6,64(%rsi)
2656
2657	jmp	L$ocb_enc_done
2658
2659.p2align	4
2660L$ocb_enc_one:
2661	movdqa	%xmm10,%xmm7
2662
2663	call	__ocb_encrypt1
2664
2665	movdqa	%xmm7,%xmm15
2666	movups	%xmm2,0(%rsi)
2667	jmp	L$ocb_enc_done
2668
2669.p2align	4
2670L$ocb_enc_two:
2671	pxor	%xmm4,%xmm4
2672	pxor	%xmm5,%xmm5
2673
2674	call	__ocb_encrypt4
2675
2676	movdqa	%xmm11,%xmm15
2677	movups	%xmm2,0(%rsi)
2678	movups	%xmm3,16(%rsi)
2679
2680	jmp	L$ocb_enc_done
2681
2682.p2align	4
2683L$ocb_enc_three:
2684	pxor	%xmm5,%xmm5
2685
2686	call	__ocb_encrypt4
2687
2688	movdqa	%xmm12,%xmm15
2689	movups	%xmm2,0(%rsi)
2690	movups	%xmm3,16(%rsi)
2691	movups	%xmm4,32(%rsi)
2692
2693	jmp	L$ocb_enc_done
2694
2695.p2align	4
2696L$ocb_enc_four:
2697	call	__ocb_encrypt4
2698
2699	movdqa	%xmm13,%xmm15
2700	movups	%xmm2,0(%rsi)
2701	movups	%xmm3,16(%rsi)
2702	movups	%xmm4,32(%rsi)
2703	movups	%xmm5,48(%rsi)
2704
2705L$ocb_enc_done:
2706	pxor	%xmm0,%xmm15
2707	movdqu	%xmm8,(%rbp)
2708	movdqu	%xmm15,(%r9)
2709
2710	xorps	%xmm0,%xmm0
2711	pxor	%xmm1,%xmm1
2712	pxor	%xmm2,%xmm2
2713	pxor	%xmm3,%xmm3
2714	pxor	%xmm4,%xmm4
2715	pxor	%xmm5,%xmm5
2716	pxor	%xmm6,%xmm6
2717	pxor	%xmm7,%xmm7
2718	pxor	%xmm8,%xmm8
2719	pxor	%xmm9,%xmm9
2720	pxor	%xmm10,%xmm10
2721	pxor	%xmm11,%xmm11
2722	pxor	%xmm12,%xmm12
2723	pxor	%xmm13,%xmm13
2724	pxor	%xmm14,%xmm14
2725	pxor	%xmm15,%xmm15
2726	leaq	40(%rsp),%rax
2727	movq	-40(%rax),%r14
2728	movq	-32(%rax),%r13
2729	movq	-24(%rax),%r12
2730	movq	-16(%rax),%rbp
2731	movq	-8(%rax),%rbx
2732	leaq	(%rax),%rsp
2733L$ocb_enc_epilogue:
2734	.byte	0xf3,0xc3
2735
2736
2737
2738.p2align	5
2739__ocb_encrypt6:
2740	pxor	%xmm9,%xmm15
2741	movdqu	(%rbx,%r12,1),%xmm11
2742	movdqa	%xmm10,%xmm12
2743	movdqu	(%rbx,%r13,1),%xmm13
2744	movdqa	%xmm10,%xmm14
2745	pxor	%xmm15,%xmm10
2746	movdqu	(%rbx,%r14,1),%xmm15
2747	pxor	%xmm10,%xmm11
2748	pxor	%xmm2,%xmm8
2749	pxor	%xmm10,%xmm2
2750	pxor	%xmm11,%xmm12
2751	pxor	%xmm3,%xmm8
2752	pxor	%xmm11,%xmm3
2753	pxor	%xmm12,%xmm13
2754	pxor	%xmm4,%xmm8
2755	pxor	%xmm12,%xmm4
2756	pxor	%xmm13,%xmm14
2757	pxor	%xmm5,%xmm8
2758	pxor	%xmm13,%xmm5
2759	pxor	%xmm14,%xmm15
2760	pxor	%xmm6,%xmm8
2761	pxor	%xmm14,%xmm6
2762	pxor	%xmm7,%xmm8
2763	pxor	%xmm15,%xmm7
2764	movups	32(%r11),%xmm0
2765
2766	leaq	1(%r8),%r12
2767	leaq	3(%r8),%r13
2768	leaq	5(%r8),%r14
2769	addq	$6,%r8
2770	pxor	%xmm9,%xmm10
2771	bsfq	%r12,%r12
2772	bsfq	%r13,%r13
2773	bsfq	%r14,%r14
2774
2775.byte	102,15,56,220,209
2776.byte	102,15,56,220,217
2777.byte	102,15,56,220,225
2778.byte	102,15,56,220,233
2779	pxor	%xmm9,%xmm11
2780	pxor	%xmm9,%xmm12
2781.byte	102,15,56,220,241
2782	pxor	%xmm9,%xmm13
2783	pxor	%xmm9,%xmm14
2784.byte	102,15,56,220,249
2785	movups	48(%r11),%xmm1
2786	pxor	%xmm9,%xmm15
2787
2788.byte	102,15,56,220,208
2789.byte	102,15,56,220,216
2790.byte	102,15,56,220,224
2791.byte	102,15,56,220,232
2792.byte	102,15,56,220,240
2793.byte	102,15,56,220,248
2794	movups	64(%r11),%xmm0
2795	shlq	$4,%r12
2796	shlq	$4,%r13
2797	jmp	L$ocb_enc_loop6
2798
2799.p2align	5
2800L$ocb_enc_loop6:
2801.byte	102,15,56,220,209
2802.byte	102,15,56,220,217
2803.byte	102,15,56,220,225
2804.byte	102,15,56,220,233
2805.byte	102,15,56,220,241
2806.byte	102,15,56,220,249
2807	movups	(%rcx,%rax,1),%xmm1
2808	addq	$32,%rax
2809
2810.byte	102,15,56,220,208
2811.byte	102,15,56,220,216
2812.byte	102,15,56,220,224
2813.byte	102,15,56,220,232
2814.byte	102,15,56,220,240
2815.byte	102,15,56,220,248
2816	movups	-16(%rcx,%rax,1),%xmm0
2817	jnz	L$ocb_enc_loop6
2818
2819.byte	102,15,56,220,209
2820.byte	102,15,56,220,217
2821.byte	102,15,56,220,225
2822.byte	102,15,56,220,233
2823.byte	102,15,56,220,241
2824.byte	102,15,56,220,249
2825	movups	16(%r11),%xmm1
2826	shlq	$4,%r14
2827
2828.byte	102,65,15,56,221,210
2829	movdqu	(%rbx),%xmm10
2830	movq	%r10,%rax
2831.byte	102,65,15,56,221,219
2832.byte	102,65,15,56,221,228
2833.byte	102,65,15,56,221,237
2834.byte	102,65,15,56,221,246
2835.byte	102,65,15,56,221,255
2836	.byte	0xf3,0xc3
2837
2838
2839
2840.p2align	5
2841__ocb_encrypt4:
2842	pxor	%xmm9,%xmm15
2843	movdqu	(%rbx,%r12,1),%xmm11
2844	movdqa	%xmm10,%xmm12
2845	movdqu	(%rbx,%r13,1),%xmm13
2846	pxor	%xmm15,%xmm10
2847	pxor	%xmm10,%xmm11
2848	pxor	%xmm2,%xmm8
2849	pxor	%xmm10,%xmm2
2850	pxor	%xmm11,%xmm12
2851	pxor	%xmm3,%xmm8
2852	pxor	%xmm11,%xmm3
2853	pxor	%xmm12,%xmm13
2854	pxor	%xmm4,%xmm8
2855	pxor	%xmm12,%xmm4
2856	pxor	%xmm5,%xmm8
2857	pxor	%xmm13,%xmm5
2858	movups	32(%r11),%xmm0
2859
2860	pxor	%xmm9,%xmm10
2861	pxor	%xmm9,%xmm11
2862	pxor	%xmm9,%xmm12
2863	pxor	%xmm9,%xmm13
2864
2865.byte	102,15,56,220,209
2866.byte	102,15,56,220,217
2867.byte	102,15,56,220,225
2868.byte	102,15,56,220,233
2869	movups	48(%r11),%xmm1
2870
2871.byte	102,15,56,220,208
2872.byte	102,15,56,220,216
2873.byte	102,15,56,220,224
2874.byte	102,15,56,220,232
2875	movups	64(%r11),%xmm0
2876	jmp	L$ocb_enc_loop4
2877
2878.p2align	5
2879L$ocb_enc_loop4:
2880.byte	102,15,56,220,209
2881.byte	102,15,56,220,217
2882.byte	102,15,56,220,225
2883.byte	102,15,56,220,233
2884	movups	(%rcx,%rax,1),%xmm1
2885	addq	$32,%rax
2886
2887.byte	102,15,56,220,208
2888.byte	102,15,56,220,216
2889.byte	102,15,56,220,224
2890.byte	102,15,56,220,232
2891	movups	-16(%rcx,%rax,1),%xmm0
2892	jnz	L$ocb_enc_loop4
2893
2894.byte	102,15,56,220,209
2895.byte	102,15,56,220,217
2896.byte	102,15,56,220,225
2897.byte	102,15,56,220,233
2898	movups	16(%r11),%xmm1
2899	movq	%r10,%rax
2900
2901.byte	102,65,15,56,221,210
2902.byte	102,65,15,56,221,219
2903.byte	102,65,15,56,221,228
2904.byte	102,65,15,56,221,237
2905	.byte	0xf3,0xc3
2906
2907
2908
2909.p2align	5
2910__ocb_encrypt1:
2911	pxor	%xmm15,%xmm7
2912	pxor	%xmm9,%xmm7
2913	pxor	%xmm2,%xmm8
2914	pxor	%xmm7,%xmm2
2915	movups	32(%r11),%xmm0
2916
2917.byte	102,15,56,220,209
2918	movups	48(%r11),%xmm1
2919	pxor	%xmm9,%xmm7
2920
2921.byte	102,15,56,220,208
2922	movups	64(%r11),%xmm0
2923	jmp	L$ocb_enc_loop1
2924
2925.p2align	5
2926L$ocb_enc_loop1:
2927.byte	102,15,56,220,209
2928	movups	(%rcx,%rax,1),%xmm1
2929	addq	$32,%rax
2930
2931.byte	102,15,56,220,208
2932	movups	-16(%rcx,%rax,1),%xmm0
2933	jnz	L$ocb_enc_loop1
2934
2935.byte	102,15,56,220,209
2936	movups	16(%r11),%xmm1
2937	movq	%r10,%rax
2938
2939.byte	102,15,56,221,215
2940	.byte	0xf3,0xc3
2941
2942
2943.globl	_aesni_ocb_decrypt
2944.private_extern _aesni_ocb_decrypt
2945
2946.p2align	5
2947_aesni_ocb_decrypt:
2948	leaq	(%rsp),%rax
2949	pushq	%rbx
2950	pushq	%rbp
2951	pushq	%r12
2952	pushq	%r13
2953	pushq	%r14
2954	movq	8(%rax),%rbx
2955	movq	8+8(%rax),%rbp
2956
2957	movl	240(%rcx),%r10d
2958	movq	%rcx,%r11
2959	shll	$4,%r10d
2960	movups	(%rcx),%xmm9
2961	movups	16(%rcx,%r10,1),%xmm1
2962
2963	movdqu	(%r9),%xmm15
2964	pxor	%xmm1,%xmm9
2965	pxor	%xmm1,%xmm15
2966
2967	movl	$16+32,%eax
2968	leaq	32(%r11,%r10,1),%rcx
2969	movups	16(%r11),%xmm1
2970	subq	%r10,%rax
2971	movq	%rax,%r10
2972
2973	movdqu	(%rbx),%xmm10
2974	movdqu	(%rbp),%xmm8
2975
2976	testq	$1,%r8
2977	jnz	L$ocb_dec_odd
2978
2979	bsfq	%r8,%r12
2980	addq	$1,%r8
2981	shlq	$4,%r12
2982	movdqu	(%rbx,%r12,1),%xmm7
2983	movdqu	(%rdi),%xmm2
2984	leaq	16(%rdi),%rdi
2985
2986	call	__ocb_decrypt1
2987
2988	movdqa	%xmm7,%xmm15
2989	movups	%xmm2,(%rsi)
2990	xorps	%xmm2,%xmm8
2991	leaq	16(%rsi),%rsi
2992	subq	$1,%rdx
2993	jz	L$ocb_dec_done
2994
2995L$ocb_dec_odd:
2996	leaq	1(%r8),%r12
2997	leaq	3(%r8),%r13
2998	leaq	5(%r8),%r14
2999	leaq	6(%r8),%r8
3000	bsfq	%r12,%r12
3001	bsfq	%r13,%r13
3002	bsfq	%r14,%r14
3003	shlq	$4,%r12
3004	shlq	$4,%r13
3005	shlq	$4,%r14
3006
3007	subq	$6,%rdx
3008	jc	L$ocb_dec_short
3009	jmp	L$ocb_dec_grandloop
3010
3011.p2align	5
3012L$ocb_dec_grandloop:
3013	movdqu	0(%rdi),%xmm2
3014	movdqu	16(%rdi),%xmm3
3015	movdqu	32(%rdi),%xmm4
3016	movdqu	48(%rdi),%xmm5
3017	movdqu	64(%rdi),%xmm6
3018	movdqu	80(%rdi),%xmm7
3019	leaq	96(%rdi),%rdi
3020
3021	call	__ocb_decrypt6
3022
3023	movups	%xmm2,0(%rsi)
3024	pxor	%xmm2,%xmm8
3025	movups	%xmm3,16(%rsi)
3026	pxor	%xmm3,%xmm8
3027	movups	%xmm4,32(%rsi)
3028	pxor	%xmm4,%xmm8
3029	movups	%xmm5,48(%rsi)
3030	pxor	%xmm5,%xmm8
3031	movups	%xmm6,64(%rsi)
3032	pxor	%xmm6,%xmm8
3033	movups	%xmm7,80(%rsi)
3034	pxor	%xmm7,%xmm8
3035	leaq	96(%rsi),%rsi
3036	subq	$6,%rdx
3037	jnc	L$ocb_dec_grandloop
3038
3039L$ocb_dec_short:
3040	addq	$6,%rdx
3041	jz	L$ocb_dec_done
3042
3043	movdqu	0(%rdi),%xmm2
3044	cmpq	$2,%rdx
3045	jb	L$ocb_dec_one
3046	movdqu	16(%rdi),%xmm3
3047	je	L$ocb_dec_two
3048
3049	movdqu	32(%rdi),%xmm4
3050	cmpq	$4,%rdx
3051	jb	L$ocb_dec_three
3052	movdqu	48(%rdi),%xmm5
3053	je	L$ocb_dec_four
3054
3055	movdqu	64(%rdi),%xmm6
3056	pxor	%xmm7,%xmm7
3057
3058	call	__ocb_decrypt6
3059
3060	movdqa	%xmm14,%xmm15
3061	movups	%xmm2,0(%rsi)
3062	pxor	%xmm2,%xmm8
3063	movups	%xmm3,16(%rsi)
3064	pxor	%xmm3,%xmm8
3065	movups	%xmm4,32(%rsi)
3066	pxor	%xmm4,%xmm8
3067	movups	%xmm5,48(%rsi)
3068	pxor	%xmm5,%xmm8
3069	movups	%xmm6,64(%rsi)
3070	pxor	%xmm6,%xmm8
3071
3072	jmp	L$ocb_dec_done
3073
3074.p2align	4
3075L$ocb_dec_one:
3076	movdqa	%xmm10,%xmm7
3077
3078	call	__ocb_decrypt1
3079
3080	movdqa	%xmm7,%xmm15
3081	movups	%xmm2,0(%rsi)
3082	xorps	%xmm2,%xmm8
3083	jmp	L$ocb_dec_done
3084
3085.p2align	4
3086L$ocb_dec_two:
3087	pxor	%xmm4,%xmm4
3088	pxor	%xmm5,%xmm5
3089
3090	call	__ocb_decrypt4
3091
3092	movdqa	%xmm11,%xmm15
3093	movups	%xmm2,0(%rsi)
3094	xorps	%xmm2,%xmm8
3095	movups	%xmm3,16(%rsi)
3096	xorps	%xmm3,%xmm8
3097
3098	jmp	L$ocb_dec_done
3099
3100.p2align	4
3101L$ocb_dec_three:
3102	pxor	%xmm5,%xmm5
3103
3104	call	__ocb_decrypt4
3105
3106	movdqa	%xmm12,%xmm15
3107	movups	%xmm2,0(%rsi)
3108	xorps	%xmm2,%xmm8
3109	movups	%xmm3,16(%rsi)
3110	xorps	%xmm3,%xmm8
3111	movups	%xmm4,32(%rsi)
3112	xorps	%xmm4,%xmm8
3113
3114	jmp	L$ocb_dec_done
3115
3116.p2align	4
3117L$ocb_dec_four:
3118	call	__ocb_decrypt4
3119
3120	movdqa	%xmm13,%xmm15
3121	movups	%xmm2,0(%rsi)
3122	pxor	%xmm2,%xmm8
3123	movups	%xmm3,16(%rsi)
3124	pxor	%xmm3,%xmm8
3125	movups	%xmm4,32(%rsi)
3126	pxor	%xmm4,%xmm8
3127	movups	%xmm5,48(%rsi)
3128	pxor	%xmm5,%xmm8
3129
3130L$ocb_dec_done:
3131	pxor	%xmm0,%xmm15
3132	movdqu	%xmm8,(%rbp)
3133	movdqu	%xmm15,(%r9)
3134
3135	xorps	%xmm0,%xmm0
3136	pxor	%xmm1,%xmm1
3137	pxor	%xmm2,%xmm2
3138	pxor	%xmm3,%xmm3
3139	pxor	%xmm4,%xmm4
3140	pxor	%xmm5,%xmm5
3141	pxor	%xmm6,%xmm6
3142	pxor	%xmm7,%xmm7
3143	pxor	%xmm8,%xmm8
3144	pxor	%xmm9,%xmm9
3145	pxor	%xmm10,%xmm10
3146	pxor	%xmm11,%xmm11
3147	pxor	%xmm12,%xmm12
3148	pxor	%xmm13,%xmm13
3149	pxor	%xmm14,%xmm14
3150	pxor	%xmm15,%xmm15
3151	leaq	40(%rsp),%rax
3152	movq	-40(%rax),%r14
3153	movq	-32(%rax),%r13
3154	movq	-24(%rax),%r12
3155	movq	-16(%rax),%rbp
3156	movq	-8(%rax),%rbx
3157	leaq	(%rax),%rsp
3158L$ocb_dec_epilogue:
3159	.byte	0xf3,0xc3
3160
3161
3162
3163.p2align	5
3164__ocb_decrypt6:
3165	pxor	%xmm9,%xmm15
3166	movdqu	(%rbx,%r12,1),%xmm11
3167	movdqa	%xmm10,%xmm12
3168	movdqu	(%rbx,%r13,1),%xmm13
3169	movdqa	%xmm10,%xmm14
3170	pxor	%xmm15,%xmm10
3171	movdqu	(%rbx,%r14,1),%xmm15
3172	pxor	%xmm10,%xmm11
3173	pxor	%xmm10,%xmm2
3174	pxor	%xmm11,%xmm12
3175	pxor	%xmm11,%xmm3
3176	pxor	%xmm12,%xmm13
3177	pxor	%xmm12,%xmm4
3178	pxor	%xmm13,%xmm14
3179	pxor	%xmm13,%xmm5
3180	pxor	%xmm14,%xmm15
3181	pxor	%xmm14,%xmm6
3182	pxor	%xmm15,%xmm7
3183	movups	32(%r11),%xmm0
3184
3185	leaq	1(%r8),%r12
3186	leaq	3(%r8),%r13
3187	leaq	5(%r8),%r14
3188	addq	$6,%r8
3189	pxor	%xmm9,%xmm10
3190	bsfq	%r12,%r12
3191	bsfq	%r13,%r13
3192	bsfq	%r14,%r14
3193
3194.byte	102,15,56,222,209
3195.byte	102,15,56,222,217
3196.byte	102,15,56,222,225
3197.byte	102,15,56,222,233
3198	pxor	%xmm9,%xmm11
3199	pxor	%xmm9,%xmm12
3200.byte	102,15,56,222,241
3201	pxor	%xmm9,%xmm13
3202	pxor	%xmm9,%xmm14
3203.byte	102,15,56,222,249
3204	movups	48(%r11),%xmm1
3205	pxor	%xmm9,%xmm15
3206
3207.byte	102,15,56,222,208
3208.byte	102,15,56,222,216
3209.byte	102,15,56,222,224
3210.byte	102,15,56,222,232
3211.byte	102,15,56,222,240
3212.byte	102,15,56,222,248
3213	movups	64(%r11),%xmm0
3214	shlq	$4,%r12
3215	shlq	$4,%r13
3216	jmp	L$ocb_dec_loop6
3217
3218.p2align	5
3219L$ocb_dec_loop6:
3220.byte	102,15,56,222,209
3221.byte	102,15,56,222,217
3222.byte	102,15,56,222,225
3223.byte	102,15,56,222,233
3224.byte	102,15,56,222,241
3225.byte	102,15,56,222,249
3226	movups	(%rcx,%rax,1),%xmm1
3227	addq	$32,%rax
3228
3229.byte	102,15,56,222,208
3230.byte	102,15,56,222,216
3231.byte	102,15,56,222,224
3232.byte	102,15,56,222,232
3233.byte	102,15,56,222,240
3234.byte	102,15,56,222,248
3235	movups	-16(%rcx,%rax,1),%xmm0
3236	jnz	L$ocb_dec_loop6
3237
3238.byte	102,15,56,222,209
3239.byte	102,15,56,222,217
3240.byte	102,15,56,222,225
3241.byte	102,15,56,222,233
3242.byte	102,15,56,222,241
3243.byte	102,15,56,222,249
3244	movups	16(%r11),%xmm1
3245	shlq	$4,%r14
3246
3247.byte	102,65,15,56,223,210
3248	movdqu	(%rbx),%xmm10
3249	movq	%r10,%rax
3250.byte	102,65,15,56,223,219
3251.byte	102,65,15,56,223,228
3252.byte	102,65,15,56,223,237
3253.byte	102,65,15,56,223,246
3254.byte	102,65,15,56,223,255
3255	.byte	0xf3,0xc3
3256
3257
3258
3259.p2align	5
3260__ocb_decrypt4:
3261	pxor	%xmm9,%xmm15
3262	movdqu	(%rbx,%r12,1),%xmm11
3263	movdqa	%xmm10,%xmm12
3264	movdqu	(%rbx,%r13,1),%xmm13
3265	pxor	%xmm15,%xmm10
3266	pxor	%xmm10,%xmm11
3267	pxor	%xmm10,%xmm2
3268	pxor	%xmm11,%xmm12
3269	pxor	%xmm11,%xmm3
3270	pxor	%xmm12,%xmm13
3271	pxor	%xmm12,%xmm4
3272	pxor	%xmm13,%xmm5
3273	movups	32(%r11),%xmm0
3274
3275	pxor	%xmm9,%xmm10
3276	pxor	%xmm9,%xmm11
3277	pxor	%xmm9,%xmm12
3278	pxor	%xmm9,%xmm13
3279
3280.byte	102,15,56,222,209
3281.byte	102,15,56,222,217
3282.byte	102,15,56,222,225
3283.byte	102,15,56,222,233
3284	movups	48(%r11),%xmm1
3285
3286.byte	102,15,56,222,208
3287.byte	102,15,56,222,216
3288.byte	102,15,56,222,224
3289.byte	102,15,56,222,232
3290	movups	64(%r11),%xmm0
3291	jmp	L$ocb_dec_loop4
3292
3293.p2align	5
3294L$ocb_dec_loop4:
3295.byte	102,15,56,222,209
3296.byte	102,15,56,222,217
3297.byte	102,15,56,222,225
3298.byte	102,15,56,222,233
3299	movups	(%rcx,%rax,1),%xmm1
3300	addq	$32,%rax
3301
3302.byte	102,15,56,222,208
3303.byte	102,15,56,222,216
3304.byte	102,15,56,222,224
3305.byte	102,15,56,222,232
3306	movups	-16(%rcx,%rax,1),%xmm0
3307	jnz	L$ocb_dec_loop4
3308
3309.byte	102,15,56,222,209
3310.byte	102,15,56,222,217
3311.byte	102,15,56,222,225
3312.byte	102,15,56,222,233
3313	movups	16(%r11),%xmm1
3314	movq	%r10,%rax
3315
3316.byte	102,65,15,56,223,210
3317.byte	102,65,15,56,223,219
3318.byte	102,65,15,56,223,228
3319.byte	102,65,15,56,223,237
3320	.byte	0xf3,0xc3
3321
3322
3323
3324.p2align	5
3325__ocb_decrypt1:
3326	pxor	%xmm15,%xmm7
3327	pxor	%xmm9,%xmm7
3328	pxor	%xmm7,%xmm2
3329	movups	32(%r11),%xmm0
3330
3331.byte	102,15,56,222,209
3332	movups	48(%r11),%xmm1
3333	pxor	%xmm9,%xmm7
3334
3335.byte	102,15,56,222,208
3336	movups	64(%r11),%xmm0
3337	jmp	L$ocb_dec_loop1
3338
3339.p2align	5
3340L$ocb_dec_loop1:
3341.byte	102,15,56,222,209
3342	movups	(%rcx,%rax,1),%xmm1
3343	addq	$32,%rax
3344
3345.byte	102,15,56,222,208
3346	movups	-16(%rcx,%rax,1),%xmm0
3347	jnz	L$ocb_dec_loop1
3348
3349.byte	102,15,56,222,209
3350	movups	16(%r11),%xmm1
3351	movq	%r10,%rax
3352
3353.byte	102,15,56,223,215
3354	.byte	0xf3,0xc3
3355
3356.globl	_aesni_cbc_encrypt
3357.private_extern _aesni_cbc_encrypt
3358
3359.p2align	4
3360_aesni_cbc_encrypt:
3361	testq	%rdx,%rdx
3362	jz	L$cbc_ret
3363
3364	movl	240(%rcx),%r10d
3365	movq	%rcx,%r11
3366	testl	%r9d,%r9d
3367	jz	L$cbc_decrypt
3368
3369	movups	(%r8),%xmm2
3370	movl	%r10d,%eax
3371	cmpq	$16,%rdx
3372	jb	L$cbc_enc_tail
3373	subq	$16,%rdx
3374	jmp	L$cbc_enc_loop
3375.p2align	4
3376L$cbc_enc_loop:
3377	movups	(%rdi),%xmm3
3378	leaq	16(%rdi),%rdi
3379
3380	movups	(%rcx),%xmm0
3381	movups	16(%rcx),%xmm1
3382	xorps	%xmm0,%xmm3
3383	leaq	32(%rcx),%rcx
3384	xorps	%xmm3,%xmm2
3385L$oop_enc1_15:
3386.byte	102,15,56,220,209
3387	decl	%eax
3388	movups	(%rcx),%xmm1
3389	leaq	16(%rcx),%rcx
3390	jnz	L$oop_enc1_15
3391.byte	102,15,56,221,209
3392	movl	%r10d,%eax
3393	movq	%r11,%rcx
3394	movups	%xmm2,0(%rsi)
3395	leaq	16(%rsi),%rsi
3396	subq	$16,%rdx
3397	jnc	L$cbc_enc_loop
3398	addq	$16,%rdx
3399	jnz	L$cbc_enc_tail
3400	pxor	%xmm0,%xmm0
3401	pxor	%xmm1,%xmm1
3402	movups	%xmm2,(%r8)
3403	pxor	%xmm2,%xmm2
3404	pxor	%xmm3,%xmm3
3405	jmp	L$cbc_ret
3406
3407L$cbc_enc_tail:
3408	movq	%rdx,%rcx
3409	xchgq	%rdi,%rsi
3410.long	0x9066A4F3
3411	movl	$16,%ecx
3412	subq	%rdx,%rcx
3413	xorl	%eax,%eax
3414.long	0x9066AAF3
3415	leaq	-16(%rdi),%rdi
3416	movl	%r10d,%eax
3417	movq	%rdi,%rsi
3418	movq	%r11,%rcx
3419	xorq	%rdx,%rdx
3420	jmp	L$cbc_enc_loop
3421
3422.p2align	4
3423L$cbc_decrypt:
3424	cmpq	$16,%rdx
3425	jne	L$cbc_decrypt_bulk
3426
3427
3428
3429	movdqu	(%rdi),%xmm2
3430	movdqu	(%r8),%xmm3
3431	movdqa	%xmm2,%xmm4
3432	movups	(%rcx),%xmm0
3433	movups	16(%rcx),%xmm1
3434	leaq	32(%rcx),%rcx
3435	xorps	%xmm0,%xmm2
3436L$oop_dec1_16:
3437.byte	102,15,56,222,209
3438	decl	%r10d
3439	movups	(%rcx),%xmm1
3440	leaq	16(%rcx),%rcx
3441	jnz	L$oop_dec1_16
3442.byte	102,15,56,223,209
3443	pxor	%xmm0,%xmm0
3444	pxor	%xmm1,%xmm1
3445	movdqu	%xmm4,(%r8)
3446	xorps	%xmm3,%xmm2
3447	pxor	%xmm3,%xmm3
3448	movups	%xmm2,(%rsi)
3449	pxor	%xmm2,%xmm2
3450	jmp	L$cbc_ret
3451.p2align	4
3452L$cbc_decrypt_bulk:
3453	leaq	(%rsp),%r11
3454	pushq	%rbp
3455	subq	$16,%rsp
3456	andq	$-16,%rsp
3457	movq	%rcx,%rbp
3458	movups	(%r8),%xmm10
3459	movl	%r10d,%eax
3460	cmpq	$0x50,%rdx
3461	jbe	L$cbc_dec_tail
3462
3463	movups	(%rcx),%xmm0
3464	movdqu	0(%rdi),%xmm2
3465	movdqu	16(%rdi),%xmm3
3466	movdqa	%xmm2,%xmm11
3467	movdqu	32(%rdi),%xmm4
3468	movdqa	%xmm3,%xmm12
3469	movdqu	48(%rdi),%xmm5
3470	movdqa	%xmm4,%xmm13
3471	movdqu	64(%rdi),%xmm6
3472	movdqa	%xmm5,%xmm14
3473	movdqu	80(%rdi),%xmm7
3474	movdqa	%xmm6,%xmm15
3475	movl	_OPENSSL_ia32cap_P+4(%rip),%r9d
3476	cmpq	$0x70,%rdx
3477	jbe	L$cbc_dec_six_or_seven
3478
3479	andl	$71303168,%r9d
3480	subq	$0x50,%rdx
3481	cmpl	$4194304,%r9d
3482	je	L$cbc_dec_loop6_enter
3483	subq	$0x20,%rdx
3484	leaq	112(%rcx),%rcx
3485	jmp	L$cbc_dec_loop8_enter
3486.p2align	4
3487L$cbc_dec_loop8:
3488	movups	%xmm9,(%rsi)
3489	leaq	16(%rsi),%rsi
3490L$cbc_dec_loop8_enter:
3491	movdqu	96(%rdi),%xmm8
3492	pxor	%xmm0,%xmm2
3493	movdqu	112(%rdi),%xmm9
3494	pxor	%xmm0,%xmm3
3495	movups	16-112(%rcx),%xmm1
3496	pxor	%xmm0,%xmm4
3497	movq	$-1,%rbp
3498	cmpq	$0x70,%rdx
3499	pxor	%xmm0,%xmm5
3500	pxor	%xmm0,%xmm6
3501	pxor	%xmm0,%xmm7
3502	pxor	%xmm0,%xmm8
3503
3504.byte	102,15,56,222,209
3505	pxor	%xmm0,%xmm9
3506	movups	32-112(%rcx),%xmm0
3507.byte	102,15,56,222,217
3508.byte	102,15,56,222,225
3509.byte	102,15,56,222,233
3510.byte	102,15,56,222,241
3511.byte	102,15,56,222,249
3512.byte	102,68,15,56,222,193
3513	adcq	$0,%rbp
3514	andq	$128,%rbp
3515.byte	102,68,15,56,222,201
3516	addq	%rdi,%rbp
3517	movups	48-112(%rcx),%xmm1
3518.byte	102,15,56,222,208
3519.byte	102,15,56,222,216
3520.byte	102,15,56,222,224
3521.byte	102,15,56,222,232
3522.byte	102,15,56,222,240
3523.byte	102,15,56,222,248
3524.byte	102,68,15,56,222,192
3525.byte	102,68,15,56,222,200
3526	movups	64-112(%rcx),%xmm0
3527	nop
3528.byte	102,15,56,222,209
3529.byte	102,15,56,222,217
3530.byte	102,15,56,222,225
3531.byte	102,15,56,222,233
3532.byte	102,15,56,222,241
3533.byte	102,15,56,222,249
3534.byte	102,68,15,56,222,193
3535.byte	102,68,15,56,222,201
3536	movups	80-112(%rcx),%xmm1
3537	nop
3538.byte	102,15,56,222,208
3539.byte	102,15,56,222,216
3540.byte	102,15,56,222,224
3541.byte	102,15,56,222,232
3542.byte	102,15,56,222,240
3543.byte	102,15,56,222,248
3544.byte	102,68,15,56,222,192
3545.byte	102,68,15,56,222,200
3546	movups	96-112(%rcx),%xmm0
3547	nop
3548.byte	102,15,56,222,209
3549.byte	102,15,56,222,217
3550.byte	102,15,56,222,225
3551.byte	102,15,56,222,233
3552.byte	102,15,56,222,241
3553.byte	102,15,56,222,249
3554.byte	102,68,15,56,222,193
3555.byte	102,68,15,56,222,201
3556	movups	112-112(%rcx),%xmm1
3557	nop
3558.byte	102,15,56,222,208
3559.byte	102,15,56,222,216
3560.byte	102,15,56,222,224
3561.byte	102,15,56,222,232
3562.byte	102,15,56,222,240
3563.byte	102,15,56,222,248
3564.byte	102,68,15,56,222,192
3565.byte	102,68,15,56,222,200
3566	movups	128-112(%rcx),%xmm0
3567	nop
3568.byte	102,15,56,222,209
3569.byte	102,15,56,222,217
3570.byte	102,15,56,222,225
3571.byte	102,15,56,222,233
3572.byte	102,15,56,222,241
3573.byte	102,15,56,222,249
3574.byte	102,68,15,56,222,193
3575.byte	102,68,15,56,222,201
3576	movups	144-112(%rcx),%xmm1
3577	cmpl	$11,%eax
3578.byte	102,15,56,222,208
3579.byte	102,15,56,222,216
3580.byte	102,15,56,222,224
3581.byte	102,15,56,222,232
3582.byte	102,15,56,222,240
3583.byte	102,15,56,222,248
3584.byte	102,68,15,56,222,192
3585.byte	102,68,15,56,222,200
3586	movups	160-112(%rcx),%xmm0
3587	jb	L$cbc_dec_done
3588.byte	102,15,56,222,209
3589.byte	102,15,56,222,217
3590.byte	102,15,56,222,225
3591.byte	102,15,56,222,233
3592.byte	102,15,56,222,241
3593.byte	102,15,56,222,249
3594.byte	102,68,15,56,222,193
3595.byte	102,68,15,56,222,201
3596	movups	176-112(%rcx),%xmm1
3597	nop
3598.byte	102,15,56,222,208
3599.byte	102,15,56,222,216
3600.byte	102,15,56,222,224
3601.byte	102,15,56,222,232
3602.byte	102,15,56,222,240
3603.byte	102,15,56,222,248
3604.byte	102,68,15,56,222,192
3605.byte	102,68,15,56,222,200
3606	movups	192-112(%rcx),%xmm0
3607	je	L$cbc_dec_done
3608.byte	102,15,56,222,209
3609.byte	102,15,56,222,217
3610.byte	102,15,56,222,225
3611.byte	102,15,56,222,233
3612.byte	102,15,56,222,241
3613.byte	102,15,56,222,249
3614.byte	102,68,15,56,222,193
3615.byte	102,68,15,56,222,201
3616	movups	208-112(%rcx),%xmm1
3617	nop
3618.byte	102,15,56,222,208
3619.byte	102,15,56,222,216
3620.byte	102,15,56,222,224
3621.byte	102,15,56,222,232
3622.byte	102,15,56,222,240
3623.byte	102,15,56,222,248
3624.byte	102,68,15,56,222,192
3625.byte	102,68,15,56,222,200
3626	movups	224-112(%rcx),%xmm0
3627	jmp	L$cbc_dec_done
3628.p2align	4
3629L$cbc_dec_done:
3630.byte	102,15,56,222,209
3631.byte	102,15,56,222,217
3632	pxor	%xmm0,%xmm10
3633	pxor	%xmm0,%xmm11
3634.byte	102,15,56,222,225
3635.byte	102,15,56,222,233
3636	pxor	%xmm0,%xmm12
3637	pxor	%xmm0,%xmm13
3638.byte	102,15,56,222,241
3639.byte	102,15,56,222,249
3640	pxor	%xmm0,%xmm14
3641	pxor	%xmm0,%xmm15
3642.byte	102,68,15,56,222,193
3643.byte	102,68,15,56,222,201
3644	movdqu	80(%rdi),%xmm1
3645
3646.byte	102,65,15,56,223,210
3647	movdqu	96(%rdi),%xmm10
3648	pxor	%xmm0,%xmm1
3649.byte	102,65,15,56,223,219
3650	pxor	%xmm0,%xmm10
3651	movdqu	112(%rdi),%xmm0
3652.byte	102,65,15,56,223,228
3653	leaq	128(%rdi),%rdi
3654	movdqu	0(%rbp),%xmm11
3655.byte	102,65,15,56,223,237
3656.byte	102,65,15,56,223,246
3657	movdqu	16(%rbp),%xmm12
3658	movdqu	32(%rbp),%xmm13
3659.byte	102,65,15,56,223,255
3660.byte	102,68,15,56,223,193
3661	movdqu	48(%rbp),%xmm14
3662	movdqu	64(%rbp),%xmm15
3663.byte	102,69,15,56,223,202
3664	movdqa	%xmm0,%xmm10
3665	movdqu	80(%rbp),%xmm1
3666	movups	-112(%rcx),%xmm0
3667
3668	movups	%xmm2,(%rsi)
3669	movdqa	%xmm11,%xmm2
3670	movups	%xmm3,16(%rsi)
3671	movdqa	%xmm12,%xmm3
3672	movups	%xmm4,32(%rsi)
3673	movdqa	%xmm13,%xmm4
3674	movups	%xmm5,48(%rsi)
3675	movdqa	%xmm14,%xmm5
3676	movups	%xmm6,64(%rsi)
3677	movdqa	%xmm15,%xmm6
3678	movups	%xmm7,80(%rsi)
3679	movdqa	%xmm1,%xmm7
3680	movups	%xmm8,96(%rsi)
3681	leaq	112(%rsi),%rsi
3682
3683	subq	$0x80,%rdx
3684	ja	L$cbc_dec_loop8
3685
3686	movaps	%xmm9,%xmm2
3687	leaq	-112(%rcx),%rcx
3688	addq	$0x70,%rdx
3689	jle	L$cbc_dec_clear_tail_collected
3690	movups	%xmm9,(%rsi)
3691	leaq	16(%rsi),%rsi
3692	cmpq	$0x50,%rdx
3693	jbe	L$cbc_dec_tail
3694
3695	movaps	%xmm11,%xmm2
3696L$cbc_dec_six_or_seven:
3697	cmpq	$0x60,%rdx
3698	ja	L$cbc_dec_seven
3699
3700	movaps	%xmm7,%xmm8
3701	call	_aesni_decrypt6
3702	pxor	%xmm10,%xmm2
3703	movaps	%xmm8,%xmm10
3704	pxor	%xmm11,%xmm3
3705	movdqu	%xmm2,(%rsi)
3706	pxor	%xmm12,%xmm4
3707	movdqu	%xmm3,16(%rsi)
3708	pxor	%xmm3,%xmm3
3709	pxor	%xmm13,%xmm5
3710	movdqu	%xmm4,32(%rsi)
3711	pxor	%xmm4,%xmm4
3712	pxor	%xmm14,%xmm6
3713	movdqu	%xmm5,48(%rsi)
3714	pxor	%xmm5,%xmm5
3715	pxor	%xmm15,%xmm7
3716	movdqu	%xmm6,64(%rsi)
3717	pxor	%xmm6,%xmm6
3718	leaq	80(%rsi),%rsi
3719	movdqa	%xmm7,%xmm2
3720	pxor	%xmm7,%xmm7
3721	jmp	L$cbc_dec_tail_collected
3722
3723.p2align	4
3724L$cbc_dec_seven:
3725	movups	96(%rdi),%xmm8
3726	xorps	%xmm9,%xmm9
3727	call	_aesni_decrypt8
3728	movups	80(%rdi),%xmm9
3729	pxor	%xmm10,%xmm2
3730	movups	96(%rdi),%xmm10
3731	pxor	%xmm11,%xmm3
3732	movdqu	%xmm2,(%rsi)
3733	pxor	%xmm12,%xmm4
3734	movdqu	%xmm3,16(%rsi)
3735	pxor	%xmm3,%xmm3
3736	pxor	%xmm13,%xmm5
3737	movdqu	%xmm4,32(%rsi)
3738	pxor	%xmm4,%xmm4
3739	pxor	%xmm14,%xmm6
3740	movdqu	%xmm5,48(%rsi)
3741	pxor	%xmm5,%xmm5
3742	pxor	%xmm15,%xmm7
3743	movdqu	%xmm6,64(%rsi)
3744	pxor	%xmm6,%xmm6
3745	pxor	%xmm9,%xmm8
3746	movdqu	%xmm7,80(%rsi)
3747	pxor	%xmm7,%xmm7
3748	leaq	96(%rsi),%rsi
3749	movdqa	%xmm8,%xmm2
3750	pxor	%xmm8,%xmm8
3751	pxor	%xmm9,%xmm9
3752	jmp	L$cbc_dec_tail_collected
3753
3754.p2align	4
3755L$cbc_dec_loop6:
3756	movups	%xmm7,(%rsi)
3757	leaq	16(%rsi),%rsi
3758	movdqu	0(%rdi),%xmm2
3759	movdqu	16(%rdi),%xmm3
3760	movdqa	%xmm2,%xmm11
3761	movdqu	32(%rdi),%xmm4
3762	movdqa	%xmm3,%xmm12
3763	movdqu	48(%rdi),%xmm5
3764	movdqa	%xmm4,%xmm13
3765	movdqu	64(%rdi),%xmm6
3766	movdqa	%xmm5,%xmm14
3767	movdqu	80(%rdi),%xmm7
3768	movdqa	%xmm6,%xmm15
3769L$cbc_dec_loop6_enter:
3770	leaq	96(%rdi),%rdi
3771	movdqa	%xmm7,%xmm8
3772
3773	call	_aesni_decrypt6
3774
3775	pxor	%xmm10,%xmm2
3776	movdqa	%xmm8,%xmm10
3777	pxor	%xmm11,%xmm3
3778	movdqu	%xmm2,(%rsi)
3779	pxor	%xmm12,%xmm4
3780	movdqu	%xmm3,16(%rsi)
3781	pxor	%xmm13,%xmm5
3782	movdqu	%xmm4,32(%rsi)
3783	pxor	%xmm14,%xmm6
3784	movq	%rbp,%rcx
3785	movdqu	%xmm5,48(%rsi)
3786	pxor	%xmm15,%xmm7
3787	movl	%r10d,%eax
3788	movdqu	%xmm6,64(%rsi)
3789	leaq	80(%rsi),%rsi
3790	subq	$0x60,%rdx
3791	ja	L$cbc_dec_loop6
3792
3793	movdqa	%xmm7,%xmm2
3794	addq	$0x50,%rdx
3795	jle	L$cbc_dec_clear_tail_collected
3796	movups	%xmm7,(%rsi)
3797	leaq	16(%rsi),%rsi
3798
3799L$cbc_dec_tail:
3800	movups	(%rdi),%xmm2
3801	subq	$0x10,%rdx
3802	jbe	L$cbc_dec_one
3803
3804	movups	16(%rdi),%xmm3
3805	movaps	%xmm2,%xmm11
3806	subq	$0x10,%rdx
3807	jbe	L$cbc_dec_two
3808
3809	movups	32(%rdi),%xmm4
3810	movaps	%xmm3,%xmm12
3811	subq	$0x10,%rdx
3812	jbe	L$cbc_dec_three
3813
3814	movups	48(%rdi),%xmm5
3815	movaps	%xmm4,%xmm13
3816	subq	$0x10,%rdx
3817	jbe	L$cbc_dec_four
3818
3819	movups	64(%rdi),%xmm6
3820	movaps	%xmm5,%xmm14
3821	movaps	%xmm6,%xmm15
3822	xorps	%xmm7,%xmm7
3823	call	_aesni_decrypt6
3824	pxor	%xmm10,%xmm2
3825	movaps	%xmm15,%xmm10
3826	pxor	%xmm11,%xmm3
3827	movdqu	%xmm2,(%rsi)
3828	pxor	%xmm12,%xmm4
3829	movdqu	%xmm3,16(%rsi)
3830	pxor	%xmm3,%xmm3
3831	pxor	%xmm13,%xmm5
3832	movdqu	%xmm4,32(%rsi)
3833	pxor	%xmm4,%xmm4
3834	pxor	%xmm14,%xmm6
3835	movdqu	%xmm5,48(%rsi)
3836	pxor	%xmm5,%xmm5
3837	leaq	64(%rsi),%rsi
3838	movdqa	%xmm6,%xmm2
3839	pxor	%xmm6,%xmm6
3840	pxor	%xmm7,%xmm7
3841	subq	$0x10,%rdx
3842	jmp	L$cbc_dec_tail_collected
3843
3844.p2align	4
3845L$cbc_dec_one:
3846	movaps	%xmm2,%xmm11
3847	movups	(%rcx),%xmm0
3848	movups	16(%rcx),%xmm1
3849	leaq	32(%rcx),%rcx
3850	xorps	%xmm0,%xmm2
3851L$oop_dec1_17:
3852.byte	102,15,56,222,209
3853	decl	%eax
3854	movups	(%rcx),%xmm1
3855	leaq	16(%rcx),%rcx
3856	jnz	L$oop_dec1_17
3857.byte	102,15,56,223,209
3858	xorps	%xmm10,%xmm2
3859	movaps	%xmm11,%xmm10
3860	jmp	L$cbc_dec_tail_collected
3861.p2align	4
3862L$cbc_dec_two:
3863	movaps	%xmm3,%xmm12
3864	call	_aesni_decrypt2
3865	pxor	%xmm10,%xmm2
3866	movaps	%xmm12,%xmm10
3867	pxor	%xmm11,%xmm3
3868	movdqu	%xmm2,(%rsi)
3869	movdqa	%xmm3,%xmm2
3870	pxor	%xmm3,%xmm3
3871	leaq	16(%rsi),%rsi
3872	jmp	L$cbc_dec_tail_collected
3873.p2align	4
3874L$cbc_dec_three:
3875	movaps	%xmm4,%xmm13
3876	call	_aesni_decrypt3
3877	pxor	%xmm10,%xmm2
3878	movaps	%xmm13,%xmm10
3879	pxor	%xmm11,%xmm3
3880	movdqu	%xmm2,(%rsi)
3881	pxor	%xmm12,%xmm4
3882	movdqu	%xmm3,16(%rsi)
3883	pxor	%xmm3,%xmm3
3884	movdqa	%xmm4,%xmm2
3885	pxor	%xmm4,%xmm4
3886	leaq	32(%rsi),%rsi
3887	jmp	L$cbc_dec_tail_collected
3888.p2align	4
3889L$cbc_dec_four:
3890	movaps	%xmm5,%xmm14
3891	call	_aesni_decrypt4
3892	pxor	%xmm10,%xmm2
3893	movaps	%xmm14,%xmm10
3894	pxor	%xmm11,%xmm3
3895	movdqu	%xmm2,(%rsi)
3896	pxor	%xmm12,%xmm4
3897	movdqu	%xmm3,16(%rsi)
3898	pxor	%xmm3,%xmm3
3899	pxor	%xmm13,%xmm5
3900	movdqu	%xmm4,32(%rsi)
3901	pxor	%xmm4,%xmm4
3902	movdqa	%xmm5,%xmm2
3903	pxor	%xmm5,%xmm5
3904	leaq	48(%rsi),%rsi
3905	jmp	L$cbc_dec_tail_collected
3906
3907.p2align	4
3908L$cbc_dec_clear_tail_collected:
3909	pxor	%xmm3,%xmm3
3910	pxor	%xmm4,%xmm4
3911	pxor	%xmm5,%xmm5
3912	pxor	%xmm6,%xmm6
3913	pxor	%xmm7,%xmm7
3914	pxor	%xmm8,%xmm8
3915	pxor	%xmm9,%xmm9
3916L$cbc_dec_tail_collected:
3917	movups	%xmm10,(%r8)
3918	andq	$15,%rdx
3919	jnz	L$cbc_dec_tail_partial
3920	movups	%xmm2,(%rsi)
3921	pxor	%xmm2,%xmm2
3922	jmp	L$cbc_dec_ret
3923.p2align	4
3924L$cbc_dec_tail_partial:
3925	movaps	%xmm2,(%rsp)
3926	pxor	%xmm2,%xmm2
3927	movq	$16,%rcx
3928	movq	%rsi,%rdi
3929	subq	%rdx,%rcx
3930	leaq	(%rsp),%rsi
3931.long	0x9066A4F3
3932	movdqa	%xmm2,(%rsp)
3933
3934L$cbc_dec_ret:
3935	xorps	%xmm0,%xmm0
3936	pxor	%xmm1,%xmm1
3937	movq	-8(%r11),%rbp
3938	leaq	(%r11),%rsp
3939L$cbc_ret:
3940	.byte	0xf3,0xc3
3941
3942.globl	_aesni_set_decrypt_key
3943.private_extern _aesni_set_decrypt_key
3944
3945.p2align	4
3946_aesni_set_decrypt_key:
3947.byte	0x48,0x83,0xEC,0x08
3948	call	__aesni_set_encrypt_key
3949	shll	$4,%esi
3950	testl	%eax,%eax
3951	jnz	L$dec_key_ret
3952	leaq	16(%rdx,%rsi,1),%rdi
3953
3954	movups	(%rdx),%xmm0
3955	movups	(%rdi),%xmm1
3956	movups	%xmm0,(%rdi)
3957	movups	%xmm1,(%rdx)
3958	leaq	16(%rdx),%rdx
3959	leaq	-16(%rdi),%rdi
3960
3961L$dec_key_inverse:
3962	movups	(%rdx),%xmm0
3963	movups	(%rdi),%xmm1
3964.byte	102,15,56,219,192
3965.byte	102,15,56,219,201
3966	leaq	16(%rdx),%rdx
3967	leaq	-16(%rdi),%rdi
3968	movups	%xmm0,16(%rdi)
3969	movups	%xmm1,-16(%rdx)
3970	cmpq	%rdx,%rdi
3971	ja	L$dec_key_inverse
3972
3973	movups	(%rdx),%xmm0
3974.byte	102,15,56,219,192
3975	pxor	%xmm1,%xmm1
3976	movups	%xmm0,(%rdi)
3977	pxor	%xmm0,%xmm0
3978L$dec_key_ret:
3979	addq	$8,%rsp
3980	.byte	0xf3,0xc3
3981L$SEH_end_set_decrypt_key:
3982
3983.globl	_aesni_set_encrypt_key
3984.private_extern _aesni_set_encrypt_key
3985
3986.p2align	4
3987_aesni_set_encrypt_key:
3988__aesni_set_encrypt_key:
3989.byte	0x48,0x83,0xEC,0x08
3990	movq	$-1,%rax
3991	testq	%rdi,%rdi
3992	jz	L$enc_key_ret
3993	testq	%rdx,%rdx
3994	jz	L$enc_key_ret
3995
3996	movl	$268437504,%r10d
3997	movups	(%rdi),%xmm0
3998	xorps	%xmm4,%xmm4
3999	andl	_OPENSSL_ia32cap_P+4(%rip),%r10d
4000	leaq	16(%rdx),%rax
4001	cmpl	$256,%esi
4002	je	L$14rounds
4003	cmpl	$192,%esi
4004	je	L$12rounds
4005	cmpl	$128,%esi
4006	jne	L$bad_keybits
4007
4008L$10rounds:
4009	movl	$9,%esi
4010	cmpl	$268435456,%r10d
4011	je	L$10rounds_alt
4012
4013	movups	%xmm0,(%rdx)
4014.byte	102,15,58,223,200,1
4015	call	L$key_expansion_128_cold
4016.byte	102,15,58,223,200,2
4017	call	L$key_expansion_128
4018.byte	102,15,58,223,200,4
4019	call	L$key_expansion_128
4020.byte	102,15,58,223,200,8
4021	call	L$key_expansion_128
4022.byte	102,15,58,223,200,16
4023	call	L$key_expansion_128
4024.byte	102,15,58,223,200,32
4025	call	L$key_expansion_128
4026.byte	102,15,58,223,200,64
4027	call	L$key_expansion_128
4028.byte	102,15,58,223,200,128
4029	call	L$key_expansion_128
4030.byte	102,15,58,223,200,27
4031	call	L$key_expansion_128
4032.byte	102,15,58,223,200,54
4033	call	L$key_expansion_128
4034	movups	%xmm0,(%rax)
4035	movl	%esi,80(%rax)
4036	xorl	%eax,%eax
4037	jmp	L$enc_key_ret
4038
4039.p2align	4
4040L$10rounds_alt:
4041	movdqa	L$key_rotate(%rip),%xmm5
4042	movl	$8,%r10d
4043	movdqa	L$key_rcon1(%rip),%xmm4
4044	movdqa	%xmm0,%xmm2
4045	movdqu	%xmm0,(%rdx)
4046	jmp	L$oop_key128
4047
4048.p2align	4
4049L$oop_key128:
4050.byte	102,15,56,0,197
4051.byte	102,15,56,221,196
4052	pslld	$1,%xmm4
4053	leaq	16(%rax),%rax
4054
4055	movdqa	%xmm2,%xmm3
4056	pslldq	$4,%xmm2
4057	pxor	%xmm2,%xmm3
4058	pslldq	$4,%xmm2
4059	pxor	%xmm2,%xmm3
4060	pslldq	$4,%xmm2
4061	pxor	%xmm3,%xmm2
4062
4063	pxor	%xmm2,%xmm0
4064	movdqu	%xmm0,-16(%rax)
4065	movdqa	%xmm0,%xmm2
4066
4067	decl	%r10d
4068	jnz	L$oop_key128
4069
4070	movdqa	L$key_rcon1b(%rip),%xmm4
4071
4072.byte	102,15,56,0,197
4073.byte	102,15,56,221,196
4074	pslld	$1,%xmm4
4075
4076	movdqa	%xmm2,%xmm3
4077	pslldq	$4,%xmm2
4078	pxor	%xmm2,%xmm3
4079	pslldq	$4,%xmm2
4080	pxor	%xmm2,%xmm3
4081	pslldq	$4,%xmm2
4082	pxor	%xmm3,%xmm2
4083
4084	pxor	%xmm2,%xmm0
4085	movdqu	%xmm0,(%rax)
4086
4087	movdqa	%xmm0,%xmm2
4088.byte	102,15,56,0,197
4089.byte	102,15,56,221,196
4090
4091	movdqa	%xmm2,%xmm3
4092	pslldq	$4,%xmm2
4093	pxor	%xmm2,%xmm3
4094	pslldq	$4,%xmm2
4095	pxor	%xmm2,%xmm3
4096	pslldq	$4,%xmm2
4097	pxor	%xmm3,%xmm2
4098
4099	pxor	%xmm2,%xmm0
4100	movdqu	%xmm0,16(%rax)
4101
4102	movl	%esi,96(%rax)
4103	xorl	%eax,%eax
4104	jmp	L$enc_key_ret
4105
4106.p2align	4
4107L$12rounds:
4108	movq	16(%rdi),%xmm2
4109	movl	$11,%esi
4110	cmpl	$268435456,%r10d
4111	je	L$12rounds_alt
4112
4113	movups	%xmm0,(%rdx)
4114.byte	102,15,58,223,202,1
4115	call	L$key_expansion_192a_cold
4116.byte	102,15,58,223,202,2
4117	call	L$key_expansion_192b
4118.byte	102,15,58,223,202,4
4119	call	L$key_expansion_192a
4120.byte	102,15,58,223,202,8
4121	call	L$key_expansion_192b
4122.byte	102,15,58,223,202,16
4123	call	L$key_expansion_192a
4124.byte	102,15,58,223,202,32
4125	call	L$key_expansion_192b
4126.byte	102,15,58,223,202,64
4127	call	L$key_expansion_192a
4128.byte	102,15,58,223,202,128
4129	call	L$key_expansion_192b
4130	movups	%xmm0,(%rax)
4131	movl	%esi,48(%rax)
4132	xorq	%rax,%rax
4133	jmp	L$enc_key_ret
4134
4135.p2align	4
4136L$12rounds_alt:
4137	movdqa	L$key_rotate192(%rip),%xmm5
4138	movdqa	L$key_rcon1(%rip),%xmm4
4139	movl	$8,%r10d
4140	movdqu	%xmm0,(%rdx)
4141	jmp	L$oop_key192
4142
4143.p2align	4
4144L$oop_key192:
4145	movq	%xmm2,0(%rax)
4146	movdqa	%xmm2,%xmm1
4147.byte	102,15,56,0,213
4148.byte	102,15,56,221,212
4149	pslld	$1,%xmm4
4150	leaq	24(%rax),%rax
4151
4152	movdqa	%xmm0,%xmm3
4153	pslldq	$4,%xmm0
4154	pxor	%xmm0,%xmm3
4155	pslldq	$4,%xmm0
4156	pxor	%xmm0,%xmm3
4157	pslldq	$4,%xmm0
4158	pxor	%xmm3,%xmm0
4159
4160	pshufd	$0xff,%xmm0,%xmm3
4161	pxor	%xmm1,%xmm3
4162	pslldq	$4,%xmm1
4163	pxor	%xmm1,%xmm3
4164
4165	pxor	%xmm2,%xmm0
4166	pxor	%xmm3,%xmm2
4167	movdqu	%xmm0,-16(%rax)
4168
4169	decl	%r10d
4170	jnz	L$oop_key192
4171
4172	movl	%esi,32(%rax)
4173	xorl	%eax,%eax
4174	jmp	L$enc_key_ret
4175
4176.p2align	4
4177L$14rounds:
4178	movups	16(%rdi),%xmm2
4179	movl	$13,%esi
4180	leaq	16(%rax),%rax
4181	cmpl	$268435456,%r10d
4182	je	L$14rounds_alt
4183
4184	movups	%xmm0,(%rdx)
4185	movups	%xmm2,16(%rdx)
4186.byte	102,15,58,223,202,1
4187	call	L$key_expansion_256a_cold
4188.byte	102,15,58,223,200,1
4189	call	L$key_expansion_256b
4190.byte	102,15,58,223,202,2
4191	call	L$key_expansion_256a
4192.byte	102,15,58,223,200,2
4193	call	L$key_expansion_256b
4194.byte	102,15,58,223,202,4
4195	call	L$key_expansion_256a
4196.byte	102,15,58,223,200,4
4197	call	L$key_expansion_256b
4198.byte	102,15,58,223,202,8
4199	call	L$key_expansion_256a
4200.byte	102,15,58,223,200,8
4201	call	L$key_expansion_256b
4202.byte	102,15,58,223,202,16
4203	call	L$key_expansion_256a
4204.byte	102,15,58,223,200,16
4205	call	L$key_expansion_256b
4206.byte	102,15,58,223,202,32
4207	call	L$key_expansion_256a
4208.byte	102,15,58,223,200,32
4209	call	L$key_expansion_256b
4210.byte	102,15,58,223,202,64
4211	call	L$key_expansion_256a
4212	movups	%xmm0,(%rax)
4213	movl	%esi,16(%rax)
4214	xorq	%rax,%rax
4215	jmp	L$enc_key_ret
4216
4217.p2align	4
4218L$14rounds_alt:
4219	movdqa	L$key_rotate(%rip),%xmm5
4220	movdqa	L$key_rcon1(%rip),%xmm4
4221	movl	$7,%r10d
4222	movdqu	%xmm0,0(%rdx)
4223	movdqa	%xmm2,%xmm1
4224	movdqu	%xmm2,16(%rdx)
4225	jmp	L$oop_key256
4226
4227.p2align	4
4228L$oop_key256:
4229.byte	102,15,56,0,213
4230.byte	102,15,56,221,212
4231
4232	movdqa	%xmm0,%xmm3
4233	pslldq	$4,%xmm0
4234	pxor	%xmm0,%xmm3
4235	pslldq	$4,%xmm0
4236	pxor	%xmm0,%xmm3
4237	pslldq	$4,%xmm0
4238	pxor	%xmm3,%xmm0
4239	pslld	$1,%xmm4
4240
4241	pxor	%xmm2,%xmm0
4242	movdqu	%xmm0,(%rax)
4243
4244	decl	%r10d
4245	jz	L$done_key256
4246
4247	pshufd	$0xff,%xmm0,%xmm2
4248	pxor	%xmm3,%xmm3
4249.byte	102,15,56,221,211
4250
4251	movdqa	%xmm1,%xmm3
4252	pslldq	$4,%xmm1
4253	pxor	%xmm1,%xmm3
4254	pslldq	$4,%xmm1
4255	pxor	%xmm1,%xmm3
4256	pslldq	$4,%xmm1
4257	pxor	%xmm3,%xmm1
4258
4259	pxor	%xmm1,%xmm2
4260	movdqu	%xmm2,16(%rax)
4261	leaq	32(%rax),%rax
4262	movdqa	%xmm2,%xmm1
4263
4264	jmp	L$oop_key256
4265
4266L$done_key256:
4267	movl	%esi,16(%rax)
4268	xorl	%eax,%eax
4269	jmp	L$enc_key_ret
4270
4271.p2align	4
4272L$bad_keybits:
4273	movq	$-2,%rax
4274L$enc_key_ret:
4275	pxor	%xmm0,%xmm0
4276	pxor	%xmm1,%xmm1
4277	pxor	%xmm2,%xmm2
4278	pxor	%xmm3,%xmm3
4279	pxor	%xmm4,%xmm4
4280	pxor	%xmm5,%xmm5
4281	addq	$8,%rsp
4282	.byte	0xf3,0xc3
4283L$SEH_end_set_encrypt_key:
4284
4285.p2align	4
4286L$key_expansion_128:
4287	movups	%xmm0,(%rax)
4288	leaq	16(%rax),%rax
4289L$key_expansion_128_cold:
4290	shufps	$16,%xmm0,%xmm4
4291	xorps	%xmm4,%xmm0
4292	shufps	$140,%xmm0,%xmm4
4293	xorps	%xmm4,%xmm0
4294	shufps	$255,%xmm1,%xmm1
4295	xorps	%xmm1,%xmm0
4296	.byte	0xf3,0xc3
4297
4298.p2align	4
4299L$key_expansion_192a:
4300	movups	%xmm0,(%rax)
4301	leaq	16(%rax),%rax
4302L$key_expansion_192a_cold:
4303	movaps	%xmm2,%xmm5
4304L$key_expansion_192b_warm:
4305	shufps	$16,%xmm0,%xmm4
4306	movdqa	%xmm2,%xmm3
4307	xorps	%xmm4,%xmm0
4308	shufps	$140,%xmm0,%xmm4
4309	pslldq	$4,%xmm3
4310	xorps	%xmm4,%xmm0
4311	pshufd	$85,%xmm1,%xmm1
4312	pxor	%xmm3,%xmm2
4313	pxor	%xmm1,%xmm0
4314	pshufd	$255,%xmm0,%xmm3
4315	pxor	%xmm3,%xmm2
4316	.byte	0xf3,0xc3
4317
4318.p2align	4
4319L$key_expansion_192b:
4320	movaps	%xmm0,%xmm3
4321	shufps	$68,%xmm0,%xmm5
4322	movups	%xmm5,(%rax)
4323	shufps	$78,%xmm2,%xmm3
4324	movups	%xmm3,16(%rax)
4325	leaq	32(%rax),%rax
4326	jmp	L$key_expansion_192b_warm
4327
4328.p2align	4
4329L$key_expansion_256a:
4330	movups	%xmm2,(%rax)
4331	leaq	16(%rax),%rax
4332L$key_expansion_256a_cold:
4333	shufps	$16,%xmm0,%xmm4
4334	xorps	%xmm4,%xmm0
4335	shufps	$140,%xmm0,%xmm4
4336	xorps	%xmm4,%xmm0
4337	shufps	$255,%xmm1,%xmm1
4338	xorps	%xmm1,%xmm0
4339	.byte	0xf3,0xc3
4340
4341.p2align	4
4342L$key_expansion_256b:
4343	movups	%xmm0,(%rax)
4344	leaq	16(%rax),%rax
4345
4346	shufps	$16,%xmm2,%xmm4
4347	xorps	%xmm4,%xmm2
4348	shufps	$140,%xmm2,%xmm4
4349	xorps	%xmm4,%xmm2
4350	shufps	$170,%xmm1,%xmm1
4351	xorps	%xmm1,%xmm2
4352	.byte	0xf3,0xc3
4353
4354
4355.p2align	6
4356L$bswap_mask:
4357.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4358L$increment32:
4359.long	6,6,6,0
4360L$increment64:
4361.long	1,0,0,0
4362L$xts_magic:
4363.long	0x87,0,1,0
4364L$increment1:
4365.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4366L$key_rotate:
4367.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4368L$key_rotate192:
4369.long	0x04070605,0x04070605,0x04070605,0x04070605
4370L$key_rcon1:
4371.long	1,1,1,1
4372L$key_rcon1b:
4373.long	0x1b,0x1b,0x1b,0x1b
4374
4375.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
4376.p2align	6
4377#endif
4378