1#if defined(__x86_64__)
2.text
3
4.globl	_aesni_encrypt
5.private_extern _aesni_encrypt
6
7.p2align	4
8_aesni_encrypt:
9	movups	(%rdi),%xmm2
10	movl	240(%rdx),%eax
11	movups	(%rdx),%xmm0
12	movups	16(%rdx),%xmm1
13	leaq	32(%rdx),%rdx
14	xorps	%xmm0,%xmm2
15L$oop_enc1_1:
16.byte	102,15,56,220,209
17	decl	%eax
18	movups	(%rdx),%xmm1
19	leaq	16(%rdx),%rdx
20	jnz	L$oop_enc1_1
21.byte	102,15,56,221,209
22	movups	%xmm2,(%rsi)
23	.byte	0xf3,0xc3
24
25
26.globl	_aesni_decrypt
27.private_extern _aesni_decrypt
28
29.p2align	4
30_aesni_decrypt:
31	movups	(%rdi),%xmm2
32	movl	240(%rdx),%eax
33	movups	(%rdx),%xmm0
34	movups	16(%rdx),%xmm1
35	leaq	32(%rdx),%rdx
36	xorps	%xmm0,%xmm2
37L$oop_dec1_2:
38.byte	102,15,56,222,209
39	decl	%eax
40	movups	(%rdx),%xmm1
41	leaq	16(%rdx),%rdx
42	jnz	L$oop_dec1_2
43.byte	102,15,56,223,209
44	movups	%xmm2,(%rsi)
45	.byte	0xf3,0xc3
46
47
48.p2align	4
49_aesni_encrypt2:
50	movups	(%rcx),%xmm0
51	shll	$4,%eax
52	movups	16(%rcx),%xmm1
53	xorps	%xmm0,%xmm2
54	xorps	%xmm0,%xmm3
55	movups	32(%rcx),%xmm0
56	leaq	32(%rcx,%rax,1),%rcx
57	negq	%rax
58	addq	$16,%rax
59
60L$enc_loop2:
61.byte	102,15,56,220,209
62.byte	102,15,56,220,217
63	movups	(%rcx,%rax,1),%xmm1
64	addq	$32,%rax
65.byte	102,15,56,220,208
66.byte	102,15,56,220,216
67	movups	-16(%rcx,%rax,1),%xmm0
68	jnz	L$enc_loop2
69
70.byte	102,15,56,220,209
71.byte	102,15,56,220,217
72.byte	102,15,56,221,208
73.byte	102,15,56,221,216
74	.byte	0xf3,0xc3
75
76
77.p2align	4
78_aesni_decrypt2:
79	movups	(%rcx),%xmm0
80	shll	$4,%eax
81	movups	16(%rcx),%xmm1
82	xorps	%xmm0,%xmm2
83	xorps	%xmm0,%xmm3
84	movups	32(%rcx),%xmm0
85	leaq	32(%rcx,%rax,1),%rcx
86	negq	%rax
87	addq	$16,%rax
88
89L$dec_loop2:
90.byte	102,15,56,222,209
91.byte	102,15,56,222,217
92	movups	(%rcx,%rax,1),%xmm1
93	addq	$32,%rax
94.byte	102,15,56,222,208
95.byte	102,15,56,222,216
96	movups	-16(%rcx,%rax,1),%xmm0
97	jnz	L$dec_loop2
98
99.byte	102,15,56,222,209
100.byte	102,15,56,222,217
101.byte	102,15,56,223,208
102.byte	102,15,56,223,216
103	.byte	0xf3,0xc3
104
105
106.p2align	4
107_aesni_encrypt3:
108	movups	(%rcx),%xmm0
109	shll	$4,%eax
110	movups	16(%rcx),%xmm1
111	xorps	%xmm0,%xmm2
112	xorps	%xmm0,%xmm3
113	xorps	%xmm0,%xmm4
114	movups	32(%rcx),%xmm0
115	leaq	32(%rcx,%rax,1),%rcx
116	negq	%rax
117	addq	$16,%rax
118
119L$enc_loop3:
120.byte	102,15,56,220,209
121.byte	102,15,56,220,217
122.byte	102,15,56,220,225
123	movups	(%rcx,%rax,1),%xmm1
124	addq	$32,%rax
125.byte	102,15,56,220,208
126.byte	102,15,56,220,216
127.byte	102,15,56,220,224
128	movups	-16(%rcx,%rax,1),%xmm0
129	jnz	L$enc_loop3
130
131.byte	102,15,56,220,209
132.byte	102,15,56,220,217
133.byte	102,15,56,220,225
134.byte	102,15,56,221,208
135.byte	102,15,56,221,216
136.byte	102,15,56,221,224
137	.byte	0xf3,0xc3
138
139
140.p2align	4
141_aesni_decrypt3:
142	movups	(%rcx),%xmm0
143	shll	$4,%eax
144	movups	16(%rcx),%xmm1
145	xorps	%xmm0,%xmm2
146	xorps	%xmm0,%xmm3
147	xorps	%xmm0,%xmm4
148	movups	32(%rcx),%xmm0
149	leaq	32(%rcx,%rax,1),%rcx
150	negq	%rax
151	addq	$16,%rax
152
153L$dec_loop3:
154.byte	102,15,56,222,209
155.byte	102,15,56,222,217
156.byte	102,15,56,222,225
157	movups	(%rcx,%rax,1),%xmm1
158	addq	$32,%rax
159.byte	102,15,56,222,208
160.byte	102,15,56,222,216
161.byte	102,15,56,222,224
162	movups	-16(%rcx,%rax,1),%xmm0
163	jnz	L$dec_loop3
164
165.byte	102,15,56,222,209
166.byte	102,15,56,222,217
167.byte	102,15,56,222,225
168.byte	102,15,56,223,208
169.byte	102,15,56,223,216
170.byte	102,15,56,223,224
171	.byte	0xf3,0xc3
172
173
174.p2align	4
175_aesni_encrypt4:
176	movups	(%rcx),%xmm0
177	shll	$4,%eax
178	movups	16(%rcx),%xmm1
179	xorps	%xmm0,%xmm2
180	xorps	%xmm0,%xmm3
181	xorps	%xmm0,%xmm4
182	xorps	%xmm0,%xmm5
183	movups	32(%rcx),%xmm0
184	leaq	32(%rcx,%rax,1),%rcx
185	negq	%rax
186.byte	0x0f,0x1f,0x00
187	addq	$16,%rax
188
189L$enc_loop4:
190.byte	102,15,56,220,209
191.byte	102,15,56,220,217
192.byte	102,15,56,220,225
193.byte	102,15,56,220,233
194	movups	(%rcx,%rax,1),%xmm1
195	addq	$32,%rax
196.byte	102,15,56,220,208
197.byte	102,15,56,220,216
198.byte	102,15,56,220,224
199.byte	102,15,56,220,232
200	movups	-16(%rcx,%rax,1),%xmm0
201	jnz	L$enc_loop4
202
203.byte	102,15,56,220,209
204.byte	102,15,56,220,217
205.byte	102,15,56,220,225
206.byte	102,15,56,220,233
207.byte	102,15,56,221,208
208.byte	102,15,56,221,216
209.byte	102,15,56,221,224
210.byte	102,15,56,221,232
211	.byte	0xf3,0xc3
212
213
214.p2align	4
215_aesni_decrypt4:
216	movups	(%rcx),%xmm0
217	shll	$4,%eax
218	movups	16(%rcx),%xmm1
219	xorps	%xmm0,%xmm2
220	xorps	%xmm0,%xmm3
221	xorps	%xmm0,%xmm4
222	xorps	%xmm0,%xmm5
223	movups	32(%rcx),%xmm0
224	leaq	32(%rcx,%rax,1),%rcx
225	negq	%rax
226.byte	0x0f,0x1f,0x00
227	addq	$16,%rax
228
229L$dec_loop4:
230.byte	102,15,56,222,209
231.byte	102,15,56,222,217
232.byte	102,15,56,222,225
233.byte	102,15,56,222,233
234	movups	(%rcx,%rax,1),%xmm1
235	addq	$32,%rax
236.byte	102,15,56,222,208
237.byte	102,15,56,222,216
238.byte	102,15,56,222,224
239.byte	102,15,56,222,232
240	movups	-16(%rcx,%rax,1),%xmm0
241	jnz	L$dec_loop4
242
243.byte	102,15,56,222,209
244.byte	102,15,56,222,217
245.byte	102,15,56,222,225
246.byte	102,15,56,222,233
247.byte	102,15,56,223,208
248.byte	102,15,56,223,216
249.byte	102,15,56,223,224
250.byte	102,15,56,223,232
251	.byte	0xf3,0xc3
252
253
254.p2align	4
255_aesni_encrypt6:
256	movups	(%rcx),%xmm0
257	shll	$4,%eax
258	movups	16(%rcx),%xmm1
259	xorps	%xmm0,%xmm2
260	pxor	%xmm0,%xmm3
261	pxor	%xmm0,%xmm4
262.byte	102,15,56,220,209
263	leaq	32(%rcx,%rax,1),%rcx
264	negq	%rax
265.byte	102,15,56,220,217
266	pxor	%xmm0,%xmm5
267	pxor	%xmm0,%xmm6
268.byte	102,15,56,220,225
269	pxor	%xmm0,%xmm7
270	addq	$16,%rax
271.byte	102,15,56,220,233
272.byte	102,15,56,220,241
273.byte	102,15,56,220,249
274	movups	-16(%rcx,%rax,1),%xmm0
275	jmp	L$enc_loop6_enter
276.p2align	4
277L$enc_loop6:
278.byte	102,15,56,220,209
279.byte	102,15,56,220,217
280.byte	102,15,56,220,225
281.byte	102,15,56,220,233
282.byte	102,15,56,220,241
283.byte	102,15,56,220,249
284L$enc_loop6_enter:
285	movups	(%rcx,%rax,1),%xmm1
286	addq	$32,%rax
287.byte	102,15,56,220,208
288.byte	102,15,56,220,216
289.byte	102,15,56,220,224
290.byte	102,15,56,220,232
291.byte	102,15,56,220,240
292.byte	102,15,56,220,248
293	movups	-16(%rcx,%rax,1),%xmm0
294	jnz	L$enc_loop6
295
296.byte	102,15,56,220,209
297.byte	102,15,56,220,217
298.byte	102,15,56,220,225
299.byte	102,15,56,220,233
300.byte	102,15,56,220,241
301.byte	102,15,56,220,249
302.byte	102,15,56,221,208
303.byte	102,15,56,221,216
304.byte	102,15,56,221,224
305.byte	102,15,56,221,232
306.byte	102,15,56,221,240
307.byte	102,15,56,221,248
308	.byte	0xf3,0xc3
309
310
311.p2align	4
312_aesni_decrypt6:
313	movups	(%rcx),%xmm0
314	shll	$4,%eax
315	movups	16(%rcx),%xmm1
316	xorps	%xmm0,%xmm2
317	pxor	%xmm0,%xmm3
318	pxor	%xmm0,%xmm4
319.byte	102,15,56,222,209
320	leaq	32(%rcx,%rax,1),%rcx
321	negq	%rax
322.byte	102,15,56,222,217
323	pxor	%xmm0,%xmm5
324	pxor	%xmm0,%xmm6
325.byte	102,15,56,222,225
326	pxor	%xmm0,%xmm7
327	addq	$16,%rax
328.byte	102,15,56,222,233
329.byte	102,15,56,222,241
330.byte	102,15,56,222,249
331	movups	-16(%rcx,%rax,1),%xmm0
332	jmp	L$dec_loop6_enter
333.p2align	4
334L$dec_loop6:
335.byte	102,15,56,222,209
336.byte	102,15,56,222,217
337.byte	102,15,56,222,225
338.byte	102,15,56,222,233
339.byte	102,15,56,222,241
340.byte	102,15,56,222,249
341L$dec_loop6_enter:
342	movups	(%rcx,%rax,1),%xmm1
343	addq	$32,%rax
344.byte	102,15,56,222,208
345.byte	102,15,56,222,216
346.byte	102,15,56,222,224
347.byte	102,15,56,222,232
348.byte	102,15,56,222,240
349.byte	102,15,56,222,248
350	movups	-16(%rcx,%rax,1),%xmm0
351	jnz	L$dec_loop6
352
353.byte	102,15,56,222,209
354.byte	102,15,56,222,217
355.byte	102,15,56,222,225
356.byte	102,15,56,222,233
357.byte	102,15,56,222,241
358.byte	102,15,56,222,249
359.byte	102,15,56,223,208
360.byte	102,15,56,223,216
361.byte	102,15,56,223,224
362.byte	102,15,56,223,232
363.byte	102,15,56,223,240
364.byte	102,15,56,223,248
365	.byte	0xf3,0xc3
366
367
368.p2align	4
369_aesni_encrypt8:
370	movups	(%rcx),%xmm0
371	shll	$4,%eax
372	movups	16(%rcx),%xmm1
373	xorps	%xmm0,%xmm2
374	xorps	%xmm0,%xmm3
375	pxor	%xmm0,%xmm4
376	pxor	%xmm0,%xmm5
377	pxor	%xmm0,%xmm6
378	leaq	32(%rcx,%rax,1),%rcx
379	negq	%rax
380.byte	102,15,56,220,209
381	addq	$16,%rax
382	pxor	%xmm0,%xmm7
383.byte	102,15,56,220,217
384	pxor	%xmm0,%xmm8
385	pxor	%xmm0,%xmm9
386.byte	102,15,56,220,225
387.byte	102,15,56,220,233
388.byte	102,15,56,220,241
389.byte	102,15,56,220,249
390.byte	102,68,15,56,220,193
391.byte	102,68,15,56,220,201
392	movups	-16(%rcx,%rax,1),%xmm0
393	jmp	L$enc_loop8_enter
394.p2align	4
395L$enc_loop8:
396.byte	102,15,56,220,209
397.byte	102,15,56,220,217
398.byte	102,15,56,220,225
399.byte	102,15,56,220,233
400.byte	102,15,56,220,241
401.byte	102,15,56,220,249
402.byte	102,68,15,56,220,193
403.byte	102,68,15,56,220,201
404L$enc_loop8_enter:
405	movups	(%rcx,%rax,1),%xmm1
406	addq	$32,%rax
407.byte	102,15,56,220,208
408.byte	102,15,56,220,216
409.byte	102,15,56,220,224
410.byte	102,15,56,220,232
411.byte	102,15,56,220,240
412.byte	102,15,56,220,248
413.byte	102,68,15,56,220,192
414.byte	102,68,15,56,220,200
415	movups	-16(%rcx,%rax,1),%xmm0
416	jnz	L$enc_loop8
417
418.byte	102,15,56,220,209
419.byte	102,15,56,220,217
420.byte	102,15,56,220,225
421.byte	102,15,56,220,233
422.byte	102,15,56,220,241
423.byte	102,15,56,220,249
424.byte	102,68,15,56,220,193
425.byte	102,68,15,56,220,201
426.byte	102,15,56,221,208
427.byte	102,15,56,221,216
428.byte	102,15,56,221,224
429.byte	102,15,56,221,232
430.byte	102,15,56,221,240
431.byte	102,15,56,221,248
432.byte	102,68,15,56,221,192
433.byte	102,68,15,56,221,200
434	.byte	0xf3,0xc3
435
436
437.p2align	4
438_aesni_decrypt8:
439	movups	(%rcx),%xmm0
440	shll	$4,%eax
441	movups	16(%rcx),%xmm1
442	xorps	%xmm0,%xmm2
443	xorps	%xmm0,%xmm3
444	pxor	%xmm0,%xmm4
445	pxor	%xmm0,%xmm5
446	pxor	%xmm0,%xmm6
447	leaq	32(%rcx,%rax,1),%rcx
448	negq	%rax
449.byte	102,15,56,222,209
450	addq	$16,%rax
451	pxor	%xmm0,%xmm7
452.byte	102,15,56,222,217
453	pxor	%xmm0,%xmm8
454	pxor	%xmm0,%xmm9
455.byte	102,15,56,222,225
456.byte	102,15,56,222,233
457.byte	102,15,56,222,241
458.byte	102,15,56,222,249
459.byte	102,68,15,56,222,193
460.byte	102,68,15,56,222,201
461	movups	-16(%rcx,%rax,1),%xmm0
462	jmp	L$dec_loop8_enter
463.p2align	4
464L$dec_loop8:
465.byte	102,15,56,222,209
466.byte	102,15,56,222,217
467.byte	102,15,56,222,225
468.byte	102,15,56,222,233
469.byte	102,15,56,222,241
470.byte	102,15,56,222,249
471.byte	102,68,15,56,222,193
472.byte	102,68,15,56,222,201
473L$dec_loop8_enter:
474	movups	(%rcx,%rax,1),%xmm1
475	addq	$32,%rax
476.byte	102,15,56,222,208
477.byte	102,15,56,222,216
478.byte	102,15,56,222,224
479.byte	102,15,56,222,232
480.byte	102,15,56,222,240
481.byte	102,15,56,222,248
482.byte	102,68,15,56,222,192
483.byte	102,68,15,56,222,200
484	movups	-16(%rcx,%rax,1),%xmm0
485	jnz	L$dec_loop8
486
487.byte	102,15,56,222,209
488.byte	102,15,56,222,217
489.byte	102,15,56,222,225
490.byte	102,15,56,222,233
491.byte	102,15,56,222,241
492.byte	102,15,56,222,249
493.byte	102,68,15,56,222,193
494.byte	102,68,15,56,222,201
495.byte	102,15,56,223,208
496.byte	102,15,56,223,216
497.byte	102,15,56,223,224
498.byte	102,15,56,223,232
499.byte	102,15,56,223,240
500.byte	102,15,56,223,248
501.byte	102,68,15,56,223,192
502.byte	102,68,15,56,223,200
503	.byte	0xf3,0xc3
504
505.globl	_aesni_ecb_encrypt
506.private_extern _aesni_ecb_encrypt
507
508.p2align	4
509_aesni_ecb_encrypt:
510	andq	$-16,%rdx
511	jz	L$ecb_ret
512
513	movl	240(%rcx),%eax
514	movups	(%rcx),%xmm0
515	movq	%rcx,%r11
516	movl	%eax,%r10d
517	testl	%r8d,%r8d
518	jz	L$ecb_decrypt
519
520	cmpq	$128,%rdx
521	jb	L$ecb_enc_tail
522
523	movdqu	(%rdi),%xmm2
524	movdqu	16(%rdi),%xmm3
525	movdqu	32(%rdi),%xmm4
526	movdqu	48(%rdi),%xmm5
527	movdqu	64(%rdi),%xmm6
528	movdqu	80(%rdi),%xmm7
529	movdqu	96(%rdi),%xmm8
530	movdqu	112(%rdi),%xmm9
531	leaq	128(%rdi),%rdi
532	subq	$128,%rdx
533	jmp	L$ecb_enc_loop8_enter
534.p2align	4
535L$ecb_enc_loop8:
536	movups	%xmm2,(%rsi)
537	movq	%r11,%rcx
538	movdqu	(%rdi),%xmm2
539	movl	%r10d,%eax
540	movups	%xmm3,16(%rsi)
541	movdqu	16(%rdi),%xmm3
542	movups	%xmm4,32(%rsi)
543	movdqu	32(%rdi),%xmm4
544	movups	%xmm5,48(%rsi)
545	movdqu	48(%rdi),%xmm5
546	movups	%xmm6,64(%rsi)
547	movdqu	64(%rdi),%xmm6
548	movups	%xmm7,80(%rsi)
549	movdqu	80(%rdi),%xmm7
550	movups	%xmm8,96(%rsi)
551	movdqu	96(%rdi),%xmm8
552	movups	%xmm9,112(%rsi)
553	leaq	128(%rsi),%rsi
554	movdqu	112(%rdi),%xmm9
555	leaq	128(%rdi),%rdi
556L$ecb_enc_loop8_enter:
557
558	call	_aesni_encrypt8
559
560	subq	$128,%rdx
561	jnc	L$ecb_enc_loop8
562
563	movups	%xmm2,(%rsi)
564	movq	%r11,%rcx
565	movups	%xmm3,16(%rsi)
566	movl	%r10d,%eax
567	movups	%xmm4,32(%rsi)
568	movups	%xmm5,48(%rsi)
569	movups	%xmm6,64(%rsi)
570	movups	%xmm7,80(%rsi)
571	movups	%xmm8,96(%rsi)
572	movups	%xmm9,112(%rsi)
573	leaq	128(%rsi),%rsi
574	addq	$128,%rdx
575	jz	L$ecb_ret
576
577L$ecb_enc_tail:
578	movups	(%rdi),%xmm2
579	cmpq	$32,%rdx
580	jb	L$ecb_enc_one
581	movups	16(%rdi),%xmm3
582	je	L$ecb_enc_two
583	movups	32(%rdi),%xmm4
584	cmpq	$64,%rdx
585	jb	L$ecb_enc_three
586	movups	48(%rdi),%xmm5
587	je	L$ecb_enc_four
588	movups	64(%rdi),%xmm6
589	cmpq	$96,%rdx
590	jb	L$ecb_enc_five
591	movups	80(%rdi),%xmm7
592	je	L$ecb_enc_six
593	movdqu	96(%rdi),%xmm8
594	call	_aesni_encrypt8
595	movups	%xmm2,(%rsi)
596	movups	%xmm3,16(%rsi)
597	movups	%xmm4,32(%rsi)
598	movups	%xmm5,48(%rsi)
599	movups	%xmm6,64(%rsi)
600	movups	%xmm7,80(%rsi)
601	movups	%xmm8,96(%rsi)
602	jmp	L$ecb_ret
603.p2align	4
604L$ecb_enc_one:
605	movups	(%rcx),%xmm0
606	movups	16(%rcx),%xmm1
607	leaq	32(%rcx),%rcx
608	xorps	%xmm0,%xmm2
609L$oop_enc1_3:
610.byte	102,15,56,220,209
611	decl	%eax
612	movups	(%rcx),%xmm1
613	leaq	16(%rcx),%rcx
614	jnz	L$oop_enc1_3
615.byte	102,15,56,221,209
616	movups	%xmm2,(%rsi)
617	jmp	L$ecb_ret
618.p2align	4
619L$ecb_enc_two:
620	call	_aesni_encrypt2
621	movups	%xmm2,(%rsi)
622	movups	%xmm3,16(%rsi)
623	jmp	L$ecb_ret
624.p2align	4
625L$ecb_enc_three:
626	call	_aesni_encrypt3
627	movups	%xmm2,(%rsi)
628	movups	%xmm3,16(%rsi)
629	movups	%xmm4,32(%rsi)
630	jmp	L$ecb_ret
631.p2align	4
632L$ecb_enc_four:
633	call	_aesni_encrypt4
634	movups	%xmm2,(%rsi)
635	movups	%xmm3,16(%rsi)
636	movups	%xmm4,32(%rsi)
637	movups	%xmm5,48(%rsi)
638	jmp	L$ecb_ret
639.p2align	4
640L$ecb_enc_five:
641	xorps	%xmm7,%xmm7
642	call	_aesni_encrypt6
643	movups	%xmm2,(%rsi)
644	movups	%xmm3,16(%rsi)
645	movups	%xmm4,32(%rsi)
646	movups	%xmm5,48(%rsi)
647	movups	%xmm6,64(%rsi)
648	jmp	L$ecb_ret
649.p2align	4
650L$ecb_enc_six:
651	call	_aesni_encrypt6
652	movups	%xmm2,(%rsi)
653	movups	%xmm3,16(%rsi)
654	movups	%xmm4,32(%rsi)
655	movups	%xmm5,48(%rsi)
656	movups	%xmm6,64(%rsi)
657	movups	%xmm7,80(%rsi)
658	jmp	L$ecb_ret
659
660.p2align	4
661L$ecb_decrypt:
662	cmpq	$128,%rdx
663	jb	L$ecb_dec_tail
664
665	movdqu	(%rdi),%xmm2
666	movdqu	16(%rdi),%xmm3
667	movdqu	32(%rdi),%xmm4
668	movdqu	48(%rdi),%xmm5
669	movdqu	64(%rdi),%xmm6
670	movdqu	80(%rdi),%xmm7
671	movdqu	96(%rdi),%xmm8
672	movdqu	112(%rdi),%xmm9
673	leaq	128(%rdi),%rdi
674	subq	$128,%rdx
675	jmp	L$ecb_dec_loop8_enter
676.p2align	4
677L$ecb_dec_loop8:
678	movups	%xmm2,(%rsi)
679	movq	%r11,%rcx
680	movdqu	(%rdi),%xmm2
681	movl	%r10d,%eax
682	movups	%xmm3,16(%rsi)
683	movdqu	16(%rdi),%xmm3
684	movups	%xmm4,32(%rsi)
685	movdqu	32(%rdi),%xmm4
686	movups	%xmm5,48(%rsi)
687	movdqu	48(%rdi),%xmm5
688	movups	%xmm6,64(%rsi)
689	movdqu	64(%rdi),%xmm6
690	movups	%xmm7,80(%rsi)
691	movdqu	80(%rdi),%xmm7
692	movups	%xmm8,96(%rsi)
693	movdqu	96(%rdi),%xmm8
694	movups	%xmm9,112(%rsi)
695	leaq	128(%rsi),%rsi
696	movdqu	112(%rdi),%xmm9
697	leaq	128(%rdi),%rdi
698L$ecb_dec_loop8_enter:
699
700	call	_aesni_decrypt8
701
702	movups	(%r11),%xmm0
703	subq	$128,%rdx
704	jnc	L$ecb_dec_loop8
705
706	movups	%xmm2,(%rsi)
707	movq	%r11,%rcx
708	movups	%xmm3,16(%rsi)
709	movl	%r10d,%eax
710	movups	%xmm4,32(%rsi)
711	movups	%xmm5,48(%rsi)
712	movups	%xmm6,64(%rsi)
713	movups	%xmm7,80(%rsi)
714	movups	%xmm8,96(%rsi)
715	movups	%xmm9,112(%rsi)
716	leaq	128(%rsi),%rsi
717	addq	$128,%rdx
718	jz	L$ecb_ret
719
720L$ecb_dec_tail:
721	movups	(%rdi),%xmm2
722	cmpq	$32,%rdx
723	jb	L$ecb_dec_one
724	movups	16(%rdi),%xmm3
725	je	L$ecb_dec_two
726	movups	32(%rdi),%xmm4
727	cmpq	$64,%rdx
728	jb	L$ecb_dec_three
729	movups	48(%rdi),%xmm5
730	je	L$ecb_dec_four
731	movups	64(%rdi),%xmm6
732	cmpq	$96,%rdx
733	jb	L$ecb_dec_five
734	movups	80(%rdi),%xmm7
735	je	L$ecb_dec_six
736	movups	96(%rdi),%xmm8
737	movups	(%rcx),%xmm0
738	call	_aesni_decrypt8
739	movups	%xmm2,(%rsi)
740	movups	%xmm3,16(%rsi)
741	movups	%xmm4,32(%rsi)
742	movups	%xmm5,48(%rsi)
743	movups	%xmm6,64(%rsi)
744	movups	%xmm7,80(%rsi)
745	movups	%xmm8,96(%rsi)
746	jmp	L$ecb_ret
747.p2align	4
748L$ecb_dec_one:
749	movups	(%rcx),%xmm0
750	movups	16(%rcx),%xmm1
751	leaq	32(%rcx),%rcx
752	xorps	%xmm0,%xmm2
753L$oop_dec1_4:
754.byte	102,15,56,222,209
755	decl	%eax
756	movups	(%rcx),%xmm1
757	leaq	16(%rcx),%rcx
758	jnz	L$oop_dec1_4
759.byte	102,15,56,223,209
760	movups	%xmm2,(%rsi)
761	jmp	L$ecb_ret
762.p2align	4
763L$ecb_dec_two:
764	call	_aesni_decrypt2
765	movups	%xmm2,(%rsi)
766	movups	%xmm3,16(%rsi)
767	jmp	L$ecb_ret
768.p2align	4
769L$ecb_dec_three:
770	call	_aesni_decrypt3
771	movups	%xmm2,(%rsi)
772	movups	%xmm3,16(%rsi)
773	movups	%xmm4,32(%rsi)
774	jmp	L$ecb_ret
775.p2align	4
776L$ecb_dec_four:
777	call	_aesni_decrypt4
778	movups	%xmm2,(%rsi)
779	movups	%xmm3,16(%rsi)
780	movups	%xmm4,32(%rsi)
781	movups	%xmm5,48(%rsi)
782	jmp	L$ecb_ret
783.p2align	4
784L$ecb_dec_five:
785	xorps	%xmm7,%xmm7
786	call	_aesni_decrypt6
787	movups	%xmm2,(%rsi)
788	movups	%xmm3,16(%rsi)
789	movups	%xmm4,32(%rsi)
790	movups	%xmm5,48(%rsi)
791	movups	%xmm6,64(%rsi)
792	jmp	L$ecb_ret
793.p2align	4
794L$ecb_dec_six:
795	call	_aesni_decrypt6
796	movups	%xmm2,(%rsi)
797	movups	%xmm3,16(%rsi)
798	movups	%xmm4,32(%rsi)
799	movups	%xmm5,48(%rsi)
800	movups	%xmm6,64(%rsi)
801	movups	%xmm7,80(%rsi)
802
803L$ecb_ret:
804	.byte	0xf3,0xc3
805
806.globl	_aesni_ccm64_encrypt_blocks
807.private_extern _aesni_ccm64_encrypt_blocks
808
809.p2align	4
810_aesni_ccm64_encrypt_blocks:
811	movl	240(%rcx),%eax
812	movdqu	(%r8),%xmm6
813	movdqa	L$increment64(%rip),%xmm9
814	movdqa	L$bswap_mask(%rip),%xmm7
815
816	shll	$4,%eax
817	movl	$16,%r10d
818	leaq	0(%rcx),%r11
819	movdqu	(%r9),%xmm3
820	movdqa	%xmm6,%xmm2
821	leaq	32(%rcx,%rax,1),%rcx
822.byte	102,15,56,0,247
823	subq	%rax,%r10
824	jmp	L$ccm64_enc_outer
825.p2align	4
826L$ccm64_enc_outer:
827	movups	(%r11),%xmm0
828	movq	%r10,%rax
829	movups	(%rdi),%xmm8
830
831	xorps	%xmm0,%xmm2
832	movups	16(%r11),%xmm1
833	xorps	%xmm8,%xmm0
834	xorps	%xmm0,%xmm3
835	movups	32(%r11),%xmm0
836
837L$ccm64_enc2_loop:
838.byte	102,15,56,220,209
839.byte	102,15,56,220,217
840	movups	(%rcx,%rax,1),%xmm1
841	addq	$32,%rax
842.byte	102,15,56,220,208
843.byte	102,15,56,220,216
844	movups	-16(%rcx,%rax,1),%xmm0
845	jnz	L$ccm64_enc2_loop
846.byte	102,15,56,220,209
847.byte	102,15,56,220,217
848	paddq	%xmm9,%xmm6
849	decq	%rdx
850.byte	102,15,56,221,208
851.byte	102,15,56,221,216
852
853	leaq	16(%rdi),%rdi
854	xorps	%xmm2,%xmm8
855	movdqa	%xmm6,%xmm2
856	movups	%xmm8,(%rsi)
857.byte	102,15,56,0,215
858	leaq	16(%rsi),%rsi
859	jnz	L$ccm64_enc_outer
860
861	movups	%xmm3,(%r9)
862	.byte	0xf3,0xc3
863
864.globl	_aesni_ccm64_decrypt_blocks
865.private_extern _aesni_ccm64_decrypt_blocks
866
867.p2align	4
868_aesni_ccm64_decrypt_blocks:
869	movl	240(%rcx),%eax
870	movups	(%r8),%xmm6
871	movdqu	(%r9),%xmm3
872	movdqa	L$increment64(%rip),%xmm9
873	movdqa	L$bswap_mask(%rip),%xmm7
874
875	movaps	%xmm6,%xmm2
876	movl	%eax,%r10d
877	movq	%rcx,%r11
878.byte	102,15,56,0,247
879	movups	(%rcx),%xmm0
880	movups	16(%rcx),%xmm1
881	leaq	32(%rcx),%rcx
882	xorps	%xmm0,%xmm2
883L$oop_enc1_5:
884.byte	102,15,56,220,209
885	decl	%eax
886	movups	(%rcx),%xmm1
887	leaq	16(%rcx),%rcx
888	jnz	L$oop_enc1_5
889.byte	102,15,56,221,209
890	shll	$4,%r10d
891	movl	$16,%eax
892	movups	(%rdi),%xmm8
893	paddq	%xmm9,%xmm6
894	leaq	16(%rdi),%rdi
895	subq	%r10,%rax
896	leaq	32(%r11,%r10,1),%rcx
897	movq	%rax,%r10
898	jmp	L$ccm64_dec_outer
899.p2align	4
900L$ccm64_dec_outer:
901	xorps	%xmm2,%xmm8
902	movdqa	%xmm6,%xmm2
903	movups	%xmm8,(%rsi)
904	leaq	16(%rsi),%rsi
905.byte	102,15,56,0,215
906
907	subq	$1,%rdx
908	jz	L$ccm64_dec_break
909
910	movups	(%r11),%xmm0
911	movq	%r10,%rax
912	movups	16(%r11),%xmm1
913	xorps	%xmm0,%xmm8
914	xorps	%xmm0,%xmm2
915	xorps	%xmm8,%xmm3
916	movups	32(%r11),%xmm0
917	jmp	L$ccm64_dec2_loop
918.p2align	4
919L$ccm64_dec2_loop:
920.byte	102,15,56,220,209
921.byte	102,15,56,220,217
922	movups	(%rcx,%rax,1),%xmm1
923	addq	$32,%rax
924.byte	102,15,56,220,208
925.byte	102,15,56,220,216
926	movups	-16(%rcx,%rax,1),%xmm0
927	jnz	L$ccm64_dec2_loop
928	movups	(%rdi),%xmm8
929	paddq	%xmm9,%xmm6
930.byte	102,15,56,220,209
931.byte	102,15,56,220,217
932.byte	102,15,56,221,208
933.byte	102,15,56,221,216
934	leaq	16(%rdi),%rdi
935	jmp	L$ccm64_dec_outer
936
937.p2align	4
938L$ccm64_dec_break:
939
940	movl	240(%r11),%eax
941	movups	(%r11),%xmm0
942	movups	16(%r11),%xmm1
943	xorps	%xmm0,%xmm8
944	leaq	32(%r11),%r11
945	xorps	%xmm8,%xmm3
946L$oop_enc1_6:
947.byte	102,15,56,220,217
948	decl	%eax
949	movups	(%r11),%xmm1
950	leaq	16(%r11),%r11
951	jnz	L$oop_enc1_6
952.byte	102,15,56,221,217
953	movups	%xmm3,(%r9)
954	.byte	0xf3,0xc3
955
956.globl	_aesni_ctr32_encrypt_blocks
957.private_extern _aesni_ctr32_encrypt_blocks
958
959.p2align	4
960_aesni_ctr32_encrypt_blocks:
961	leaq	(%rsp),%rax
962	pushq	%rbp
963	subq	$128,%rsp
964	andq	$-16,%rsp
965	leaq	-8(%rax),%rbp
966
967	cmpq	$1,%rdx
968	je	L$ctr32_one_shortcut
969
970	movdqu	(%r8),%xmm2
971	movdqu	(%rcx),%xmm0
972	movl	12(%r8),%r8d
973	pxor	%xmm0,%xmm2
974	movl	12(%rcx),%r11d
975	movdqa	%xmm2,0(%rsp)
976	bswapl	%r8d
977	movdqa	%xmm2,%xmm3
978	movdqa	%xmm2,%xmm4
979	movdqa	%xmm2,%xmm5
980	movdqa	%xmm2,64(%rsp)
981	movdqa	%xmm2,80(%rsp)
982	movdqa	%xmm2,96(%rsp)
983	movq	%rdx,%r10
984	movdqa	%xmm2,112(%rsp)
985
986	leaq	1(%r8),%rax
987	leaq	2(%r8),%rdx
988	bswapl	%eax
989	bswapl	%edx
990	xorl	%r11d,%eax
991	xorl	%r11d,%edx
992.byte	102,15,58,34,216,3
993	leaq	3(%r8),%rax
994	movdqa	%xmm3,16(%rsp)
995.byte	102,15,58,34,226,3
996	bswapl	%eax
997	movq	%r10,%rdx
998	leaq	4(%r8),%r10
999	movdqa	%xmm4,32(%rsp)
1000	xorl	%r11d,%eax
1001	bswapl	%r10d
1002.byte	102,15,58,34,232,3
1003	xorl	%r11d,%r10d
1004	movdqa	%xmm5,48(%rsp)
1005	leaq	5(%r8),%r9
1006	movl	%r10d,64+12(%rsp)
1007	bswapl	%r9d
1008	leaq	6(%r8),%r10
1009	movl	240(%rcx),%eax
1010	xorl	%r11d,%r9d
1011	bswapl	%r10d
1012	movl	%r9d,80+12(%rsp)
1013	xorl	%r11d,%r10d
1014	leaq	7(%r8),%r9
1015	movl	%r10d,96+12(%rsp)
1016	bswapl	%r9d
1017	movl	_OPENSSL_ia32cap_P+4(%rip),%r10d
1018	xorl	%r11d,%r9d
1019	andl	$71303168,%r10d
1020	movl	%r9d,112+12(%rsp)
1021
1022	movups	16(%rcx),%xmm1
1023
1024	movdqa	64(%rsp),%xmm6
1025	movdqa	80(%rsp),%xmm7
1026
1027	cmpq	$8,%rdx
1028	jb	L$ctr32_tail
1029
1030	subq	$6,%rdx
1031	cmpl	$4194304,%r10d
1032	je	L$ctr32_6x
1033
1034	leaq	128(%rcx),%rcx
1035	subq	$2,%rdx
1036	jmp	L$ctr32_loop8
1037
1038.p2align	4
1039L$ctr32_6x:
1040	shll	$4,%eax
1041	movl	$48,%r10d
1042	bswapl	%r11d
1043	leaq	32(%rcx,%rax,1),%rcx
1044	subq	%rax,%r10
1045	jmp	L$ctr32_loop6
1046
1047.p2align	4
1048L$ctr32_loop6:
1049	addl	$6,%r8d
1050	movups	-48(%rcx,%r10,1),%xmm0
1051.byte	102,15,56,220,209
1052	movl	%r8d,%eax
1053	xorl	%r11d,%eax
1054.byte	102,15,56,220,217
1055.byte	0x0f,0x38,0xf1,0x44,0x24,12
1056	leal	1(%r8),%eax
1057.byte	102,15,56,220,225
1058	xorl	%r11d,%eax
1059.byte	0x0f,0x38,0xf1,0x44,0x24,28
1060.byte	102,15,56,220,233
1061	leal	2(%r8),%eax
1062	xorl	%r11d,%eax
1063.byte	102,15,56,220,241
1064.byte	0x0f,0x38,0xf1,0x44,0x24,44
1065	leal	3(%r8),%eax
1066.byte	102,15,56,220,249
1067	movups	-32(%rcx,%r10,1),%xmm1
1068	xorl	%r11d,%eax
1069
1070.byte	102,15,56,220,208
1071.byte	0x0f,0x38,0xf1,0x44,0x24,60
1072	leal	4(%r8),%eax
1073.byte	102,15,56,220,216
1074	xorl	%r11d,%eax
1075.byte	0x0f,0x38,0xf1,0x44,0x24,76
1076.byte	102,15,56,220,224
1077	leal	5(%r8),%eax
1078	xorl	%r11d,%eax
1079.byte	102,15,56,220,232
1080.byte	0x0f,0x38,0xf1,0x44,0x24,92
1081	movq	%r10,%rax
1082.byte	102,15,56,220,240
1083.byte	102,15,56,220,248
1084	movups	-16(%rcx,%r10,1),%xmm0
1085
1086	call	L$enc_loop6
1087
1088	movdqu	(%rdi),%xmm8
1089	movdqu	16(%rdi),%xmm9
1090	movdqu	32(%rdi),%xmm10
1091	movdqu	48(%rdi),%xmm11
1092	movdqu	64(%rdi),%xmm12
1093	movdqu	80(%rdi),%xmm13
1094	leaq	96(%rdi),%rdi
1095	movups	-64(%rcx,%r10,1),%xmm1
1096	pxor	%xmm2,%xmm8
1097	movaps	0(%rsp),%xmm2
1098	pxor	%xmm3,%xmm9
1099	movaps	16(%rsp),%xmm3
1100	pxor	%xmm4,%xmm10
1101	movaps	32(%rsp),%xmm4
1102	pxor	%xmm5,%xmm11
1103	movaps	48(%rsp),%xmm5
1104	pxor	%xmm6,%xmm12
1105	movaps	64(%rsp),%xmm6
1106	pxor	%xmm7,%xmm13
1107	movaps	80(%rsp),%xmm7
1108	movdqu	%xmm8,(%rsi)
1109	movdqu	%xmm9,16(%rsi)
1110	movdqu	%xmm10,32(%rsi)
1111	movdqu	%xmm11,48(%rsi)
1112	movdqu	%xmm12,64(%rsi)
1113	movdqu	%xmm13,80(%rsi)
1114	leaq	96(%rsi),%rsi
1115
1116	subq	$6,%rdx
1117	jnc	L$ctr32_loop6
1118
1119	addq	$6,%rdx
1120	jz	L$ctr32_done
1121
1122	leal	-48(%r10),%eax
1123	leaq	-80(%rcx,%r10,1),%rcx
1124	negl	%eax
1125	shrl	$4,%eax
1126	jmp	L$ctr32_tail
1127
1128.p2align	5
1129L$ctr32_loop8:
1130	addl	$8,%r8d
1131	movdqa	96(%rsp),%xmm8
1132.byte	102,15,56,220,209
1133	movl	%r8d,%r9d
1134	movdqa	112(%rsp),%xmm9
1135.byte	102,15,56,220,217
1136	bswapl	%r9d
1137	movups	32-128(%rcx),%xmm0
1138.byte	102,15,56,220,225
1139	xorl	%r11d,%r9d
1140	nop
1141.byte	102,15,56,220,233
1142	movl	%r9d,0+12(%rsp)
1143	leaq	1(%r8),%r9
1144.byte	102,15,56,220,241
1145.byte	102,15,56,220,249
1146.byte	102,68,15,56,220,193
1147.byte	102,68,15,56,220,201
1148	movups	48-128(%rcx),%xmm1
1149	bswapl	%r9d
1150.byte	102,15,56,220,208
1151.byte	102,15,56,220,216
1152	xorl	%r11d,%r9d
1153.byte	0x66,0x90
1154.byte	102,15,56,220,224
1155.byte	102,15,56,220,232
1156	movl	%r9d,16+12(%rsp)
1157	leaq	2(%r8),%r9
1158.byte	102,15,56,220,240
1159.byte	102,15,56,220,248
1160.byte	102,68,15,56,220,192
1161.byte	102,68,15,56,220,200
1162	movups	64-128(%rcx),%xmm0
1163	bswapl	%r9d
1164.byte	102,15,56,220,209
1165.byte	102,15,56,220,217
1166	xorl	%r11d,%r9d
1167.byte	0x66,0x90
1168.byte	102,15,56,220,225
1169.byte	102,15,56,220,233
1170	movl	%r9d,32+12(%rsp)
1171	leaq	3(%r8),%r9
1172.byte	102,15,56,220,241
1173.byte	102,15,56,220,249
1174.byte	102,68,15,56,220,193
1175.byte	102,68,15,56,220,201
1176	movups	80-128(%rcx),%xmm1
1177	bswapl	%r9d
1178.byte	102,15,56,220,208
1179.byte	102,15,56,220,216
1180	xorl	%r11d,%r9d
1181.byte	0x66,0x90
1182.byte	102,15,56,220,224
1183.byte	102,15,56,220,232
1184	movl	%r9d,48+12(%rsp)
1185	leaq	4(%r8),%r9
1186.byte	102,15,56,220,240
1187.byte	102,15,56,220,248
1188.byte	102,68,15,56,220,192
1189.byte	102,68,15,56,220,200
1190	movups	96-128(%rcx),%xmm0
1191	bswapl	%r9d
1192.byte	102,15,56,220,209
1193.byte	102,15,56,220,217
1194	xorl	%r11d,%r9d
1195.byte	0x66,0x90
1196.byte	102,15,56,220,225
1197.byte	102,15,56,220,233
1198	movl	%r9d,64+12(%rsp)
1199	leaq	5(%r8),%r9
1200.byte	102,15,56,220,241
1201.byte	102,15,56,220,249
1202.byte	102,68,15,56,220,193
1203.byte	102,68,15,56,220,201
1204	movups	112-128(%rcx),%xmm1
1205	bswapl	%r9d
1206.byte	102,15,56,220,208
1207.byte	102,15,56,220,216
1208	xorl	%r11d,%r9d
1209.byte	0x66,0x90
1210.byte	102,15,56,220,224
1211.byte	102,15,56,220,232
1212	movl	%r9d,80+12(%rsp)
1213	leaq	6(%r8),%r9
1214.byte	102,15,56,220,240
1215.byte	102,15,56,220,248
1216.byte	102,68,15,56,220,192
1217.byte	102,68,15,56,220,200
1218	movups	128-128(%rcx),%xmm0
1219	bswapl	%r9d
1220.byte	102,15,56,220,209
1221.byte	102,15,56,220,217
1222	xorl	%r11d,%r9d
1223.byte	0x66,0x90
1224.byte	102,15,56,220,225
1225.byte	102,15,56,220,233
1226	movl	%r9d,96+12(%rsp)
1227	leaq	7(%r8),%r9
1228.byte	102,15,56,220,241
1229.byte	102,15,56,220,249
1230.byte	102,68,15,56,220,193
1231.byte	102,68,15,56,220,201
1232	movups	144-128(%rcx),%xmm1
1233	bswapl	%r9d
1234.byte	102,15,56,220,208
1235.byte	102,15,56,220,216
1236.byte	102,15,56,220,224
1237	xorl	%r11d,%r9d
1238	movdqu	0(%rdi),%xmm10
1239.byte	102,15,56,220,232
1240	movl	%r9d,112+12(%rsp)
1241	cmpl	$11,%eax
1242.byte	102,15,56,220,240
1243.byte	102,15,56,220,248
1244.byte	102,68,15,56,220,192
1245.byte	102,68,15,56,220,200
1246	movups	160-128(%rcx),%xmm0
1247
1248	jb	L$ctr32_enc_done
1249
1250.byte	102,15,56,220,209
1251.byte	102,15,56,220,217
1252.byte	102,15,56,220,225
1253.byte	102,15,56,220,233
1254.byte	102,15,56,220,241
1255.byte	102,15,56,220,249
1256.byte	102,68,15,56,220,193
1257.byte	102,68,15,56,220,201
1258	movups	176-128(%rcx),%xmm1
1259
1260.byte	102,15,56,220,208
1261.byte	102,15,56,220,216
1262.byte	102,15,56,220,224
1263.byte	102,15,56,220,232
1264.byte	102,15,56,220,240
1265.byte	102,15,56,220,248
1266.byte	102,68,15,56,220,192
1267.byte	102,68,15,56,220,200
1268	movups	192-128(%rcx),%xmm0
1269	je	L$ctr32_enc_done
1270
1271.byte	102,15,56,220,209
1272.byte	102,15,56,220,217
1273.byte	102,15,56,220,225
1274.byte	102,15,56,220,233
1275.byte	102,15,56,220,241
1276.byte	102,15,56,220,249
1277.byte	102,68,15,56,220,193
1278.byte	102,68,15,56,220,201
1279	movups	208-128(%rcx),%xmm1
1280
1281.byte	102,15,56,220,208
1282.byte	102,15,56,220,216
1283.byte	102,15,56,220,224
1284.byte	102,15,56,220,232
1285.byte	102,15,56,220,240
1286.byte	102,15,56,220,248
1287.byte	102,68,15,56,220,192
1288.byte	102,68,15,56,220,200
1289	movups	224-128(%rcx),%xmm0
1290	jmp	L$ctr32_enc_done
1291
1292.p2align	4
1293L$ctr32_enc_done:
1294	movdqu	16(%rdi),%xmm11
1295	pxor	%xmm0,%xmm10
1296	movdqu	32(%rdi),%xmm12
1297	pxor	%xmm0,%xmm11
1298	movdqu	48(%rdi),%xmm13
1299	pxor	%xmm0,%xmm12
1300	movdqu	64(%rdi),%xmm14
1301	pxor	%xmm0,%xmm13
1302	movdqu	80(%rdi),%xmm15
1303	pxor	%xmm0,%xmm14
1304	pxor	%xmm0,%xmm15
1305.byte	102,15,56,220,209
1306.byte	102,15,56,220,217
1307.byte	102,15,56,220,225
1308.byte	102,15,56,220,233
1309.byte	102,15,56,220,241
1310.byte	102,15,56,220,249
1311.byte	102,68,15,56,220,193
1312.byte	102,68,15,56,220,201
1313	movdqu	96(%rdi),%xmm1
1314	leaq	128(%rdi),%rdi
1315
1316.byte	102,65,15,56,221,210
1317	pxor	%xmm0,%xmm1
1318	movdqu	112-128(%rdi),%xmm10
1319.byte	102,65,15,56,221,219
1320	pxor	%xmm0,%xmm10
1321	movdqa	0(%rsp),%xmm11
1322.byte	102,65,15,56,221,228
1323.byte	102,65,15,56,221,237
1324	movdqa	16(%rsp),%xmm12
1325	movdqa	32(%rsp),%xmm13
1326.byte	102,65,15,56,221,246
1327.byte	102,65,15,56,221,255
1328	movdqa	48(%rsp),%xmm14
1329	movdqa	64(%rsp),%xmm15
1330.byte	102,68,15,56,221,193
1331	movdqa	80(%rsp),%xmm0
1332	movups	16-128(%rcx),%xmm1
1333.byte	102,69,15,56,221,202
1334
1335	movups	%xmm2,(%rsi)
1336	movdqa	%xmm11,%xmm2
1337	movups	%xmm3,16(%rsi)
1338	movdqa	%xmm12,%xmm3
1339	movups	%xmm4,32(%rsi)
1340	movdqa	%xmm13,%xmm4
1341	movups	%xmm5,48(%rsi)
1342	movdqa	%xmm14,%xmm5
1343	movups	%xmm6,64(%rsi)
1344	movdqa	%xmm15,%xmm6
1345	movups	%xmm7,80(%rsi)
1346	movdqa	%xmm0,%xmm7
1347	movups	%xmm8,96(%rsi)
1348	movups	%xmm9,112(%rsi)
1349	leaq	128(%rsi),%rsi
1350
1351	subq	$8,%rdx
1352	jnc	L$ctr32_loop8
1353
1354	addq	$8,%rdx
1355	jz	L$ctr32_done
1356	leaq	-128(%rcx),%rcx
1357
1358L$ctr32_tail:
1359	leaq	16(%rcx),%rcx
1360	cmpq	$4,%rdx
1361	jb	L$ctr32_loop3
1362	je	L$ctr32_loop4
1363
1364	shll	$4,%eax
1365	movdqa	96(%rsp),%xmm8
1366	pxor	%xmm9,%xmm9
1367
1368	movups	16(%rcx),%xmm0
1369.byte	102,15,56,220,209
1370.byte	102,15,56,220,217
1371	leaq	32-16(%rcx,%rax,1),%rcx
1372	negq	%rax
1373.byte	102,15,56,220,225
1374	addq	$16,%rax
1375	movups	(%rdi),%xmm10
1376.byte	102,15,56,220,233
1377.byte	102,15,56,220,241
1378	movups	16(%rdi),%xmm11
1379	movups	32(%rdi),%xmm12
1380.byte	102,15,56,220,249
1381.byte	102,68,15,56,220,193
1382
1383	call	L$enc_loop8_enter
1384
1385	movdqu	48(%rdi),%xmm13
1386	pxor	%xmm10,%xmm2
1387	movdqu	64(%rdi),%xmm10
1388	pxor	%xmm11,%xmm3
1389	movdqu	%xmm2,(%rsi)
1390	pxor	%xmm12,%xmm4
1391	movdqu	%xmm3,16(%rsi)
1392	pxor	%xmm13,%xmm5
1393	movdqu	%xmm4,32(%rsi)
1394	pxor	%xmm10,%xmm6
1395	movdqu	%xmm5,48(%rsi)
1396	movdqu	%xmm6,64(%rsi)
1397	cmpq	$6,%rdx
1398	jb	L$ctr32_done
1399
1400	movups	80(%rdi),%xmm11
1401	xorps	%xmm11,%xmm7
1402	movups	%xmm7,80(%rsi)
1403	je	L$ctr32_done
1404
1405	movups	96(%rdi),%xmm12
1406	xorps	%xmm12,%xmm8
1407	movups	%xmm8,96(%rsi)
1408	jmp	L$ctr32_done
1409
1410.p2align	5
1411L$ctr32_loop4:
1412.byte	102,15,56,220,209
1413	leaq	16(%rcx),%rcx
1414	decl	%eax
1415.byte	102,15,56,220,217
1416.byte	102,15,56,220,225
1417.byte	102,15,56,220,233
1418	movups	(%rcx),%xmm1
1419	jnz	L$ctr32_loop4
1420.byte	102,15,56,221,209
1421.byte	102,15,56,221,217
1422	movups	(%rdi),%xmm10
1423	movups	16(%rdi),%xmm11
1424.byte	102,15,56,221,225
1425.byte	102,15,56,221,233
1426	movups	32(%rdi),%xmm12
1427	movups	48(%rdi),%xmm13
1428
1429	xorps	%xmm10,%xmm2
1430	movups	%xmm2,(%rsi)
1431	xorps	%xmm11,%xmm3
1432	movups	%xmm3,16(%rsi)
1433	pxor	%xmm12,%xmm4
1434	movdqu	%xmm4,32(%rsi)
1435	pxor	%xmm13,%xmm5
1436	movdqu	%xmm5,48(%rsi)
1437	jmp	L$ctr32_done
1438
1439.p2align	5
1440L$ctr32_loop3:
1441.byte	102,15,56,220,209
1442	leaq	16(%rcx),%rcx
1443	decl	%eax
1444.byte	102,15,56,220,217
1445.byte	102,15,56,220,225
1446	movups	(%rcx),%xmm1
1447	jnz	L$ctr32_loop3
1448.byte	102,15,56,221,209
1449.byte	102,15,56,221,217
1450.byte	102,15,56,221,225
1451
1452	movups	(%rdi),%xmm10
1453	xorps	%xmm10,%xmm2
1454	movups	%xmm2,(%rsi)
1455	cmpq	$2,%rdx
1456	jb	L$ctr32_done
1457
1458	movups	16(%rdi),%xmm11
1459	xorps	%xmm11,%xmm3
1460	movups	%xmm3,16(%rsi)
1461	je	L$ctr32_done
1462
1463	movups	32(%rdi),%xmm12
1464	xorps	%xmm12,%xmm4
1465	movups	%xmm4,32(%rsi)
1466	jmp	L$ctr32_done
1467
1468.p2align	4
1469L$ctr32_one_shortcut:
1470	movups	(%r8),%xmm2
1471	movups	(%rdi),%xmm10
1472	movl	240(%rcx),%eax
1473	movups	(%rcx),%xmm0
1474	movups	16(%rcx),%xmm1
1475	leaq	32(%rcx),%rcx
1476	xorps	%xmm0,%xmm2
1477L$oop_enc1_7:
1478.byte	102,15,56,220,209
1479	decl	%eax
1480	movups	(%rcx),%xmm1
1481	leaq	16(%rcx),%rcx
1482	jnz	L$oop_enc1_7
1483.byte	102,15,56,221,209
1484	xorps	%xmm10,%xmm2
1485	movups	%xmm2,(%rsi)
1486	jmp	L$ctr32_done
1487
1488.p2align	4
1489L$ctr32_done:
1490	leaq	(%rbp),%rsp
1491	popq	%rbp
1492L$ctr32_epilogue:
1493	.byte	0xf3,0xc3
1494
1495.globl	_aesni_xts_encrypt
1496.private_extern _aesni_xts_encrypt
1497
1498.p2align	4
1499_aesni_xts_encrypt:
1500	leaq	(%rsp),%rax
1501	pushq	%rbp
1502	subq	$112,%rsp
1503	andq	$-16,%rsp
1504	leaq	-8(%rax),%rbp
1505	movups	(%r9),%xmm2
1506	movl	240(%r8),%eax
1507	movl	240(%rcx),%r10d
1508	movups	(%r8),%xmm0
1509	movups	16(%r8),%xmm1
1510	leaq	32(%r8),%r8
1511	xorps	%xmm0,%xmm2
1512L$oop_enc1_8:
1513.byte	102,15,56,220,209
1514	decl	%eax
1515	movups	(%r8),%xmm1
1516	leaq	16(%r8),%r8
1517	jnz	L$oop_enc1_8
1518.byte	102,15,56,221,209
1519	movups	(%rcx),%xmm0
1520	movq	%rcx,%r11
1521	movl	%r10d,%eax
1522	shll	$4,%r10d
1523	movq	%rdx,%r9
1524	andq	$-16,%rdx
1525
1526	movups	16(%rcx,%r10,1),%xmm1
1527
1528	movdqa	L$xts_magic(%rip),%xmm8
1529	movdqa	%xmm2,%xmm15
1530	pshufd	$95,%xmm2,%xmm9
1531	pxor	%xmm0,%xmm1
1532	movdqa	%xmm9,%xmm14
1533	paddd	%xmm9,%xmm9
1534	movdqa	%xmm15,%xmm10
1535	psrad	$31,%xmm14
1536	paddq	%xmm15,%xmm15
1537	pand	%xmm8,%xmm14
1538	pxor	%xmm0,%xmm10
1539	pxor	%xmm14,%xmm15
1540	movdqa	%xmm9,%xmm14
1541	paddd	%xmm9,%xmm9
1542	movdqa	%xmm15,%xmm11
1543	psrad	$31,%xmm14
1544	paddq	%xmm15,%xmm15
1545	pand	%xmm8,%xmm14
1546	pxor	%xmm0,%xmm11
1547	pxor	%xmm14,%xmm15
1548	movdqa	%xmm9,%xmm14
1549	paddd	%xmm9,%xmm9
1550	movdqa	%xmm15,%xmm12
1551	psrad	$31,%xmm14
1552	paddq	%xmm15,%xmm15
1553	pand	%xmm8,%xmm14
1554	pxor	%xmm0,%xmm12
1555	pxor	%xmm14,%xmm15
1556	movdqa	%xmm9,%xmm14
1557	paddd	%xmm9,%xmm9
1558	movdqa	%xmm15,%xmm13
1559	psrad	$31,%xmm14
1560	paddq	%xmm15,%xmm15
1561	pand	%xmm8,%xmm14
1562	pxor	%xmm0,%xmm13
1563	pxor	%xmm14,%xmm15
1564	movdqa	%xmm15,%xmm14
1565	psrad	$31,%xmm9
1566	paddq	%xmm15,%xmm15
1567	pand	%xmm8,%xmm9
1568	pxor	%xmm0,%xmm14
1569	pxor	%xmm9,%xmm15
1570	movaps	%xmm1,96(%rsp)
1571
1572	subq	$96,%rdx
1573	jc	L$xts_enc_short
1574
1575	movl	$16+96,%eax
1576	leaq	32(%r11,%r10,1),%rcx
1577	subq	%r10,%rax
1578	movups	16(%r11),%xmm1
1579	movq	%rax,%r10
1580	leaq	L$xts_magic(%rip),%r8
1581	jmp	L$xts_enc_grandloop
1582
1583.p2align	5
1584L$xts_enc_grandloop:
1585	movdqu	0(%rdi),%xmm2
1586	movdqa	%xmm0,%xmm8
1587	movdqu	16(%rdi),%xmm3
1588	pxor	%xmm10,%xmm2
1589	movdqu	32(%rdi),%xmm4
1590	pxor	%xmm11,%xmm3
1591.byte	102,15,56,220,209
1592	movdqu	48(%rdi),%xmm5
1593	pxor	%xmm12,%xmm4
1594.byte	102,15,56,220,217
1595	movdqu	64(%rdi),%xmm6
1596	pxor	%xmm13,%xmm5
1597.byte	102,15,56,220,225
1598	movdqu	80(%rdi),%xmm7
1599	pxor	%xmm15,%xmm8
1600	movdqa	96(%rsp),%xmm9
1601	pxor	%xmm14,%xmm6
1602.byte	102,15,56,220,233
1603	movups	32(%r11),%xmm0
1604	leaq	96(%rdi),%rdi
1605	pxor	%xmm8,%xmm7
1606
1607	pxor	%xmm9,%xmm10
1608.byte	102,15,56,220,241
1609	pxor	%xmm9,%xmm11
1610	movdqa	%xmm10,0(%rsp)
1611.byte	102,15,56,220,249
1612	movups	48(%r11),%xmm1
1613	pxor	%xmm9,%xmm12
1614
1615.byte	102,15,56,220,208
1616	pxor	%xmm9,%xmm13
1617	movdqa	%xmm11,16(%rsp)
1618.byte	102,15,56,220,216
1619	pxor	%xmm9,%xmm14
1620	movdqa	%xmm12,32(%rsp)
1621.byte	102,15,56,220,224
1622.byte	102,15,56,220,232
1623	pxor	%xmm9,%xmm8
1624	movdqa	%xmm14,64(%rsp)
1625.byte	102,15,56,220,240
1626.byte	102,15,56,220,248
1627	movups	64(%r11),%xmm0
1628	movdqa	%xmm8,80(%rsp)
1629	pshufd	$95,%xmm15,%xmm9
1630	jmp	L$xts_enc_loop6
1631.p2align	5
1632L$xts_enc_loop6:
1633.byte	102,15,56,220,209
1634.byte	102,15,56,220,217
1635.byte	102,15,56,220,225
1636.byte	102,15,56,220,233
1637.byte	102,15,56,220,241
1638.byte	102,15,56,220,249
1639	movups	-64(%rcx,%rax,1),%xmm1
1640	addq	$32,%rax
1641
1642.byte	102,15,56,220,208
1643.byte	102,15,56,220,216
1644.byte	102,15,56,220,224
1645.byte	102,15,56,220,232
1646.byte	102,15,56,220,240
1647.byte	102,15,56,220,248
1648	movups	-80(%rcx,%rax,1),%xmm0
1649	jnz	L$xts_enc_loop6
1650
1651	movdqa	(%r8),%xmm8
1652	movdqa	%xmm9,%xmm14
1653	paddd	%xmm9,%xmm9
1654.byte	102,15,56,220,209
1655	paddq	%xmm15,%xmm15
1656	psrad	$31,%xmm14
1657.byte	102,15,56,220,217
1658	pand	%xmm8,%xmm14
1659	movups	(%r11),%xmm10
1660.byte	102,15,56,220,225
1661.byte	102,15,56,220,233
1662.byte	102,15,56,220,241
1663	pxor	%xmm14,%xmm15
1664	movaps	%xmm10,%xmm11
1665.byte	102,15,56,220,249
1666	movups	-64(%rcx),%xmm1
1667
1668	movdqa	%xmm9,%xmm14
1669.byte	102,15,56,220,208
1670	paddd	%xmm9,%xmm9
1671	pxor	%xmm15,%xmm10
1672.byte	102,15,56,220,216
1673	psrad	$31,%xmm14
1674	paddq	%xmm15,%xmm15
1675.byte	102,15,56,220,224
1676.byte	102,15,56,220,232
1677	pand	%xmm8,%xmm14
1678	movaps	%xmm11,%xmm12
1679.byte	102,15,56,220,240
1680	pxor	%xmm14,%xmm15
1681	movdqa	%xmm9,%xmm14
1682.byte	102,15,56,220,248
1683	movups	-48(%rcx),%xmm0
1684
1685	paddd	%xmm9,%xmm9
1686.byte	102,15,56,220,209
1687	pxor	%xmm15,%xmm11
1688	psrad	$31,%xmm14
1689.byte	102,15,56,220,217
1690	paddq	%xmm15,%xmm15
1691	pand	%xmm8,%xmm14
1692.byte	102,15,56,220,225
1693.byte	102,15,56,220,233
1694	movdqa	%xmm13,48(%rsp)
1695	pxor	%xmm14,%xmm15
1696.byte	102,15,56,220,241
1697	movaps	%xmm12,%xmm13
1698	movdqa	%xmm9,%xmm14
1699.byte	102,15,56,220,249
1700	movups	-32(%rcx),%xmm1
1701
1702	paddd	%xmm9,%xmm9
1703.byte	102,15,56,220,208
1704	pxor	%xmm15,%xmm12
1705	psrad	$31,%xmm14
1706.byte	102,15,56,220,216
1707	paddq	%xmm15,%xmm15
1708	pand	%xmm8,%xmm14
1709.byte	102,15,56,220,224
1710.byte	102,15,56,220,232
1711.byte	102,15,56,220,240
1712	pxor	%xmm14,%xmm15
1713	movaps	%xmm13,%xmm14
1714.byte	102,15,56,220,248
1715
1716	movdqa	%xmm9,%xmm0
1717	paddd	%xmm9,%xmm9
1718.byte	102,15,56,220,209
1719	pxor	%xmm15,%xmm13
1720	psrad	$31,%xmm0
1721.byte	102,15,56,220,217
1722	paddq	%xmm15,%xmm15
1723	pand	%xmm8,%xmm0
1724.byte	102,15,56,220,225
1725.byte	102,15,56,220,233
1726	pxor	%xmm0,%xmm15
1727	movups	(%r11),%xmm0
1728.byte	102,15,56,220,241
1729.byte	102,15,56,220,249
1730	movups	16(%r11),%xmm1
1731
1732	pxor	%xmm15,%xmm14
1733.byte	102,15,56,221,84,36,0
1734	psrad	$31,%xmm9
1735	paddq	%xmm15,%xmm15
1736.byte	102,15,56,221,92,36,16
1737.byte	102,15,56,221,100,36,32
1738	pand	%xmm8,%xmm9
1739	movq	%r10,%rax
1740.byte	102,15,56,221,108,36,48
1741.byte	102,15,56,221,116,36,64
1742.byte	102,15,56,221,124,36,80
1743	pxor	%xmm9,%xmm15
1744
1745	leaq	96(%rsi),%rsi
1746	movups	%xmm2,-96(%rsi)
1747	movups	%xmm3,-80(%rsi)
1748	movups	%xmm4,-64(%rsi)
1749	movups	%xmm5,-48(%rsi)
1750	movups	%xmm6,-32(%rsi)
1751	movups	%xmm7,-16(%rsi)
1752	subq	$96,%rdx
1753	jnc	L$xts_enc_grandloop
1754
1755	movl	$16+96,%eax
1756	subl	%r10d,%eax
1757	movq	%r11,%rcx
1758	shrl	$4,%eax
1759
1760L$xts_enc_short:
1761	movl	%eax,%r10d
1762	pxor	%xmm0,%xmm10
1763	addq	$96,%rdx
1764	jz	L$xts_enc_done
1765
1766	pxor	%xmm0,%xmm11
1767	cmpq	$32,%rdx
1768	jb	L$xts_enc_one
1769	pxor	%xmm0,%xmm12
1770	je	L$xts_enc_two
1771
1772	pxor	%xmm0,%xmm13
1773	cmpq	$64,%rdx
1774	jb	L$xts_enc_three
1775	pxor	%xmm0,%xmm14
1776	je	L$xts_enc_four
1777
1778	movdqu	(%rdi),%xmm2
1779	movdqu	16(%rdi),%xmm3
1780	movdqu	32(%rdi),%xmm4
1781	pxor	%xmm10,%xmm2
1782	movdqu	48(%rdi),%xmm5
1783	pxor	%xmm11,%xmm3
1784	movdqu	64(%rdi),%xmm6
1785	leaq	80(%rdi),%rdi
1786	pxor	%xmm12,%xmm4
1787	pxor	%xmm13,%xmm5
1788	pxor	%xmm14,%xmm6
1789
1790	call	_aesni_encrypt6
1791
1792	xorps	%xmm10,%xmm2
1793	movdqa	%xmm15,%xmm10
1794	xorps	%xmm11,%xmm3
1795	xorps	%xmm12,%xmm4
1796	movdqu	%xmm2,(%rsi)
1797	xorps	%xmm13,%xmm5
1798	movdqu	%xmm3,16(%rsi)
1799	xorps	%xmm14,%xmm6
1800	movdqu	%xmm4,32(%rsi)
1801	movdqu	%xmm5,48(%rsi)
1802	movdqu	%xmm6,64(%rsi)
1803	leaq	80(%rsi),%rsi
1804	jmp	L$xts_enc_done
1805
1806.p2align	4
1807L$xts_enc_one:
1808	movups	(%rdi),%xmm2
1809	leaq	16(%rdi),%rdi
1810	xorps	%xmm10,%xmm2
1811	movups	(%rcx),%xmm0
1812	movups	16(%rcx),%xmm1
1813	leaq	32(%rcx),%rcx
1814	xorps	%xmm0,%xmm2
1815L$oop_enc1_9:
1816.byte	102,15,56,220,209
1817	decl	%eax
1818	movups	(%rcx),%xmm1
1819	leaq	16(%rcx),%rcx
1820	jnz	L$oop_enc1_9
1821.byte	102,15,56,221,209
1822	xorps	%xmm10,%xmm2
1823	movdqa	%xmm11,%xmm10
1824	movups	%xmm2,(%rsi)
1825	leaq	16(%rsi),%rsi
1826	jmp	L$xts_enc_done
1827
1828.p2align	4
1829L$xts_enc_two:
1830	movups	(%rdi),%xmm2
1831	movups	16(%rdi),%xmm3
1832	leaq	32(%rdi),%rdi
1833	xorps	%xmm10,%xmm2
1834	xorps	%xmm11,%xmm3
1835
1836	call	_aesni_encrypt2
1837
1838	xorps	%xmm10,%xmm2
1839	movdqa	%xmm12,%xmm10
1840	xorps	%xmm11,%xmm3
1841	movups	%xmm2,(%rsi)
1842	movups	%xmm3,16(%rsi)
1843	leaq	32(%rsi),%rsi
1844	jmp	L$xts_enc_done
1845
1846.p2align	4
1847L$xts_enc_three:
1848	movups	(%rdi),%xmm2
1849	movups	16(%rdi),%xmm3
1850	movups	32(%rdi),%xmm4
1851	leaq	48(%rdi),%rdi
1852	xorps	%xmm10,%xmm2
1853	xorps	%xmm11,%xmm3
1854	xorps	%xmm12,%xmm4
1855
1856	call	_aesni_encrypt3
1857
1858	xorps	%xmm10,%xmm2
1859	movdqa	%xmm13,%xmm10
1860	xorps	%xmm11,%xmm3
1861	xorps	%xmm12,%xmm4
1862	movups	%xmm2,(%rsi)
1863	movups	%xmm3,16(%rsi)
1864	movups	%xmm4,32(%rsi)
1865	leaq	48(%rsi),%rsi
1866	jmp	L$xts_enc_done
1867
1868.p2align	4
1869L$xts_enc_four:
1870	movups	(%rdi),%xmm2
1871	movups	16(%rdi),%xmm3
1872	movups	32(%rdi),%xmm4
1873	xorps	%xmm10,%xmm2
1874	movups	48(%rdi),%xmm5
1875	leaq	64(%rdi),%rdi
1876	xorps	%xmm11,%xmm3
1877	xorps	%xmm12,%xmm4
1878	xorps	%xmm13,%xmm5
1879
1880	call	_aesni_encrypt4
1881
1882	pxor	%xmm10,%xmm2
1883	movdqa	%xmm14,%xmm10
1884	pxor	%xmm11,%xmm3
1885	pxor	%xmm12,%xmm4
1886	movdqu	%xmm2,(%rsi)
1887	pxor	%xmm13,%xmm5
1888	movdqu	%xmm3,16(%rsi)
1889	movdqu	%xmm4,32(%rsi)
1890	movdqu	%xmm5,48(%rsi)
1891	leaq	64(%rsi),%rsi
1892	jmp	L$xts_enc_done
1893
1894.p2align	4
1895L$xts_enc_done:
1896	andq	$15,%r9
1897	jz	L$xts_enc_ret
1898	movq	%r9,%rdx
1899
1900L$xts_enc_steal:
1901	movzbl	(%rdi),%eax
1902	movzbl	-16(%rsi),%ecx
1903	leaq	1(%rdi),%rdi
1904	movb	%al,-16(%rsi)
1905	movb	%cl,0(%rsi)
1906	leaq	1(%rsi),%rsi
1907	subq	$1,%rdx
1908	jnz	L$xts_enc_steal
1909
1910	subq	%r9,%rsi
1911	movq	%r11,%rcx
1912	movl	%r10d,%eax
1913
1914	movups	-16(%rsi),%xmm2
1915	xorps	%xmm10,%xmm2
1916	movups	(%rcx),%xmm0
1917	movups	16(%rcx),%xmm1
1918	leaq	32(%rcx),%rcx
1919	xorps	%xmm0,%xmm2
1920L$oop_enc1_10:
1921.byte	102,15,56,220,209
1922	decl	%eax
1923	movups	(%rcx),%xmm1
1924	leaq	16(%rcx),%rcx
1925	jnz	L$oop_enc1_10
1926.byte	102,15,56,221,209
1927	xorps	%xmm10,%xmm2
1928	movups	%xmm2,-16(%rsi)
1929
1930L$xts_enc_ret:
1931	leaq	(%rbp),%rsp
1932	popq	%rbp
1933L$xts_enc_epilogue:
1934	.byte	0xf3,0xc3
1935
1936.globl	_aesni_xts_decrypt
1937.private_extern _aesni_xts_decrypt
1938
1939.p2align	4
1940_aesni_xts_decrypt:
1941	leaq	(%rsp),%rax
1942	pushq	%rbp
1943	subq	$112,%rsp
1944	andq	$-16,%rsp
1945	leaq	-8(%rax),%rbp
1946	movups	(%r9),%xmm2
1947	movl	240(%r8),%eax
1948	movl	240(%rcx),%r10d
1949	movups	(%r8),%xmm0
1950	movups	16(%r8),%xmm1
1951	leaq	32(%r8),%r8
1952	xorps	%xmm0,%xmm2
1953L$oop_enc1_11:
1954.byte	102,15,56,220,209
1955	decl	%eax
1956	movups	(%r8),%xmm1
1957	leaq	16(%r8),%r8
1958	jnz	L$oop_enc1_11
1959.byte	102,15,56,221,209
1960	xorl	%eax,%eax
1961	testq	$15,%rdx
1962	setnz	%al
1963	shlq	$4,%rax
1964	subq	%rax,%rdx
1965
1966	movups	(%rcx),%xmm0
1967	movq	%rcx,%r11
1968	movl	%r10d,%eax
1969	shll	$4,%r10d
1970	movq	%rdx,%r9
1971	andq	$-16,%rdx
1972
1973	movups	16(%rcx,%r10,1),%xmm1
1974
1975	movdqa	L$xts_magic(%rip),%xmm8
1976	movdqa	%xmm2,%xmm15
1977	pshufd	$95,%xmm2,%xmm9
1978	pxor	%xmm0,%xmm1
1979	movdqa	%xmm9,%xmm14
1980	paddd	%xmm9,%xmm9
1981	movdqa	%xmm15,%xmm10
1982	psrad	$31,%xmm14
1983	paddq	%xmm15,%xmm15
1984	pand	%xmm8,%xmm14
1985	pxor	%xmm0,%xmm10
1986	pxor	%xmm14,%xmm15
1987	movdqa	%xmm9,%xmm14
1988	paddd	%xmm9,%xmm9
1989	movdqa	%xmm15,%xmm11
1990	psrad	$31,%xmm14
1991	paddq	%xmm15,%xmm15
1992	pand	%xmm8,%xmm14
1993	pxor	%xmm0,%xmm11
1994	pxor	%xmm14,%xmm15
1995	movdqa	%xmm9,%xmm14
1996	paddd	%xmm9,%xmm9
1997	movdqa	%xmm15,%xmm12
1998	psrad	$31,%xmm14
1999	paddq	%xmm15,%xmm15
2000	pand	%xmm8,%xmm14
2001	pxor	%xmm0,%xmm12
2002	pxor	%xmm14,%xmm15
2003	movdqa	%xmm9,%xmm14
2004	paddd	%xmm9,%xmm9
2005	movdqa	%xmm15,%xmm13
2006	psrad	$31,%xmm14
2007	paddq	%xmm15,%xmm15
2008	pand	%xmm8,%xmm14
2009	pxor	%xmm0,%xmm13
2010	pxor	%xmm14,%xmm15
2011	movdqa	%xmm15,%xmm14
2012	psrad	$31,%xmm9
2013	paddq	%xmm15,%xmm15
2014	pand	%xmm8,%xmm9
2015	pxor	%xmm0,%xmm14
2016	pxor	%xmm9,%xmm15
2017	movaps	%xmm1,96(%rsp)
2018
2019	subq	$96,%rdx
2020	jc	L$xts_dec_short
2021
2022	movl	$16+96,%eax
2023	leaq	32(%r11,%r10,1),%rcx
2024	subq	%r10,%rax
2025	movups	16(%r11),%xmm1
2026	movq	%rax,%r10
2027	leaq	L$xts_magic(%rip),%r8
2028	jmp	L$xts_dec_grandloop
2029
2030.p2align	5
2031L$xts_dec_grandloop:
2032	movdqu	0(%rdi),%xmm2
2033	movdqa	%xmm0,%xmm8
2034	movdqu	16(%rdi),%xmm3
2035	pxor	%xmm10,%xmm2
2036	movdqu	32(%rdi),%xmm4
2037	pxor	%xmm11,%xmm3
2038.byte	102,15,56,222,209
2039	movdqu	48(%rdi),%xmm5
2040	pxor	%xmm12,%xmm4
2041.byte	102,15,56,222,217
2042	movdqu	64(%rdi),%xmm6
2043	pxor	%xmm13,%xmm5
2044.byte	102,15,56,222,225
2045	movdqu	80(%rdi),%xmm7
2046	pxor	%xmm15,%xmm8
2047	movdqa	96(%rsp),%xmm9
2048	pxor	%xmm14,%xmm6
2049.byte	102,15,56,222,233
2050	movups	32(%r11),%xmm0
2051	leaq	96(%rdi),%rdi
2052	pxor	%xmm8,%xmm7
2053
2054	pxor	%xmm9,%xmm10
2055.byte	102,15,56,222,241
2056	pxor	%xmm9,%xmm11
2057	movdqa	%xmm10,0(%rsp)
2058.byte	102,15,56,222,249
2059	movups	48(%r11),%xmm1
2060	pxor	%xmm9,%xmm12
2061
2062.byte	102,15,56,222,208
2063	pxor	%xmm9,%xmm13
2064	movdqa	%xmm11,16(%rsp)
2065.byte	102,15,56,222,216
2066	pxor	%xmm9,%xmm14
2067	movdqa	%xmm12,32(%rsp)
2068.byte	102,15,56,222,224
2069.byte	102,15,56,222,232
2070	pxor	%xmm9,%xmm8
2071	movdqa	%xmm14,64(%rsp)
2072.byte	102,15,56,222,240
2073.byte	102,15,56,222,248
2074	movups	64(%r11),%xmm0
2075	movdqa	%xmm8,80(%rsp)
2076	pshufd	$95,%xmm15,%xmm9
2077	jmp	L$xts_dec_loop6
2078.p2align	5
2079L$xts_dec_loop6:
2080.byte	102,15,56,222,209
2081.byte	102,15,56,222,217
2082.byte	102,15,56,222,225
2083.byte	102,15,56,222,233
2084.byte	102,15,56,222,241
2085.byte	102,15,56,222,249
2086	movups	-64(%rcx,%rax,1),%xmm1
2087	addq	$32,%rax
2088
2089.byte	102,15,56,222,208
2090.byte	102,15,56,222,216
2091.byte	102,15,56,222,224
2092.byte	102,15,56,222,232
2093.byte	102,15,56,222,240
2094.byte	102,15,56,222,248
2095	movups	-80(%rcx,%rax,1),%xmm0
2096	jnz	L$xts_dec_loop6
2097
2098	movdqa	(%r8),%xmm8
2099	movdqa	%xmm9,%xmm14
2100	paddd	%xmm9,%xmm9
2101.byte	102,15,56,222,209
2102	paddq	%xmm15,%xmm15
2103	psrad	$31,%xmm14
2104.byte	102,15,56,222,217
2105	pand	%xmm8,%xmm14
2106	movups	(%r11),%xmm10
2107.byte	102,15,56,222,225
2108.byte	102,15,56,222,233
2109.byte	102,15,56,222,241
2110	pxor	%xmm14,%xmm15
2111	movaps	%xmm10,%xmm11
2112.byte	102,15,56,222,249
2113	movups	-64(%rcx),%xmm1
2114
2115	movdqa	%xmm9,%xmm14
2116.byte	102,15,56,222,208
2117	paddd	%xmm9,%xmm9
2118	pxor	%xmm15,%xmm10
2119.byte	102,15,56,222,216
2120	psrad	$31,%xmm14
2121	paddq	%xmm15,%xmm15
2122.byte	102,15,56,222,224
2123.byte	102,15,56,222,232
2124	pand	%xmm8,%xmm14
2125	movaps	%xmm11,%xmm12
2126.byte	102,15,56,222,240
2127	pxor	%xmm14,%xmm15
2128	movdqa	%xmm9,%xmm14
2129.byte	102,15,56,222,248
2130	movups	-48(%rcx),%xmm0
2131
2132	paddd	%xmm9,%xmm9
2133.byte	102,15,56,222,209
2134	pxor	%xmm15,%xmm11
2135	psrad	$31,%xmm14
2136.byte	102,15,56,222,217
2137	paddq	%xmm15,%xmm15
2138	pand	%xmm8,%xmm14
2139.byte	102,15,56,222,225
2140.byte	102,15,56,222,233
2141	movdqa	%xmm13,48(%rsp)
2142	pxor	%xmm14,%xmm15
2143.byte	102,15,56,222,241
2144	movaps	%xmm12,%xmm13
2145	movdqa	%xmm9,%xmm14
2146.byte	102,15,56,222,249
2147	movups	-32(%rcx),%xmm1
2148
2149	paddd	%xmm9,%xmm9
2150.byte	102,15,56,222,208
2151	pxor	%xmm15,%xmm12
2152	psrad	$31,%xmm14
2153.byte	102,15,56,222,216
2154	paddq	%xmm15,%xmm15
2155	pand	%xmm8,%xmm14
2156.byte	102,15,56,222,224
2157.byte	102,15,56,222,232
2158.byte	102,15,56,222,240
2159	pxor	%xmm14,%xmm15
2160	movaps	%xmm13,%xmm14
2161.byte	102,15,56,222,248
2162
2163	movdqa	%xmm9,%xmm0
2164	paddd	%xmm9,%xmm9
2165.byte	102,15,56,222,209
2166	pxor	%xmm15,%xmm13
2167	psrad	$31,%xmm0
2168.byte	102,15,56,222,217
2169	paddq	%xmm15,%xmm15
2170	pand	%xmm8,%xmm0
2171.byte	102,15,56,222,225
2172.byte	102,15,56,222,233
2173	pxor	%xmm0,%xmm15
2174	movups	(%r11),%xmm0
2175.byte	102,15,56,222,241
2176.byte	102,15,56,222,249
2177	movups	16(%r11),%xmm1
2178
2179	pxor	%xmm15,%xmm14
2180.byte	102,15,56,223,84,36,0
2181	psrad	$31,%xmm9
2182	paddq	%xmm15,%xmm15
2183.byte	102,15,56,223,92,36,16
2184.byte	102,15,56,223,100,36,32
2185	pand	%xmm8,%xmm9
2186	movq	%r10,%rax
2187.byte	102,15,56,223,108,36,48
2188.byte	102,15,56,223,116,36,64
2189.byte	102,15,56,223,124,36,80
2190	pxor	%xmm9,%xmm15
2191
2192	leaq	96(%rsi),%rsi
2193	movups	%xmm2,-96(%rsi)
2194	movups	%xmm3,-80(%rsi)
2195	movups	%xmm4,-64(%rsi)
2196	movups	%xmm5,-48(%rsi)
2197	movups	%xmm6,-32(%rsi)
2198	movups	%xmm7,-16(%rsi)
2199	subq	$96,%rdx
2200	jnc	L$xts_dec_grandloop
2201
2202	movl	$16+96,%eax
2203	subl	%r10d,%eax
2204	movq	%r11,%rcx
2205	shrl	$4,%eax
2206
2207L$xts_dec_short:
2208	movl	%eax,%r10d
2209	pxor	%xmm0,%xmm10
2210	pxor	%xmm0,%xmm11
2211	addq	$96,%rdx
2212	jz	L$xts_dec_done
2213
2214	pxor	%xmm0,%xmm12
2215	cmpq	$32,%rdx
2216	jb	L$xts_dec_one
2217	pxor	%xmm0,%xmm13
2218	je	L$xts_dec_two
2219
2220	pxor	%xmm0,%xmm14
2221	cmpq	$64,%rdx
2222	jb	L$xts_dec_three
2223	je	L$xts_dec_four
2224
2225	movdqu	(%rdi),%xmm2
2226	movdqu	16(%rdi),%xmm3
2227	movdqu	32(%rdi),%xmm4
2228	pxor	%xmm10,%xmm2
2229	movdqu	48(%rdi),%xmm5
2230	pxor	%xmm11,%xmm3
2231	movdqu	64(%rdi),%xmm6
2232	leaq	80(%rdi),%rdi
2233	pxor	%xmm12,%xmm4
2234	pxor	%xmm13,%xmm5
2235	pxor	%xmm14,%xmm6
2236
2237	call	_aesni_decrypt6
2238
2239	xorps	%xmm10,%xmm2
2240	xorps	%xmm11,%xmm3
2241	xorps	%xmm12,%xmm4
2242	movdqu	%xmm2,(%rsi)
2243	xorps	%xmm13,%xmm5
2244	movdqu	%xmm3,16(%rsi)
2245	xorps	%xmm14,%xmm6
2246	movdqu	%xmm4,32(%rsi)
2247	pxor	%xmm14,%xmm14
2248	movdqu	%xmm5,48(%rsi)
2249	pcmpgtd	%xmm15,%xmm14
2250	movdqu	%xmm6,64(%rsi)
2251	leaq	80(%rsi),%rsi
2252	pshufd	$19,%xmm14,%xmm11
2253	andq	$15,%r9
2254	jz	L$xts_dec_ret
2255
2256	movdqa	%xmm15,%xmm10
2257	paddq	%xmm15,%xmm15
2258	pand	%xmm8,%xmm11
2259	pxor	%xmm15,%xmm11
2260	jmp	L$xts_dec_done2
2261
2262.p2align	4
2263L$xts_dec_one:
2264	movups	(%rdi),%xmm2
2265	leaq	16(%rdi),%rdi
2266	xorps	%xmm10,%xmm2
2267	movups	(%rcx),%xmm0
2268	movups	16(%rcx),%xmm1
2269	leaq	32(%rcx),%rcx
2270	xorps	%xmm0,%xmm2
2271L$oop_dec1_12:
2272.byte	102,15,56,222,209
2273	decl	%eax
2274	movups	(%rcx),%xmm1
2275	leaq	16(%rcx),%rcx
2276	jnz	L$oop_dec1_12
2277.byte	102,15,56,223,209
2278	xorps	%xmm10,%xmm2
2279	movdqa	%xmm11,%xmm10
2280	movups	%xmm2,(%rsi)
2281	movdqa	%xmm12,%xmm11
2282	leaq	16(%rsi),%rsi
2283	jmp	L$xts_dec_done
2284
2285.p2align	4
2286L$xts_dec_two:
2287	movups	(%rdi),%xmm2
2288	movups	16(%rdi),%xmm3
2289	leaq	32(%rdi),%rdi
2290	xorps	%xmm10,%xmm2
2291	xorps	%xmm11,%xmm3
2292
2293	call	_aesni_decrypt2
2294
2295	xorps	%xmm10,%xmm2
2296	movdqa	%xmm12,%xmm10
2297	xorps	%xmm11,%xmm3
2298	movdqa	%xmm13,%xmm11
2299	movups	%xmm2,(%rsi)
2300	movups	%xmm3,16(%rsi)
2301	leaq	32(%rsi),%rsi
2302	jmp	L$xts_dec_done
2303
2304.p2align	4
2305L$xts_dec_three:
2306	movups	(%rdi),%xmm2
2307	movups	16(%rdi),%xmm3
2308	movups	32(%rdi),%xmm4
2309	leaq	48(%rdi),%rdi
2310	xorps	%xmm10,%xmm2
2311	xorps	%xmm11,%xmm3
2312	xorps	%xmm12,%xmm4
2313
2314	call	_aesni_decrypt3
2315
2316	xorps	%xmm10,%xmm2
2317	movdqa	%xmm13,%xmm10
2318	xorps	%xmm11,%xmm3
2319	movdqa	%xmm14,%xmm11
2320	xorps	%xmm12,%xmm4
2321	movups	%xmm2,(%rsi)
2322	movups	%xmm3,16(%rsi)
2323	movups	%xmm4,32(%rsi)
2324	leaq	48(%rsi),%rsi
2325	jmp	L$xts_dec_done
2326
2327.p2align	4
2328L$xts_dec_four:
2329	movups	(%rdi),%xmm2
2330	movups	16(%rdi),%xmm3
2331	movups	32(%rdi),%xmm4
2332	xorps	%xmm10,%xmm2
2333	movups	48(%rdi),%xmm5
2334	leaq	64(%rdi),%rdi
2335	xorps	%xmm11,%xmm3
2336	xorps	%xmm12,%xmm4
2337	xorps	%xmm13,%xmm5
2338
2339	call	_aesni_decrypt4
2340
2341	pxor	%xmm10,%xmm2
2342	movdqa	%xmm14,%xmm10
2343	pxor	%xmm11,%xmm3
2344	movdqa	%xmm15,%xmm11
2345	pxor	%xmm12,%xmm4
2346	movdqu	%xmm2,(%rsi)
2347	pxor	%xmm13,%xmm5
2348	movdqu	%xmm3,16(%rsi)
2349	movdqu	%xmm4,32(%rsi)
2350	movdqu	%xmm5,48(%rsi)
2351	leaq	64(%rsi),%rsi
2352	jmp	L$xts_dec_done
2353
2354.p2align	4
2355L$xts_dec_done:
2356	andq	$15,%r9
2357	jz	L$xts_dec_ret
2358L$xts_dec_done2:
2359	movq	%r9,%rdx
2360	movq	%r11,%rcx
2361	movl	%r10d,%eax
2362
2363	movups	(%rdi),%xmm2
2364	xorps	%xmm11,%xmm2
2365	movups	(%rcx),%xmm0
2366	movups	16(%rcx),%xmm1
2367	leaq	32(%rcx),%rcx
2368	xorps	%xmm0,%xmm2
2369L$oop_dec1_13:
2370.byte	102,15,56,222,209
2371	decl	%eax
2372	movups	(%rcx),%xmm1
2373	leaq	16(%rcx),%rcx
2374	jnz	L$oop_dec1_13
2375.byte	102,15,56,223,209
2376	xorps	%xmm11,%xmm2
2377	movups	%xmm2,(%rsi)
2378
2379L$xts_dec_steal:
2380	movzbl	16(%rdi),%eax
2381	movzbl	(%rsi),%ecx
2382	leaq	1(%rdi),%rdi
2383	movb	%al,(%rsi)
2384	movb	%cl,16(%rsi)
2385	leaq	1(%rsi),%rsi
2386	subq	$1,%rdx
2387	jnz	L$xts_dec_steal
2388
2389	subq	%r9,%rsi
2390	movq	%r11,%rcx
2391	movl	%r10d,%eax
2392
2393	movups	(%rsi),%xmm2
2394	xorps	%xmm10,%xmm2
2395	movups	(%rcx),%xmm0
2396	movups	16(%rcx),%xmm1
2397	leaq	32(%rcx),%rcx
2398	xorps	%xmm0,%xmm2
2399L$oop_dec1_14:
2400.byte	102,15,56,222,209
2401	decl	%eax
2402	movups	(%rcx),%xmm1
2403	leaq	16(%rcx),%rcx
2404	jnz	L$oop_dec1_14
2405.byte	102,15,56,223,209
2406	xorps	%xmm10,%xmm2
2407	movups	%xmm2,(%rsi)
2408
2409L$xts_dec_ret:
2410	leaq	(%rbp),%rsp
2411	popq	%rbp
2412L$xts_dec_epilogue:
2413	.byte	0xf3,0xc3
2414
2415.globl	_aesni_cbc_encrypt
2416.private_extern _aesni_cbc_encrypt
2417
2418.p2align	4
2419_aesni_cbc_encrypt:
2420	testq	%rdx,%rdx
2421	jz	L$cbc_ret
2422
2423	movl	240(%rcx),%r10d
2424	movq	%rcx,%r11
2425	testl	%r9d,%r9d
2426	jz	L$cbc_decrypt
2427
2428	movups	(%r8),%xmm2
2429	movl	%r10d,%eax
2430	cmpq	$16,%rdx
2431	jb	L$cbc_enc_tail
2432	subq	$16,%rdx
2433	jmp	L$cbc_enc_loop
2434.p2align	4
2435L$cbc_enc_loop:
2436	movups	(%rdi),%xmm3
2437	leaq	16(%rdi),%rdi
2438
2439	movups	(%rcx),%xmm0
2440	movups	16(%rcx),%xmm1
2441	xorps	%xmm0,%xmm3
2442	leaq	32(%rcx),%rcx
2443	xorps	%xmm3,%xmm2
2444L$oop_enc1_15:
2445.byte	102,15,56,220,209
2446	decl	%eax
2447	movups	(%rcx),%xmm1
2448	leaq	16(%rcx),%rcx
2449	jnz	L$oop_enc1_15
2450.byte	102,15,56,221,209
2451	movl	%r10d,%eax
2452	movq	%r11,%rcx
2453	movups	%xmm2,0(%rsi)
2454	leaq	16(%rsi),%rsi
2455	subq	$16,%rdx
2456	jnc	L$cbc_enc_loop
2457	addq	$16,%rdx
2458	jnz	L$cbc_enc_tail
2459	movups	%xmm2,(%r8)
2460	jmp	L$cbc_ret
2461
2462L$cbc_enc_tail:
2463	movq	%rdx,%rcx
2464	xchgq	%rdi,%rsi
2465.long	0x9066A4F3
2466	movl	$16,%ecx
2467	subq	%rdx,%rcx
2468	xorl	%eax,%eax
2469.long	0x9066AAF3
2470	leaq	-16(%rdi),%rdi
2471	movl	%r10d,%eax
2472	movq	%rdi,%rsi
2473	movq	%r11,%rcx
2474	xorq	%rdx,%rdx
2475	jmp	L$cbc_enc_loop
2476
2477.p2align	4
2478L$cbc_decrypt:
2479	leaq	(%rsp),%rax
2480	pushq	%rbp
2481	subq	$16,%rsp
2482	andq	$-16,%rsp
2483	leaq	-8(%rax),%rbp
2484	movups	(%r8),%xmm10
2485	movl	%r10d,%eax
2486	cmpq	$80,%rdx
2487	jbe	L$cbc_dec_tail
2488
2489	movups	(%rcx),%xmm0
2490	movdqu	0(%rdi),%xmm2
2491	movdqu	16(%rdi),%xmm3
2492	movdqa	%xmm2,%xmm11
2493	movdqu	32(%rdi),%xmm4
2494	movdqa	%xmm3,%xmm12
2495	movdqu	48(%rdi),%xmm5
2496	movdqa	%xmm4,%xmm13
2497	movdqu	64(%rdi),%xmm6
2498	movdqa	%xmm5,%xmm14
2499	movdqu	80(%rdi),%xmm7
2500	movdqa	%xmm6,%xmm15
2501	movl	_OPENSSL_ia32cap_P+4(%rip),%r9d
2502	cmpq	$112,%rdx
2503	jbe	L$cbc_dec_six_or_seven
2504
2505	andl	$71303168,%r9d
2506	subq	$80,%rdx
2507	cmpl	$4194304,%r9d
2508	je	L$cbc_dec_loop6_enter
2509	subq	$32,%rdx
2510	leaq	112(%rcx),%rcx
2511	jmp	L$cbc_dec_loop8_enter
2512.p2align	4
2513L$cbc_dec_loop8:
2514	movups	%xmm9,(%rsi)
2515	leaq	16(%rsi),%rsi
2516L$cbc_dec_loop8_enter:
2517	movdqu	96(%rdi),%xmm8
2518	pxor	%xmm0,%xmm2
2519	movdqu	112(%rdi),%xmm9
2520	pxor	%xmm0,%xmm3
2521	movups	16-112(%rcx),%xmm1
2522	pxor	%xmm0,%xmm4
2523	xorq	%r11,%r11
2524	cmpq	$112,%rdx
2525	pxor	%xmm0,%xmm5
2526	pxor	%xmm0,%xmm6
2527	pxor	%xmm0,%xmm7
2528	pxor	%xmm0,%xmm8
2529
2530.byte	102,15,56,222,209
2531	pxor	%xmm0,%xmm9
2532	movups	32-112(%rcx),%xmm0
2533.byte	102,15,56,222,217
2534.byte	102,15,56,222,225
2535.byte	102,15,56,222,233
2536.byte	102,15,56,222,241
2537.byte	102,15,56,222,249
2538.byte	102,68,15,56,222,193
2539	setnc	%r11b
2540	shlq	$7,%r11
2541.byte	102,68,15,56,222,201
2542	addq	%rdi,%r11
2543	movups	48-112(%rcx),%xmm1
2544.byte	102,15,56,222,208
2545.byte	102,15,56,222,216
2546.byte	102,15,56,222,224
2547.byte	102,15,56,222,232
2548.byte	102,15,56,222,240
2549.byte	102,15,56,222,248
2550.byte	102,68,15,56,222,192
2551.byte	102,68,15,56,222,200
2552	movups	64-112(%rcx),%xmm0
2553	nop
2554.byte	102,15,56,222,209
2555.byte	102,15,56,222,217
2556.byte	102,15,56,222,225
2557.byte	102,15,56,222,233
2558.byte	102,15,56,222,241
2559.byte	102,15,56,222,249
2560.byte	102,68,15,56,222,193
2561.byte	102,68,15,56,222,201
2562	movups	80-112(%rcx),%xmm1
2563	nop
2564.byte	102,15,56,222,208
2565.byte	102,15,56,222,216
2566.byte	102,15,56,222,224
2567.byte	102,15,56,222,232
2568.byte	102,15,56,222,240
2569.byte	102,15,56,222,248
2570.byte	102,68,15,56,222,192
2571.byte	102,68,15,56,222,200
2572	movups	96-112(%rcx),%xmm0
2573	nop
2574.byte	102,15,56,222,209
2575.byte	102,15,56,222,217
2576.byte	102,15,56,222,225
2577.byte	102,15,56,222,233
2578.byte	102,15,56,222,241
2579.byte	102,15,56,222,249
2580.byte	102,68,15,56,222,193
2581.byte	102,68,15,56,222,201
2582	movups	112-112(%rcx),%xmm1
2583	nop
2584.byte	102,15,56,222,208
2585.byte	102,15,56,222,216
2586.byte	102,15,56,222,224
2587.byte	102,15,56,222,232
2588.byte	102,15,56,222,240
2589.byte	102,15,56,222,248
2590.byte	102,68,15,56,222,192
2591.byte	102,68,15,56,222,200
2592	movups	128-112(%rcx),%xmm0
2593	nop
2594.byte	102,15,56,222,209
2595.byte	102,15,56,222,217
2596.byte	102,15,56,222,225
2597.byte	102,15,56,222,233
2598.byte	102,15,56,222,241
2599.byte	102,15,56,222,249
2600.byte	102,68,15,56,222,193
2601.byte	102,68,15,56,222,201
2602	movups	144-112(%rcx),%xmm1
2603	cmpl	$11,%eax
2604.byte	102,15,56,222,208
2605.byte	102,15,56,222,216
2606.byte	102,15,56,222,224
2607.byte	102,15,56,222,232
2608.byte	102,15,56,222,240
2609.byte	102,15,56,222,248
2610.byte	102,68,15,56,222,192
2611.byte	102,68,15,56,222,200
2612	movups	160-112(%rcx),%xmm0
2613	jb	L$cbc_dec_done
2614.byte	102,15,56,222,209
2615.byte	102,15,56,222,217
2616.byte	102,15,56,222,225
2617.byte	102,15,56,222,233
2618.byte	102,15,56,222,241
2619.byte	102,15,56,222,249
2620.byte	102,68,15,56,222,193
2621.byte	102,68,15,56,222,201
2622	movups	176-112(%rcx),%xmm1
2623	nop
2624.byte	102,15,56,222,208
2625.byte	102,15,56,222,216
2626.byte	102,15,56,222,224
2627.byte	102,15,56,222,232
2628.byte	102,15,56,222,240
2629.byte	102,15,56,222,248
2630.byte	102,68,15,56,222,192
2631.byte	102,68,15,56,222,200
2632	movups	192-112(%rcx),%xmm0
2633	je	L$cbc_dec_done
2634.byte	102,15,56,222,209
2635.byte	102,15,56,222,217
2636.byte	102,15,56,222,225
2637.byte	102,15,56,222,233
2638.byte	102,15,56,222,241
2639.byte	102,15,56,222,249
2640.byte	102,68,15,56,222,193
2641.byte	102,68,15,56,222,201
2642	movups	208-112(%rcx),%xmm1
2643	nop
2644.byte	102,15,56,222,208
2645.byte	102,15,56,222,216
2646.byte	102,15,56,222,224
2647.byte	102,15,56,222,232
2648.byte	102,15,56,222,240
2649.byte	102,15,56,222,248
2650.byte	102,68,15,56,222,192
2651.byte	102,68,15,56,222,200
2652	movups	224-112(%rcx),%xmm0
2653	jmp	L$cbc_dec_done
2654.p2align	4
2655L$cbc_dec_done:
2656.byte	102,15,56,222,209
2657.byte	102,15,56,222,217
2658	pxor	%xmm0,%xmm10
2659	pxor	%xmm0,%xmm11
2660.byte	102,15,56,222,225
2661.byte	102,15,56,222,233
2662	pxor	%xmm0,%xmm12
2663	pxor	%xmm0,%xmm13
2664.byte	102,15,56,222,241
2665.byte	102,15,56,222,249
2666	pxor	%xmm0,%xmm14
2667	pxor	%xmm0,%xmm15
2668.byte	102,68,15,56,222,193
2669.byte	102,68,15,56,222,201
2670	movdqu	80(%rdi),%xmm1
2671
2672.byte	102,65,15,56,223,210
2673	movdqu	96(%rdi),%xmm10
2674	pxor	%xmm0,%xmm1
2675.byte	102,65,15,56,223,219
2676	pxor	%xmm0,%xmm10
2677	movdqu	112(%rdi),%xmm0
2678.byte	102,65,15,56,223,228
2679	leaq	128(%rdi),%rdi
2680	movdqu	0(%r11),%xmm11
2681.byte	102,65,15,56,223,237
2682.byte	102,65,15,56,223,246
2683	movdqu	16(%r11),%xmm12
2684	movdqu	32(%r11),%xmm13
2685.byte	102,65,15,56,223,255
2686.byte	102,68,15,56,223,193
2687	movdqu	48(%r11),%xmm14
2688	movdqu	64(%r11),%xmm15
2689.byte	102,69,15,56,223,202
2690	movdqa	%xmm0,%xmm10
2691	movdqu	80(%r11),%xmm1
2692	movups	-112(%rcx),%xmm0
2693
2694	movups	%xmm2,(%rsi)
2695	movdqa	%xmm11,%xmm2
2696	movups	%xmm3,16(%rsi)
2697	movdqa	%xmm12,%xmm3
2698	movups	%xmm4,32(%rsi)
2699	movdqa	%xmm13,%xmm4
2700	movups	%xmm5,48(%rsi)
2701	movdqa	%xmm14,%xmm5
2702	movups	%xmm6,64(%rsi)
2703	movdqa	%xmm15,%xmm6
2704	movups	%xmm7,80(%rsi)
2705	movdqa	%xmm1,%xmm7
2706	movups	%xmm8,96(%rsi)
2707	leaq	112(%rsi),%rsi
2708
2709	subq	$128,%rdx
2710	ja	L$cbc_dec_loop8
2711
2712	movaps	%xmm9,%xmm2
2713	leaq	-112(%rcx),%rcx
2714	addq	$112,%rdx
2715	jle	L$cbc_dec_tail_collected
2716	movups	%xmm9,(%rsi)
2717	leaq	16(%rsi),%rsi
2718	cmpq	$80,%rdx
2719	jbe	L$cbc_dec_tail
2720
2721	movaps	%xmm11,%xmm2
2722L$cbc_dec_six_or_seven:
2723	cmpq	$96,%rdx
2724	ja	L$cbc_dec_seven
2725
2726	movaps	%xmm7,%xmm8
2727	call	_aesni_decrypt6
2728	pxor	%xmm10,%xmm2
2729	movaps	%xmm8,%xmm10
2730	pxor	%xmm11,%xmm3
2731	movdqu	%xmm2,(%rsi)
2732	pxor	%xmm12,%xmm4
2733	movdqu	%xmm3,16(%rsi)
2734	pxor	%xmm13,%xmm5
2735	movdqu	%xmm4,32(%rsi)
2736	pxor	%xmm14,%xmm6
2737	movdqu	%xmm5,48(%rsi)
2738	pxor	%xmm15,%xmm7
2739	movdqu	%xmm6,64(%rsi)
2740	leaq	80(%rsi),%rsi
2741	movdqa	%xmm7,%xmm2
2742	jmp	L$cbc_dec_tail_collected
2743
2744.p2align	4
2745L$cbc_dec_seven:
2746	movups	96(%rdi),%xmm8
2747	xorps	%xmm9,%xmm9
2748	call	_aesni_decrypt8
2749	movups	80(%rdi),%xmm9
2750	pxor	%xmm10,%xmm2
2751	movups	96(%rdi),%xmm10
2752	pxor	%xmm11,%xmm3
2753	movdqu	%xmm2,(%rsi)
2754	pxor	%xmm12,%xmm4
2755	movdqu	%xmm3,16(%rsi)
2756	pxor	%xmm13,%xmm5
2757	movdqu	%xmm4,32(%rsi)
2758	pxor	%xmm14,%xmm6
2759	movdqu	%xmm5,48(%rsi)
2760	pxor	%xmm15,%xmm7
2761	movdqu	%xmm6,64(%rsi)
2762	pxor	%xmm9,%xmm8
2763	movdqu	%xmm7,80(%rsi)
2764	leaq	96(%rsi),%rsi
2765	movdqa	%xmm8,%xmm2
2766	jmp	L$cbc_dec_tail_collected
2767
2768.p2align	4
2769L$cbc_dec_loop6:
2770	movups	%xmm7,(%rsi)
2771	leaq	16(%rsi),%rsi
2772	movdqu	0(%rdi),%xmm2
2773	movdqu	16(%rdi),%xmm3
2774	movdqa	%xmm2,%xmm11
2775	movdqu	32(%rdi),%xmm4
2776	movdqa	%xmm3,%xmm12
2777	movdqu	48(%rdi),%xmm5
2778	movdqa	%xmm4,%xmm13
2779	movdqu	64(%rdi),%xmm6
2780	movdqa	%xmm5,%xmm14
2781	movdqu	80(%rdi),%xmm7
2782	movdqa	%xmm6,%xmm15
2783L$cbc_dec_loop6_enter:
2784	leaq	96(%rdi),%rdi
2785	movdqa	%xmm7,%xmm8
2786
2787	call	_aesni_decrypt6
2788
2789	pxor	%xmm10,%xmm2
2790	movdqa	%xmm8,%xmm10
2791	pxor	%xmm11,%xmm3
2792	movdqu	%xmm2,(%rsi)
2793	pxor	%xmm12,%xmm4
2794	movdqu	%xmm3,16(%rsi)
2795	pxor	%xmm13,%xmm5
2796	movdqu	%xmm4,32(%rsi)
2797	pxor	%xmm14,%xmm6
2798	movq	%r11,%rcx
2799	movdqu	%xmm5,48(%rsi)
2800	pxor	%xmm15,%xmm7
2801	movl	%r10d,%eax
2802	movdqu	%xmm6,64(%rsi)
2803	leaq	80(%rsi),%rsi
2804	subq	$96,%rdx
2805	ja	L$cbc_dec_loop6
2806
2807	movdqa	%xmm7,%xmm2
2808	addq	$80,%rdx
2809	jle	L$cbc_dec_tail_collected
2810	movups	%xmm7,(%rsi)
2811	leaq	16(%rsi),%rsi
2812
2813L$cbc_dec_tail:
2814	movups	(%rdi),%xmm2
2815	subq	$16,%rdx
2816	jbe	L$cbc_dec_one
2817
2818	movups	16(%rdi),%xmm3
2819	movaps	%xmm2,%xmm11
2820	subq	$16,%rdx
2821	jbe	L$cbc_dec_two
2822
2823	movups	32(%rdi),%xmm4
2824	movaps	%xmm3,%xmm12
2825	subq	$16,%rdx
2826	jbe	L$cbc_dec_three
2827
2828	movups	48(%rdi),%xmm5
2829	movaps	%xmm4,%xmm13
2830	subq	$16,%rdx
2831	jbe	L$cbc_dec_four
2832
2833	movups	64(%rdi),%xmm6
2834	movaps	%xmm5,%xmm14
2835	movaps	%xmm6,%xmm15
2836	xorps	%xmm7,%xmm7
2837	call	_aesni_decrypt6
2838	pxor	%xmm10,%xmm2
2839	movaps	%xmm15,%xmm10
2840	pxor	%xmm11,%xmm3
2841	movdqu	%xmm2,(%rsi)
2842	pxor	%xmm12,%xmm4
2843	movdqu	%xmm3,16(%rsi)
2844	pxor	%xmm13,%xmm5
2845	movdqu	%xmm4,32(%rsi)
2846	pxor	%xmm14,%xmm6
2847	movdqu	%xmm5,48(%rsi)
2848	leaq	64(%rsi),%rsi
2849	movdqa	%xmm6,%xmm2
2850	subq	$16,%rdx
2851	jmp	L$cbc_dec_tail_collected
2852
2853.p2align	4
2854L$cbc_dec_one:
2855	movaps	%xmm2,%xmm11
2856	movups	(%rcx),%xmm0
2857	movups	16(%rcx),%xmm1
2858	leaq	32(%rcx),%rcx
2859	xorps	%xmm0,%xmm2
2860L$oop_dec1_16:
2861.byte	102,15,56,222,209
2862	decl	%eax
2863	movups	(%rcx),%xmm1
2864	leaq	16(%rcx),%rcx
2865	jnz	L$oop_dec1_16
2866.byte	102,15,56,223,209
2867	xorps	%xmm10,%xmm2
2868	movaps	%xmm11,%xmm10
2869	jmp	L$cbc_dec_tail_collected
2870.p2align	4
2871L$cbc_dec_two:
2872	movaps	%xmm3,%xmm12
2873	call	_aesni_decrypt2
2874	pxor	%xmm10,%xmm2
2875	movaps	%xmm12,%xmm10
2876	pxor	%xmm11,%xmm3
2877	movdqu	%xmm2,(%rsi)
2878	movdqa	%xmm3,%xmm2
2879	leaq	16(%rsi),%rsi
2880	jmp	L$cbc_dec_tail_collected
2881.p2align	4
2882L$cbc_dec_three:
2883	movaps	%xmm4,%xmm13
2884	call	_aesni_decrypt3
2885	pxor	%xmm10,%xmm2
2886	movaps	%xmm13,%xmm10
2887	pxor	%xmm11,%xmm3
2888	movdqu	%xmm2,(%rsi)
2889	pxor	%xmm12,%xmm4
2890	movdqu	%xmm3,16(%rsi)
2891	movdqa	%xmm4,%xmm2
2892	leaq	32(%rsi),%rsi
2893	jmp	L$cbc_dec_tail_collected
2894.p2align	4
2895L$cbc_dec_four:
2896	movaps	%xmm5,%xmm14
2897	call	_aesni_decrypt4
2898	pxor	%xmm10,%xmm2
2899	movaps	%xmm14,%xmm10
2900	pxor	%xmm11,%xmm3
2901	movdqu	%xmm2,(%rsi)
2902	pxor	%xmm12,%xmm4
2903	movdqu	%xmm3,16(%rsi)
2904	pxor	%xmm13,%xmm5
2905	movdqu	%xmm4,32(%rsi)
2906	movdqa	%xmm5,%xmm2
2907	leaq	48(%rsi),%rsi
2908	jmp	L$cbc_dec_tail_collected
2909
2910.p2align	4
2911L$cbc_dec_tail_collected:
2912	movups	%xmm10,(%r8)
2913	andq	$15,%rdx
2914	jnz	L$cbc_dec_tail_partial
2915	movups	%xmm2,(%rsi)
2916	jmp	L$cbc_dec_ret
2917.p2align	4
2918L$cbc_dec_tail_partial:
2919	movaps	%xmm2,(%rsp)
2920	movq	$16,%rcx
2921	movq	%rsi,%rdi
2922	subq	%rdx,%rcx
2923	leaq	(%rsp),%rsi
2924.long	0x9066A4F3
2925
2926L$cbc_dec_ret:
2927	leaq	(%rbp),%rsp
2928	popq	%rbp
2929L$cbc_ret:
2930	.byte	0xf3,0xc3
2931
2932.globl	_aesni_set_decrypt_key
2933.private_extern _aesni_set_decrypt_key
2934
2935.p2align	4
2936_aesni_set_decrypt_key:
2937.byte	0x48,0x83,0xEC,0x08
2938	call	__aesni_set_encrypt_key
2939	shll	$4,%esi
2940	testl	%eax,%eax
2941	jnz	L$dec_key_ret
2942	leaq	16(%rdx,%rsi,1),%rdi
2943
2944	movups	(%rdx),%xmm0
2945	movups	(%rdi),%xmm1
2946	movups	%xmm0,(%rdi)
2947	movups	%xmm1,(%rdx)
2948	leaq	16(%rdx),%rdx
2949	leaq	-16(%rdi),%rdi
2950
2951L$dec_key_inverse:
2952	movups	(%rdx),%xmm0
2953	movups	(%rdi),%xmm1
2954.byte	102,15,56,219,192
2955.byte	102,15,56,219,201
2956	leaq	16(%rdx),%rdx
2957	leaq	-16(%rdi),%rdi
2958	movups	%xmm0,16(%rdi)
2959	movups	%xmm1,-16(%rdx)
2960	cmpq	%rdx,%rdi
2961	ja	L$dec_key_inverse
2962
2963	movups	(%rdx),%xmm0
2964.byte	102,15,56,219,192
2965	movups	%xmm0,(%rdi)
2966L$dec_key_ret:
2967	addq	$8,%rsp
2968	.byte	0xf3,0xc3
2969L$SEH_end_set_decrypt_key:
2970
2971.globl	_aesni_set_encrypt_key
2972.private_extern _aesni_set_encrypt_key
2973
2974.p2align	4
2975_aesni_set_encrypt_key:
2976__aesni_set_encrypt_key:
2977.byte	0x48,0x83,0xEC,0x08
2978	movq	$-1,%rax
2979	testq	%rdi,%rdi
2980	jz	L$enc_key_ret
2981	testq	%rdx,%rdx
2982	jz	L$enc_key_ret
2983
2984	movups	(%rdi),%xmm0
2985	xorps	%xmm4,%xmm4
2986	leaq	16(%rdx),%rax
2987	cmpl	$256,%esi
2988	je	L$14rounds
2989	cmpl	$192,%esi
2990	je	L$12rounds
2991	cmpl	$128,%esi
2992	jne	L$bad_keybits
2993
2994L$10rounds:
2995	movl	$9,%esi
2996	movups	%xmm0,(%rdx)
2997.byte	102,15,58,223,200,1
2998	call	L$key_expansion_128_cold
2999.byte	102,15,58,223,200,2
3000	call	L$key_expansion_128
3001.byte	102,15,58,223,200,4
3002	call	L$key_expansion_128
3003.byte	102,15,58,223,200,8
3004	call	L$key_expansion_128
3005.byte	102,15,58,223,200,16
3006	call	L$key_expansion_128
3007.byte	102,15,58,223,200,32
3008	call	L$key_expansion_128
3009.byte	102,15,58,223,200,64
3010	call	L$key_expansion_128
3011.byte	102,15,58,223,200,128
3012	call	L$key_expansion_128
3013.byte	102,15,58,223,200,27
3014	call	L$key_expansion_128
3015.byte	102,15,58,223,200,54
3016	call	L$key_expansion_128
3017	movups	%xmm0,(%rax)
3018	movl	%esi,80(%rax)
3019	xorl	%eax,%eax
3020	jmp	L$enc_key_ret
3021
3022.p2align	4
3023L$12rounds:
3024	movq	16(%rdi),%xmm2
3025	movl	$11,%esi
3026	movups	%xmm0,(%rdx)
3027.byte	102,15,58,223,202,1
3028	call	L$key_expansion_192a_cold
3029.byte	102,15,58,223,202,2
3030	call	L$key_expansion_192b
3031.byte	102,15,58,223,202,4
3032	call	L$key_expansion_192a
3033.byte	102,15,58,223,202,8
3034	call	L$key_expansion_192b
3035.byte	102,15,58,223,202,16
3036	call	L$key_expansion_192a
3037.byte	102,15,58,223,202,32
3038	call	L$key_expansion_192b
3039.byte	102,15,58,223,202,64
3040	call	L$key_expansion_192a
3041.byte	102,15,58,223,202,128
3042	call	L$key_expansion_192b
3043	movups	%xmm0,(%rax)
3044	movl	%esi,48(%rax)
3045	xorq	%rax,%rax
3046	jmp	L$enc_key_ret
3047
3048.p2align	4
3049L$14rounds:
3050	movups	16(%rdi),%xmm2
3051	movl	$13,%esi
3052	leaq	16(%rax),%rax
3053	movups	%xmm0,(%rdx)
3054	movups	%xmm2,16(%rdx)
3055.byte	102,15,58,223,202,1
3056	call	L$key_expansion_256a_cold
3057.byte	102,15,58,223,200,1
3058	call	L$key_expansion_256b
3059.byte	102,15,58,223,202,2
3060	call	L$key_expansion_256a
3061.byte	102,15,58,223,200,2
3062	call	L$key_expansion_256b
3063.byte	102,15,58,223,202,4
3064	call	L$key_expansion_256a
3065.byte	102,15,58,223,200,4
3066	call	L$key_expansion_256b
3067.byte	102,15,58,223,202,8
3068	call	L$key_expansion_256a
3069.byte	102,15,58,223,200,8
3070	call	L$key_expansion_256b
3071.byte	102,15,58,223,202,16
3072	call	L$key_expansion_256a
3073.byte	102,15,58,223,200,16
3074	call	L$key_expansion_256b
3075.byte	102,15,58,223,202,32
3076	call	L$key_expansion_256a
3077.byte	102,15,58,223,200,32
3078	call	L$key_expansion_256b
3079.byte	102,15,58,223,202,64
3080	call	L$key_expansion_256a
3081	movups	%xmm0,(%rax)
3082	movl	%esi,16(%rax)
3083	xorq	%rax,%rax
3084	jmp	L$enc_key_ret
3085
3086.p2align	4
3087L$bad_keybits:
3088	movq	$-2,%rax
3089L$enc_key_ret:
3090	addq	$8,%rsp
3091	.byte	0xf3,0xc3
3092L$SEH_end_set_encrypt_key:
3093
3094.p2align	4
3095L$key_expansion_128:
3096	movups	%xmm0,(%rax)
3097	leaq	16(%rax),%rax
3098L$key_expansion_128_cold:
3099	shufps	$16,%xmm0,%xmm4
3100	xorps	%xmm4,%xmm0
3101	shufps	$140,%xmm0,%xmm4
3102	xorps	%xmm4,%xmm0
3103	shufps	$255,%xmm1,%xmm1
3104	xorps	%xmm1,%xmm0
3105	.byte	0xf3,0xc3
3106
3107.p2align	4
3108L$key_expansion_192a:
3109	movups	%xmm0,(%rax)
3110	leaq	16(%rax),%rax
3111L$key_expansion_192a_cold:
3112	movaps	%xmm2,%xmm5
3113L$key_expansion_192b_warm:
3114	shufps	$16,%xmm0,%xmm4
3115	movdqa	%xmm2,%xmm3
3116	xorps	%xmm4,%xmm0
3117	shufps	$140,%xmm0,%xmm4
3118	pslldq	$4,%xmm3
3119	xorps	%xmm4,%xmm0
3120	pshufd	$85,%xmm1,%xmm1
3121	pxor	%xmm3,%xmm2
3122	pxor	%xmm1,%xmm0
3123	pshufd	$255,%xmm0,%xmm3
3124	pxor	%xmm3,%xmm2
3125	.byte	0xf3,0xc3
3126
3127.p2align	4
3128L$key_expansion_192b:
3129	movaps	%xmm0,%xmm3
3130	shufps	$68,%xmm0,%xmm5
3131	movups	%xmm5,(%rax)
3132	shufps	$78,%xmm2,%xmm3
3133	movups	%xmm3,16(%rax)
3134	leaq	32(%rax),%rax
3135	jmp	L$key_expansion_192b_warm
3136
3137.p2align	4
3138L$key_expansion_256a:
3139	movups	%xmm2,(%rax)
3140	leaq	16(%rax),%rax
3141L$key_expansion_256a_cold:
3142	shufps	$16,%xmm0,%xmm4
3143	xorps	%xmm4,%xmm0
3144	shufps	$140,%xmm0,%xmm4
3145	xorps	%xmm4,%xmm0
3146	shufps	$255,%xmm1,%xmm1
3147	xorps	%xmm1,%xmm0
3148	.byte	0xf3,0xc3
3149
3150.p2align	4
3151L$key_expansion_256b:
3152	movups	%xmm0,(%rax)
3153	leaq	16(%rax),%rax
3154
3155	shufps	$16,%xmm2,%xmm4
3156	xorps	%xmm4,%xmm2
3157	shufps	$140,%xmm2,%xmm4
3158	xorps	%xmm4,%xmm2
3159	shufps	$170,%xmm1,%xmm1
3160	xorps	%xmm1,%xmm2
3161	.byte	0xf3,0xc3
3162
3163
3164.p2align	6
3165L$bswap_mask:
3166.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3167L$increment32:
3168.long	6,6,6,0
3169L$increment64:
3170.long	1,0,0,0
3171L$xts_magic:
3172.long	0x87,0,1,0
3173L$increment1:
3174.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3175
3176.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3177.p2align	6
3178#endif
3179