1#!/usr/bin/env perl
2
3# Copyright (c) 2015, CloudFlare Ltd.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
16
17##############################################################################
18#                                                                            #
19# Author:  Vlad Krasnov                                                      #
20#                                                                            #
21##############################################################################
22
23$flavour = shift;
24$output  = shift;
25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
35*STDOUT=*OUT;
36
37$avx = 2;
38
39$code.=<<___;
40.text
41.extern OPENSSL_ia32cap_P
42
43chacha20_poly1305_constants:
44
45.align 64
46.chacha20_consts:
47.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
48.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
49.rol8:
50.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
51.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
52.rol16:
53.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
54.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
55.avx2_init:
56.long 0,0,0,0
57.sse_inc:
58.long 1,0,0,0
59.avx2_inc:
60.long 2,0,0,0,2,0,0,0
61.clamp:
62.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
63.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
64.align 16
65.and_masks:
66.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
67.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
68.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
69.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
70.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
71.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
72.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
73.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
74.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
75.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
76.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
77.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
78.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
79.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
80.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
81___
82
83my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
84my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
85my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
86my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
87my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
88my $r_store="0*16(%rbp)";
89my $s_store="1*16(%rbp)";
90my $len_store="2*16(%rbp)";
91my $state1_store="3*16(%rbp)";
92my $state2_store="4*16(%rbp)";
93my $tmp_store="5*16(%rbp)";
94my $ctr0_store="6*16(%rbp)";
95my $ctr1_store="7*16(%rbp)";
96my $ctr2_store="8*16(%rbp)";
97my $ctr3_store="9*16(%rbp)";
98
99sub chacha_qr {
100my ($a,$b,$c,$d,$t,$dir)=@_;
101$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
102$code.="paddd $b, $a
103        pxor $a, $d
104        pshufb .rol16(%rip), $d
105        paddd $d, $c
106        pxor $c, $b
107        movdqa $b, $t
108        pslld \$12, $t
109        psrld \$20, $b
110        pxor $t, $b
111        paddd $b, $a
112        pxor $a, $d
113        pshufb .rol8(%rip), $d
114        paddd $d, $c
115        pxor $c, $b
116        movdqa $b, $t
117        pslld \$7, $t
118        psrld \$25, $b
119        pxor $t, $b\n";
120$code.="palignr \$4, $b, $b
121        palignr \$8, $c, $c
122        palignr \$12, $d, $d\n" if ($dir =~ /left/);
123$code.="palignr \$12, $b, $b
124        palignr \$8, $c, $c
125        palignr \$4, $d, $d\n" if ($dir =~ /right/);
126$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
127}
128
129sub poly_add {
130my ($src)=@_;
131$code.="add $src, $acc0
132        adc 8+$src, $acc1
133        adc \$1, $acc2\n";
134}
135
136sub poly_stage1 {
137$code.="mov 0+$r_store, %rax
138        mov %rax, $t2
139        mul $acc0
140        mov %rax, $t0
141        mov %rdx, $t1
142        mov 0+$r_store, %rax
143        mul $acc1
144        imulq $acc2, $t2
145        add %rax, $t1
146        adc %rdx, $t2\n";
147}
148
149sub poly_stage2 {
150$code.="mov 8+$r_store, %rax
151        mov %rax, $t3
152        mul $acc0
153        add %rax, $t1
154        adc \$0, %rdx
155        mov %rdx, $acc0
156        mov 8+$r_store, %rax
157        mul $acc1
158        add %rax, $t2
159        adc \$0, %rdx\n";
160}
161
162sub poly_stage3 {
163$code.="imulq $acc2, $t3
164        add $acc0, $t2
165        adc %rdx, $t3\n";
166}
167
168sub poly_reduce_stage {
169$code.="mov $t0, $acc0
170        mov $t1, $acc1
171        mov $t2, $acc2
172        and \$3, $acc2
173        mov $t2, $t0
174        and \$-4, $t0
175        mov $t3, $t1
176        shrd \$2, $t3, $t2
177        shr \$2, $t3
178        add $t0, $acc0
179        adc $t1, $acc1
180        adc \$0, $acc2
181        add $t2, $acc0
182        adc $t3, $acc1
183        adc \$0, $acc2\n";
184}
185
186sub poly_mul {
187    &poly_stage1();
188    &poly_stage2();
189    &poly_stage3();
190    &poly_reduce_stage();
191}
192
193sub prep_state {
194my ($n)=@_;
195$code.="movdqa .chacha20_consts(%rip), $A0
196        movdqa $state1_store, $B0
197        movdqa $state2_store, $C0\n";
198$code.="movdqa $A0, $A1
199        movdqa $B0, $B1
200        movdqa $C0, $C1\n" if ($n ge 2);
201$code.="movdqa $A0, $A2
202        movdqa $B0, $B2
203        movdqa $C0, $C2\n" if ($n ge 3);
204$code.="movdqa $A0, $A3
205        movdqa $B0, $B3
206        movdqa $C0, $C3\n" if ($n ge 4);
207$code.="movdqa $ctr0_store, $D0
208        paddd .sse_inc(%rip), $D0
209        movdqa $D0, $ctr0_store\n" if ($n eq 1);
210$code.="movdqa $ctr0_store, $D1
211        paddd .sse_inc(%rip), $D1
212        movdqa $D1, $D0
213        paddd .sse_inc(%rip), $D0
214        movdqa $D0, $ctr0_store
215        movdqa $D1, $ctr1_store\n" if ($n eq 2);
216$code.="movdqa $ctr0_store, $D2
217        paddd .sse_inc(%rip), $D2
218        movdqa $D2, $D1
219        paddd .sse_inc(%rip), $D1
220        movdqa $D1, $D0
221        paddd .sse_inc(%rip), $D0
222        movdqa $D0, $ctr0_store
223        movdqa $D1, $ctr1_store
224        movdqa $D2, $ctr2_store\n" if ($n eq 3);
225$code.="movdqa $ctr0_store, $D3
226        paddd .sse_inc(%rip), $D3
227        movdqa $D3, $D2
228        paddd .sse_inc(%rip), $D2
229        movdqa $D2, $D1
230        paddd .sse_inc(%rip), $D1
231        movdqa $D1, $D0
232        paddd .sse_inc(%rip), $D0
233        movdqa $D0, $ctr0_store
234        movdqa $D1, $ctr1_store
235        movdqa $D2, $ctr2_store
236        movdqa $D3, $ctr3_store\n" if ($n eq 4);
237}
238
239sub finalize_state {
240my ($n)=@_;
241$code.="paddd .chacha20_consts(%rip), $A3
242        paddd $state1_store, $B3
243        paddd $state2_store, $C3
244        paddd $ctr3_store, $D3\n" if ($n eq 4);
245$code.="paddd .chacha20_consts(%rip), $A2
246        paddd $state1_store, $B2
247        paddd $state2_store, $C2
248        paddd $ctr2_store, $D2\n" if ($n ge 3);
249$code.="paddd .chacha20_consts(%rip), $A1
250        paddd $state1_store, $B1
251        paddd $state2_store, $C1
252        paddd $ctr1_store, $D1\n" if ($n ge 2);
253$code.="paddd .chacha20_consts(%rip), $A0
254        paddd $state1_store, $B0
255        paddd $state2_store, $C0
256        paddd $ctr0_store, $D0\n";
257}
258
259sub xor_stream {
260my ($A, $B, $C, $D, $offset)=@_;
261$code.="movdqu 0*16 + $offset($inp), $A3
262        movdqu 1*16 + $offset($inp), $B3
263        movdqu 2*16 + $offset($inp), $C3
264        movdqu 3*16 + $offset($inp), $D3
265        pxor $A3, $A
266        pxor $B3, $B
267        pxor $C3, $C
268        pxor $D, $D3
269        movdqu $A, 0*16 + $offset($oup)
270        movdqu $B, 1*16 + $offset($oup)
271        movdqu $C, 2*16 + $offset($oup)
272        movdqu $D3, 3*16 + $offset($oup)\n";
273}
274
275sub xor_stream_using_temp {
276my ($A, $B, $C, $D, $offset, $temp)=@_;
277$code.="movdqa $temp, $tmp_store
278        movdqu 0*16 + $offset($inp), $temp
279        pxor $A, $temp
280        movdqu $temp, 0*16 + $offset($oup)
281        movdqu 1*16 + $offset($inp), $temp
282        pxor $B, $temp
283        movdqu $temp, 1*16 + $offset($oup)
284        movdqu 2*16 + $offset($inp), $temp
285        pxor $C, $temp
286        movdqu $temp, 2*16 + $offset($oup)
287        movdqu 3*16 + $offset($inp), $temp
288        pxor $D, $temp
289        movdqu $temp, 3*16 + $offset($oup)\n";
290}
291
292sub gen_chacha_round {
293my ($rot1, $rot2, $shift)=@_;
294my $round="";
295$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
296$round.="movdqa $rot2, $C0
297         paddd $B3, $A3
298         paddd $B2, $A2
299         paddd $B1, $A1
300         paddd $B0, $A0
301         pxor $A3, $D3
302         pxor $A2, $D2
303         pxor $A1, $D1
304         pxor $A0, $D0
305         pshufb $C0, $D3
306         pshufb $C0, $D2
307         pshufb $C0, $D1
308         pshufb $C0, $D0
309         movdqa $tmp_store, $C0
310         paddd $D3, $C3
311         paddd $D2, $C2
312         paddd $D1, $C1
313         paddd $D0, $C0
314         pxor $C3, $B3
315         pxor $C2, $B2
316         pxor $C1, $B1
317         pxor $C0, $B0
318         movdqa $C0, $tmp_store
319         movdqa $B3, $C0
320         psrld \$$rot1, $C0
321         pslld \$32-$rot1, $B3
322         pxor $C0, $B3
323         movdqa $B2, $C0
324         psrld \$$rot1, $C0
325         pslld \$32-$rot1, $B2
326         pxor $C0, $B2
327         movdqa $B1, $C0
328         psrld \$$rot1, $C0
329         pslld \$32-$rot1, $B1
330         pxor $C0, $B1
331         movdqa $B0, $C0
332         psrld \$$rot1, $C0
333         pslld \$32-$rot1, $B0
334         pxor $C0, $B0\n";
335($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
336($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
337$round.="movdqa $tmp_store, $C0
338         palignr \$$s1, $B3, $B3
339         palignr \$$s2, $C3, $C3
340         palignr \$$s3, $D3, $D3
341         palignr \$$s1, $B2, $B2
342         palignr \$$s2, $C2, $C2
343         palignr \$$s3, $D2, $D2
344         palignr \$$s1, $B1, $B1
345         palignr \$$s2, $C1, $C1
346         palignr \$$s3, $D1, $D1
347         palignr \$$s1, $B0, $B0
348         palignr \$$s2, $C0, $C0
349         palignr \$$s3, $D0, $D0\n"
350if (($shift =~ /left/) || ($shift =~ /right/));
351return $round;
352};
353
354$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
355               &gen_chacha_round(25, ".rol8(%rip)", "left") .
356               &gen_chacha_round(20, ".rol16(%rip)") .
357               &gen_chacha_round(25, ".rol8(%rip)", "right");
358
359my @loop_body = split /\n/, $chacha_body;
360
361sub emit_body {
362my ($n)=@_;
363    for (my $i=0; $i < $n; $i++) {
364        $code=$code.shift(@loop_body)."\n";
365    };
366}
367
368{
369################################################################################
370# void poly_hash_ad_internal();
371$code.="
372.type poly_hash_ad_internal,\@function,2
373.align 64
374poly_hash_ad_internal:
375.cfi_startproc
376    xor $acc0, $acc0
377    xor $acc1, $acc1
378    xor $acc2, $acc2
379    cmp \$13,  $itr2
380    jne hash_ad_loop
381poly_fast_tls_ad:
382    # Special treatment for the TLS case of 13 bytes
383    mov ($adp), $acc0
384    mov 5($adp), $acc1
385    shr \$24, $acc1
386    mov \$1, $acc2\n";
387    &poly_mul(); $code.="
388    ret
389hash_ad_loop:
390        # Hash in 16 byte chunk
391        cmp \$16, $itr2
392        jb hash_ad_tail\n";
393        &poly_add("0($adp)");
394        &poly_mul(); $code.="
395        lea 1*16($adp), $adp
396        sub \$16, $itr2
397    jmp hash_ad_loop
398hash_ad_tail:
399    cmp \$0, $itr2
400    je 1f
401    # Hash last < 16 byte tail
402    xor $t0, $t0
403    xor $t1, $t1
404    xor $t2, $t2
405    add $itr2, $adp
406hash_ad_tail_loop:
407        shld \$8, $t0, $t1
408        shl \$8, $t0
409        movzxb -1($adp), $t2
410        xor $t2, $t0
411        dec $adp
412        dec $itr2
413    jne hash_ad_tail_loop
414
415    add $t0, $acc0
416    adc $t1, $acc1
417    adc \$1, $acc2\n";
418    &poly_mul(); $code.="
419    # Finished AD
4201:
421    ret
422.cfi_endproc
423.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
424}
425
426{
427################################################################################
428# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
429$code.="
430.globl chacha20_poly1305_open
431.type chacha20_poly1305_open,\@function,2
432.align 64
433chacha20_poly1305_open:
434.cfi_startproc
435    push %rbp
436.cfi_adjust_cfa_offset 8
437    push %rbx
438.cfi_adjust_cfa_offset 8
439    push %r12
440.cfi_adjust_cfa_offset 8
441    push %r13
442.cfi_adjust_cfa_offset 8
443    push %r14
444.cfi_adjust_cfa_offset 8
445    push %r15
446.cfi_adjust_cfa_offset 8
447    # We write the calculated authenticator back to keyp at the end, so save
448    # the pointer on the stack too.
449    push $keyp
450.cfi_adjust_cfa_offset 8
451    sub \$288 + 32, %rsp
452.cfi_adjust_cfa_offset 288 + 32
453.cfi_offset rbp, -16
454.cfi_offset rbx, -24
455.cfi_offset r12, -32
456.cfi_offset r13, -40
457.cfi_offset r14, -48
458.cfi_offset r15, -56
459    lea 32(%rsp), %rbp
460    and \$-32, %rbp
461    mov %rdx, 8+$len_store
462    mov %r8, 0+$len_store
463    mov %rdx, $inl\n"; $code.="
464    mov OPENSSL_ia32cap_P+8(%rip), %eax
465    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
466    xor \$`(1<<5) + (1<<8)`, %eax
467    jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
468$code.="
4691:
470    cmp \$128, $inl
471    jbe open_sse_128
472    # For long buffers, prepare the poly key first
473    movdqa .chacha20_consts(%rip), $A0
474    movdqu 0*16($keyp), $B0
475    movdqu 1*16($keyp), $C0
476    movdqu 2*16($keyp), $D0
477    movdqa $D0, $T1
478    # Store on stack, to free keyp
479    movdqa $B0, $state1_store
480    movdqa $C0, $state2_store
481    movdqa $D0, $ctr0_store
482    mov \$10, $acc0
4831:  \n";
484        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
485        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
486        dec $acc0
487    jne 1b
488    # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
489    paddd .chacha20_consts(%rip), $A0
490    paddd $state1_store, $B0
491    # Clamp and store the key
492    pand .clamp(%rip), $A0
493    movdqa $A0, $r_store
494    movdqa $B0, $s_store
495    # Hash
496    mov %r8, $itr2
497    call poly_hash_ad_internal
498open_sse_main_loop:
499        cmp \$16*16, $inl
500        jb 2f
501        # Load state, increment counter blocks\n";
502        &prep_state(4); $code.="
503        # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
504        # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
505        mov \$4, $itr1
506        mov $inp, $itr2
5071:  \n";
508            &emit_body(20);
509            &poly_add("0($itr2)"); $code.="
510            lea 2*8($itr2), $itr2\n";
511            &emit_body(20);
512            &poly_stage1();
513            &emit_body(20);
514            &poly_stage2();
515            &emit_body(20);
516            &poly_stage3();
517            &emit_body(20);
518            &poly_reduce_stage();
519            foreach $l (@loop_body) {$code.=$l."\n";}
520            @loop_body = split /\n/, $chacha_body; $code.="
521            dec $itr1
522        jge 1b\n";
523            &poly_add("0($itr2)");
524            &poly_mul(); $code.="
525            lea 2*8($itr2), $itr2
526            cmp \$-6, $itr1
527        jg 1b\n";
528        &finalize_state(4);
529        &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
530        &xor_stream($A2, $B2, $C2, $D2, "4*16");
531        &xor_stream($A1, $B1, $C1, $D1, "8*16");
532        &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
533        lea 16*16($inp), $inp
534        lea 16*16($oup), $oup
535        sub \$16*16, $inl
536    jmp open_sse_main_loop
5372:
538    # Handle the various tail sizes efficiently
539    test $inl, $inl
540    jz open_sse_finalize
541    cmp \$4*16, $inl
542    ja 3f\n";
543###############################################################################
544    # At most 64 bytes are left
545    &prep_state(1); $code.="
546    xor $itr2, $itr2
547    mov $inl, $itr1
548    cmp \$16, $itr1
549    jb 2f
5501:  \n";
551        &poly_add("0($inp, $itr2)");
552        &poly_mul(); $code.="
553        sub \$16, $itr1
5542:
555        add \$16, $itr2\n";
556        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
557        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
558        cmp \$16, $itr1
559    jae 1b
560        cmp \$10*16, $itr2
561    jne 2b\n";
562    &finalize_state(1); $code.="
563    jmp open_sse_tail_64_dec_loop
5643:
565    cmp \$8*16, $inl
566    ja 3f\n";
567###############################################################################
568    # 65 - 128 bytes are left
569    &prep_state(2); $code.="
570    mov $inl, $itr1
571    and \$-16, $itr1
572    xor $itr2, $itr2
5731:  \n";
574        &poly_add("0($inp, $itr2)");
575        &poly_mul(); $code.="
5762:
577        add \$16, $itr2\n";
578        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
579        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
580        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
581        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
582        cmp $itr1, $itr2
583    jb 1b
584        cmp \$10*16, $itr2
585    jne 2b\n";
586    &finalize_state(2);
587    &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
588    sub \$4*16, $inl
589    lea 4*16($inp), $inp
590    lea 4*16($oup), $oup
591    jmp open_sse_tail_64_dec_loop
5923:
593    cmp \$12*16, $inl
594    ja 3f\n";
595###############################################################################
596    # 129 - 192 bytes are left
597    &prep_state(3); $code.="
598    mov $inl, $itr1
599    mov \$10*16, $itr2
600    cmp \$10*16, $itr1
601    cmovg $itr2, $itr1
602    and \$-16, $itr1
603    xor $itr2, $itr2
6041:  \n";
605        &poly_add("0($inp, $itr2)");
606        &poly_mul(); $code.="
6072:
608        add \$16, $itr2\n";
609        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
610        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
611        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
612        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
613        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
614        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
615        cmp $itr1, $itr2
616    jb 1b
617        cmp \$10*16, $itr2
618    jne 2b
619    cmp \$11*16, $inl
620    jb 1f\n";
621    &poly_add("10*16($inp)");
622    &poly_mul(); $code.="
623    cmp \$12*16, $inl
624    jb 1f\n";
625    &poly_add("11*16($inp)");
626    &poly_mul(); $code.="
6271:  \n";
628    &finalize_state(3);
629    &xor_stream($A2, $B2, $C2, $D2, "0*16");
630    &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
631    sub \$8*16, $inl
632    lea 8*16($inp), $inp
633    lea 8*16($oup), $oup
634    jmp open_sse_tail_64_dec_loop
6353:
636###############################################################################\n";
637    # 193 - 255 bytes are left
638    &prep_state(4); $code.="
639    xor $itr2, $itr2
6401:  \n";
641        &poly_add("0($inp, $itr2)");
642        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
643        &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
644        &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
645        &poly_stage1();
646        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
647        &poly_stage2();
648        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
649        &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
650        &poly_stage3();
651        &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
652        &poly_reduce_stage();
653        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
654        add \$16, $itr2
655        cmp \$10*16, $itr2
656    jb 1b
657    mov $inl, $itr1
658    and \$-16, $itr1
6591:  \n";
660        &poly_add("0($inp, $itr2)");
661        &poly_mul(); $code.="
662        add \$16, $itr2
663        cmp $itr1, $itr2
664    jb 1b\n";
665    &finalize_state(4);
666    &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
667    &xor_stream($A2, $B2, $C2, $D2, "4*16");
668    &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
669    movdqa $tmp_store, $D0
670    sub \$12*16, $inl
671    lea 12*16($inp), $inp
672    lea 12*16($oup), $oup
673###############################################################################
674    # Decrypt the remaining data, 16B at a time, using existing stream
675open_sse_tail_64_dec_loop:
676    cmp \$16, $inl
677    jb 1f
678        sub \$16, $inl
679        movdqu ($inp), $T0
680        pxor $T0, $A0
681        movdqu $A0, ($oup)
682        lea 16($inp), $inp
683        lea 16($oup), $oup
684        movdqa $B0, $A0
685        movdqa $C0, $B0
686        movdqa $D0, $C0
687    jmp open_sse_tail_64_dec_loop
6881:
689    movdqa $A0, $A1
690
691    # Decrypt up to 16 bytes at the end.
692open_sse_tail_16:
693    test $inl, $inl
694    jz open_sse_finalize
695
696    # Read the final bytes into $T0. They need to be read in reverse order so
697    # that they end up in the correct order in $T0.
698    pxor $T0, $T0
699    lea -1($inp, $inl), $inp
700    movq $inl, $itr2
7012:
702        pslldq \$1, $T0
703        pinsrb \$0, ($inp), $T0
704        sub \$1, $inp
705        sub \$1, $itr2
706        jnz 2b
707
7083:
709    movq $T0, $t0
710    pextrq \$1, $T0, $t1
711    # The final bytes of keystream are in $A1.
712    pxor $A1, $T0
713
714    # Copy the plaintext bytes out.
7152:
716        pextrb \$0, $T0, ($oup)
717        psrldq \$1, $T0
718        add \$1, $oup
719        sub \$1, $inl
720    jne 2b
721
722    add $t0, $acc0
723    adc $t1, $acc1
724    adc \$1, $acc2\n";
725    &poly_mul(); $code.="
726
727open_sse_finalize:\n";
728    &poly_add($len_store);
729    &poly_mul(); $code.="
730    # Final reduce
731    mov $acc0, $t0
732    mov $acc1, $t1
733    mov $acc2, $t2
734    sub \$-5, $acc0
735    sbb \$-1, $acc1
736    sbb \$3, $acc2
737    cmovc $t0, $acc0
738    cmovc $t1, $acc1
739    cmovc $t2, $acc2
740    # Add in s part of the key
741    add 0+$s_store, $acc0
742    adc 8+$s_store, $acc1
743
744    add \$288 + 32, %rsp
745.cfi_adjust_cfa_offset -(288 + 32)
746    pop $keyp
747.cfi_adjust_cfa_offset -8
748    movq $acc0, ($keyp)
749    movq $acc1, 8($keyp)
750
751    pop %r15
752.cfi_adjust_cfa_offset -8
753    pop %r14
754.cfi_adjust_cfa_offset -8
755    pop %r13
756.cfi_adjust_cfa_offset -8
757    pop %r12
758.cfi_adjust_cfa_offset -8
759    pop %rbx
760.cfi_adjust_cfa_offset -8
761    pop %rbp
762.cfi_adjust_cfa_offset -8
763    ret
764.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
765###############################################################################
766open_sse_128:
767    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
768    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
769    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
770    movdqu 2*16($keyp), $D0
771    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
772    movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
773    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
774    mov \$10, $acc0
7751:  \n";
776        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
777        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
778        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
779        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
780        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
781        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
782    dec $acc0
783    jnz 1b
784    paddd .chacha20_consts(%rip), $A0
785    paddd .chacha20_consts(%rip), $A1
786    paddd .chacha20_consts(%rip), $A2
787    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
788    paddd $T2, $C1\npaddd $T2, $C2
789    paddd $T3, $D1
790    paddd .sse_inc(%rip), $T3
791    paddd $T3, $D2
792    # Clamp and store the key
793    pand .clamp(%rip), $A0
794    movdqa $A0, $r_store
795    movdqa $B0, $s_store
796    # Hash
797    mov %r8, $itr2
798    call poly_hash_ad_internal
7991:
800        cmp \$16, $inl
801        jb open_sse_tail_16
802        sub \$16, $inl\n";
803        # Load for hashing
804        &poly_add("0*8($inp)"); $code.="
805        # Load for decryption
806        movdqu 0*16($inp), $T0
807        pxor $T0, $A1
808        movdqu $A1, 0*16($oup)
809        lea 1*16($inp), $inp
810        lea 1*16($oup), $oup\n";
811        &poly_mul(); $code.="
812        # Shift the stream left
813        movdqa $B1, $A1
814        movdqa $C1, $B1
815        movdqa $D1, $C1
816        movdqa $A2, $D1
817        movdqa $B2, $A2
818        movdqa $C2, $B2
819        movdqa $D2, $C2
820    jmp 1b
821    jmp open_sse_tail_16
822.size chacha20_poly1305_open, .-chacha20_poly1305_open
823.cfi_endproc
824
825################################################################################
826################################################################################
827# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
828.globl  chacha20_poly1305_seal
829.type chacha20_poly1305_seal,\@function,2
830.align 64
831chacha20_poly1305_seal:
832.cfi_startproc
833    push %rbp
834.cfi_adjust_cfa_offset 8
835    push %rbx
836.cfi_adjust_cfa_offset 8
837    push %r12
838.cfi_adjust_cfa_offset 8
839    push %r13
840.cfi_adjust_cfa_offset 8
841    push %r14
842.cfi_adjust_cfa_offset 8
843    push %r15
844.cfi_adjust_cfa_offset 8
845    # We write the calculated authenticator back to keyp at the end, so save
846    # the pointer on the stack too.
847    push $keyp
848.cfi_adjust_cfa_offset 8
849    sub \$288 + 32, %rsp
850.cfi_adjust_cfa_offset 288 + 32
851.cfi_offset rbp, -16
852.cfi_offset rbx, -24
853.cfi_offset r12, -32
854.cfi_offset r13, -40
855.cfi_offset r14, -48
856.cfi_offset r15, -56
857    lea 32(%rsp), %rbp
858    and \$-32, %rbp
859    mov %rdx, 8+$len_store
860    mov %r8, 0+$len_store
861    mov %rdx, $inl\n"; $code.="
862    mov OPENSSL_ia32cap_P+8(%rip), %eax
863    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
864    xor \$`(1<<5) + (1<<8)`, %eax
865    jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
866$code.="
867    cmp \$128, $inl
868    jbe seal_sse_128
869    # For longer buffers, prepare the poly key + some stream
870    movdqa .chacha20_consts(%rip), $A0
871    movdqu 0*16($keyp), $B0
872    movdqu 1*16($keyp), $C0
873    movdqu 2*16($keyp), $D0
874    movdqa $A0, $A1
875    movdqa $A0, $A2
876    movdqa $A0, $A3
877    movdqa $B0, $B1
878    movdqa $B0, $B2
879    movdqa $B0, $B3
880    movdqa $C0, $C1
881    movdqa $C0, $C2
882    movdqa $C0, $C3
883    movdqa $D0, $D3
884    paddd .sse_inc(%rip), $D0
885    movdqa $D0, $D2
886    paddd .sse_inc(%rip), $D0
887    movdqa $D0, $D1
888    paddd .sse_inc(%rip), $D0
889    # Store on stack
890    movdqa $B0, $state1_store
891    movdqa $C0, $state2_store
892    movdqa $D0, $ctr0_store
893    movdqa $D1, $ctr1_store
894    movdqa $D2, $ctr2_store
895    movdqa $D3, $ctr3_store
896    mov \$10, $acc0
8971:  \n";
898        foreach $l (@loop_body) {$code.=$l."\n";}
899        @loop_body = split /\n/, $chacha_body; $code.="
900        dec $acc0
901    jnz 1b\n";
902    &finalize_state(4); $code.="
903    # Clamp and store the key
904    pand .clamp(%rip), $A3
905    movdqa $A3, $r_store
906    movdqa $B3, $s_store
907    # Hash
908    mov %r8, $itr2
909    call poly_hash_ad_internal\n";
910    &xor_stream($A2,$B2,$C2,$D2,"0*16");
911    &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
912    cmp \$12*16, $inl
913    ja 1f
914    mov \$8*16, $itr1
915    sub \$8*16, $inl
916    lea 8*16($inp), $inp
917    jmp seal_sse_128_seal_hash
9181:  \n";
919    &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
920    mov \$12*16, $itr1
921    sub \$12*16, $inl
922    lea 12*16($inp), $inp
923    mov \$2, $itr1
924    mov \$8, $itr2
925    cmp \$4*16, $inl
926    jbe seal_sse_tail_64
927    cmp \$8*16, $inl
928    jbe seal_sse_tail_128
929    cmp \$12*16, $inl
930    jbe seal_sse_tail_192
931
9321:  \n";
933    # The main loop
934        &prep_state(4); $code.="
9352:  \n";
936            &emit_body(20);
937            &poly_add("0($oup)");
938            &emit_body(20);
939            &poly_stage1();
940            &emit_body(20);
941            &poly_stage2();
942            &emit_body(20);
943            &poly_stage3();
944            &emit_body(20);
945            &poly_reduce_stage();
946            foreach $l (@loop_body) {$code.=$l."\n";}
947            @loop_body = split /\n/, $chacha_body; $code.="
948            lea 16($oup), $oup
949            dec $itr2
950        jge 2b\n";
951            &poly_add("0*8($oup)");
952            &poly_mul(); $code.="
953            lea 16($oup), $oup
954            dec $itr1
955        jg 2b\n";
956
957        &finalize_state(4);$code.="
958        movdqa $D2, $tmp_store\n";
959        &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
960        movdqa $tmp_store, $D2\n";
961        &xor_stream($A2,$B2,$C2,$D2, 4*16);
962        &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
963        cmp \$16*16, $inl
964        ja 3f
965
966        mov \$12*16, $itr1
967        sub \$12*16, $inl
968        lea 12*16($inp), $inp
969        jmp seal_sse_128_seal_hash
9703:  \n";
971        &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
972        lea 16*16($inp), $inp
973        sub \$16*16, $inl
974        mov \$6, $itr1
975        mov \$4, $itr2
976        cmp \$12*16, $inl
977    jg 1b
978    mov $inl, $itr1
979    test $inl, $inl
980    je seal_sse_128_seal_hash
981    mov \$6, $itr1
982    cmp \$4*16, $inl
983    jg 3f
984###############################################################################
985seal_sse_tail_64:\n";
986    &prep_state(1); $code.="
9871:  \n";
988        &poly_add("0($oup)");
989        &poly_mul(); $code.="
990        lea 16($oup), $oup
9912:  \n";
992        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
993        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
994        &poly_add("0($oup)");
995        &poly_mul(); $code.="
996        lea 16($oup), $oup
997    dec $itr1
998    jg 1b
999    dec $itr2
1000    jge 2b\n";
1001    &finalize_state(1); $code.="
1002    jmp seal_sse_128_seal
10033:
1004    cmp \$8*16, $inl
1005    jg 3f
1006###############################################################################
1007seal_sse_tail_128:\n";
1008    &prep_state(2); $code.="
10091:  \n";
1010        &poly_add("0($oup)");
1011        &poly_mul(); $code.="
1012        lea 16($oup), $oup
10132:  \n";
1014        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1015        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1016        &poly_add("0($oup)");
1017        &poly_mul();
1018        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1019        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
1020        lea 16($oup), $oup
1021    dec $itr1
1022    jg 1b
1023    dec $itr2
1024    jge 2b\n";
1025    &finalize_state(2);
1026    &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
1027    mov \$4*16, $itr1
1028    sub \$4*16, $inl
1029    lea 4*16($inp), $inp
1030    jmp seal_sse_128_seal_hash
10313:
1032###############################################################################
1033seal_sse_tail_192:\n";
1034    &prep_state(3); $code.="
10351:  \n";
1036        &poly_add("0($oup)");
1037        &poly_mul(); $code.="
1038        lea 16($oup), $oup
10392:  \n";
1040        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1041        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1042        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
1043        &poly_add("0($oup)");
1044        &poly_mul();
1045        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1046        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
1047        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1048        lea 16($oup), $oup
1049    dec $itr1
1050    jg 1b
1051    dec $itr2
1052    jge 2b\n";
1053    &finalize_state(3);
1054    &xor_stream($A2,$B2,$C2,$D2,0*16);
1055    &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
1056    mov \$8*16, $itr1
1057    sub \$8*16, $inl
1058    lea 8*16($inp), $inp
1059###############################################################################
1060seal_sse_128_seal_hash:
1061        cmp \$16, $itr1
1062        jb seal_sse_128_seal\n";
1063        &poly_add("0($oup)");
1064        &poly_mul(); $code.="
1065        sub \$16, $itr1
1066        lea 16($oup), $oup
1067    jmp seal_sse_128_seal_hash
1068
1069seal_sse_128_seal:
1070        cmp \$16, $inl
1071        jb seal_sse_tail_16
1072        sub \$16, $inl
1073        # Load for decryption
1074        movdqu 0*16($inp), $T0
1075        pxor $T0, $A0
1076        movdqu $A0, 0*16($oup)
1077        # Then hash
1078        add 0*8($oup), $acc0
1079        adc 1*8($oup), $acc1
1080        adc \$1, $acc2
1081        lea 1*16($inp), $inp
1082        lea 1*16($oup), $oup\n";
1083        &poly_mul(); $code.="
1084        # Shift the stream left
1085        movdqa $B0, $A0
1086        movdqa $C0, $B0
1087        movdqa $D0, $C0
1088        movdqa $A1, $D0
1089        movdqa $B1, $A1
1090        movdqa $C1, $B1
1091        movdqa $D1, $C1
1092    jmp seal_sse_128_seal
1093
1094seal_sse_tail_16:
1095    test $inl, $inl
1096    jz seal_sse_finalize
1097    # We can only load the PT one byte at a time to avoid buffer overread
1098    mov $inl, $itr2
1099    shl \$4, $itr2
1100    lea .and_masks(%rip), $t0
1101    mov $inl, $itr1
1102    lea -1($inp, $inl), $inp
1103    pxor $T3, $T3
11041:
1105        pslldq \$1, $T3
1106        pinsrb \$0, ($inp), $T3
1107        lea -1($inp), $inp
1108        dec $itr1
1109    jne 1b
1110
1111    # XOR the keystream with the plaintext.
1112    pxor $A0, $T3
1113
1114    # Write ciphertext out, byte-by-byte.
1115    movq $inl, $itr1
1116    movdqu $T3, $A0
11172:
1118        pextrb \$0, $A0, ($oup)
1119        psrldq \$1, $A0
1120        add \$1, $oup
1121        sub \$1, $itr1
1122        jnz 2b
1123
1124    pand -16($t0, $itr2), $T3
1125    movq $T3, $t0
1126    pextrq \$1, $T3, $t1
1127    add $t0, $acc0
1128    adc $t1, $acc1
1129    adc \$1, $acc2\n";
1130    &poly_mul(); $code.="
1131seal_sse_finalize:\n";
1132    &poly_add($len_store);
1133    &poly_mul(); $code.="
1134    # Final reduce
1135    mov $acc0, $t0
1136    mov $acc1, $t1
1137    mov $acc2, $t2
1138    sub \$-5, $acc0
1139    sbb \$-1, $acc1
1140    sbb \$3, $acc2
1141    cmovc $t0, $acc0
1142    cmovc $t1, $acc1
1143    cmovc $t2, $acc2
1144    # Add in s part of the key
1145    add 0+$s_store, $acc0
1146    adc 8+$s_store, $acc1
1147
1148    add \$288 + 32, %rsp
1149.cfi_adjust_cfa_offset -(288 + 32)
1150    pop $keyp
1151.cfi_adjust_cfa_offset -8
1152    mov $acc0, 0*8($keyp)
1153    mov $acc1, 1*8($keyp)
1154
1155    pop %r15
1156.cfi_adjust_cfa_offset -8
1157    pop %r14
1158.cfi_adjust_cfa_offset -8
1159    pop %r13
1160.cfi_adjust_cfa_offset -8
1161    pop %r12
1162.cfi_adjust_cfa_offset -8
1163    pop %rbx
1164.cfi_adjust_cfa_offset -8
1165    pop %rbp
1166.cfi_adjust_cfa_offset -8
1167    ret
1168.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
1169################################################################################
1170seal_sse_128:
1171    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
1172    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
1173    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
1174    movdqu 2*16($keyp), $D2
1175    movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
1176    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
1177    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
1178    mov \$10, $acc0
11791:\n";
1180        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1181        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1182        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
1183        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1184        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
1185        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1186        dec $acc0
1187    jnz 1b
1188    paddd .chacha20_consts(%rip), $A0
1189    paddd .chacha20_consts(%rip), $A1
1190    paddd .chacha20_consts(%rip), $A2
1191    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
1192    paddd $T2, $C0\npaddd $T2, $C1
1193    paddd $T3, $D0
1194    paddd .sse_inc(%rip), $T3
1195    paddd $T3, $D1
1196    # Clamp and store the key
1197    pand .clamp(%rip), $A2
1198    movdqa $A2, $r_store
1199    movdqa $B2, $s_store
1200    # Hash
1201    mov %r8, $itr2
1202    call poly_hash_ad_internal
1203    jmp seal_sse_128_seal
1204.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
1205}
1206
1207# There should have been a cfi_endproc at the end of that function, but the two
1208# following blocks of code are jumped to without a stack frame and the CFI
1209# context which they are used in happens to match the CFI context at the end of
1210# the previous function. So the CFI table is just extended to the end of them.
1211
1212if ($avx>1) {
1213
1214($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
1215my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
1216($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
1217$state1_store="2*32(%rbp)";
1218$state2_store="3*32(%rbp)";
1219$tmp_store="4*32(%rbp)";
1220$ctr0_store="5*32(%rbp)";
1221$ctr1_store="6*32(%rbp)";
1222$ctr2_store="7*32(%rbp)";
1223$ctr3_store="8*32(%rbp)";
1224
1225sub chacha_qr_avx2 {
1226my ($a,$b,$c,$d,$t,$dir)=@_;
1227$code.=<<___ if ($dir =~ /store/);
1228    vmovdqa $t, $tmp_store
1229___
1230$code.=<<___;
1231    vpaddd $b, $a, $a
1232    vpxor $a, $d, $d
1233    vpshufb .rol16(%rip), $d, $d
1234    vpaddd $d, $c, $c
1235    vpxor $c, $b, $b
1236    vpsrld \$20, $b, $t
1237    vpslld \$12, $b, $b
1238    vpxor $t, $b, $b
1239    vpaddd $b, $a, $a
1240    vpxor $a, $d, $d
1241    vpshufb .rol8(%rip), $d, $d
1242    vpaddd $d, $c, $c
1243    vpxor $c, $b, $b
1244    vpslld \$7, $b, $t
1245    vpsrld \$25, $b, $b
1246    vpxor $t, $b, $b
1247___
1248$code.=<<___ if ($dir =~ /left/);
1249    vpalignr \$12, $d, $d, $d
1250    vpalignr \$8, $c, $c, $c
1251    vpalignr \$4, $b, $b, $b
1252___
1253$code.=<<___ if ($dir =~ /right/);
1254    vpalignr \$4, $d, $d, $d
1255    vpalignr \$8, $c, $c, $c
1256    vpalignr \$12, $b, $b, $b
1257___
1258$code.=<<___ if ($dir =~ /load/);
1259    vmovdqa $tmp_store, $t
1260___
1261}
1262
1263sub prep_state_avx2 {
1264my ($n)=@_;
1265$code.=<<___;
1266    vmovdqa .chacha20_consts(%rip), $A0
1267    vmovdqa $state1_store, $B0
1268    vmovdqa $state2_store, $C0
1269___
1270$code.=<<___ if ($n ge 2);
1271    vmovdqa $A0, $A1
1272    vmovdqa $B0, $B1
1273    vmovdqa $C0, $C1
1274___
1275$code.=<<___ if ($n ge 3);
1276    vmovdqa $A0, $A2
1277    vmovdqa $B0, $B2
1278    vmovdqa $C0, $C2
1279___
1280$code.=<<___ if ($n ge 4);
1281    vmovdqa $A0, $A3
1282    vmovdqa $B0, $B3
1283    vmovdqa $C0, $C3
1284___
1285$code.=<<___ if ($n eq 1);
1286    vmovdqa .avx2_inc(%rip), $D0
1287    vpaddd $ctr0_store, $D0, $D0
1288    vmovdqa $D0, $ctr0_store
1289___
1290$code.=<<___ if ($n eq 2);
1291    vmovdqa .avx2_inc(%rip), $D0
1292    vpaddd $ctr0_store, $D0, $D1
1293    vpaddd $D1, $D0, $D0
1294    vmovdqa $D0, $ctr0_store
1295    vmovdqa $D1, $ctr1_store
1296___
1297$code.=<<___ if ($n eq 3);
1298    vmovdqa .avx2_inc(%rip), $D0
1299    vpaddd $ctr0_store, $D0, $D2
1300    vpaddd $D2, $D0, $D1
1301    vpaddd $D1, $D0, $D0
1302    vmovdqa $D0, $ctr0_store
1303    vmovdqa $D1, $ctr1_store
1304    vmovdqa $D2, $ctr2_store
1305___
1306$code.=<<___ if ($n eq 4);
1307    vmovdqa .avx2_inc(%rip), $D0
1308    vpaddd $ctr0_store, $D0, $D3
1309    vpaddd $D3, $D0, $D2
1310    vpaddd $D2, $D0, $D1
1311    vpaddd $D1, $D0, $D0
1312    vmovdqa $D3, $ctr3_store
1313    vmovdqa $D2, $ctr2_store
1314    vmovdqa $D1, $ctr1_store
1315    vmovdqa $D0, $ctr0_store
1316___
1317}
1318
1319sub finalize_state_avx2 {
1320my ($n)=@_;
1321$code.=<<___ if ($n eq 4);
1322    vpaddd .chacha20_consts(%rip), $A3, $A3
1323    vpaddd $state1_store, $B3, $B3
1324    vpaddd $state2_store, $C3, $C3
1325    vpaddd $ctr3_store, $D3, $D3
1326___
1327$code.=<<___ if ($n ge 3);
1328    vpaddd .chacha20_consts(%rip), $A2, $A2
1329    vpaddd $state1_store, $B2, $B2
1330    vpaddd $state2_store, $C2, $C2
1331    vpaddd $ctr2_store, $D2, $D2
1332___
1333$code.=<<___ if ($n ge 2);
1334    vpaddd .chacha20_consts(%rip), $A1, $A1
1335    vpaddd $state1_store, $B1, $B1
1336    vpaddd $state2_store, $C1, $C1
1337    vpaddd $ctr1_store, $D1, $D1
1338___
1339$code.=<<___;
1340    vpaddd .chacha20_consts(%rip), $A0, $A0
1341    vpaddd $state1_store, $B0, $B0
1342    vpaddd $state2_store, $C0, $C0
1343    vpaddd $ctr0_store, $D0, $D0
1344___
1345}
1346
1347sub xor_stream_avx2 {
1348my ($A, $B, $C, $D, $offset, $hlp)=@_;
1349$code.=<<___;
1350    vperm2i128 \$0x02, $A, $B, $hlp
1351    vperm2i128 \$0x13, $A, $B, $B
1352    vperm2i128 \$0x02, $C, $D, $A
1353    vperm2i128 \$0x13, $C, $D, $C
1354    vpxor 0*32+$offset($inp), $hlp, $hlp
1355    vpxor 1*32+$offset($inp), $A, $A
1356    vpxor 2*32+$offset($inp), $B, $B
1357    vpxor 3*32+$offset($inp), $C, $C
1358    vmovdqu $hlp, 0*32+$offset($oup)
1359    vmovdqu $A, 1*32+$offset($oup)
1360    vmovdqu $B, 2*32+$offset($oup)
1361    vmovdqu $C, 3*32+$offset($oup)
1362___
1363}
1364
1365sub finish_stream_avx2 {
1366my ($A, $B, $C, $D, $hlp)=@_;
1367$code.=<<___;
1368    vperm2i128 \$0x13, $A, $B, $hlp
1369    vperm2i128 \$0x02, $A, $B, $A
1370    vperm2i128 \$0x02, $C, $D, $B
1371    vperm2i128 \$0x13, $C, $D, $D
1372    vmovdqa $hlp, $C
1373___
1374}
1375
1376sub poly_stage1_mulx {
1377$code.=<<___;
1378    mov 0+$r_store, %rdx
1379    mov %rdx, $t2
1380    mulx $acc0, $t0, $t1
1381    mulx $acc1, %rax, %rdx
1382    imulq $acc2, $t2
1383    add %rax, $t1
1384    adc %rdx, $t2
1385___
1386}
1387
1388sub poly_stage2_mulx {
1389$code.=<<___;
1390    mov 8+$r_store, %rdx
1391    mulx $acc0, $acc0, %rax
1392    add $acc0, $t1
1393    mulx $acc1, $acc1, $t3
1394    adc $acc1, $t2
1395    adc \$0, $t3
1396    imulq $acc2, %rdx
1397___
1398}
1399
1400sub poly_stage3_mulx {
1401$code.=<<___;
1402    add %rax, $t2
1403    adc %rdx, $t3
1404___
1405}
1406
1407sub poly_mul_mulx {
1408    &poly_stage1_mulx();
1409    &poly_stage2_mulx();
1410    &poly_stage3_mulx();
1411    &poly_reduce_stage();
1412}
1413
1414sub gen_chacha_round_avx2 {
1415my ($rot1, $rot2, $shift)=@_;
1416my $round="";
1417$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
1418$round=$round ."vmovdqa $rot2, $C0
1419                vpaddd $B3, $A3, $A3
1420                vpaddd $B2, $A2, $A2
1421                vpaddd $B1, $A1, $A1
1422                vpaddd $B0, $A0, $A0
1423                vpxor $A3, $D3, $D3
1424                vpxor $A2, $D2, $D2
1425                vpxor $A1, $D1, $D1
1426                vpxor $A0, $D0, $D0
1427                vpshufb $C0, $D3, $D3
1428                vpshufb $C0, $D2, $D2
1429                vpshufb $C0, $D1, $D1
1430                vpshufb $C0, $D0, $D0
1431                vmovdqa $tmp_store, $C0
1432                vpaddd $D3, $C3, $C3
1433                vpaddd $D2, $C2, $C2
1434                vpaddd $D1, $C1, $C1
1435                vpaddd $D0, $C0, $C0
1436                vpxor $C3, $B3, $B3
1437                vpxor $C2, $B2, $B2
1438                vpxor $C1, $B1, $B1
1439                vpxor $C0, $B0, $B0
1440                vmovdqa $C0, $tmp_store
1441                vpsrld \$$rot1, $B3, $C0
1442                vpslld \$32-$rot1, $B3, $B3
1443                vpxor $C0, $B3, $B3
1444                vpsrld \$$rot1, $B2, $C0
1445                vpslld \$32-$rot1, $B2, $B2
1446                vpxor $C0, $B2, $B2
1447                vpsrld \$$rot1, $B1, $C0
1448                vpslld \$32-$rot1, $B1, $B1
1449                vpxor $C0, $B1, $B1
1450                vpsrld \$$rot1, $B0, $C0
1451                vpslld \$32-$rot1, $B0, $B0
1452                vpxor $C0, $B0, $B0\n";
1453($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
1454($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
1455$round=$round ."vmovdqa $tmp_store, $C0
1456                vpalignr \$$s1, $B3, $B3, $B3
1457                vpalignr \$$s2, $C3, $C3, $C3
1458                vpalignr \$$s3, $D3, $D3, $D3
1459                vpalignr \$$s1, $B2, $B2, $B2
1460                vpalignr \$$s2, $C2, $C2, $C2
1461                vpalignr \$$s3, $D2, $D2, $D2
1462                vpalignr \$$s1, $B1, $B1, $B1
1463                vpalignr \$$s2, $C1, $C1, $C1
1464                vpalignr \$$s3, $D1, $D1, $D1
1465                vpalignr \$$s1, $B0, $B0, $B0
1466                vpalignr \$$s2, $C0, $C0, $C0
1467                vpalignr \$$s3, $D0, $D0, $D0\n"
1468if (($shift =~ /left/) || ($shift =~ /right/));
1469return $round;
1470};
1471
1472$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
1473               &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
1474               &gen_chacha_round_avx2(20, ".rol16(%rip)") .
1475               &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
1476
1477@loop_body = split /\n/, $chacha_body;
1478
1479$code.="
1480###############################################################################
1481.type chacha20_poly1305_open_avx2,\@function,2
1482.align 64
1483chacha20_poly1305_open_avx2:
1484    vzeroupper
1485    vmovdqa .chacha20_consts(%rip), $A0
1486    vbroadcasti128 0*16($keyp), $B0
1487    vbroadcasti128 1*16($keyp), $C0
1488    vbroadcasti128 2*16($keyp), $D0
1489    vpaddd .avx2_init(%rip), $D0, $D0
1490    cmp \$6*32, $inl
1491    jbe open_avx2_192
1492    cmp \$10*32, $inl
1493    jbe open_avx2_320
1494
1495    vmovdqa $B0, $state1_store
1496    vmovdqa $C0, $state2_store
1497    vmovdqa $D0, $ctr0_store
1498    mov \$10, $acc0
14991:  \n";
1500        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1501        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1502        dec $acc0
1503    jne 1b
1504    vpaddd .chacha20_consts(%rip), $A0, $A0
1505    vpaddd $state1_store, $B0, $B0
1506    vpaddd $state2_store, $C0, $C0
1507    vpaddd $ctr0_store, $D0, $D0
1508
1509    vperm2i128 \$0x02, $A0, $B0, $T0
1510    # Clamp and store key
1511    vpand .clamp(%rip), $T0, $T0
1512    vmovdqa $T0, $r_store
1513    # Stream for the first 64 bytes
1514    vperm2i128 \$0x13, $A0, $B0, $A0
1515    vperm2i128 \$0x13, $C0, $D0, $B0
1516    # Hash AD + first 64 bytes
1517    mov %r8, $itr2
1518    call poly_hash_ad_internal
1519    xor $itr1, $itr1
1520    # Hash first 64 bytes
15211:  \n";
1522       &poly_add("0($inp, $itr1)");
1523       &poly_mul(); $code.="
1524       add \$16, $itr1
1525       cmp \$2*32, $itr1
1526    jne 1b
1527    # Decrypt first 64 bytes
1528    vpxor 0*32($inp), $A0, $A0
1529    vpxor 1*32($inp), $B0, $B0
1530    vmovdqu $A0, 0*32($oup)
1531    vmovdqu $B0, 1*32($oup)
1532    lea 2*32($inp), $inp
1533    lea 2*32($oup), $oup
1534    sub \$2*32, $inl
15351:
1536        # Hash and decrypt 512 bytes each iteration
1537        cmp \$16*32, $inl
1538        jb 3f\n";
1539        &prep_state_avx2(4); $code.="
1540        xor $itr1, $itr1
15412:  \n";
1542            &poly_add("0*8($inp, $itr1)");
1543            &emit_body(10);
1544            &poly_stage1_mulx();
1545            &emit_body(9);
1546            &poly_stage2_mulx();
1547            &emit_body(12);
1548            &poly_stage3_mulx();
1549            &emit_body(10);
1550            &poly_reduce_stage();
1551            &emit_body(9);
1552            &poly_add("2*8($inp, $itr1)");
1553            &emit_body(8);
1554            &poly_stage1_mulx();
1555            &emit_body(18);
1556            &poly_stage2_mulx();
1557            &emit_body(18);
1558            &poly_stage3_mulx();
1559            &emit_body(9);
1560            &poly_reduce_stage();
1561            &emit_body(8);
1562            &poly_add("4*8($inp, $itr1)"); $code.="
1563            lea 6*8($itr1), $itr1\n";
1564            &emit_body(18);
1565            &poly_stage1_mulx();
1566            &emit_body(8);
1567            &poly_stage2_mulx();
1568            &emit_body(8);
1569            &poly_stage3_mulx();
1570            &emit_body(18);
1571            &poly_reduce_stage();
1572            foreach $l (@loop_body) {$code.=$l."\n";}
1573            @loop_body = split /\n/, $chacha_body; $code.="
1574            cmp \$10*6*8, $itr1
1575        jne 2b\n";
1576        &finalize_state_avx2(4); $code.="
1577        vmovdqa $A0, $tmp_store\n";
1578        &poly_add("10*6*8($inp)");
1579        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
1580        vmovdqa $tmp_store, $A0\n";
1581        &poly_mul();
1582        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
1583        &poly_add("10*6*8+2*8($inp)");
1584        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
1585        &poly_mul();
1586        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
1587        lea 16*32($inp), $inp
1588        lea 16*32($oup), $oup
1589        sub \$16*32, $inl
1590    jmp 1b
15913:
1592    test $inl, $inl
1593    vzeroupper
1594    je open_sse_finalize
15953:
1596    cmp \$4*32, $inl
1597    ja 3f\n";
1598###############################################################################
1599    # 1-128 bytes left
1600    &prep_state_avx2(1); $code.="
1601    xor $itr2, $itr2
1602    mov $inl, $itr1
1603    and \$-16, $itr1
1604    test $itr1, $itr1
1605    je 2f
16061:  \n";
1607        &poly_add("0*8($inp, $itr2)");
1608        &poly_mul(); $code.="
16092:
1610        add \$16, $itr2\n";
1611        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1612        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1613        cmp $itr1, $itr2
1614    jb 1b
1615        cmp \$160, $itr2
1616    jne 2b\n";
1617    &finalize_state_avx2(1);
1618    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
1619    jmp open_avx2_tail_loop
16203:
1621    cmp \$8*32, $inl
1622    ja 3f\n";
1623###############################################################################
1624    # 129-256 bytes left
1625    &prep_state_avx2(2); $code.="
1626    mov $inl, $tmp_store
1627    mov $inl, $itr1
1628    sub \$4*32, $itr1
1629    shr \$4, $itr1
1630    mov \$10, $itr2
1631    cmp \$10, $itr1
1632    cmovg $itr2, $itr1
1633    mov $inp, $inl
1634    xor $itr2, $itr2
16351:  \n";
1636        &poly_add("0*8($inl)");
1637        &poly_mul_mulx(); $code.="
1638        lea 16($inl), $inl
16392:  \n";
1640        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1641        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
1642        inc $itr2\n";
1643        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1644        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1645        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1646        cmp $itr1, $itr2
1647    jb 1b
1648        cmp \$10, $itr2
1649    jne 2b
1650    mov $inl, $itr2
1651    sub $inp, $inl
1652    mov $inl, $itr1
1653    mov $tmp_store, $inl
16541:
1655        add \$16, $itr1
1656        cmp $inl, $itr1
1657        jg 1f\n";
1658        &poly_add("0*8($itr2)");
1659        &poly_mul_mulx(); $code.="
1660        lea 16($itr2), $itr2
1661    jmp 1b
16621:  \n";
1663    &finalize_state_avx2(2);
1664    &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
1665    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
1666    lea 4*32($inp), $inp
1667    lea 4*32($oup), $oup
1668    sub \$4*32, $inl
1669    jmp open_avx2_tail_loop
16703:
1671    cmp \$12*32, $inl
1672    ja 3f\n";
1673###############################################################################
1674    # 257-383 bytes left
1675    &prep_state_avx2(3); $code.="
1676    mov $inl, $tmp_store
1677    mov $inl, $itr1
1678    sub \$8*32, $itr1
1679    shr \$4, $itr1
1680    add \$6, $itr1
1681    mov \$10, $itr2
1682    cmp \$10, $itr1
1683    cmovg $itr2, $itr1
1684    mov $inp, $inl
1685    xor $itr2, $itr2
16861:  \n";
1687        &poly_add("0*8($inl)");
1688        &poly_mul_mulx(); $code.="
1689        lea 16($inl), $inl
16902:  \n";
1691        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
1692        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1693        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1694        &poly_add("0*8($inl)");
1695        &poly_mul(); $code.="
1696        lea 16($inl), $inl
1697        inc $itr2\n";
1698        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
1699        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1700        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1701        cmp $itr1, $itr2
1702    jb 1b
1703        cmp \$10, $itr2
1704    jne 2b
1705    mov $inl, $itr2
1706    sub $inp, $inl
1707    mov $inl, $itr1
1708    mov $tmp_store, $inl
17091:
1710        add \$16, $itr1
1711        cmp $inl, $itr1
1712        jg 1f\n";
1713        &poly_add("0*8($itr2)");
1714        &poly_mul_mulx(); $code.="
1715        lea 16($itr2), $itr2
1716    jmp 1b
17171:  \n";
1718    &finalize_state_avx2(3);
1719    &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
1720    &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
1721    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
1722    lea 8*32($inp), $inp
1723    lea 8*32($oup), $oup
1724    sub \$8*32, $inl
1725    jmp open_avx2_tail_loop
17263:  \n";
1727###############################################################################
1728    # 384-512 bytes left
1729    &prep_state_avx2(4); $code.="
1730    xor $itr1, $itr1
1731    mov $inp, $itr2
17321:  \n";
1733        &poly_add("0*8($itr2)");
1734        &poly_mul(); $code.="
1735        lea 2*8($itr2), $itr2
17362:  \n";
1737        &emit_body(37);
1738        &poly_add("0*8($itr2)");
1739        &poly_mul_mulx();
1740        &emit_body(48);
1741        &poly_add("2*8($itr2)");
1742        &poly_mul_mulx(); $code.="
1743        lea 4*8($itr2), $itr2\n";
1744        foreach $l (@loop_body) {$code.=$l."\n";}
1745        @loop_body = split /\n/, $chacha_body; $code.="
1746        inc $itr1
1747        cmp \$4, $itr1
1748    jl  1b
1749        cmp \$10, $itr1
1750    jne 2b
1751    mov $inl, $itr1
1752    sub \$12*32, $itr1
1753    and \$-16, $itr1
17541:
1755        test $itr1, $itr1
1756        je 1f\n";
1757        &poly_add("0*8($itr2)");
1758        &poly_mul_mulx(); $code.="
1759        lea 2*8($itr2), $itr2
1760        sub \$2*8, $itr1
1761    jmp 1b
17621:  \n";
1763    &finalize_state_avx2(4); $code.="
1764    vmovdqa $A0, $tmp_store\n";
1765    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
1766    vmovdqa $tmp_store, $A0\n";
1767    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
1768    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
1769    &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
1770    lea 12*32($inp), $inp
1771    lea 12*32($oup), $oup
1772    sub \$12*32, $inl
1773open_avx2_tail_loop:
1774    cmp \$32, $inl
1775    jb open_avx2_tail
1776        sub \$32, $inl
1777        vpxor ($inp), $A0, $A0
1778        vmovdqu $A0, ($oup)
1779        lea 1*32($inp), $inp
1780        lea 1*32($oup), $oup
1781        vmovdqa $B0, $A0
1782        vmovdqa $C0, $B0
1783        vmovdqa $D0, $C0
1784    jmp open_avx2_tail_loop
1785open_avx2_tail:
1786    cmp \$16, $inl
1787    vmovdqa $A0x, $A1x
1788    jb 1f
1789    sub \$16, $inl
1790    #load for decryption
1791    vpxor ($inp), $A0x, $A1x
1792    vmovdqu $A1x, ($oup)
1793    lea 1*16($inp), $inp
1794    lea 1*16($oup), $oup
1795    vperm2i128 \$0x11, $A0, $A0, $A0
1796    vmovdqa $A0x, $A1x
17971:
1798    vzeroupper
1799    jmp open_sse_tail_16
1800###############################################################################
1801open_avx2_192:
1802    vmovdqa $A0, $A1
1803    vmovdqa $A0, $A2
1804    vmovdqa $B0, $B1
1805    vmovdqa $B0, $B2
1806    vmovdqa $C0, $C1
1807    vmovdqa $C0, $C2
1808    vpaddd .avx2_inc(%rip), $D0, $D1
1809    vmovdqa $D0, $T2
1810    vmovdqa $D1, $T3
1811    mov \$10, $acc0
18121:  \n";
1813        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1814        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1815        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1816        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
1817        dec $acc0
1818    jne 1b
1819    vpaddd $A2, $A0, $A0
1820    vpaddd $A2, $A1, $A1
1821    vpaddd $B2, $B0, $B0
1822    vpaddd $B2, $B1, $B1
1823    vpaddd $C2, $C0, $C0
1824    vpaddd $C2, $C1, $C1
1825    vpaddd $T2, $D0, $D0
1826    vpaddd $T3, $D1, $D1
1827    vperm2i128 \$0x02, $A0, $B0, $T0
1828    # Clamp and store the key
1829    vpand .clamp(%rip), $T0, $T0
1830    vmovdqa $T0, $r_store
1831    # Stream for up to 192 bytes
1832    vperm2i128 \$0x13, $A0, $B0, $A0
1833    vperm2i128 \$0x13, $C0, $D0, $B0
1834    vperm2i128 \$0x02, $A1, $B1, $C0
1835    vperm2i128 \$0x02, $C1, $D1, $D0
1836    vperm2i128 \$0x13, $A1, $B1, $A1
1837    vperm2i128 \$0x13, $C1, $D1, $B1
1838open_avx2_short:
1839    mov %r8, $itr2
1840    call poly_hash_ad_internal
1841open_avx2_hash_and_xor_loop:
1842        cmp \$32, $inl
1843        jb open_avx2_short_tail_32
1844        sub \$32, $inl\n";
1845        # Load + hash
1846        &poly_add("0*8($inp)");
1847        &poly_mul();
1848        &poly_add("2*8($inp)");
1849        &poly_mul(); $code.="
1850        # Load + decrypt
1851        vpxor ($inp), $A0, $A0
1852        vmovdqu $A0, ($oup)
1853        lea 1*32($inp), $inp
1854        lea 1*32($oup), $oup
1855        # Shift stream
1856        vmovdqa $B0, $A0
1857        vmovdqa $C0, $B0
1858        vmovdqa $D0, $C0
1859        vmovdqa $A1, $D0
1860        vmovdqa $B1, $A1
1861        vmovdqa $C1, $B1
1862        vmovdqa $D1, $C1
1863        vmovdqa $A2, $D1
1864        vmovdqa $B2, $A2
1865    jmp open_avx2_hash_and_xor_loop
1866open_avx2_short_tail_32:
1867    cmp \$16, $inl
1868    vmovdqa $A0x, $A1x
1869    jb 1f
1870    sub \$16, $inl\n";
1871    &poly_add("0*8($inp)");
1872    &poly_mul(); $code.="
1873    vpxor ($inp), $A0x, $A3x
1874    vmovdqu $A3x, ($oup)
1875    lea 1*16($inp), $inp
1876    lea 1*16($oup), $oup
1877    vextracti128 \$1, $A0, $A1x
18781:
1879    vzeroupper
1880    jmp open_sse_tail_16
1881###############################################################################
1882open_avx2_320:
1883    vmovdqa $A0, $A1
1884    vmovdqa $A0, $A2
1885    vmovdqa $B0, $B1
1886    vmovdqa $B0, $B2
1887    vmovdqa $C0, $C1
1888    vmovdqa $C0, $C2
1889    vpaddd .avx2_inc(%rip), $D0, $D1
1890    vpaddd .avx2_inc(%rip), $D1, $D2
1891    vmovdqa $B0, $T1
1892    vmovdqa $C0, $T2
1893    vmovdqa $D0, $ctr0_store
1894    vmovdqa $D1, $ctr1_store
1895    vmovdqa $D2, $ctr2_store
1896    mov \$10, $acc0
18971:  \n";
1898        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1899        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1900        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
1901        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1902        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1903        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1904        dec $acc0
1905    jne 1b
1906    vpaddd .chacha20_consts(%rip), $A0, $A0
1907    vpaddd .chacha20_consts(%rip), $A1, $A1
1908    vpaddd .chacha20_consts(%rip), $A2, $A2
1909    vpaddd $T1, $B0, $B0
1910    vpaddd $T1, $B1, $B1
1911    vpaddd $T1, $B2, $B2
1912    vpaddd $T2, $C0, $C0
1913    vpaddd $T2, $C1, $C1
1914    vpaddd $T2, $C2, $C2
1915    vpaddd $ctr0_store, $D0, $D0
1916    vpaddd $ctr1_store, $D1, $D1
1917    vpaddd $ctr2_store, $D2, $D2
1918    vperm2i128 \$0x02, $A0, $B0, $T0
1919    # Clamp and store the key
1920    vpand .clamp(%rip), $T0, $T0
1921    vmovdqa $T0, $r_store
1922    # Stream for up to 320 bytes
1923    vperm2i128 \$0x13, $A0, $B0, $A0
1924    vperm2i128 \$0x13, $C0, $D0, $B0
1925    vperm2i128 \$0x02, $A1, $B1, $C0
1926    vperm2i128 \$0x02, $C1, $D1, $D0
1927    vperm2i128 \$0x13, $A1, $B1, $A1
1928    vperm2i128 \$0x13, $C1, $D1, $B1
1929    vperm2i128 \$0x02, $A2, $B2, $C1
1930    vperm2i128 \$0x02, $C2, $D2, $D1
1931    vperm2i128 \$0x13, $A2, $B2, $A2
1932    vperm2i128 \$0x13, $C2, $D2, $B2
1933    jmp open_avx2_short
1934.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
1935###############################################################################
1936###############################################################################
1937.type chacha20_poly1305_seal_avx2,\@function,2
1938.align 64
1939chacha20_poly1305_seal_avx2:
1940    vzeroupper
1941    vmovdqa .chacha20_consts(%rip), $A0
1942    vbroadcasti128 0*16($keyp), $B0
1943    vbroadcasti128 1*16($keyp), $C0
1944    vbroadcasti128 2*16($keyp), $D0
1945    vpaddd .avx2_init(%rip), $D0, $D0
1946    cmp \$6*32, $inl
1947    jbe seal_avx2_192
1948    cmp \$10*32, $inl
1949    jbe seal_avx2_320
1950    vmovdqa $A0, $A1
1951    vmovdqa $A0, $A2
1952    vmovdqa $A0, $A3
1953    vmovdqa $B0, $B1
1954    vmovdqa $B0, $B2
1955    vmovdqa $B0, $B3
1956    vmovdqa $B0, $state1_store
1957    vmovdqa $C0, $C1
1958    vmovdqa $C0, $C2
1959    vmovdqa $C0, $C3
1960    vmovdqa $C0, $state2_store
1961    vmovdqa $D0, $D3
1962    vpaddd .avx2_inc(%rip), $D3, $D2
1963    vpaddd .avx2_inc(%rip), $D2, $D1
1964    vpaddd .avx2_inc(%rip), $D1, $D0
1965    vmovdqa $D0, $ctr0_store
1966    vmovdqa $D1, $ctr1_store
1967    vmovdqa $D2, $ctr2_store
1968    vmovdqa $D3, $ctr3_store
1969    mov \$10, $acc0
19701:  \n";
1971        foreach $l (@loop_body) {$code.=$l."\n";}
1972        @loop_body = split /\n/, $chacha_body; $code.="
1973        dec $acc0
1974        jnz 1b\n";
1975    &finalize_state_avx2(4); $code.="
1976    vperm2i128 \$0x13, $C3, $D3, $C3
1977    vperm2i128 \$0x02, $A3, $B3, $D3
1978    vperm2i128 \$0x13, $A3, $B3, $A3
1979    vpand .clamp(%rip), $D3, $D3
1980    vmovdqa $D3, $r_store
1981    mov %r8, $itr2
1982    call poly_hash_ad_internal
1983    # Safely store 320 bytes (otherwise would handle with optimized call)
1984    vpxor 0*32($inp), $A3, $A3
1985    vpxor 1*32($inp), $C3, $C3
1986    vmovdqu $A3, 0*32($oup)
1987    vmovdqu $C3, 1*32($oup)\n";
1988    &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
1989    &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
1990    &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
1991    lea 10*32($inp), $inp
1992    sub \$10*32, $inl
1993    mov \$10*32, $itr1
1994    cmp \$4*32, $inl
1995    jbe seal_avx2_hash
1996    vpxor 0*32($inp), $A0, $A0
1997    vpxor 1*32($inp), $B0, $B0
1998    vpxor 2*32($inp), $C0, $C0
1999    vpxor 3*32($inp), $D0, $D0
2000    vmovdqu $A0, 10*32($oup)
2001    vmovdqu $B0, 11*32($oup)
2002    vmovdqu $C0, 12*32($oup)
2003    vmovdqu $D0, 13*32($oup)
2004    lea 4*32($inp), $inp
2005    sub \$4*32, $inl
2006    mov \$8, $itr1
2007    mov \$2, $itr2
2008    cmp \$4*32, $inl
2009    jbe seal_avx2_tail_128
2010    cmp \$8*32, $inl
2011    jbe seal_avx2_tail_256
2012    cmp \$12*32, $inl
2013    jbe seal_avx2_tail_384
2014    cmp \$16*32, $inl
2015    jbe seal_avx2_tail_512\n";
2016    # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2017    &prep_state_avx2(4);
2018    foreach $l (@loop_body) {$code.=$l."\n";}
2019    @loop_body = split /\n/, $chacha_body;
2020    &emit_body(41);
2021    @loop_body = split /\n/, $chacha_body; $code.="
2022    sub \$16, $oup
2023    mov \$9, $itr1
2024    jmp 4f
20251:  \n";
2026        &prep_state_avx2(4); $code.="
2027        mov \$10, $itr1
20282:  \n";
2029            &poly_add("0*8($oup)");
2030            &emit_body(10);
2031            &poly_stage1_mulx();
2032            &emit_body(9);
2033            &poly_stage2_mulx();
2034            &emit_body(12);
2035            &poly_stage3_mulx();
2036            &emit_body(10);
2037            &poly_reduce_stage(); $code.="
20384:  \n";
2039            &emit_body(9);
2040            &poly_add("2*8($oup)");
2041            &emit_body(8);
2042            &poly_stage1_mulx();
2043            &emit_body(18);
2044            &poly_stage2_mulx();
2045            &emit_body(18);
2046            &poly_stage3_mulx();
2047            &emit_body(9);
2048            &poly_reduce_stage();
2049            &emit_body(8);
2050            &poly_add("4*8($oup)"); $code.="
2051            lea 6*8($oup), $oup\n";
2052            &emit_body(18);
2053            &poly_stage1_mulx();
2054            &emit_body(8);
2055            &poly_stage2_mulx();
2056            &emit_body(8);
2057            &poly_stage3_mulx();
2058            &emit_body(18);
2059            &poly_reduce_stage();
2060            foreach $l (@loop_body) {$code.=$l."\n";}
2061            @loop_body = split /\n/, $chacha_body; $code.="
2062            dec $itr1
2063        jne 2b\n";
2064        &finalize_state_avx2(4); $code.="
2065        lea 4*8($oup), $oup
2066        vmovdqa $A0, $tmp_store\n";
2067        &poly_add("-4*8($oup)");
2068        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
2069        vmovdqa $tmp_store, $A0\n";
2070        &poly_mul();
2071        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
2072        &poly_add("-2*8($oup)");
2073        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
2074        &poly_mul();
2075        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
2076        lea 16*32($inp), $inp
2077        sub \$16*32, $inl
2078        cmp \$16*32, $inl
2079    jg 1b\n";
2080    &poly_add("0*8($oup)");
2081    &poly_mul();
2082    &poly_add("2*8($oup)");
2083    &poly_mul(); $code.="
2084    lea 4*8($oup), $oup
2085    mov \$10, $itr1
2086    xor $itr2, $itr2
2087    cmp \$4*32, $inl
2088    ja 3f
2089###############################################################################
2090seal_avx2_tail_128:\n";
2091    &prep_state_avx2(1); $code.="
20921:  \n";
2093        &poly_add("0($oup)");
2094        &poly_mul(); $code.="
2095        lea 2*8($oup), $oup
20962:  \n";
2097        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2098        &poly_add("0*8($oup)");
2099        &poly_mul();
2100        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2101        &poly_add("2*8($oup)");
2102        &poly_mul(); $code.="
2103        lea 4*8($oup), $oup
2104        dec $itr1
2105    jg 1b
2106        dec $itr2
2107    jge 2b\n";
2108    &finalize_state_avx2(1);
2109    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2110    jmp seal_avx2_short_loop
21113:
2112    cmp \$8*32, $inl
2113    ja 3f
2114###############################################################################
2115seal_avx2_tail_256:\n";
2116    &prep_state_avx2(2); $code.="
21171:  \n";
2118        &poly_add("0($oup)");
2119        &poly_mul(); $code.="
2120        lea 2*8($oup), $oup
21212:  \n";
2122        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2123        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2124        &poly_add("0*8($oup)");
2125        &poly_mul();
2126        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2127        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2128        &poly_add("2*8($oup)");
2129        &poly_mul(); $code.="
2130        lea 4*8($oup), $oup
2131        dec $itr1
2132    jg 1b
2133        dec $itr2
2134    jge 2b\n";
2135    &finalize_state_avx2(2);
2136    &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
2137    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2138    mov \$4*32, $itr1
2139    lea 4*32($inp), $inp
2140    sub \$4*32, $inl
2141    jmp seal_avx2_hash
21423:
2143    cmp \$12*32, $inl
2144    ja seal_avx2_tail_512
2145###############################################################################
2146seal_avx2_tail_384:\n";
2147    &prep_state_avx2(3); $code.="
21481:  \n";
2149        &poly_add("0($oup)");
2150        &poly_mul(); $code.="
2151        lea 2*8($oup), $oup
21522:  \n";
2153        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2154        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2155        &poly_add("0*8($oup)");
2156        &poly_mul();
2157        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2158        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2159        &poly_add("2*8($oup)");
2160        &poly_mul();
2161        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2162        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2163        lea 4*8($oup), $oup
2164        dec $itr1
2165    jg 1b
2166        dec $itr2
2167    jge 2b\n";
2168    &finalize_state_avx2(3);
2169    &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
2170    &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
2171    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2172    mov \$8*32, $itr1
2173    lea 8*32($inp), $inp
2174    sub \$8*32, $inl
2175    jmp seal_avx2_hash
2176###############################################################################
2177seal_avx2_tail_512:\n";
2178    &prep_state_avx2(4); $code.="
21791:  \n";
2180        &poly_add("0($oup)");
2181        &poly_mul_mulx(); $code.="
2182        lea 2*8($oup), $oup
21832:  \n";
2184        &emit_body(20);
2185        &poly_add("0*8($oup)");
2186        &emit_body(20);
2187        &poly_stage1_mulx();
2188        &emit_body(20);
2189        &poly_stage2_mulx();
2190        &emit_body(20);
2191        &poly_stage3_mulx();
2192        &emit_body(20);
2193        &poly_reduce_stage();
2194        &emit_body(20);
2195        &poly_add("2*8($oup)");
2196        &emit_body(20);
2197        &poly_stage1_mulx();
2198        &emit_body(20);
2199        &poly_stage2_mulx();
2200        &emit_body(20);
2201        &poly_stage3_mulx();
2202        &emit_body(20);
2203        &poly_reduce_stage();
2204        foreach $l (@loop_body) {$code.=$l."\n";}
2205        @loop_body = split /\n/, $chacha_body; $code.="
2206        lea 4*8($oup), $oup
2207        dec $itr1
2208    jg 1b
2209        dec $itr2
2210    jge 2b\n";
2211    &finalize_state_avx2(4); $code.="
2212    vmovdqa $A0, $tmp_store\n";
2213    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
2214    vmovdqa $tmp_store, $A0\n";
2215    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
2216    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
2217    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2218    mov \$12*32, $itr1
2219    lea 12*32($inp), $inp
2220    sub \$12*32, $inl
2221    jmp seal_avx2_hash
2222################################################################################
2223seal_avx2_320:
2224    vmovdqa $A0, $A1
2225    vmovdqa $A0, $A2
2226    vmovdqa $B0, $B1
2227    vmovdqa $B0, $B2
2228    vmovdqa $C0, $C1
2229    vmovdqa $C0, $C2
2230    vpaddd .avx2_inc(%rip), $D0, $D1
2231    vpaddd .avx2_inc(%rip), $D1, $D2
2232    vmovdqa $B0, $T1
2233    vmovdqa $C0, $T2
2234    vmovdqa $D0, $ctr0_store
2235    vmovdqa $D1, $ctr1_store
2236    vmovdqa $D2, $ctr2_store
2237    mov \$10, $acc0
22381:  \n";
2239        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2240        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2241        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2242        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2243        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2244        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2245        dec $acc0
2246    jne 1b
2247    vpaddd .chacha20_consts(%rip), $A0, $A0
2248    vpaddd .chacha20_consts(%rip), $A1, $A1
2249    vpaddd .chacha20_consts(%rip), $A2, $A2
2250    vpaddd $T1, $B0, $B0
2251    vpaddd $T1, $B1, $B1
2252    vpaddd $T1, $B2, $B2
2253    vpaddd $T2, $C0, $C0
2254    vpaddd $T2, $C1, $C1
2255    vpaddd $T2, $C2, $C2
2256    vpaddd $ctr0_store, $D0, $D0
2257    vpaddd $ctr1_store, $D1, $D1
2258    vpaddd $ctr2_store, $D2, $D2
2259    vperm2i128 \$0x02, $A0, $B0, $T0
2260    # Clamp and store the key
2261    vpand .clamp(%rip), $T0, $T0
2262    vmovdqa $T0, $r_store
2263    # Stream for up to 320 bytes
2264    vperm2i128 \$0x13, $A0, $B0, $A0
2265    vperm2i128 \$0x13, $C0, $D0, $B0
2266    vperm2i128 \$0x02, $A1, $B1, $C0
2267    vperm2i128 \$0x02, $C1, $D1, $D0
2268    vperm2i128 \$0x13, $A1, $B1, $A1
2269    vperm2i128 \$0x13, $C1, $D1, $B1
2270    vperm2i128 \$0x02, $A2, $B2, $C1
2271    vperm2i128 \$0x02, $C2, $D2, $D1
2272    vperm2i128 \$0x13, $A2, $B2, $A2
2273    vperm2i128 \$0x13, $C2, $D2, $B2
2274    jmp seal_avx2_short
2275################################################################################
2276seal_avx2_192:
2277    vmovdqa $A0, $A1
2278    vmovdqa $A0, $A2
2279    vmovdqa $B0, $B1
2280    vmovdqa $B0, $B2
2281    vmovdqa $C0, $C1
2282    vmovdqa $C0, $C2
2283    vpaddd .avx2_inc(%rip), $D0, $D1
2284    vmovdqa $D0, $T2
2285    vmovdqa $D1, $T3
2286    mov \$10, $acc0
22871:  \n";
2288        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2289        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2290        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2291        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
2292        dec $acc0
2293    jne 1b
2294    vpaddd $A2, $A0, $A0
2295    vpaddd $A2, $A1, $A1
2296    vpaddd $B2, $B0, $B0
2297    vpaddd $B2, $B1, $B1
2298    vpaddd $C2, $C0, $C0
2299    vpaddd $C2, $C1, $C1
2300    vpaddd $T2, $D0, $D0
2301    vpaddd $T3, $D1, $D1
2302    vperm2i128 \$0x02, $A0, $B0, $T0
2303    # Clamp and store the key
2304    vpand .clamp(%rip), $T0, $T0
2305    vmovdqa $T0, $r_store
2306    # Stream for up to 192 bytes
2307    vperm2i128 \$0x13, $A0, $B0, $A0
2308    vperm2i128 \$0x13, $C0, $D0, $B0
2309    vperm2i128 \$0x02, $A1, $B1, $C0
2310    vperm2i128 \$0x02, $C1, $D1, $D0
2311    vperm2i128 \$0x13, $A1, $B1, $A1
2312    vperm2i128 \$0x13, $C1, $D1, $B1
2313seal_avx2_short:
2314    mov %r8, $itr2
2315    call poly_hash_ad_internal
2316    xor $itr1, $itr1
2317seal_avx2_hash:
2318        cmp \$16, $itr1
2319        jb seal_avx2_short_loop\n";
2320        &poly_add("0($oup)");
2321        &poly_mul(); $code.="
2322        sub \$16, $itr1
2323        add \$16, $oup
2324    jmp seal_avx2_hash
2325seal_avx2_short_loop:
2326        cmp \$32, $inl
2327        jb seal_avx2_short_tail
2328        sub \$32, $inl
2329        # Encrypt
2330        vpxor ($inp), $A0, $A0
2331        vmovdqu $A0, ($oup)
2332        lea 1*32($inp), $inp
2333        # Load + hash\n";
2334        &poly_add("0*8($oup)");
2335        &poly_mul();
2336        &poly_add("2*8($oup)");
2337        &poly_mul(); $code.="
2338        lea 1*32($oup), $oup
2339        # Shift stream
2340        vmovdqa $B0, $A0
2341        vmovdqa $C0, $B0
2342        vmovdqa $D0, $C0
2343        vmovdqa $A1, $D0
2344        vmovdqa $B1, $A1
2345        vmovdqa $C1, $B1
2346        vmovdqa $D1, $C1
2347        vmovdqa $A2, $D1
2348        vmovdqa $B2, $A2
2349    jmp seal_avx2_short_loop
2350seal_avx2_short_tail:
2351    cmp \$16, $inl
2352    jb 1f
2353    sub \$16, $inl
2354    vpxor ($inp), $A0x, $A3x
2355    vmovdqu $A3x, ($oup)
2356    lea 1*16($inp), $inp\n";
2357    &poly_add("0*8($oup)");
2358    &poly_mul(); $code.="
2359    lea 1*16($oup), $oup
2360    vextracti128 \$1, $A0, $A0x
23611:
2362    vzeroupper
2363    jmp seal_sse_tail_16
2364.cfi_endproc
2365";
2366}
2367
2368if (!$win64) {
2369  $code =~ s/\`([^\`]*)\`/eval $1/gem;
2370  print $code;
2371} else {
2372  print <<___;
2373.globl dummy_chacha20_poly1305_asm
2374.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
2375dummy_chacha20_poly1305_asm:
2376    ret
2377___
2378}
2379
2380close STDOUT;
2381