1#!/usr/bin/env perl 2 3# Copyright (c) 2015, CloudFlare Ltd. 4# 5# Permission to use, copy, modify, and/or distribute this software for any 6# purpose with or without fee is hereby granted, provided that the above 7# copyright notice and this permission notice appear in all copies. 8# 9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 16 17############################################################################## 18# # 19# Author: Vlad Krasnov # 20# # 21############################################################################## 22 23$flavour = shift; 24$output = shift; 25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 26 27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 28 29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 32die "can't locate x86_64-xlate.pl"; 33 34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 35*STDOUT=*OUT; 36 37$avx = 2; 38 39$code.=<<___; 40.text 41.extern OPENSSL_ia32cap_P 42 43chacha20_poly1305_constants: 44 45.align 64 46.chacha20_consts: 47.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 48.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 49.rol8: 50.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 51.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 52.rol16: 53.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 54.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 55.avx2_init: 56.long 0,0,0,0 57.sse_inc: 58.long 1,0,0,0 59.avx2_inc: 60.long 2,0,0,0,2,0,0,0 61.clamp: 62.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 63.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF 64.align 16 65.and_masks: 66.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 67.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 68.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 69.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 70.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 71.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 72.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 73.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 74.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 75.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 76.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 77.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 78.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 79.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 80.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 81___ 82 83my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8"); 84my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); 85my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); 86my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); 87my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 88my $r_store="0*16(%rbp)"; 89my $s_store="1*16(%rbp)"; 90my $len_store="2*16(%rbp)"; 91my $state1_store="3*16(%rbp)"; 92my $state2_store="4*16(%rbp)"; 93my $tmp_store="5*16(%rbp)"; 94my $ctr0_store="6*16(%rbp)"; 95my $ctr1_store="7*16(%rbp)"; 96my $ctr2_store="8*16(%rbp)"; 97my $ctr3_store="9*16(%rbp)"; 98 99sub chacha_qr { 100my ($a,$b,$c,$d,$t,$dir)=@_; 101$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); 102$code.="paddd $b, $a 103 pxor $a, $d 104 pshufb .rol16(%rip), $d 105 paddd $d, $c 106 pxor $c, $b 107 movdqa $b, $t 108 pslld \$12, $t 109 psrld \$20, $b 110 pxor $t, $b 111 paddd $b, $a 112 pxor $a, $d 113 pshufb .rol8(%rip), $d 114 paddd $d, $c 115 pxor $c, $b 116 movdqa $b, $t 117 pslld \$7, $t 118 psrld \$25, $b 119 pxor $t, $b\n"; 120$code.="palignr \$4, $b, $b 121 palignr \$8, $c, $c 122 palignr \$12, $d, $d\n" if ($dir =~ /left/); 123$code.="palignr \$12, $b, $b 124 palignr \$8, $c, $c 125 palignr \$4, $d, $d\n" if ($dir =~ /right/); 126$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); 127} 128 129sub poly_add { 130my ($src)=@_; 131$code.="add $src, $acc0 132 adc 8+$src, $acc1 133 adc \$1, $acc2\n"; 134} 135 136sub poly_stage1 { 137$code.="mov 0+$r_store, %rax 138 mov %rax, $t2 139 mul $acc0 140 mov %rax, $t0 141 mov %rdx, $t1 142 mov 0+$r_store, %rax 143 mul $acc1 144 imulq $acc2, $t2 145 add %rax, $t1 146 adc %rdx, $t2\n"; 147} 148 149sub poly_stage2 { 150$code.="mov 8+$r_store, %rax 151 mov %rax, $t3 152 mul $acc0 153 add %rax, $t1 154 adc \$0, %rdx 155 mov %rdx, $acc0 156 mov 8+$r_store, %rax 157 mul $acc1 158 add %rax, $t2 159 adc \$0, %rdx\n"; 160} 161 162sub poly_stage3 { 163$code.="imulq $acc2, $t3 164 add $acc0, $t2 165 adc %rdx, $t3\n"; 166} 167 168sub poly_reduce_stage { 169$code.="mov $t0, $acc0 170 mov $t1, $acc1 171 mov $t2, $acc2 172 and \$3, $acc2 173 mov $t2, $t0 174 and \$-4, $t0 175 mov $t3, $t1 176 shrd \$2, $t3, $t2 177 shr \$2, $t3 178 add $t0, $acc0 179 adc $t1, $acc1 180 adc \$0, $acc2 181 add $t2, $acc0 182 adc $t3, $acc1 183 adc \$0, $acc2\n"; 184} 185 186sub poly_mul { 187 &poly_stage1(); 188 &poly_stage2(); 189 &poly_stage3(); 190 &poly_reduce_stage(); 191} 192 193sub prep_state { 194my ($n)=@_; 195$code.="movdqa .chacha20_consts(%rip), $A0 196 movdqa $state1_store, $B0 197 movdqa $state2_store, $C0\n"; 198$code.="movdqa $A0, $A1 199 movdqa $B0, $B1 200 movdqa $C0, $C1\n" if ($n ge 2); 201$code.="movdqa $A0, $A2 202 movdqa $B0, $B2 203 movdqa $C0, $C2\n" if ($n ge 3); 204$code.="movdqa $A0, $A3 205 movdqa $B0, $B3 206 movdqa $C0, $C3\n" if ($n ge 4); 207$code.="movdqa $ctr0_store, $D0 208 paddd .sse_inc(%rip), $D0 209 movdqa $D0, $ctr0_store\n" if ($n eq 1); 210$code.="movdqa $ctr0_store, $D1 211 paddd .sse_inc(%rip), $D1 212 movdqa $D1, $D0 213 paddd .sse_inc(%rip), $D0 214 movdqa $D0, $ctr0_store 215 movdqa $D1, $ctr1_store\n" if ($n eq 2); 216$code.="movdqa $ctr0_store, $D2 217 paddd .sse_inc(%rip), $D2 218 movdqa $D2, $D1 219 paddd .sse_inc(%rip), $D1 220 movdqa $D1, $D0 221 paddd .sse_inc(%rip), $D0 222 movdqa $D0, $ctr0_store 223 movdqa $D1, $ctr1_store 224 movdqa $D2, $ctr2_store\n" if ($n eq 3); 225$code.="movdqa $ctr0_store, $D3 226 paddd .sse_inc(%rip), $D3 227 movdqa $D3, $D2 228 paddd .sse_inc(%rip), $D2 229 movdqa $D2, $D1 230 paddd .sse_inc(%rip), $D1 231 movdqa $D1, $D0 232 paddd .sse_inc(%rip), $D0 233 movdqa $D0, $ctr0_store 234 movdqa $D1, $ctr1_store 235 movdqa $D2, $ctr2_store 236 movdqa $D3, $ctr3_store\n" if ($n eq 4); 237} 238 239sub finalize_state { 240my ($n)=@_; 241$code.="paddd .chacha20_consts(%rip), $A3 242 paddd $state1_store, $B3 243 paddd $state2_store, $C3 244 paddd $ctr3_store, $D3\n" if ($n eq 4); 245$code.="paddd .chacha20_consts(%rip), $A2 246 paddd $state1_store, $B2 247 paddd $state2_store, $C2 248 paddd $ctr2_store, $D2\n" if ($n ge 3); 249$code.="paddd .chacha20_consts(%rip), $A1 250 paddd $state1_store, $B1 251 paddd $state2_store, $C1 252 paddd $ctr1_store, $D1\n" if ($n ge 2); 253$code.="paddd .chacha20_consts(%rip), $A0 254 paddd $state1_store, $B0 255 paddd $state2_store, $C0 256 paddd $ctr0_store, $D0\n"; 257} 258 259sub xor_stream { 260my ($A, $B, $C, $D, $offset)=@_; 261$code.="movdqu 0*16 + $offset($inp), $A3 262 movdqu 1*16 + $offset($inp), $B3 263 movdqu 2*16 + $offset($inp), $C3 264 movdqu 3*16 + $offset($inp), $D3 265 pxor $A3, $A 266 pxor $B3, $B 267 pxor $C3, $C 268 pxor $D, $D3 269 movdqu $A, 0*16 + $offset($oup) 270 movdqu $B, 1*16 + $offset($oup) 271 movdqu $C, 2*16 + $offset($oup) 272 movdqu $D3, 3*16 + $offset($oup)\n"; 273} 274 275sub xor_stream_using_temp { 276my ($A, $B, $C, $D, $offset, $temp)=@_; 277$code.="movdqa $temp, $tmp_store 278 movdqu 0*16 + $offset($inp), $temp 279 pxor $A, $temp 280 movdqu $temp, 0*16 + $offset($oup) 281 movdqu 1*16 + $offset($inp), $temp 282 pxor $B, $temp 283 movdqu $temp, 1*16 + $offset($oup) 284 movdqu 2*16 + $offset($inp), $temp 285 pxor $C, $temp 286 movdqu $temp, 2*16 + $offset($oup) 287 movdqu 3*16 + $offset($inp), $temp 288 pxor $D, $temp 289 movdqu $temp, 3*16 + $offset($oup)\n"; 290} 291 292sub gen_chacha_round { 293my ($rot1, $rot2, $shift)=@_; 294my $round=""; 295$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); 296$round.="movdqa $rot2, $C0 297 paddd $B3, $A3 298 paddd $B2, $A2 299 paddd $B1, $A1 300 paddd $B0, $A0 301 pxor $A3, $D3 302 pxor $A2, $D2 303 pxor $A1, $D1 304 pxor $A0, $D0 305 pshufb $C0, $D3 306 pshufb $C0, $D2 307 pshufb $C0, $D1 308 pshufb $C0, $D0 309 movdqa $tmp_store, $C0 310 paddd $D3, $C3 311 paddd $D2, $C2 312 paddd $D1, $C1 313 paddd $D0, $C0 314 pxor $C3, $B3 315 pxor $C2, $B2 316 pxor $C1, $B1 317 pxor $C0, $B0 318 movdqa $C0, $tmp_store 319 movdqa $B3, $C0 320 psrld \$$rot1, $C0 321 pslld \$32-$rot1, $B3 322 pxor $C0, $B3 323 movdqa $B2, $C0 324 psrld \$$rot1, $C0 325 pslld \$32-$rot1, $B2 326 pxor $C0, $B2 327 movdqa $B1, $C0 328 psrld \$$rot1, $C0 329 pslld \$32-$rot1, $B1 330 pxor $C0, $B1 331 movdqa $B0, $C0 332 psrld \$$rot1, $C0 333 pslld \$32-$rot1, $B0 334 pxor $C0, $B0\n"; 335($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 336($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 337$round.="movdqa $tmp_store, $C0 338 palignr \$$s1, $B3, $B3 339 palignr \$$s2, $C3, $C3 340 palignr \$$s3, $D3, $D3 341 palignr \$$s1, $B2, $B2 342 palignr \$$s2, $C2, $C2 343 palignr \$$s3, $D2, $D2 344 palignr \$$s1, $B1, $B1 345 palignr \$$s2, $C1, $C1 346 palignr \$$s3, $D1, $D1 347 palignr \$$s1, $B0, $B0 348 palignr \$$s2, $C0, $C0 349 palignr \$$s3, $D0, $D0\n" 350if (($shift =~ /left/) || ($shift =~ /right/)); 351return $round; 352}; 353 354$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") . 355 &gen_chacha_round(25, ".rol8(%rip)", "left") . 356 &gen_chacha_round(20, ".rol16(%rip)") . 357 &gen_chacha_round(25, ".rol8(%rip)", "right"); 358 359my @loop_body = split /\n/, $chacha_body; 360 361sub emit_body { 362my ($n)=@_; 363 for (my $i=0; $i < $n; $i++) { 364 $code=$code.shift(@loop_body)."\n"; 365 }; 366} 367 368{ 369################################################################################ 370# void poly_hash_ad_internal(); 371$code.=" 372.type poly_hash_ad_internal,\@function,2 373.align 64 374poly_hash_ad_internal: 375.cfi_startproc 376 xor $acc0, $acc0 377 xor $acc1, $acc1 378 xor $acc2, $acc2 379 cmp \$13, $itr2 380 jne hash_ad_loop 381poly_fast_tls_ad: 382 # Special treatment for the TLS case of 13 bytes 383 mov ($adp), $acc0 384 mov 5($adp), $acc1 385 shr \$24, $acc1 386 mov \$1, $acc2\n"; 387 &poly_mul(); $code.=" 388 ret 389hash_ad_loop: 390 # Hash in 16 byte chunk 391 cmp \$16, $itr2 392 jb hash_ad_tail\n"; 393 &poly_add("0($adp)"); 394 &poly_mul(); $code.=" 395 lea 1*16($adp), $adp 396 sub \$16, $itr2 397 jmp hash_ad_loop 398hash_ad_tail: 399 cmp \$0, $itr2 400 je 1f 401 # Hash last < 16 byte tail 402 xor $t0, $t0 403 xor $t1, $t1 404 xor $t2, $t2 405 add $itr2, $adp 406hash_ad_tail_loop: 407 shld \$8, $t0, $t1 408 shl \$8, $t0 409 movzxb -1($adp), $t2 410 xor $t2, $t0 411 dec $adp 412 dec $itr2 413 jne hash_ad_tail_loop 414 415 add $t0, $acc0 416 adc $t1, $acc1 417 adc \$1, $acc2\n"; 418 &poly_mul(); $code.=" 419 # Finished AD 4201: 421 ret 422.cfi_endproc 423.size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; 424} 425 426{ 427################################################################################ 428# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); 429$code.=" 430.globl chacha20_poly1305_open 431.type chacha20_poly1305_open,\@function,2 432.align 64 433chacha20_poly1305_open: 434.cfi_startproc 435 push %rbp 436.cfi_adjust_cfa_offset 8 437 push %rbx 438.cfi_adjust_cfa_offset 8 439 push %r12 440.cfi_adjust_cfa_offset 8 441 push %r13 442.cfi_adjust_cfa_offset 8 443 push %r14 444.cfi_adjust_cfa_offset 8 445 push %r15 446.cfi_adjust_cfa_offset 8 447 # We write the calculated authenticator back to keyp at the end, so save 448 # the pointer on the stack too. 449 push $keyp 450.cfi_adjust_cfa_offset 8 451 sub \$288 + 32, %rsp 452.cfi_adjust_cfa_offset 288 + 32 453.cfi_offset rbp, -16 454.cfi_offset rbx, -24 455.cfi_offset r12, -32 456.cfi_offset r13, -40 457.cfi_offset r14, -48 458.cfi_offset r15, -56 459 lea 32(%rsp), %rbp 460 and \$-32, %rbp 461 mov %rdx, 8+$len_store 462 mov %r8, 0+$len_store 463 mov %rdx, $inl\n"; $code.=" 464 mov OPENSSL_ia32cap_P+8(%rip), %eax 465 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 466 xor \$`(1<<5) + (1<<8)`, %eax 467 jz chacha20_poly1305_open_avx2\n" if ($avx>1); 468$code.=" 4691: 470 cmp \$128, $inl 471 jbe open_sse_128 472 # For long buffers, prepare the poly key first 473 movdqa .chacha20_consts(%rip), $A0 474 movdqu 0*16($keyp), $B0 475 movdqu 1*16($keyp), $C0 476 movdqu 2*16($keyp), $D0 477 movdqa $D0, $T1 478 # Store on stack, to free keyp 479 movdqa $B0, $state1_store 480 movdqa $C0, $state2_store 481 movdqa $D0, $ctr0_store 482 mov \$10, $acc0 4831: \n"; 484 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 485 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 486 dec $acc0 487 jne 1b 488 # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 489 paddd .chacha20_consts(%rip), $A0 490 paddd $state1_store, $B0 491 # Clamp and store the key 492 pand .clamp(%rip), $A0 493 movdqa $A0, $r_store 494 movdqa $B0, $s_store 495 # Hash 496 mov %r8, $itr2 497 call poly_hash_ad_internal 498open_sse_main_loop: 499 cmp \$16*16, $inl 500 jb 2f 501 # Load state, increment counter blocks\n"; 502 &prep_state(4); $code.=" 503 # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we 504 # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 505 mov \$4, $itr1 506 mov $inp, $itr2 5071: \n"; 508 &emit_body(20); 509 &poly_add("0($itr2)"); $code.=" 510 lea 2*8($itr2), $itr2\n"; 511 &emit_body(20); 512 &poly_stage1(); 513 &emit_body(20); 514 &poly_stage2(); 515 &emit_body(20); 516 &poly_stage3(); 517 &emit_body(20); 518 &poly_reduce_stage(); 519 foreach $l (@loop_body) {$code.=$l."\n";} 520 @loop_body = split /\n/, $chacha_body; $code.=" 521 dec $itr1 522 jge 1b\n"; 523 &poly_add("0($itr2)"); 524 &poly_mul(); $code.=" 525 lea 2*8($itr2), $itr2 526 cmp \$-6, $itr1 527 jg 1b\n"; 528 &finalize_state(4); 529 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 530 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 531 &xor_stream($A1, $B1, $C1, $D1, "8*16"); 532 &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" 533 lea 16*16($inp), $inp 534 lea 16*16($oup), $oup 535 sub \$16*16, $inl 536 jmp open_sse_main_loop 5372: 538 # Handle the various tail sizes efficiently 539 test $inl, $inl 540 jz open_sse_finalize 541 cmp \$4*16, $inl 542 ja 3f\n"; 543############################################################################### 544 # At most 64 bytes are left 545 &prep_state(1); $code.=" 546 xor $itr2, $itr2 547 mov $inl, $itr1 548 cmp \$16, $itr1 549 jb 2f 5501: \n"; 551 &poly_add("0($inp, $itr2)"); 552 &poly_mul(); $code.=" 553 sub \$16, $itr1 5542: 555 add \$16, $itr2\n"; 556 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 557 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 558 cmp \$16, $itr1 559 jae 1b 560 cmp \$10*16, $itr2 561 jne 2b\n"; 562 &finalize_state(1); $code.=" 563 jmp open_sse_tail_64_dec_loop 5643: 565 cmp \$8*16, $inl 566 ja 3f\n"; 567############################################################################### 568 # 65 - 128 bytes are left 569 &prep_state(2); $code.=" 570 mov $inl, $itr1 571 and \$-16, $itr1 572 xor $itr2, $itr2 5731: \n"; 574 &poly_add("0($inp, $itr2)"); 575 &poly_mul(); $code.=" 5762: 577 add \$16, $itr2\n"; 578 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 579 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 580 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 581 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" 582 cmp $itr1, $itr2 583 jb 1b 584 cmp \$10*16, $itr2 585 jne 2b\n"; 586 &finalize_state(2); 587 &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" 588 sub \$4*16, $inl 589 lea 4*16($inp), $inp 590 lea 4*16($oup), $oup 591 jmp open_sse_tail_64_dec_loop 5923: 593 cmp \$12*16, $inl 594 ja 3f\n"; 595############################################################################### 596 # 129 - 192 bytes are left 597 &prep_state(3); $code.=" 598 mov $inl, $itr1 599 mov \$10*16, $itr2 600 cmp \$10*16, $itr1 601 cmovg $itr2, $itr1 602 and \$-16, $itr1 603 xor $itr2, $itr2 6041: \n"; 605 &poly_add("0($inp, $itr2)"); 606 &poly_mul(); $code.=" 6072: 608 add \$16, $itr2\n"; 609 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 610 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 611 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 612 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 613 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 614 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 615 cmp $itr1, $itr2 616 jb 1b 617 cmp \$10*16, $itr2 618 jne 2b 619 cmp \$11*16, $inl 620 jb 1f\n"; 621 &poly_add("10*16($inp)"); 622 &poly_mul(); $code.=" 623 cmp \$12*16, $inl 624 jb 1f\n"; 625 &poly_add("11*16($inp)"); 626 &poly_mul(); $code.=" 6271: \n"; 628 &finalize_state(3); 629 &xor_stream($A2, $B2, $C2, $D2, "0*16"); 630 &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" 631 sub \$8*16, $inl 632 lea 8*16($inp), $inp 633 lea 8*16($oup), $oup 634 jmp open_sse_tail_64_dec_loop 6353: 636###############################################################################\n"; 637 # 193 - 255 bytes are left 638 &prep_state(4); $code.=" 639 xor $itr2, $itr2 6401: \n"; 641 &poly_add("0($inp, $itr2)"); 642 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); 643 &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); 644 &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); 645 &poly_stage1(); 646 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); 647 &poly_stage2(); 648 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); 649 &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); 650 &poly_stage3(); 651 &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); 652 &poly_reduce_stage(); 653 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" 654 add \$16, $itr2 655 cmp \$10*16, $itr2 656 jb 1b 657 mov $inl, $itr1 658 and \$-16, $itr1 6591: \n"; 660 &poly_add("0($inp, $itr2)"); 661 &poly_mul(); $code.=" 662 add \$16, $itr2 663 cmp $itr1, $itr2 664 jb 1b\n"; 665 &finalize_state(4); 666 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 667 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 668 &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" 669 movdqa $tmp_store, $D0 670 sub \$12*16, $inl 671 lea 12*16($inp), $inp 672 lea 12*16($oup), $oup 673############################################################################### 674 # Decrypt the remaining data, 16B at a time, using existing stream 675open_sse_tail_64_dec_loop: 676 cmp \$16, $inl 677 jb 1f 678 sub \$16, $inl 679 movdqu ($inp), $T0 680 pxor $T0, $A0 681 movdqu $A0, ($oup) 682 lea 16($inp), $inp 683 lea 16($oup), $oup 684 movdqa $B0, $A0 685 movdqa $C0, $B0 686 movdqa $D0, $C0 687 jmp open_sse_tail_64_dec_loop 6881: 689 movdqa $A0, $A1 690 691 # Decrypt up to 16 bytes at the end. 692open_sse_tail_16: 693 test $inl, $inl 694 jz open_sse_finalize 695 696 # Read the final bytes into $T0. They need to be read in reverse order so 697 # that they end up in the correct order in $T0. 698 pxor $T0, $T0 699 lea -1($inp, $inl), $inp 700 movq $inl, $itr2 7012: 702 pslldq \$1, $T0 703 pinsrb \$0, ($inp), $T0 704 sub \$1, $inp 705 sub \$1, $itr2 706 jnz 2b 707 7083: 709 movq $T0, $t0 710 pextrq \$1, $T0, $t1 711 # The final bytes of keystream are in $A1. 712 pxor $A1, $T0 713 714 # Copy the plaintext bytes out. 7152: 716 pextrb \$0, $T0, ($oup) 717 psrldq \$1, $T0 718 add \$1, $oup 719 sub \$1, $inl 720 jne 2b 721 722 add $t0, $acc0 723 adc $t1, $acc1 724 adc \$1, $acc2\n"; 725 &poly_mul(); $code.=" 726 727open_sse_finalize:\n"; 728 &poly_add($len_store); 729 &poly_mul(); $code.=" 730 # Final reduce 731 mov $acc0, $t0 732 mov $acc1, $t1 733 mov $acc2, $t2 734 sub \$-5, $acc0 735 sbb \$-1, $acc1 736 sbb \$3, $acc2 737 cmovc $t0, $acc0 738 cmovc $t1, $acc1 739 cmovc $t2, $acc2 740 # Add in s part of the key 741 add 0+$s_store, $acc0 742 adc 8+$s_store, $acc1 743 744 add \$288 + 32, %rsp 745.cfi_adjust_cfa_offset -(288 + 32) 746 pop $keyp 747.cfi_adjust_cfa_offset -8 748 movq $acc0, ($keyp) 749 movq $acc1, 8($keyp) 750 751 pop %r15 752.cfi_adjust_cfa_offset -8 753 pop %r14 754.cfi_adjust_cfa_offset -8 755 pop %r13 756.cfi_adjust_cfa_offset -8 757 pop %r12 758.cfi_adjust_cfa_offset -8 759 pop %rbx 760.cfi_adjust_cfa_offset -8 761 pop %rbp 762.cfi_adjust_cfa_offset -8 763 ret 764.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 765############################################################################### 766open_sse_128: 767 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 768 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 769 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 770 movdqu 2*16($keyp), $D0 771 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 772 movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2 773 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 774 mov \$10, $acc0 7751: \n"; 776 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 777 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 778 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 779 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 780 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 781 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 782 dec $acc0 783 jnz 1b 784 paddd .chacha20_consts(%rip), $A0 785 paddd .chacha20_consts(%rip), $A1 786 paddd .chacha20_consts(%rip), $A2 787 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 788 paddd $T2, $C1\npaddd $T2, $C2 789 paddd $T3, $D1 790 paddd .sse_inc(%rip), $T3 791 paddd $T3, $D2 792 # Clamp and store the key 793 pand .clamp(%rip), $A0 794 movdqa $A0, $r_store 795 movdqa $B0, $s_store 796 # Hash 797 mov %r8, $itr2 798 call poly_hash_ad_internal 7991: 800 cmp \$16, $inl 801 jb open_sse_tail_16 802 sub \$16, $inl\n"; 803 # Load for hashing 804 &poly_add("0*8($inp)"); $code.=" 805 # Load for decryption 806 movdqu 0*16($inp), $T0 807 pxor $T0, $A1 808 movdqu $A1, 0*16($oup) 809 lea 1*16($inp), $inp 810 lea 1*16($oup), $oup\n"; 811 &poly_mul(); $code.=" 812 # Shift the stream left 813 movdqa $B1, $A1 814 movdqa $C1, $B1 815 movdqa $D1, $C1 816 movdqa $A2, $D1 817 movdqa $B2, $A2 818 movdqa $C2, $B2 819 movdqa $D2, $C2 820 jmp 1b 821 jmp open_sse_tail_16 822.size chacha20_poly1305_open, .-chacha20_poly1305_open 823.cfi_endproc 824 825################################################################################ 826################################################################################ 827# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); 828.globl chacha20_poly1305_seal 829.type chacha20_poly1305_seal,\@function,2 830.align 64 831chacha20_poly1305_seal: 832.cfi_startproc 833 push %rbp 834.cfi_adjust_cfa_offset 8 835 push %rbx 836.cfi_adjust_cfa_offset 8 837 push %r12 838.cfi_adjust_cfa_offset 8 839 push %r13 840.cfi_adjust_cfa_offset 8 841 push %r14 842.cfi_adjust_cfa_offset 8 843 push %r15 844.cfi_adjust_cfa_offset 8 845 # We write the calculated authenticator back to keyp at the end, so save 846 # the pointer on the stack too. 847 push $keyp 848.cfi_adjust_cfa_offset 8 849 sub \$288 + 32, %rsp 850.cfi_adjust_cfa_offset 288 + 32 851.cfi_offset rbp, -16 852.cfi_offset rbx, -24 853.cfi_offset r12, -32 854.cfi_offset r13, -40 855.cfi_offset r14, -48 856.cfi_offset r15, -56 857 lea 32(%rsp), %rbp 858 and \$-32, %rbp 859 mov %rdx, 8+$len_store 860 mov %r8, 0+$len_store 861 mov %rdx, $inl\n"; $code.=" 862 mov OPENSSL_ia32cap_P+8(%rip), %eax 863 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 864 xor \$`(1<<5) + (1<<8)`, %eax 865 jz chacha20_poly1305_seal_avx2\n" if ($avx>1); 866$code.=" 867 cmp \$128, $inl 868 jbe seal_sse_128 869 # For longer buffers, prepare the poly key + some stream 870 movdqa .chacha20_consts(%rip), $A0 871 movdqu 0*16($keyp), $B0 872 movdqu 1*16($keyp), $C0 873 movdqu 2*16($keyp), $D0 874 movdqa $A0, $A1 875 movdqa $A0, $A2 876 movdqa $A0, $A3 877 movdqa $B0, $B1 878 movdqa $B0, $B2 879 movdqa $B0, $B3 880 movdqa $C0, $C1 881 movdqa $C0, $C2 882 movdqa $C0, $C3 883 movdqa $D0, $D3 884 paddd .sse_inc(%rip), $D0 885 movdqa $D0, $D2 886 paddd .sse_inc(%rip), $D0 887 movdqa $D0, $D1 888 paddd .sse_inc(%rip), $D0 889 # Store on stack 890 movdqa $B0, $state1_store 891 movdqa $C0, $state2_store 892 movdqa $D0, $ctr0_store 893 movdqa $D1, $ctr1_store 894 movdqa $D2, $ctr2_store 895 movdqa $D3, $ctr3_store 896 mov \$10, $acc0 8971: \n"; 898 foreach $l (@loop_body) {$code.=$l."\n";} 899 @loop_body = split /\n/, $chacha_body; $code.=" 900 dec $acc0 901 jnz 1b\n"; 902 &finalize_state(4); $code.=" 903 # Clamp and store the key 904 pand .clamp(%rip), $A3 905 movdqa $A3, $r_store 906 movdqa $B3, $s_store 907 # Hash 908 mov %r8, $itr2 909 call poly_hash_ad_internal\n"; 910 &xor_stream($A2,$B2,$C2,$D2,"0*16"); 911 &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" 912 cmp \$12*16, $inl 913 ja 1f 914 mov \$8*16, $itr1 915 sub \$8*16, $inl 916 lea 8*16($inp), $inp 917 jmp seal_sse_128_seal_hash 9181: \n"; 919 &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" 920 mov \$12*16, $itr1 921 sub \$12*16, $inl 922 lea 12*16($inp), $inp 923 mov \$2, $itr1 924 mov \$8, $itr2 925 cmp \$4*16, $inl 926 jbe seal_sse_tail_64 927 cmp \$8*16, $inl 928 jbe seal_sse_tail_128 929 cmp \$12*16, $inl 930 jbe seal_sse_tail_192 931 9321: \n"; 933 # The main loop 934 &prep_state(4); $code.=" 9352: \n"; 936 &emit_body(20); 937 &poly_add("0($oup)"); 938 &emit_body(20); 939 &poly_stage1(); 940 &emit_body(20); 941 &poly_stage2(); 942 &emit_body(20); 943 &poly_stage3(); 944 &emit_body(20); 945 &poly_reduce_stage(); 946 foreach $l (@loop_body) {$code.=$l."\n";} 947 @loop_body = split /\n/, $chacha_body; $code.=" 948 lea 16($oup), $oup 949 dec $itr2 950 jge 2b\n"; 951 &poly_add("0*8($oup)"); 952 &poly_mul(); $code.=" 953 lea 16($oup), $oup 954 dec $itr1 955 jg 2b\n"; 956 957 &finalize_state(4);$code.=" 958 movdqa $D2, $tmp_store\n"; 959 &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" 960 movdqa $tmp_store, $D2\n"; 961 &xor_stream($A2,$B2,$C2,$D2, 4*16); 962 &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" 963 cmp \$16*16, $inl 964 ja 3f 965 966 mov \$12*16, $itr1 967 sub \$12*16, $inl 968 lea 12*16($inp), $inp 969 jmp seal_sse_128_seal_hash 9703: \n"; 971 &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" 972 lea 16*16($inp), $inp 973 sub \$16*16, $inl 974 mov \$6, $itr1 975 mov \$4, $itr2 976 cmp \$12*16, $inl 977 jg 1b 978 mov $inl, $itr1 979 test $inl, $inl 980 je seal_sse_128_seal_hash 981 mov \$6, $itr1 982 cmp \$4*16, $inl 983 jg 3f 984############################################################################### 985seal_sse_tail_64:\n"; 986 &prep_state(1); $code.=" 9871: \n"; 988 &poly_add("0($oup)"); 989 &poly_mul(); $code.=" 990 lea 16($oup), $oup 9912: \n"; 992 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 993 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 994 &poly_add("0($oup)"); 995 &poly_mul(); $code.=" 996 lea 16($oup), $oup 997 dec $itr1 998 jg 1b 999 dec $itr2 1000 jge 2b\n"; 1001 &finalize_state(1); $code.=" 1002 jmp seal_sse_128_seal 10033: 1004 cmp \$8*16, $inl 1005 jg 3f 1006############################################################################### 1007seal_sse_tail_128:\n"; 1008 &prep_state(2); $code.=" 10091: \n"; 1010 &poly_add("0($oup)"); 1011 &poly_mul(); $code.=" 1012 lea 16($oup), $oup 10132: \n"; 1014 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1015 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1016 &poly_add("0($oup)"); 1017 &poly_mul(); 1018 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1019 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1020 lea 16($oup), $oup 1021 dec $itr1 1022 jg 1b 1023 dec $itr2 1024 jge 2b\n"; 1025 &finalize_state(2); 1026 &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" 1027 mov \$4*16, $itr1 1028 sub \$4*16, $inl 1029 lea 4*16($inp), $inp 1030 jmp seal_sse_128_seal_hash 10313: 1032############################################################################### 1033seal_sse_tail_192:\n"; 1034 &prep_state(3); $code.=" 10351: \n"; 1036 &poly_add("0($oup)"); 1037 &poly_mul(); $code.=" 1038 lea 16($oup), $oup 10392: \n"; 1040 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1041 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1042 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1043 &poly_add("0($oup)"); 1044 &poly_mul(); 1045 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1046 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1047 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1048 lea 16($oup), $oup 1049 dec $itr1 1050 jg 1b 1051 dec $itr2 1052 jge 2b\n"; 1053 &finalize_state(3); 1054 &xor_stream($A2,$B2,$C2,$D2,0*16); 1055 &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" 1056 mov \$8*16, $itr1 1057 sub \$8*16, $inl 1058 lea 8*16($inp), $inp 1059############################################################################### 1060seal_sse_128_seal_hash: 1061 cmp \$16, $itr1 1062 jb seal_sse_128_seal\n"; 1063 &poly_add("0($oup)"); 1064 &poly_mul(); $code.=" 1065 sub \$16, $itr1 1066 lea 16($oup), $oup 1067 jmp seal_sse_128_seal_hash 1068 1069seal_sse_128_seal: 1070 cmp \$16, $inl 1071 jb seal_sse_tail_16 1072 sub \$16, $inl 1073 # Load for decryption 1074 movdqu 0*16($inp), $T0 1075 pxor $T0, $A0 1076 movdqu $A0, 0*16($oup) 1077 # Then hash 1078 add 0*8($oup), $acc0 1079 adc 1*8($oup), $acc1 1080 adc \$1, $acc2 1081 lea 1*16($inp), $inp 1082 lea 1*16($oup), $oup\n"; 1083 &poly_mul(); $code.=" 1084 # Shift the stream left 1085 movdqa $B0, $A0 1086 movdqa $C0, $B0 1087 movdqa $D0, $C0 1088 movdqa $A1, $D0 1089 movdqa $B1, $A1 1090 movdqa $C1, $B1 1091 movdqa $D1, $C1 1092 jmp seal_sse_128_seal 1093 1094seal_sse_tail_16: 1095 test $inl, $inl 1096 jz seal_sse_finalize 1097 # We can only load the PT one byte at a time to avoid buffer overread 1098 mov $inl, $itr2 1099 shl \$4, $itr2 1100 lea .and_masks(%rip), $t0 1101 mov $inl, $itr1 1102 lea -1($inp, $inl), $inp 1103 pxor $T3, $T3 11041: 1105 pslldq \$1, $T3 1106 pinsrb \$0, ($inp), $T3 1107 lea -1($inp), $inp 1108 dec $itr1 1109 jne 1b 1110 1111 # XOR the keystream with the plaintext. 1112 pxor $A0, $T3 1113 1114 # Write ciphertext out, byte-by-byte. 1115 movq $inl, $itr1 1116 movdqu $T3, $A0 11172: 1118 pextrb \$0, $A0, ($oup) 1119 psrldq \$1, $A0 1120 add \$1, $oup 1121 sub \$1, $itr1 1122 jnz 2b 1123 1124 pand -16($t0, $itr2), $T3 1125 movq $T3, $t0 1126 pextrq \$1, $T3, $t1 1127 add $t0, $acc0 1128 adc $t1, $acc1 1129 adc \$1, $acc2\n"; 1130 &poly_mul(); $code.=" 1131seal_sse_finalize:\n"; 1132 &poly_add($len_store); 1133 &poly_mul(); $code.=" 1134 # Final reduce 1135 mov $acc0, $t0 1136 mov $acc1, $t1 1137 mov $acc2, $t2 1138 sub \$-5, $acc0 1139 sbb \$-1, $acc1 1140 sbb \$3, $acc2 1141 cmovc $t0, $acc0 1142 cmovc $t1, $acc1 1143 cmovc $t2, $acc2 1144 # Add in s part of the key 1145 add 0+$s_store, $acc0 1146 adc 8+$s_store, $acc1 1147 1148 add \$288 + 32, %rsp 1149.cfi_adjust_cfa_offset -(288 + 32) 1150 pop $keyp 1151.cfi_adjust_cfa_offset -8 1152 mov $acc0, 0*8($keyp) 1153 mov $acc1, 1*8($keyp) 1154 1155 pop %r15 1156.cfi_adjust_cfa_offset -8 1157 pop %r14 1158.cfi_adjust_cfa_offset -8 1159 pop %r13 1160.cfi_adjust_cfa_offset -8 1161 pop %r12 1162.cfi_adjust_cfa_offset -8 1163 pop %rbx 1164.cfi_adjust_cfa_offset -8 1165 pop %rbp 1166.cfi_adjust_cfa_offset -8 1167 ret 1168.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 1169################################################################################ 1170seal_sse_128: 1171 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 1172 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 1173 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 1174 movdqu 2*16($keyp), $D2 1175 movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0 1176 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 1177 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 1178 mov \$10, $acc0 11791:\n"; 1180 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1181 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1182 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1183 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1184 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1185 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1186 dec $acc0 1187 jnz 1b 1188 paddd .chacha20_consts(%rip), $A0 1189 paddd .chacha20_consts(%rip), $A1 1190 paddd .chacha20_consts(%rip), $A2 1191 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 1192 paddd $T2, $C0\npaddd $T2, $C1 1193 paddd $T3, $D0 1194 paddd .sse_inc(%rip), $T3 1195 paddd $T3, $D1 1196 # Clamp and store the key 1197 pand .clamp(%rip), $A2 1198 movdqa $A2, $r_store 1199 movdqa $B2, $s_store 1200 # Hash 1201 mov %r8, $itr2 1202 call poly_hash_ad_internal 1203 jmp seal_sse_128_seal 1204.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n"; 1205} 1206 1207# There should have been a cfi_endproc at the end of that function, but the two 1208# following blocks of code are jumped to without a stack frame and the CFI 1209# context which they are used in happens to match the CFI context at the end of 1210# the previous function. So the CFI table is just extended to the end of them. 1211 1212if ($avx>1) { 1213 1214($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); 1215my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); 1216($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 1217$state1_store="2*32(%rbp)"; 1218$state2_store="3*32(%rbp)"; 1219$tmp_store="4*32(%rbp)"; 1220$ctr0_store="5*32(%rbp)"; 1221$ctr1_store="6*32(%rbp)"; 1222$ctr2_store="7*32(%rbp)"; 1223$ctr3_store="8*32(%rbp)"; 1224 1225sub chacha_qr_avx2 { 1226my ($a,$b,$c,$d,$t,$dir)=@_; 1227$code.=<<___ if ($dir =~ /store/); 1228 vmovdqa $t, $tmp_store 1229___ 1230$code.=<<___; 1231 vpaddd $b, $a, $a 1232 vpxor $a, $d, $d 1233 vpshufb .rol16(%rip), $d, $d 1234 vpaddd $d, $c, $c 1235 vpxor $c, $b, $b 1236 vpsrld \$20, $b, $t 1237 vpslld \$12, $b, $b 1238 vpxor $t, $b, $b 1239 vpaddd $b, $a, $a 1240 vpxor $a, $d, $d 1241 vpshufb .rol8(%rip), $d, $d 1242 vpaddd $d, $c, $c 1243 vpxor $c, $b, $b 1244 vpslld \$7, $b, $t 1245 vpsrld \$25, $b, $b 1246 vpxor $t, $b, $b 1247___ 1248$code.=<<___ if ($dir =~ /left/); 1249 vpalignr \$12, $d, $d, $d 1250 vpalignr \$8, $c, $c, $c 1251 vpalignr \$4, $b, $b, $b 1252___ 1253$code.=<<___ if ($dir =~ /right/); 1254 vpalignr \$4, $d, $d, $d 1255 vpalignr \$8, $c, $c, $c 1256 vpalignr \$12, $b, $b, $b 1257___ 1258$code.=<<___ if ($dir =~ /load/); 1259 vmovdqa $tmp_store, $t 1260___ 1261} 1262 1263sub prep_state_avx2 { 1264my ($n)=@_; 1265$code.=<<___; 1266 vmovdqa .chacha20_consts(%rip), $A0 1267 vmovdqa $state1_store, $B0 1268 vmovdqa $state2_store, $C0 1269___ 1270$code.=<<___ if ($n ge 2); 1271 vmovdqa $A0, $A1 1272 vmovdqa $B0, $B1 1273 vmovdqa $C0, $C1 1274___ 1275$code.=<<___ if ($n ge 3); 1276 vmovdqa $A0, $A2 1277 vmovdqa $B0, $B2 1278 vmovdqa $C0, $C2 1279___ 1280$code.=<<___ if ($n ge 4); 1281 vmovdqa $A0, $A3 1282 vmovdqa $B0, $B3 1283 vmovdqa $C0, $C3 1284___ 1285$code.=<<___ if ($n eq 1); 1286 vmovdqa .avx2_inc(%rip), $D0 1287 vpaddd $ctr0_store, $D0, $D0 1288 vmovdqa $D0, $ctr0_store 1289___ 1290$code.=<<___ if ($n eq 2); 1291 vmovdqa .avx2_inc(%rip), $D0 1292 vpaddd $ctr0_store, $D0, $D1 1293 vpaddd $D1, $D0, $D0 1294 vmovdqa $D0, $ctr0_store 1295 vmovdqa $D1, $ctr1_store 1296___ 1297$code.=<<___ if ($n eq 3); 1298 vmovdqa .avx2_inc(%rip), $D0 1299 vpaddd $ctr0_store, $D0, $D2 1300 vpaddd $D2, $D0, $D1 1301 vpaddd $D1, $D0, $D0 1302 vmovdqa $D0, $ctr0_store 1303 vmovdqa $D1, $ctr1_store 1304 vmovdqa $D2, $ctr2_store 1305___ 1306$code.=<<___ if ($n eq 4); 1307 vmovdqa .avx2_inc(%rip), $D0 1308 vpaddd $ctr0_store, $D0, $D3 1309 vpaddd $D3, $D0, $D2 1310 vpaddd $D2, $D0, $D1 1311 vpaddd $D1, $D0, $D0 1312 vmovdqa $D3, $ctr3_store 1313 vmovdqa $D2, $ctr2_store 1314 vmovdqa $D1, $ctr1_store 1315 vmovdqa $D0, $ctr0_store 1316___ 1317} 1318 1319sub finalize_state_avx2 { 1320my ($n)=@_; 1321$code.=<<___ if ($n eq 4); 1322 vpaddd .chacha20_consts(%rip), $A3, $A3 1323 vpaddd $state1_store, $B3, $B3 1324 vpaddd $state2_store, $C3, $C3 1325 vpaddd $ctr3_store, $D3, $D3 1326___ 1327$code.=<<___ if ($n ge 3); 1328 vpaddd .chacha20_consts(%rip), $A2, $A2 1329 vpaddd $state1_store, $B2, $B2 1330 vpaddd $state2_store, $C2, $C2 1331 vpaddd $ctr2_store, $D2, $D2 1332___ 1333$code.=<<___ if ($n ge 2); 1334 vpaddd .chacha20_consts(%rip), $A1, $A1 1335 vpaddd $state1_store, $B1, $B1 1336 vpaddd $state2_store, $C1, $C1 1337 vpaddd $ctr1_store, $D1, $D1 1338___ 1339$code.=<<___; 1340 vpaddd .chacha20_consts(%rip), $A0, $A0 1341 vpaddd $state1_store, $B0, $B0 1342 vpaddd $state2_store, $C0, $C0 1343 vpaddd $ctr0_store, $D0, $D0 1344___ 1345} 1346 1347sub xor_stream_avx2 { 1348my ($A, $B, $C, $D, $offset, $hlp)=@_; 1349$code.=<<___; 1350 vperm2i128 \$0x02, $A, $B, $hlp 1351 vperm2i128 \$0x13, $A, $B, $B 1352 vperm2i128 \$0x02, $C, $D, $A 1353 vperm2i128 \$0x13, $C, $D, $C 1354 vpxor 0*32+$offset($inp), $hlp, $hlp 1355 vpxor 1*32+$offset($inp), $A, $A 1356 vpxor 2*32+$offset($inp), $B, $B 1357 vpxor 3*32+$offset($inp), $C, $C 1358 vmovdqu $hlp, 0*32+$offset($oup) 1359 vmovdqu $A, 1*32+$offset($oup) 1360 vmovdqu $B, 2*32+$offset($oup) 1361 vmovdqu $C, 3*32+$offset($oup) 1362___ 1363} 1364 1365sub finish_stream_avx2 { 1366my ($A, $B, $C, $D, $hlp)=@_; 1367$code.=<<___; 1368 vperm2i128 \$0x13, $A, $B, $hlp 1369 vperm2i128 \$0x02, $A, $B, $A 1370 vperm2i128 \$0x02, $C, $D, $B 1371 vperm2i128 \$0x13, $C, $D, $D 1372 vmovdqa $hlp, $C 1373___ 1374} 1375 1376sub poly_stage1_mulx { 1377$code.=<<___; 1378 mov 0+$r_store, %rdx 1379 mov %rdx, $t2 1380 mulx $acc0, $t0, $t1 1381 mulx $acc1, %rax, %rdx 1382 imulq $acc2, $t2 1383 add %rax, $t1 1384 adc %rdx, $t2 1385___ 1386} 1387 1388sub poly_stage2_mulx { 1389$code.=<<___; 1390 mov 8+$r_store, %rdx 1391 mulx $acc0, $acc0, %rax 1392 add $acc0, $t1 1393 mulx $acc1, $acc1, $t3 1394 adc $acc1, $t2 1395 adc \$0, $t3 1396 imulq $acc2, %rdx 1397___ 1398} 1399 1400sub poly_stage3_mulx { 1401$code.=<<___; 1402 add %rax, $t2 1403 adc %rdx, $t3 1404___ 1405} 1406 1407sub poly_mul_mulx { 1408 &poly_stage1_mulx(); 1409 &poly_stage2_mulx(); 1410 &poly_stage3_mulx(); 1411 &poly_reduce_stage(); 1412} 1413 1414sub gen_chacha_round_avx2 { 1415my ($rot1, $rot2, $shift)=@_; 1416my $round=""; 1417$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); 1418$round=$round ."vmovdqa $rot2, $C0 1419 vpaddd $B3, $A3, $A3 1420 vpaddd $B2, $A2, $A2 1421 vpaddd $B1, $A1, $A1 1422 vpaddd $B0, $A0, $A0 1423 vpxor $A3, $D3, $D3 1424 vpxor $A2, $D2, $D2 1425 vpxor $A1, $D1, $D1 1426 vpxor $A0, $D0, $D0 1427 vpshufb $C0, $D3, $D3 1428 vpshufb $C0, $D2, $D2 1429 vpshufb $C0, $D1, $D1 1430 vpshufb $C0, $D0, $D0 1431 vmovdqa $tmp_store, $C0 1432 vpaddd $D3, $C3, $C3 1433 vpaddd $D2, $C2, $C2 1434 vpaddd $D1, $C1, $C1 1435 vpaddd $D0, $C0, $C0 1436 vpxor $C3, $B3, $B3 1437 vpxor $C2, $B2, $B2 1438 vpxor $C1, $B1, $B1 1439 vpxor $C0, $B0, $B0 1440 vmovdqa $C0, $tmp_store 1441 vpsrld \$$rot1, $B3, $C0 1442 vpslld \$32-$rot1, $B3, $B3 1443 vpxor $C0, $B3, $B3 1444 vpsrld \$$rot1, $B2, $C0 1445 vpslld \$32-$rot1, $B2, $B2 1446 vpxor $C0, $B2, $B2 1447 vpsrld \$$rot1, $B1, $C0 1448 vpslld \$32-$rot1, $B1, $B1 1449 vpxor $C0, $B1, $B1 1450 vpsrld \$$rot1, $B0, $C0 1451 vpslld \$32-$rot1, $B0, $B0 1452 vpxor $C0, $B0, $B0\n"; 1453($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 1454($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 1455$round=$round ."vmovdqa $tmp_store, $C0 1456 vpalignr \$$s1, $B3, $B3, $B3 1457 vpalignr \$$s2, $C3, $C3, $C3 1458 vpalignr \$$s3, $D3, $D3, $D3 1459 vpalignr \$$s1, $B2, $B2, $B2 1460 vpalignr \$$s2, $C2, $C2, $C2 1461 vpalignr \$$s3, $D2, $D2, $D2 1462 vpalignr \$$s1, $B1, $B1, $B1 1463 vpalignr \$$s2, $C1, $C1, $C1 1464 vpalignr \$$s3, $D1, $D1, $D1 1465 vpalignr \$$s1, $B0, $B0, $B0 1466 vpalignr \$$s2, $C0, $C0, $C0 1467 vpalignr \$$s3, $D0, $D0, $D0\n" 1468if (($shift =~ /left/) || ($shift =~ /right/)); 1469return $round; 1470}; 1471 1472$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") . 1473 &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") . 1474 &gen_chacha_round_avx2(20, ".rol16(%rip)") . 1475 &gen_chacha_round_avx2(25, ".rol8(%rip)", "right"); 1476 1477@loop_body = split /\n/, $chacha_body; 1478 1479$code.=" 1480############################################################################### 1481.type chacha20_poly1305_open_avx2,\@function,2 1482.align 64 1483chacha20_poly1305_open_avx2: 1484 vzeroupper 1485 vmovdqa .chacha20_consts(%rip), $A0 1486 vbroadcasti128 0*16($keyp), $B0 1487 vbroadcasti128 1*16($keyp), $C0 1488 vbroadcasti128 2*16($keyp), $D0 1489 vpaddd .avx2_init(%rip), $D0, $D0 1490 cmp \$6*32, $inl 1491 jbe open_avx2_192 1492 cmp \$10*32, $inl 1493 jbe open_avx2_320 1494 1495 vmovdqa $B0, $state1_store 1496 vmovdqa $C0, $state2_store 1497 vmovdqa $D0, $ctr0_store 1498 mov \$10, $acc0 14991: \n"; 1500 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1501 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1502 dec $acc0 1503 jne 1b 1504 vpaddd .chacha20_consts(%rip), $A0, $A0 1505 vpaddd $state1_store, $B0, $B0 1506 vpaddd $state2_store, $C0, $C0 1507 vpaddd $ctr0_store, $D0, $D0 1508 1509 vperm2i128 \$0x02, $A0, $B0, $T0 1510 # Clamp and store key 1511 vpand .clamp(%rip), $T0, $T0 1512 vmovdqa $T0, $r_store 1513 # Stream for the first 64 bytes 1514 vperm2i128 \$0x13, $A0, $B0, $A0 1515 vperm2i128 \$0x13, $C0, $D0, $B0 1516 # Hash AD + first 64 bytes 1517 mov %r8, $itr2 1518 call poly_hash_ad_internal 1519 xor $itr1, $itr1 1520 # Hash first 64 bytes 15211: \n"; 1522 &poly_add("0($inp, $itr1)"); 1523 &poly_mul(); $code.=" 1524 add \$16, $itr1 1525 cmp \$2*32, $itr1 1526 jne 1b 1527 # Decrypt first 64 bytes 1528 vpxor 0*32($inp), $A0, $A0 1529 vpxor 1*32($inp), $B0, $B0 1530 vmovdqu $A0, 0*32($oup) 1531 vmovdqu $B0, 1*32($oup) 1532 lea 2*32($inp), $inp 1533 lea 2*32($oup), $oup 1534 sub \$2*32, $inl 15351: 1536 # Hash and decrypt 512 bytes each iteration 1537 cmp \$16*32, $inl 1538 jb 3f\n"; 1539 &prep_state_avx2(4); $code.=" 1540 xor $itr1, $itr1 15412: \n"; 1542 &poly_add("0*8($inp, $itr1)"); 1543 &emit_body(10); 1544 &poly_stage1_mulx(); 1545 &emit_body(9); 1546 &poly_stage2_mulx(); 1547 &emit_body(12); 1548 &poly_stage3_mulx(); 1549 &emit_body(10); 1550 &poly_reduce_stage(); 1551 &emit_body(9); 1552 &poly_add("2*8($inp, $itr1)"); 1553 &emit_body(8); 1554 &poly_stage1_mulx(); 1555 &emit_body(18); 1556 &poly_stage2_mulx(); 1557 &emit_body(18); 1558 &poly_stage3_mulx(); 1559 &emit_body(9); 1560 &poly_reduce_stage(); 1561 &emit_body(8); 1562 &poly_add("4*8($inp, $itr1)"); $code.=" 1563 lea 6*8($itr1), $itr1\n"; 1564 &emit_body(18); 1565 &poly_stage1_mulx(); 1566 &emit_body(8); 1567 &poly_stage2_mulx(); 1568 &emit_body(8); 1569 &poly_stage3_mulx(); 1570 &emit_body(18); 1571 &poly_reduce_stage(); 1572 foreach $l (@loop_body) {$code.=$l."\n";} 1573 @loop_body = split /\n/, $chacha_body; $code.=" 1574 cmp \$10*6*8, $itr1 1575 jne 2b\n"; 1576 &finalize_state_avx2(4); $code.=" 1577 vmovdqa $A0, $tmp_store\n"; 1578 &poly_add("10*6*8($inp)"); 1579 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1580 vmovdqa $tmp_store, $A0\n"; 1581 &poly_mul(); 1582 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1583 &poly_add("10*6*8+2*8($inp)"); 1584 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1585 &poly_mul(); 1586 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 1587 lea 16*32($inp), $inp 1588 lea 16*32($oup), $oup 1589 sub \$16*32, $inl 1590 jmp 1b 15913: 1592 test $inl, $inl 1593 vzeroupper 1594 je open_sse_finalize 15953: 1596 cmp \$4*32, $inl 1597 ja 3f\n"; 1598############################################################################### 1599 # 1-128 bytes left 1600 &prep_state_avx2(1); $code.=" 1601 xor $itr2, $itr2 1602 mov $inl, $itr1 1603 and \$-16, $itr1 1604 test $itr1, $itr1 1605 je 2f 16061: \n"; 1607 &poly_add("0*8($inp, $itr2)"); 1608 &poly_mul(); $code.=" 16092: 1610 add \$16, $itr2\n"; 1611 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1612 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1613 cmp $itr1, $itr2 1614 jb 1b 1615 cmp \$160, $itr2 1616 jne 2b\n"; 1617 &finalize_state_avx2(1); 1618 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 1619 jmp open_avx2_tail_loop 16203: 1621 cmp \$8*32, $inl 1622 ja 3f\n"; 1623############################################################################### 1624 # 129-256 bytes left 1625 &prep_state_avx2(2); $code.=" 1626 mov $inl, $tmp_store 1627 mov $inl, $itr1 1628 sub \$4*32, $itr1 1629 shr \$4, $itr1 1630 mov \$10, $itr2 1631 cmp \$10, $itr1 1632 cmovg $itr2, $itr1 1633 mov $inp, $inl 1634 xor $itr2, $itr2 16351: \n"; 1636 &poly_add("0*8($inl)"); 1637 &poly_mul_mulx(); $code.=" 1638 lea 16($inl), $inl 16392: \n"; 1640 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1641 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" 1642 inc $itr2\n"; 1643 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1644 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1645 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1646 cmp $itr1, $itr2 1647 jb 1b 1648 cmp \$10, $itr2 1649 jne 2b 1650 mov $inl, $itr2 1651 sub $inp, $inl 1652 mov $inl, $itr1 1653 mov $tmp_store, $inl 16541: 1655 add \$16, $itr1 1656 cmp $inl, $itr1 1657 jg 1f\n"; 1658 &poly_add("0*8($itr2)"); 1659 &poly_mul_mulx(); $code.=" 1660 lea 16($itr2), $itr2 1661 jmp 1b 16621: \n"; 1663 &finalize_state_avx2(2); 1664 &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); 1665 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1666 lea 4*32($inp), $inp 1667 lea 4*32($oup), $oup 1668 sub \$4*32, $inl 1669 jmp open_avx2_tail_loop 16703: 1671 cmp \$12*32, $inl 1672 ja 3f\n"; 1673############################################################################### 1674 # 257-383 bytes left 1675 &prep_state_avx2(3); $code.=" 1676 mov $inl, $tmp_store 1677 mov $inl, $itr1 1678 sub \$8*32, $itr1 1679 shr \$4, $itr1 1680 add \$6, $itr1 1681 mov \$10, $itr2 1682 cmp \$10, $itr1 1683 cmovg $itr2, $itr1 1684 mov $inp, $inl 1685 xor $itr2, $itr2 16861: \n"; 1687 &poly_add("0*8($inl)"); 1688 &poly_mul_mulx(); $code.=" 1689 lea 16($inl), $inl 16902: \n"; 1691 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 1692 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1693 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1694 &poly_add("0*8($inl)"); 1695 &poly_mul(); $code.=" 1696 lea 16($inl), $inl 1697 inc $itr2\n"; 1698 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); 1699 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1700 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1701 cmp $itr1, $itr2 1702 jb 1b 1703 cmp \$10, $itr2 1704 jne 2b 1705 mov $inl, $itr2 1706 sub $inp, $inl 1707 mov $inl, $itr1 1708 mov $tmp_store, $inl 17091: 1710 add \$16, $itr1 1711 cmp $inl, $itr1 1712 jg 1f\n"; 1713 &poly_add("0*8($itr2)"); 1714 &poly_mul_mulx(); $code.=" 1715 lea 16($itr2), $itr2 1716 jmp 1b 17171: \n"; 1718 &finalize_state_avx2(3); 1719 &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); 1720 &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); 1721 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1722 lea 8*32($inp), $inp 1723 lea 8*32($oup), $oup 1724 sub \$8*32, $inl 1725 jmp open_avx2_tail_loop 17263: \n"; 1727############################################################################### 1728 # 384-512 bytes left 1729 &prep_state_avx2(4); $code.=" 1730 xor $itr1, $itr1 1731 mov $inp, $itr2 17321: \n"; 1733 &poly_add("0*8($itr2)"); 1734 &poly_mul(); $code.=" 1735 lea 2*8($itr2), $itr2 17362: \n"; 1737 &emit_body(37); 1738 &poly_add("0*8($itr2)"); 1739 &poly_mul_mulx(); 1740 &emit_body(48); 1741 &poly_add("2*8($itr2)"); 1742 &poly_mul_mulx(); $code.=" 1743 lea 4*8($itr2), $itr2\n"; 1744 foreach $l (@loop_body) {$code.=$l."\n";} 1745 @loop_body = split /\n/, $chacha_body; $code.=" 1746 inc $itr1 1747 cmp \$4, $itr1 1748 jl 1b 1749 cmp \$10, $itr1 1750 jne 2b 1751 mov $inl, $itr1 1752 sub \$12*32, $itr1 1753 and \$-16, $itr1 17541: 1755 test $itr1, $itr1 1756 je 1f\n"; 1757 &poly_add("0*8($itr2)"); 1758 &poly_mul_mulx(); $code.=" 1759 lea 2*8($itr2), $itr2 1760 sub \$2*8, $itr1 1761 jmp 1b 17621: \n"; 1763 &finalize_state_avx2(4); $code.=" 1764 vmovdqa $A0, $tmp_store\n"; 1765 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1766 vmovdqa $tmp_store, $A0\n"; 1767 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1768 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1769 &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" 1770 lea 12*32($inp), $inp 1771 lea 12*32($oup), $oup 1772 sub \$12*32, $inl 1773open_avx2_tail_loop: 1774 cmp \$32, $inl 1775 jb open_avx2_tail 1776 sub \$32, $inl 1777 vpxor ($inp), $A0, $A0 1778 vmovdqu $A0, ($oup) 1779 lea 1*32($inp), $inp 1780 lea 1*32($oup), $oup 1781 vmovdqa $B0, $A0 1782 vmovdqa $C0, $B0 1783 vmovdqa $D0, $C0 1784 jmp open_avx2_tail_loop 1785open_avx2_tail: 1786 cmp \$16, $inl 1787 vmovdqa $A0x, $A1x 1788 jb 1f 1789 sub \$16, $inl 1790 #load for decryption 1791 vpxor ($inp), $A0x, $A1x 1792 vmovdqu $A1x, ($oup) 1793 lea 1*16($inp), $inp 1794 lea 1*16($oup), $oup 1795 vperm2i128 \$0x11, $A0, $A0, $A0 1796 vmovdqa $A0x, $A1x 17971: 1798 vzeroupper 1799 jmp open_sse_tail_16 1800############################################################################### 1801open_avx2_192: 1802 vmovdqa $A0, $A1 1803 vmovdqa $A0, $A2 1804 vmovdqa $B0, $B1 1805 vmovdqa $B0, $B2 1806 vmovdqa $C0, $C1 1807 vmovdqa $C0, $C2 1808 vpaddd .avx2_inc(%rip), $D0, $D1 1809 vmovdqa $D0, $T2 1810 vmovdqa $D1, $T3 1811 mov \$10, $acc0 18121: \n"; 1813 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1814 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1815 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1816 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1817 dec $acc0 1818 jne 1b 1819 vpaddd $A2, $A0, $A0 1820 vpaddd $A2, $A1, $A1 1821 vpaddd $B2, $B0, $B0 1822 vpaddd $B2, $B1, $B1 1823 vpaddd $C2, $C0, $C0 1824 vpaddd $C2, $C1, $C1 1825 vpaddd $T2, $D0, $D0 1826 vpaddd $T3, $D1, $D1 1827 vperm2i128 \$0x02, $A0, $B0, $T0 1828 # Clamp and store the key 1829 vpand .clamp(%rip), $T0, $T0 1830 vmovdqa $T0, $r_store 1831 # Stream for up to 192 bytes 1832 vperm2i128 \$0x13, $A0, $B0, $A0 1833 vperm2i128 \$0x13, $C0, $D0, $B0 1834 vperm2i128 \$0x02, $A1, $B1, $C0 1835 vperm2i128 \$0x02, $C1, $D1, $D0 1836 vperm2i128 \$0x13, $A1, $B1, $A1 1837 vperm2i128 \$0x13, $C1, $D1, $B1 1838open_avx2_short: 1839 mov %r8, $itr2 1840 call poly_hash_ad_internal 1841open_avx2_hash_and_xor_loop: 1842 cmp \$32, $inl 1843 jb open_avx2_short_tail_32 1844 sub \$32, $inl\n"; 1845 # Load + hash 1846 &poly_add("0*8($inp)"); 1847 &poly_mul(); 1848 &poly_add("2*8($inp)"); 1849 &poly_mul(); $code.=" 1850 # Load + decrypt 1851 vpxor ($inp), $A0, $A0 1852 vmovdqu $A0, ($oup) 1853 lea 1*32($inp), $inp 1854 lea 1*32($oup), $oup 1855 # Shift stream 1856 vmovdqa $B0, $A0 1857 vmovdqa $C0, $B0 1858 vmovdqa $D0, $C0 1859 vmovdqa $A1, $D0 1860 vmovdqa $B1, $A1 1861 vmovdqa $C1, $B1 1862 vmovdqa $D1, $C1 1863 vmovdqa $A2, $D1 1864 vmovdqa $B2, $A2 1865 jmp open_avx2_hash_and_xor_loop 1866open_avx2_short_tail_32: 1867 cmp \$16, $inl 1868 vmovdqa $A0x, $A1x 1869 jb 1f 1870 sub \$16, $inl\n"; 1871 &poly_add("0*8($inp)"); 1872 &poly_mul(); $code.=" 1873 vpxor ($inp), $A0x, $A3x 1874 vmovdqu $A3x, ($oup) 1875 lea 1*16($inp), $inp 1876 lea 1*16($oup), $oup 1877 vextracti128 \$1, $A0, $A1x 18781: 1879 vzeroupper 1880 jmp open_sse_tail_16 1881############################################################################### 1882open_avx2_320: 1883 vmovdqa $A0, $A1 1884 vmovdqa $A0, $A2 1885 vmovdqa $B0, $B1 1886 vmovdqa $B0, $B2 1887 vmovdqa $C0, $C1 1888 vmovdqa $C0, $C2 1889 vpaddd .avx2_inc(%rip), $D0, $D1 1890 vpaddd .avx2_inc(%rip), $D1, $D2 1891 vmovdqa $B0, $T1 1892 vmovdqa $C0, $T2 1893 vmovdqa $D0, $ctr0_store 1894 vmovdqa $D1, $ctr1_store 1895 vmovdqa $D2, $ctr2_store 1896 mov \$10, $acc0 18971: \n"; 1898 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1899 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1900 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 1901 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1902 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1903 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1904 dec $acc0 1905 jne 1b 1906 vpaddd .chacha20_consts(%rip), $A0, $A0 1907 vpaddd .chacha20_consts(%rip), $A1, $A1 1908 vpaddd .chacha20_consts(%rip), $A2, $A2 1909 vpaddd $T1, $B0, $B0 1910 vpaddd $T1, $B1, $B1 1911 vpaddd $T1, $B2, $B2 1912 vpaddd $T2, $C0, $C0 1913 vpaddd $T2, $C1, $C1 1914 vpaddd $T2, $C2, $C2 1915 vpaddd $ctr0_store, $D0, $D0 1916 vpaddd $ctr1_store, $D1, $D1 1917 vpaddd $ctr2_store, $D2, $D2 1918 vperm2i128 \$0x02, $A0, $B0, $T0 1919 # Clamp and store the key 1920 vpand .clamp(%rip), $T0, $T0 1921 vmovdqa $T0, $r_store 1922 # Stream for up to 320 bytes 1923 vperm2i128 \$0x13, $A0, $B0, $A0 1924 vperm2i128 \$0x13, $C0, $D0, $B0 1925 vperm2i128 \$0x02, $A1, $B1, $C0 1926 vperm2i128 \$0x02, $C1, $D1, $D0 1927 vperm2i128 \$0x13, $A1, $B1, $A1 1928 vperm2i128 \$0x13, $C1, $D1, $B1 1929 vperm2i128 \$0x02, $A2, $B2, $C1 1930 vperm2i128 \$0x02, $C2, $D2, $D1 1931 vperm2i128 \$0x13, $A2, $B2, $A2 1932 vperm2i128 \$0x13, $C2, $D2, $B2 1933 jmp open_avx2_short 1934.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 1935############################################################################### 1936############################################################################### 1937.type chacha20_poly1305_seal_avx2,\@function,2 1938.align 64 1939chacha20_poly1305_seal_avx2: 1940 vzeroupper 1941 vmovdqa .chacha20_consts(%rip), $A0 1942 vbroadcasti128 0*16($keyp), $B0 1943 vbroadcasti128 1*16($keyp), $C0 1944 vbroadcasti128 2*16($keyp), $D0 1945 vpaddd .avx2_init(%rip), $D0, $D0 1946 cmp \$6*32, $inl 1947 jbe seal_avx2_192 1948 cmp \$10*32, $inl 1949 jbe seal_avx2_320 1950 vmovdqa $A0, $A1 1951 vmovdqa $A0, $A2 1952 vmovdqa $A0, $A3 1953 vmovdqa $B0, $B1 1954 vmovdqa $B0, $B2 1955 vmovdqa $B0, $B3 1956 vmovdqa $B0, $state1_store 1957 vmovdqa $C0, $C1 1958 vmovdqa $C0, $C2 1959 vmovdqa $C0, $C3 1960 vmovdqa $C0, $state2_store 1961 vmovdqa $D0, $D3 1962 vpaddd .avx2_inc(%rip), $D3, $D2 1963 vpaddd .avx2_inc(%rip), $D2, $D1 1964 vpaddd .avx2_inc(%rip), $D1, $D0 1965 vmovdqa $D0, $ctr0_store 1966 vmovdqa $D1, $ctr1_store 1967 vmovdqa $D2, $ctr2_store 1968 vmovdqa $D3, $ctr3_store 1969 mov \$10, $acc0 19701: \n"; 1971 foreach $l (@loop_body) {$code.=$l."\n";} 1972 @loop_body = split /\n/, $chacha_body; $code.=" 1973 dec $acc0 1974 jnz 1b\n"; 1975 &finalize_state_avx2(4); $code.=" 1976 vperm2i128 \$0x13, $C3, $D3, $C3 1977 vperm2i128 \$0x02, $A3, $B3, $D3 1978 vperm2i128 \$0x13, $A3, $B3, $A3 1979 vpand .clamp(%rip), $D3, $D3 1980 vmovdqa $D3, $r_store 1981 mov %r8, $itr2 1982 call poly_hash_ad_internal 1983 # Safely store 320 bytes (otherwise would handle with optimized call) 1984 vpxor 0*32($inp), $A3, $A3 1985 vpxor 1*32($inp), $C3, $C3 1986 vmovdqu $A3, 0*32($oup) 1987 vmovdqu $C3, 1*32($oup)\n"; 1988 &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); 1989 &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); 1990 &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" 1991 lea 10*32($inp), $inp 1992 sub \$10*32, $inl 1993 mov \$10*32, $itr1 1994 cmp \$4*32, $inl 1995 jbe seal_avx2_hash 1996 vpxor 0*32($inp), $A0, $A0 1997 vpxor 1*32($inp), $B0, $B0 1998 vpxor 2*32($inp), $C0, $C0 1999 vpxor 3*32($inp), $D0, $D0 2000 vmovdqu $A0, 10*32($oup) 2001 vmovdqu $B0, 11*32($oup) 2002 vmovdqu $C0, 12*32($oup) 2003 vmovdqu $D0, 13*32($oup) 2004 lea 4*32($inp), $inp 2005 sub \$4*32, $inl 2006 mov \$8, $itr1 2007 mov \$2, $itr2 2008 cmp \$4*32, $inl 2009 jbe seal_avx2_tail_128 2010 cmp \$8*32, $inl 2011 jbe seal_avx2_tail_256 2012 cmp \$12*32, $inl 2013 jbe seal_avx2_tail_384 2014 cmp \$16*32, $inl 2015 jbe seal_avx2_tail_512\n"; 2016 # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2017 &prep_state_avx2(4); 2018 foreach $l (@loop_body) {$code.=$l."\n";} 2019 @loop_body = split /\n/, $chacha_body; 2020 &emit_body(41); 2021 @loop_body = split /\n/, $chacha_body; $code.=" 2022 sub \$16, $oup 2023 mov \$9, $itr1 2024 jmp 4f 20251: \n"; 2026 &prep_state_avx2(4); $code.=" 2027 mov \$10, $itr1 20282: \n"; 2029 &poly_add("0*8($oup)"); 2030 &emit_body(10); 2031 &poly_stage1_mulx(); 2032 &emit_body(9); 2033 &poly_stage2_mulx(); 2034 &emit_body(12); 2035 &poly_stage3_mulx(); 2036 &emit_body(10); 2037 &poly_reduce_stage(); $code.=" 20384: \n"; 2039 &emit_body(9); 2040 &poly_add("2*8($oup)"); 2041 &emit_body(8); 2042 &poly_stage1_mulx(); 2043 &emit_body(18); 2044 &poly_stage2_mulx(); 2045 &emit_body(18); 2046 &poly_stage3_mulx(); 2047 &emit_body(9); 2048 &poly_reduce_stage(); 2049 &emit_body(8); 2050 &poly_add("4*8($oup)"); $code.=" 2051 lea 6*8($oup), $oup\n"; 2052 &emit_body(18); 2053 &poly_stage1_mulx(); 2054 &emit_body(8); 2055 &poly_stage2_mulx(); 2056 &emit_body(8); 2057 &poly_stage3_mulx(); 2058 &emit_body(18); 2059 &poly_reduce_stage(); 2060 foreach $l (@loop_body) {$code.=$l."\n";} 2061 @loop_body = split /\n/, $chacha_body; $code.=" 2062 dec $itr1 2063 jne 2b\n"; 2064 &finalize_state_avx2(4); $code.=" 2065 lea 4*8($oup), $oup 2066 vmovdqa $A0, $tmp_store\n"; 2067 &poly_add("-4*8($oup)"); 2068 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2069 vmovdqa $tmp_store, $A0\n"; 2070 &poly_mul(); 2071 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2072 &poly_add("-2*8($oup)"); 2073 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2074 &poly_mul(); 2075 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 2076 lea 16*32($inp), $inp 2077 sub \$16*32, $inl 2078 cmp \$16*32, $inl 2079 jg 1b\n"; 2080 &poly_add("0*8($oup)"); 2081 &poly_mul(); 2082 &poly_add("2*8($oup)"); 2083 &poly_mul(); $code.=" 2084 lea 4*8($oup), $oup 2085 mov \$10, $itr1 2086 xor $itr2, $itr2 2087 cmp \$4*32, $inl 2088 ja 3f 2089############################################################################### 2090seal_avx2_tail_128:\n"; 2091 &prep_state_avx2(1); $code.=" 20921: \n"; 2093 &poly_add("0($oup)"); 2094 &poly_mul(); $code.=" 2095 lea 2*8($oup), $oup 20962: \n"; 2097 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2098 &poly_add("0*8($oup)"); 2099 &poly_mul(); 2100 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2101 &poly_add("2*8($oup)"); 2102 &poly_mul(); $code.=" 2103 lea 4*8($oup), $oup 2104 dec $itr1 2105 jg 1b 2106 dec $itr2 2107 jge 2b\n"; 2108 &finalize_state_avx2(1); 2109 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2110 jmp seal_avx2_short_loop 21113: 2112 cmp \$8*32, $inl 2113 ja 3f 2114############################################################################### 2115seal_avx2_tail_256:\n"; 2116 &prep_state_avx2(2); $code.=" 21171: \n"; 2118 &poly_add("0($oup)"); 2119 &poly_mul(); $code.=" 2120 lea 2*8($oup), $oup 21212: \n"; 2122 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2123 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2124 &poly_add("0*8($oup)"); 2125 &poly_mul(); 2126 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2127 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2128 &poly_add("2*8($oup)"); 2129 &poly_mul(); $code.=" 2130 lea 4*8($oup), $oup 2131 dec $itr1 2132 jg 1b 2133 dec $itr2 2134 jge 2b\n"; 2135 &finalize_state_avx2(2); 2136 &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); 2137 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2138 mov \$4*32, $itr1 2139 lea 4*32($inp), $inp 2140 sub \$4*32, $inl 2141 jmp seal_avx2_hash 21423: 2143 cmp \$12*32, $inl 2144 ja seal_avx2_tail_512 2145############################################################################### 2146seal_avx2_tail_384:\n"; 2147 &prep_state_avx2(3); $code.=" 21481: \n"; 2149 &poly_add("0($oup)"); 2150 &poly_mul(); $code.=" 2151 lea 2*8($oup), $oup 21522: \n"; 2153 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2154 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2155 &poly_add("0*8($oup)"); 2156 &poly_mul(); 2157 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2158 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2159 &poly_add("2*8($oup)"); 2160 &poly_mul(); 2161 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2162 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2163 lea 4*8($oup), $oup 2164 dec $itr1 2165 jg 1b 2166 dec $itr2 2167 jge 2b\n"; 2168 &finalize_state_avx2(3); 2169 &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); 2170 &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); 2171 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2172 mov \$8*32, $itr1 2173 lea 8*32($inp), $inp 2174 sub \$8*32, $inl 2175 jmp seal_avx2_hash 2176############################################################################### 2177seal_avx2_tail_512:\n"; 2178 &prep_state_avx2(4); $code.=" 21791: \n"; 2180 &poly_add("0($oup)"); 2181 &poly_mul_mulx(); $code.=" 2182 lea 2*8($oup), $oup 21832: \n"; 2184 &emit_body(20); 2185 &poly_add("0*8($oup)"); 2186 &emit_body(20); 2187 &poly_stage1_mulx(); 2188 &emit_body(20); 2189 &poly_stage2_mulx(); 2190 &emit_body(20); 2191 &poly_stage3_mulx(); 2192 &emit_body(20); 2193 &poly_reduce_stage(); 2194 &emit_body(20); 2195 &poly_add("2*8($oup)"); 2196 &emit_body(20); 2197 &poly_stage1_mulx(); 2198 &emit_body(20); 2199 &poly_stage2_mulx(); 2200 &emit_body(20); 2201 &poly_stage3_mulx(); 2202 &emit_body(20); 2203 &poly_reduce_stage(); 2204 foreach $l (@loop_body) {$code.=$l."\n";} 2205 @loop_body = split /\n/, $chacha_body; $code.=" 2206 lea 4*8($oup), $oup 2207 dec $itr1 2208 jg 1b 2209 dec $itr2 2210 jge 2b\n"; 2211 &finalize_state_avx2(4); $code.=" 2212 vmovdqa $A0, $tmp_store\n"; 2213 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2214 vmovdqa $tmp_store, $A0\n"; 2215 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2216 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2217 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2218 mov \$12*32, $itr1 2219 lea 12*32($inp), $inp 2220 sub \$12*32, $inl 2221 jmp seal_avx2_hash 2222################################################################################ 2223seal_avx2_320: 2224 vmovdqa $A0, $A1 2225 vmovdqa $A0, $A2 2226 vmovdqa $B0, $B1 2227 vmovdqa $B0, $B2 2228 vmovdqa $C0, $C1 2229 vmovdqa $C0, $C2 2230 vpaddd .avx2_inc(%rip), $D0, $D1 2231 vpaddd .avx2_inc(%rip), $D1, $D2 2232 vmovdqa $B0, $T1 2233 vmovdqa $C0, $T2 2234 vmovdqa $D0, $ctr0_store 2235 vmovdqa $D1, $ctr1_store 2236 vmovdqa $D2, $ctr2_store 2237 mov \$10, $acc0 22381: \n"; 2239 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2240 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2241 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2242 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2243 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2244 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2245 dec $acc0 2246 jne 1b 2247 vpaddd .chacha20_consts(%rip), $A0, $A0 2248 vpaddd .chacha20_consts(%rip), $A1, $A1 2249 vpaddd .chacha20_consts(%rip), $A2, $A2 2250 vpaddd $T1, $B0, $B0 2251 vpaddd $T1, $B1, $B1 2252 vpaddd $T1, $B2, $B2 2253 vpaddd $T2, $C0, $C0 2254 vpaddd $T2, $C1, $C1 2255 vpaddd $T2, $C2, $C2 2256 vpaddd $ctr0_store, $D0, $D0 2257 vpaddd $ctr1_store, $D1, $D1 2258 vpaddd $ctr2_store, $D2, $D2 2259 vperm2i128 \$0x02, $A0, $B0, $T0 2260 # Clamp and store the key 2261 vpand .clamp(%rip), $T0, $T0 2262 vmovdqa $T0, $r_store 2263 # Stream for up to 320 bytes 2264 vperm2i128 \$0x13, $A0, $B0, $A0 2265 vperm2i128 \$0x13, $C0, $D0, $B0 2266 vperm2i128 \$0x02, $A1, $B1, $C0 2267 vperm2i128 \$0x02, $C1, $D1, $D0 2268 vperm2i128 \$0x13, $A1, $B1, $A1 2269 vperm2i128 \$0x13, $C1, $D1, $B1 2270 vperm2i128 \$0x02, $A2, $B2, $C1 2271 vperm2i128 \$0x02, $C2, $D2, $D1 2272 vperm2i128 \$0x13, $A2, $B2, $A2 2273 vperm2i128 \$0x13, $C2, $D2, $B2 2274 jmp seal_avx2_short 2275################################################################################ 2276seal_avx2_192: 2277 vmovdqa $A0, $A1 2278 vmovdqa $A0, $A2 2279 vmovdqa $B0, $B1 2280 vmovdqa $B0, $B2 2281 vmovdqa $C0, $C1 2282 vmovdqa $C0, $C2 2283 vpaddd .avx2_inc(%rip), $D0, $D1 2284 vmovdqa $D0, $T2 2285 vmovdqa $D1, $T3 2286 mov \$10, $acc0 22871: \n"; 2288 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2289 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2290 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2291 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 2292 dec $acc0 2293 jne 1b 2294 vpaddd $A2, $A0, $A0 2295 vpaddd $A2, $A1, $A1 2296 vpaddd $B2, $B0, $B0 2297 vpaddd $B2, $B1, $B1 2298 vpaddd $C2, $C0, $C0 2299 vpaddd $C2, $C1, $C1 2300 vpaddd $T2, $D0, $D0 2301 vpaddd $T3, $D1, $D1 2302 vperm2i128 \$0x02, $A0, $B0, $T0 2303 # Clamp and store the key 2304 vpand .clamp(%rip), $T0, $T0 2305 vmovdqa $T0, $r_store 2306 # Stream for up to 192 bytes 2307 vperm2i128 \$0x13, $A0, $B0, $A0 2308 vperm2i128 \$0x13, $C0, $D0, $B0 2309 vperm2i128 \$0x02, $A1, $B1, $C0 2310 vperm2i128 \$0x02, $C1, $D1, $D0 2311 vperm2i128 \$0x13, $A1, $B1, $A1 2312 vperm2i128 \$0x13, $C1, $D1, $B1 2313seal_avx2_short: 2314 mov %r8, $itr2 2315 call poly_hash_ad_internal 2316 xor $itr1, $itr1 2317seal_avx2_hash: 2318 cmp \$16, $itr1 2319 jb seal_avx2_short_loop\n"; 2320 &poly_add("0($oup)"); 2321 &poly_mul(); $code.=" 2322 sub \$16, $itr1 2323 add \$16, $oup 2324 jmp seal_avx2_hash 2325seal_avx2_short_loop: 2326 cmp \$32, $inl 2327 jb seal_avx2_short_tail 2328 sub \$32, $inl 2329 # Encrypt 2330 vpxor ($inp), $A0, $A0 2331 vmovdqu $A0, ($oup) 2332 lea 1*32($inp), $inp 2333 # Load + hash\n"; 2334 &poly_add("0*8($oup)"); 2335 &poly_mul(); 2336 &poly_add("2*8($oup)"); 2337 &poly_mul(); $code.=" 2338 lea 1*32($oup), $oup 2339 # Shift stream 2340 vmovdqa $B0, $A0 2341 vmovdqa $C0, $B0 2342 vmovdqa $D0, $C0 2343 vmovdqa $A1, $D0 2344 vmovdqa $B1, $A1 2345 vmovdqa $C1, $B1 2346 vmovdqa $D1, $C1 2347 vmovdqa $A2, $D1 2348 vmovdqa $B2, $A2 2349 jmp seal_avx2_short_loop 2350seal_avx2_short_tail: 2351 cmp \$16, $inl 2352 jb 1f 2353 sub \$16, $inl 2354 vpxor ($inp), $A0x, $A3x 2355 vmovdqu $A3x, ($oup) 2356 lea 1*16($inp), $inp\n"; 2357 &poly_add("0*8($oup)"); 2358 &poly_mul(); $code.=" 2359 lea 1*16($oup), $oup 2360 vextracti128 \$1, $A0, $A0x 23611: 2362 vzeroupper 2363 jmp seal_sse_tail_16 2364.cfi_endproc 2365"; 2366} 2367 2368if (!$win64) { 2369 $code =~ s/\`([^\`]*)\`/eval $1/gem; 2370 print $code; 2371} else { 2372 print <<___; 2373.globl dummy_chacha20_poly1305_asm 2374.type dummy_chacha20_poly1305_asm,\@abi-omnipotent 2375dummy_chacha20_poly1305_asm: 2376 ret 2377___ 2378} 2379 2380close STDOUT; 2381