1default rel 2%define XMMWORD 3%define YMMWORD 4%define ZMMWORD 5section .text code align=64 6 7 8EXTERN OPENSSL_ia32cap_P 9 10ALIGN 64 11$L$zero: 12 DD 0,0,0,0 13$L$one: 14 DD 1,0,0,0 15$L$inc: 16 DD 0,1,2,3 17$L$four: 18 DD 4,4,4,4 19$L$incy: 20 DD 0,2,4,6,1,3,5,7 21$L$eight: 22 DD 8,8,8,8,8,8,8,8 23$L$rot16: 24DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd 25$L$rot24: 26DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe 27$L$sigma: 28DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 29DB 0 30ALIGN 64 31$L$zeroz: 32 DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 33$L$fourz: 34 DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 35$L$incz: 36 DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 37$L$sixteen: 38 DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 39DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 40DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 41DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 42DB 108,46,111,114,103,62,0 43global ChaCha20_ctr32 44 45ALIGN 64 46ChaCha20_ctr32: 47 mov QWORD[8+rsp],rdi ;WIN64 prologue 48 mov QWORD[16+rsp],rsi 49 mov rax,rsp 50$L$SEH_begin_ChaCha20_ctr32: 51 mov rdi,rcx 52 mov rsi,rdx 53 mov rdx,r8 54 mov rcx,r9 55 mov r8,QWORD[40+rsp] 56 57 58 cmp rdx,0 59 je NEAR $L$no_data 60 mov r10,QWORD[((OPENSSL_ia32cap_P+4))] 61 test r10d,512 62 jnz NEAR $L$ChaCha20_ssse3 63 64 push rbx 65 push rbp 66 push r12 67 push r13 68 push r14 69 push r15 70 sub rsp,64+24 71$L$ctr32_body: 72 73 74 movdqu xmm1,XMMWORD[rcx] 75 movdqu xmm2,XMMWORD[16+rcx] 76 movdqu xmm3,XMMWORD[r8] 77 movdqa xmm4,XMMWORD[$L$one] 78 79 80 movdqa XMMWORD[16+rsp],xmm1 81 movdqa XMMWORD[32+rsp],xmm2 82 movdqa XMMWORD[48+rsp],xmm3 83 mov rbp,rdx 84 jmp NEAR $L$oop_outer 85 86ALIGN 32 87$L$oop_outer: 88 mov eax,0x61707865 89 mov ebx,0x3320646e 90 mov ecx,0x79622d32 91 mov edx,0x6b206574 92 mov r8d,DWORD[16+rsp] 93 mov r9d,DWORD[20+rsp] 94 mov r10d,DWORD[24+rsp] 95 mov r11d,DWORD[28+rsp] 96 movd r12d,xmm3 97 mov r13d,DWORD[52+rsp] 98 mov r14d,DWORD[56+rsp] 99 mov r15d,DWORD[60+rsp] 100 101 mov QWORD[((64+0))+rsp],rbp 102 mov ebp,10 103 mov QWORD[((64+8))+rsp],rsi 104DB 102,72,15,126,214 105 mov QWORD[((64+16))+rsp],rdi 106 mov rdi,rsi 107 shr rdi,32 108 jmp NEAR $L$oop 109 110ALIGN 32 111$L$oop: 112 add eax,r8d 113 xor r12d,eax 114 rol r12d,16 115 add ebx,r9d 116 xor r13d,ebx 117 rol r13d,16 118 add esi,r12d 119 xor r8d,esi 120 rol r8d,12 121 add edi,r13d 122 xor r9d,edi 123 rol r9d,12 124 add eax,r8d 125 xor r12d,eax 126 rol r12d,8 127 add ebx,r9d 128 xor r13d,ebx 129 rol r13d,8 130 add esi,r12d 131 xor r8d,esi 132 rol r8d,7 133 add edi,r13d 134 xor r9d,edi 135 rol r9d,7 136 mov DWORD[32+rsp],esi 137 mov DWORD[36+rsp],edi 138 mov esi,DWORD[40+rsp] 139 mov edi,DWORD[44+rsp] 140 add ecx,r10d 141 xor r14d,ecx 142 rol r14d,16 143 add edx,r11d 144 xor r15d,edx 145 rol r15d,16 146 add esi,r14d 147 xor r10d,esi 148 rol r10d,12 149 add edi,r15d 150 xor r11d,edi 151 rol r11d,12 152 add ecx,r10d 153 xor r14d,ecx 154 rol r14d,8 155 add edx,r11d 156 xor r15d,edx 157 rol r15d,8 158 add esi,r14d 159 xor r10d,esi 160 rol r10d,7 161 add edi,r15d 162 xor r11d,edi 163 rol r11d,7 164 add eax,r9d 165 xor r15d,eax 166 rol r15d,16 167 add ebx,r10d 168 xor r12d,ebx 169 rol r12d,16 170 add esi,r15d 171 xor r9d,esi 172 rol r9d,12 173 add edi,r12d 174 xor r10d,edi 175 rol r10d,12 176 add eax,r9d 177 xor r15d,eax 178 rol r15d,8 179 add ebx,r10d 180 xor r12d,ebx 181 rol r12d,8 182 add esi,r15d 183 xor r9d,esi 184 rol r9d,7 185 add edi,r12d 186 xor r10d,edi 187 rol r10d,7 188 mov DWORD[40+rsp],esi 189 mov DWORD[44+rsp],edi 190 mov esi,DWORD[32+rsp] 191 mov edi,DWORD[36+rsp] 192 add ecx,r11d 193 xor r13d,ecx 194 rol r13d,16 195 add edx,r8d 196 xor r14d,edx 197 rol r14d,16 198 add esi,r13d 199 xor r11d,esi 200 rol r11d,12 201 add edi,r14d 202 xor r8d,edi 203 rol r8d,12 204 add ecx,r11d 205 xor r13d,ecx 206 rol r13d,8 207 add edx,r8d 208 xor r14d,edx 209 rol r14d,8 210 add esi,r13d 211 xor r11d,esi 212 rol r11d,7 213 add edi,r14d 214 xor r8d,edi 215 rol r8d,7 216 dec ebp 217 jnz NEAR $L$oop 218 mov DWORD[36+rsp],edi 219 mov DWORD[32+rsp],esi 220 mov rbp,QWORD[64+rsp] 221 movdqa xmm1,xmm2 222 mov rsi,QWORD[((64+8))+rsp] 223 paddd xmm3,xmm4 224 mov rdi,QWORD[((64+16))+rsp] 225 226 add eax,0x61707865 227 add ebx,0x3320646e 228 add ecx,0x79622d32 229 add edx,0x6b206574 230 add r8d,DWORD[16+rsp] 231 add r9d,DWORD[20+rsp] 232 add r10d,DWORD[24+rsp] 233 add r11d,DWORD[28+rsp] 234 add r12d,DWORD[48+rsp] 235 add r13d,DWORD[52+rsp] 236 add r14d,DWORD[56+rsp] 237 add r15d,DWORD[60+rsp] 238 paddd xmm1,XMMWORD[32+rsp] 239 240 cmp rbp,64 241 jb NEAR $L$tail 242 243 xor eax,DWORD[rsi] 244 xor ebx,DWORD[4+rsi] 245 xor ecx,DWORD[8+rsi] 246 xor edx,DWORD[12+rsi] 247 xor r8d,DWORD[16+rsi] 248 xor r9d,DWORD[20+rsi] 249 xor r10d,DWORD[24+rsi] 250 xor r11d,DWORD[28+rsi] 251 movdqu xmm0,XMMWORD[32+rsi] 252 xor r12d,DWORD[48+rsi] 253 xor r13d,DWORD[52+rsi] 254 xor r14d,DWORD[56+rsi] 255 xor r15d,DWORD[60+rsi] 256 lea rsi,[64+rsi] 257 pxor xmm0,xmm1 258 259 movdqa XMMWORD[32+rsp],xmm2 260 movd DWORD[48+rsp],xmm3 261 262 mov DWORD[rdi],eax 263 mov DWORD[4+rdi],ebx 264 mov DWORD[8+rdi],ecx 265 mov DWORD[12+rdi],edx 266 mov DWORD[16+rdi],r8d 267 mov DWORD[20+rdi],r9d 268 mov DWORD[24+rdi],r10d 269 mov DWORD[28+rdi],r11d 270 movdqu XMMWORD[32+rdi],xmm0 271 mov DWORD[48+rdi],r12d 272 mov DWORD[52+rdi],r13d 273 mov DWORD[56+rdi],r14d 274 mov DWORD[60+rdi],r15d 275 lea rdi,[64+rdi] 276 277 sub rbp,64 278 jnz NEAR $L$oop_outer 279 280 jmp NEAR $L$done 281 282ALIGN 16 283$L$tail: 284 mov DWORD[rsp],eax 285 mov DWORD[4+rsp],ebx 286 xor rbx,rbx 287 mov DWORD[8+rsp],ecx 288 mov DWORD[12+rsp],edx 289 mov DWORD[16+rsp],r8d 290 mov DWORD[20+rsp],r9d 291 mov DWORD[24+rsp],r10d 292 mov DWORD[28+rsp],r11d 293 movdqa XMMWORD[32+rsp],xmm1 294 mov DWORD[48+rsp],r12d 295 mov DWORD[52+rsp],r13d 296 mov DWORD[56+rsp],r14d 297 mov DWORD[60+rsp],r15d 298 299$L$oop_tail: 300 movzx eax,BYTE[rbx*1+rsi] 301 movzx edx,BYTE[rbx*1+rsp] 302 lea rbx,[1+rbx] 303 xor eax,edx 304 mov BYTE[((-1))+rbx*1+rdi],al 305 dec rbp 306 jnz NEAR $L$oop_tail 307 308$L$done: 309 lea rsi,[((64+24+48))+rsp] 310 mov r15,QWORD[((-48))+rsi] 311 mov r14,QWORD[((-40))+rsi] 312 mov r13,QWORD[((-32))+rsi] 313 mov r12,QWORD[((-24))+rsi] 314 mov rbp,QWORD[((-16))+rsi] 315 mov rbx,QWORD[((-8))+rsi] 316 lea rsp,[rsi] 317$L$no_data: 318 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 319 mov rsi,QWORD[16+rsp] 320 DB 0F3h,0C3h ;repret 321$L$SEH_end_ChaCha20_ctr32: 322 323ALIGN 32 324ChaCha20_ssse3: 325 mov QWORD[8+rsp],rdi ;WIN64 prologue 326 mov QWORD[16+rsp],rsi 327 mov rax,rsp 328$L$SEH_begin_ChaCha20_ssse3: 329 mov rdi,rcx 330 mov rsi,rdx 331 mov rdx,r8 332 mov rcx,r9 333 mov r8,QWORD[40+rsp] 334 335 336$L$ChaCha20_ssse3: 337 mov r9,rsp 338 cmp rdx,128 339 ja NEAR $L$ChaCha20_4x 340 341$L$do_sse3_after_all: 342 sub rsp,64+40 343 movaps XMMWORD[(-40)+r9],xmm6 344 movaps XMMWORD[(-24)+r9],xmm7 345$L$ssse3_body: 346 movdqa xmm0,XMMWORD[$L$sigma] 347 movdqu xmm1,XMMWORD[rcx] 348 movdqu xmm2,XMMWORD[16+rcx] 349 movdqu xmm3,XMMWORD[r8] 350 movdqa xmm6,XMMWORD[$L$rot16] 351 movdqa xmm7,XMMWORD[$L$rot24] 352 353 movdqa XMMWORD[rsp],xmm0 354 movdqa XMMWORD[16+rsp],xmm1 355 movdqa XMMWORD[32+rsp],xmm2 356 movdqa XMMWORD[48+rsp],xmm3 357 mov r8,10 358 jmp NEAR $L$oop_ssse3 359 360ALIGN 32 361$L$oop_outer_ssse3: 362 movdqa xmm3,XMMWORD[$L$one] 363 movdqa xmm0,XMMWORD[rsp] 364 movdqa xmm1,XMMWORD[16+rsp] 365 movdqa xmm2,XMMWORD[32+rsp] 366 paddd xmm3,XMMWORD[48+rsp] 367 mov r8,10 368 movdqa XMMWORD[48+rsp],xmm3 369 jmp NEAR $L$oop_ssse3 370 371ALIGN 32 372$L$oop_ssse3: 373 paddd xmm0,xmm1 374 pxor xmm3,xmm0 375DB 102,15,56,0,222 376 paddd xmm2,xmm3 377 pxor xmm1,xmm2 378 movdqa xmm4,xmm1 379 psrld xmm1,20 380 pslld xmm4,12 381 por xmm1,xmm4 382 paddd xmm0,xmm1 383 pxor xmm3,xmm0 384DB 102,15,56,0,223 385 paddd xmm2,xmm3 386 pxor xmm1,xmm2 387 movdqa xmm4,xmm1 388 psrld xmm1,25 389 pslld xmm4,7 390 por xmm1,xmm4 391 pshufd xmm2,xmm2,78 392 pshufd xmm1,xmm1,57 393 pshufd xmm3,xmm3,147 394 nop 395 paddd xmm0,xmm1 396 pxor xmm3,xmm0 397DB 102,15,56,0,222 398 paddd xmm2,xmm3 399 pxor xmm1,xmm2 400 movdqa xmm4,xmm1 401 psrld xmm1,20 402 pslld xmm4,12 403 por xmm1,xmm4 404 paddd xmm0,xmm1 405 pxor xmm3,xmm0 406DB 102,15,56,0,223 407 paddd xmm2,xmm3 408 pxor xmm1,xmm2 409 movdqa xmm4,xmm1 410 psrld xmm1,25 411 pslld xmm4,7 412 por xmm1,xmm4 413 pshufd xmm2,xmm2,78 414 pshufd xmm1,xmm1,147 415 pshufd xmm3,xmm3,57 416 dec r8 417 jnz NEAR $L$oop_ssse3 418 paddd xmm0,XMMWORD[rsp] 419 paddd xmm1,XMMWORD[16+rsp] 420 paddd xmm2,XMMWORD[32+rsp] 421 paddd xmm3,XMMWORD[48+rsp] 422 423 cmp rdx,64 424 jb NEAR $L$tail_ssse3 425 426 movdqu xmm4,XMMWORD[rsi] 427 movdqu xmm5,XMMWORD[16+rsi] 428 pxor xmm0,xmm4 429 movdqu xmm4,XMMWORD[32+rsi] 430 pxor xmm1,xmm5 431 movdqu xmm5,XMMWORD[48+rsi] 432 lea rsi,[64+rsi] 433 pxor xmm2,xmm4 434 pxor xmm3,xmm5 435 436 movdqu XMMWORD[rdi],xmm0 437 movdqu XMMWORD[16+rdi],xmm1 438 movdqu XMMWORD[32+rdi],xmm2 439 movdqu XMMWORD[48+rdi],xmm3 440 lea rdi,[64+rdi] 441 442 sub rdx,64 443 jnz NEAR $L$oop_outer_ssse3 444 445 jmp NEAR $L$done_ssse3 446 447ALIGN 16 448$L$tail_ssse3: 449 movdqa XMMWORD[rsp],xmm0 450 movdqa XMMWORD[16+rsp],xmm1 451 movdqa XMMWORD[32+rsp],xmm2 452 movdqa XMMWORD[48+rsp],xmm3 453 xor r8,r8 454 455$L$oop_tail_ssse3: 456 movzx eax,BYTE[r8*1+rsi] 457 movzx ecx,BYTE[r8*1+rsp] 458 lea r8,[1+r8] 459 xor eax,ecx 460 mov BYTE[((-1))+r8*1+rdi],al 461 dec rdx 462 jnz NEAR $L$oop_tail_ssse3 463 464$L$done_ssse3: 465 movaps xmm6,XMMWORD[((-40))+r9] 466 movaps xmm7,XMMWORD[((-24))+r9] 467 lea rsp,[r9] 468$L$ssse3_epilogue: 469 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 470 mov rsi,QWORD[16+rsp] 471 DB 0F3h,0C3h ;repret 472$L$SEH_end_ChaCha20_ssse3: 473 474ALIGN 32 475ChaCha20_4x: 476 mov QWORD[8+rsp],rdi ;WIN64 prologue 477 mov QWORD[16+rsp],rsi 478 mov rax,rsp 479$L$SEH_begin_ChaCha20_4x: 480 mov rdi,rcx 481 mov rsi,rdx 482 mov rdx,r8 483 mov rcx,r9 484 mov r8,QWORD[40+rsp] 485 486 487$L$ChaCha20_4x: 488 mov r9,rsp 489 mov r11,r10 490 shr r10,32 491 test r10,32 492 jnz NEAR $L$ChaCha20_8x 493 cmp rdx,192 494 ja NEAR $L$proceed4x 495 496 and r11,71303168 497 cmp r11,4194304 498 je NEAR $L$do_sse3_after_all 499 500$L$proceed4x: 501 sub rsp,0x140+168 502 movaps XMMWORD[(-168)+r9],xmm6 503 movaps XMMWORD[(-152)+r9],xmm7 504 movaps XMMWORD[(-136)+r9],xmm8 505 movaps XMMWORD[(-120)+r9],xmm9 506 movaps XMMWORD[(-104)+r9],xmm10 507 movaps XMMWORD[(-88)+r9],xmm11 508 movaps XMMWORD[(-72)+r9],xmm12 509 movaps XMMWORD[(-56)+r9],xmm13 510 movaps XMMWORD[(-40)+r9],xmm14 511 movaps XMMWORD[(-24)+r9],xmm15 512$L$4x_body: 513 movdqa xmm11,XMMWORD[$L$sigma] 514 movdqu xmm15,XMMWORD[rcx] 515 movdqu xmm7,XMMWORD[16+rcx] 516 movdqu xmm3,XMMWORD[r8] 517 lea rcx,[256+rsp] 518 lea r10,[$L$rot16] 519 lea r11,[$L$rot24] 520 521 pshufd xmm8,xmm11,0x00 522 pshufd xmm9,xmm11,0x55 523 movdqa XMMWORD[64+rsp],xmm8 524 pshufd xmm10,xmm11,0xaa 525 movdqa XMMWORD[80+rsp],xmm9 526 pshufd xmm11,xmm11,0xff 527 movdqa XMMWORD[96+rsp],xmm10 528 movdqa XMMWORD[112+rsp],xmm11 529 530 pshufd xmm12,xmm15,0x00 531 pshufd xmm13,xmm15,0x55 532 movdqa XMMWORD[(128-256)+rcx],xmm12 533 pshufd xmm14,xmm15,0xaa 534 movdqa XMMWORD[(144-256)+rcx],xmm13 535 pshufd xmm15,xmm15,0xff 536 movdqa XMMWORD[(160-256)+rcx],xmm14 537 movdqa XMMWORD[(176-256)+rcx],xmm15 538 539 pshufd xmm4,xmm7,0x00 540 pshufd xmm5,xmm7,0x55 541 movdqa XMMWORD[(192-256)+rcx],xmm4 542 pshufd xmm6,xmm7,0xaa 543 movdqa XMMWORD[(208-256)+rcx],xmm5 544 pshufd xmm7,xmm7,0xff 545 movdqa XMMWORD[(224-256)+rcx],xmm6 546 movdqa XMMWORD[(240-256)+rcx],xmm7 547 548 pshufd xmm0,xmm3,0x00 549 pshufd xmm1,xmm3,0x55 550 paddd xmm0,XMMWORD[$L$inc] 551 pshufd xmm2,xmm3,0xaa 552 movdqa XMMWORD[(272-256)+rcx],xmm1 553 pshufd xmm3,xmm3,0xff 554 movdqa XMMWORD[(288-256)+rcx],xmm2 555 movdqa XMMWORD[(304-256)+rcx],xmm3 556 557 jmp NEAR $L$oop_enter4x 558 559ALIGN 32 560$L$oop_outer4x: 561 movdqa xmm8,XMMWORD[64+rsp] 562 movdqa xmm9,XMMWORD[80+rsp] 563 movdqa xmm10,XMMWORD[96+rsp] 564 movdqa xmm11,XMMWORD[112+rsp] 565 movdqa xmm12,XMMWORD[((128-256))+rcx] 566 movdqa xmm13,XMMWORD[((144-256))+rcx] 567 movdqa xmm14,XMMWORD[((160-256))+rcx] 568 movdqa xmm15,XMMWORD[((176-256))+rcx] 569 movdqa xmm4,XMMWORD[((192-256))+rcx] 570 movdqa xmm5,XMMWORD[((208-256))+rcx] 571 movdqa xmm6,XMMWORD[((224-256))+rcx] 572 movdqa xmm7,XMMWORD[((240-256))+rcx] 573 movdqa xmm0,XMMWORD[((256-256))+rcx] 574 movdqa xmm1,XMMWORD[((272-256))+rcx] 575 movdqa xmm2,XMMWORD[((288-256))+rcx] 576 movdqa xmm3,XMMWORD[((304-256))+rcx] 577 paddd xmm0,XMMWORD[$L$four] 578 579$L$oop_enter4x: 580 movdqa XMMWORD[32+rsp],xmm6 581 movdqa XMMWORD[48+rsp],xmm7 582 movdqa xmm7,XMMWORD[r10] 583 mov eax,10 584 movdqa XMMWORD[(256-256)+rcx],xmm0 585 jmp NEAR $L$oop4x 586 587ALIGN 32 588$L$oop4x: 589 paddd xmm8,xmm12 590 paddd xmm9,xmm13 591 pxor xmm0,xmm8 592 pxor xmm1,xmm9 593DB 102,15,56,0,199 594DB 102,15,56,0,207 595 paddd xmm4,xmm0 596 paddd xmm5,xmm1 597 pxor xmm12,xmm4 598 pxor xmm13,xmm5 599 movdqa xmm6,xmm12 600 pslld xmm12,12 601 psrld xmm6,20 602 movdqa xmm7,xmm13 603 pslld xmm13,12 604 por xmm12,xmm6 605 psrld xmm7,20 606 movdqa xmm6,XMMWORD[r11] 607 por xmm13,xmm7 608 paddd xmm8,xmm12 609 paddd xmm9,xmm13 610 pxor xmm0,xmm8 611 pxor xmm1,xmm9 612DB 102,15,56,0,198 613DB 102,15,56,0,206 614 paddd xmm4,xmm0 615 paddd xmm5,xmm1 616 pxor xmm12,xmm4 617 pxor xmm13,xmm5 618 movdqa xmm7,xmm12 619 pslld xmm12,7 620 psrld xmm7,25 621 movdqa xmm6,xmm13 622 pslld xmm13,7 623 por xmm12,xmm7 624 psrld xmm6,25 625 movdqa xmm7,XMMWORD[r10] 626 por xmm13,xmm6 627 movdqa XMMWORD[rsp],xmm4 628 movdqa XMMWORD[16+rsp],xmm5 629 movdqa xmm4,XMMWORD[32+rsp] 630 movdqa xmm5,XMMWORD[48+rsp] 631 paddd xmm10,xmm14 632 paddd xmm11,xmm15 633 pxor xmm2,xmm10 634 pxor xmm3,xmm11 635DB 102,15,56,0,215 636DB 102,15,56,0,223 637 paddd xmm4,xmm2 638 paddd xmm5,xmm3 639 pxor xmm14,xmm4 640 pxor xmm15,xmm5 641 movdqa xmm6,xmm14 642 pslld xmm14,12 643 psrld xmm6,20 644 movdqa xmm7,xmm15 645 pslld xmm15,12 646 por xmm14,xmm6 647 psrld xmm7,20 648 movdqa xmm6,XMMWORD[r11] 649 por xmm15,xmm7 650 paddd xmm10,xmm14 651 paddd xmm11,xmm15 652 pxor xmm2,xmm10 653 pxor xmm3,xmm11 654DB 102,15,56,0,214 655DB 102,15,56,0,222 656 paddd xmm4,xmm2 657 paddd xmm5,xmm3 658 pxor xmm14,xmm4 659 pxor xmm15,xmm5 660 movdqa xmm7,xmm14 661 pslld xmm14,7 662 psrld xmm7,25 663 movdqa xmm6,xmm15 664 pslld xmm15,7 665 por xmm14,xmm7 666 psrld xmm6,25 667 movdqa xmm7,XMMWORD[r10] 668 por xmm15,xmm6 669 paddd xmm8,xmm13 670 paddd xmm9,xmm14 671 pxor xmm3,xmm8 672 pxor xmm0,xmm9 673DB 102,15,56,0,223 674DB 102,15,56,0,199 675 paddd xmm4,xmm3 676 paddd xmm5,xmm0 677 pxor xmm13,xmm4 678 pxor xmm14,xmm5 679 movdqa xmm6,xmm13 680 pslld xmm13,12 681 psrld xmm6,20 682 movdqa xmm7,xmm14 683 pslld xmm14,12 684 por xmm13,xmm6 685 psrld xmm7,20 686 movdqa xmm6,XMMWORD[r11] 687 por xmm14,xmm7 688 paddd xmm8,xmm13 689 paddd xmm9,xmm14 690 pxor xmm3,xmm8 691 pxor xmm0,xmm9 692DB 102,15,56,0,222 693DB 102,15,56,0,198 694 paddd xmm4,xmm3 695 paddd xmm5,xmm0 696 pxor xmm13,xmm4 697 pxor xmm14,xmm5 698 movdqa xmm7,xmm13 699 pslld xmm13,7 700 psrld xmm7,25 701 movdqa xmm6,xmm14 702 pslld xmm14,7 703 por xmm13,xmm7 704 psrld xmm6,25 705 movdqa xmm7,XMMWORD[r10] 706 por xmm14,xmm6 707 movdqa XMMWORD[32+rsp],xmm4 708 movdqa XMMWORD[48+rsp],xmm5 709 movdqa xmm4,XMMWORD[rsp] 710 movdqa xmm5,XMMWORD[16+rsp] 711 paddd xmm10,xmm15 712 paddd xmm11,xmm12 713 pxor xmm1,xmm10 714 pxor xmm2,xmm11 715DB 102,15,56,0,207 716DB 102,15,56,0,215 717 paddd xmm4,xmm1 718 paddd xmm5,xmm2 719 pxor xmm15,xmm4 720 pxor xmm12,xmm5 721 movdqa xmm6,xmm15 722 pslld xmm15,12 723 psrld xmm6,20 724 movdqa xmm7,xmm12 725 pslld xmm12,12 726 por xmm15,xmm6 727 psrld xmm7,20 728 movdqa xmm6,XMMWORD[r11] 729 por xmm12,xmm7 730 paddd xmm10,xmm15 731 paddd xmm11,xmm12 732 pxor xmm1,xmm10 733 pxor xmm2,xmm11 734DB 102,15,56,0,206 735DB 102,15,56,0,214 736 paddd xmm4,xmm1 737 paddd xmm5,xmm2 738 pxor xmm15,xmm4 739 pxor xmm12,xmm5 740 movdqa xmm7,xmm15 741 pslld xmm15,7 742 psrld xmm7,25 743 movdqa xmm6,xmm12 744 pslld xmm12,7 745 por xmm15,xmm7 746 psrld xmm6,25 747 movdqa xmm7,XMMWORD[r10] 748 por xmm12,xmm6 749 dec eax 750 jnz NEAR $L$oop4x 751 752 paddd xmm8,XMMWORD[64+rsp] 753 paddd xmm9,XMMWORD[80+rsp] 754 paddd xmm10,XMMWORD[96+rsp] 755 paddd xmm11,XMMWORD[112+rsp] 756 757 movdqa xmm6,xmm8 758 punpckldq xmm8,xmm9 759 movdqa xmm7,xmm10 760 punpckldq xmm10,xmm11 761 punpckhdq xmm6,xmm9 762 punpckhdq xmm7,xmm11 763 movdqa xmm9,xmm8 764 punpcklqdq xmm8,xmm10 765 movdqa xmm11,xmm6 766 punpcklqdq xmm6,xmm7 767 punpckhqdq xmm9,xmm10 768 punpckhqdq xmm11,xmm7 769 paddd xmm12,XMMWORD[((128-256))+rcx] 770 paddd xmm13,XMMWORD[((144-256))+rcx] 771 paddd xmm14,XMMWORD[((160-256))+rcx] 772 paddd xmm15,XMMWORD[((176-256))+rcx] 773 774 movdqa XMMWORD[rsp],xmm8 775 movdqa XMMWORD[16+rsp],xmm9 776 movdqa xmm8,XMMWORD[32+rsp] 777 movdqa xmm9,XMMWORD[48+rsp] 778 779 movdqa xmm10,xmm12 780 punpckldq xmm12,xmm13 781 movdqa xmm7,xmm14 782 punpckldq xmm14,xmm15 783 punpckhdq xmm10,xmm13 784 punpckhdq xmm7,xmm15 785 movdqa xmm13,xmm12 786 punpcklqdq xmm12,xmm14 787 movdqa xmm15,xmm10 788 punpcklqdq xmm10,xmm7 789 punpckhqdq xmm13,xmm14 790 punpckhqdq xmm15,xmm7 791 paddd xmm4,XMMWORD[((192-256))+rcx] 792 paddd xmm5,XMMWORD[((208-256))+rcx] 793 paddd xmm8,XMMWORD[((224-256))+rcx] 794 paddd xmm9,XMMWORD[((240-256))+rcx] 795 796 movdqa XMMWORD[32+rsp],xmm6 797 movdqa XMMWORD[48+rsp],xmm11 798 799 movdqa xmm14,xmm4 800 punpckldq xmm4,xmm5 801 movdqa xmm7,xmm8 802 punpckldq xmm8,xmm9 803 punpckhdq xmm14,xmm5 804 punpckhdq xmm7,xmm9 805 movdqa xmm5,xmm4 806 punpcklqdq xmm4,xmm8 807 movdqa xmm9,xmm14 808 punpcklqdq xmm14,xmm7 809 punpckhqdq xmm5,xmm8 810 punpckhqdq xmm9,xmm7 811 paddd xmm0,XMMWORD[((256-256))+rcx] 812 paddd xmm1,XMMWORD[((272-256))+rcx] 813 paddd xmm2,XMMWORD[((288-256))+rcx] 814 paddd xmm3,XMMWORD[((304-256))+rcx] 815 816 movdqa xmm8,xmm0 817 punpckldq xmm0,xmm1 818 movdqa xmm7,xmm2 819 punpckldq xmm2,xmm3 820 punpckhdq xmm8,xmm1 821 punpckhdq xmm7,xmm3 822 movdqa xmm1,xmm0 823 punpcklqdq xmm0,xmm2 824 movdqa xmm3,xmm8 825 punpcklqdq xmm8,xmm7 826 punpckhqdq xmm1,xmm2 827 punpckhqdq xmm3,xmm7 828 cmp rdx,64*4 829 jb NEAR $L$tail4x 830 831 movdqu xmm6,XMMWORD[rsi] 832 movdqu xmm11,XMMWORD[16+rsi] 833 movdqu xmm2,XMMWORD[32+rsi] 834 movdqu xmm7,XMMWORD[48+rsi] 835 pxor xmm6,XMMWORD[rsp] 836 pxor xmm11,xmm12 837 pxor xmm2,xmm4 838 pxor xmm7,xmm0 839 840 movdqu XMMWORD[rdi],xmm6 841 movdqu xmm6,XMMWORD[64+rsi] 842 movdqu XMMWORD[16+rdi],xmm11 843 movdqu xmm11,XMMWORD[80+rsi] 844 movdqu XMMWORD[32+rdi],xmm2 845 movdqu xmm2,XMMWORD[96+rsi] 846 movdqu XMMWORD[48+rdi],xmm7 847 movdqu xmm7,XMMWORD[112+rsi] 848 lea rsi,[128+rsi] 849 pxor xmm6,XMMWORD[16+rsp] 850 pxor xmm11,xmm13 851 pxor xmm2,xmm5 852 pxor xmm7,xmm1 853 854 movdqu XMMWORD[64+rdi],xmm6 855 movdqu xmm6,XMMWORD[rsi] 856 movdqu XMMWORD[80+rdi],xmm11 857 movdqu xmm11,XMMWORD[16+rsi] 858 movdqu XMMWORD[96+rdi],xmm2 859 movdqu xmm2,XMMWORD[32+rsi] 860 movdqu XMMWORD[112+rdi],xmm7 861 lea rdi,[128+rdi] 862 movdqu xmm7,XMMWORD[48+rsi] 863 pxor xmm6,XMMWORD[32+rsp] 864 pxor xmm11,xmm10 865 pxor xmm2,xmm14 866 pxor xmm7,xmm8 867 868 movdqu XMMWORD[rdi],xmm6 869 movdqu xmm6,XMMWORD[64+rsi] 870 movdqu XMMWORD[16+rdi],xmm11 871 movdqu xmm11,XMMWORD[80+rsi] 872 movdqu XMMWORD[32+rdi],xmm2 873 movdqu xmm2,XMMWORD[96+rsi] 874 movdqu XMMWORD[48+rdi],xmm7 875 movdqu xmm7,XMMWORD[112+rsi] 876 lea rsi,[128+rsi] 877 pxor xmm6,XMMWORD[48+rsp] 878 pxor xmm11,xmm15 879 pxor xmm2,xmm9 880 pxor xmm7,xmm3 881 movdqu XMMWORD[64+rdi],xmm6 882 movdqu XMMWORD[80+rdi],xmm11 883 movdqu XMMWORD[96+rdi],xmm2 884 movdqu XMMWORD[112+rdi],xmm7 885 lea rdi,[128+rdi] 886 887 sub rdx,64*4 888 jnz NEAR $L$oop_outer4x 889 890 jmp NEAR $L$done4x 891 892$L$tail4x: 893 cmp rdx,192 894 jae NEAR $L$192_or_more4x 895 cmp rdx,128 896 jae NEAR $L$128_or_more4x 897 cmp rdx,64 898 jae NEAR $L$64_or_more4x 899 900 901 xor r10,r10 902 903 movdqa XMMWORD[16+rsp],xmm12 904 movdqa XMMWORD[32+rsp],xmm4 905 movdqa XMMWORD[48+rsp],xmm0 906 jmp NEAR $L$oop_tail4x 907 908ALIGN 32 909$L$64_or_more4x: 910 movdqu xmm6,XMMWORD[rsi] 911 movdqu xmm11,XMMWORD[16+rsi] 912 movdqu xmm2,XMMWORD[32+rsi] 913 movdqu xmm7,XMMWORD[48+rsi] 914 pxor xmm6,XMMWORD[rsp] 915 pxor xmm11,xmm12 916 pxor xmm2,xmm4 917 pxor xmm7,xmm0 918 movdqu XMMWORD[rdi],xmm6 919 movdqu XMMWORD[16+rdi],xmm11 920 movdqu XMMWORD[32+rdi],xmm2 921 movdqu XMMWORD[48+rdi],xmm7 922 je NEAR $L$done4x 923 924 movdqa xmm6,XMMWORD[16+rsp] 925 lea rsi,[64+rsi] 926 xor r10,r10 927 movdqa XMMWORD[rsp],xmm6 928 movdqa XMMWORD[16+rsp],xmm13 929 lea rdi,[64+rdi] 930 movdqa XMMWORD[32+rsp],xmm5 931 sub rdx,64 932 movdqa XMMWORD[48+rsp],xmm1 933 jmp NEAR $L$oop_tail4x 934 935ALIGN 32 936$L$128_or_more4x: 937 movdqu xmm6,XMMWORD[rsi] 938 movdqu xmm11,XMMWORD[16+rsi] 939 movdqu xmm2,XMMWORD[32+rsi] 940 movdqu xmm7,XMMWORD[48+rsi] 941 pxor xmm6,XMMWORD[rsp] 942 pxor xmm11,xmm12 943 pxor xmm2,xmm4 944 pxor xmm7,xmm0 945 946 movdqu XMMWORD[rdi],xmm6 947 movdqu xmm6,XMMWORD[64+rsi] 948 movdqu XMMWORD[16+rdi],xmm11 949 movdqu xmm11,XMMWORD[80+rsi] 950 movdqu XMMWORD[32+rdi],xmm2 951 movdqu xmm2,XMMWORD[96+rsi] 952 movdqu XMMWORD[48+rdi],xmm7 953 movdqu xmm7,XMMWORD[112+rsi] 954 pxor xmm6,XMMWORD[16+rsp] 955 pxor xmm11,xmm13 956 pxor xmm2,xmm5 957 pxor xmm7,xmm1 958 movdqu XMMWORD[64+rdi],xmm6 959 movdqu XMMWORD[80+rdi],xmm11 960 movdqu XMMWORD[96+rdi],xmm2 961 movdqu XMMWORD[112+rdi],xmm7 962 je NEAR $L$done4x 963 964 movdqa xmm6,XMMWORD[32+rsp] 965 lea rsi,[128+rsi] 966 xor r10,r10 967 movdqa XMMWORD[rsp],xmm6 968 movdqa XMMWORD[16+rsp],xmm10 969 lea rdi,[128+rdi] 970 movdqa XMMWORD[32+rsp],xmm14 971 sub rdx,128 972 movdqa XMMWORD[48+rsp],xmm8 973 jmp NEAR $L$oop_tail4x 974 975ALIGN 32 976$L$192_or_more4x: 977 movdqu xmm6,XMMWORD[rsi] 978 movdqu xmm11,XMMWORD[16+rsi] 979 movdqu xmm2,XMMWORD[32+rsi] 980 movdqu xmm7,XMMWORD[48+rsi] 981 pxor xmm6,XMMWORD[rsp] 982 pxor xmm11,xmm12 983 pxor xmm2,xmm4 984 pxor xmm7,xmm0 985 986 movdqu XMMWORD[rdi],xmm6 987 movdqu xmm6,XMMWORD[64+rsi] 988 movdqu XMMWORD[16+rdi],xmm11 989 movdqu xmm11,XMMWORD[80+rsi] 990 movdqu XMMWORD[32+rdi],xmm2 991 movdqu xmm2,XMMWORD[96+rsi] 992 movdqu XMMWORD[48+rdi],xmm7 993 movdqu xmm7,XMMWORD[112+rsi] 994 lea rsi,[128+rsi] 995 pxor xmm6,XMMWORD[16+rsp] 996 pxor xmm11,xmm13 997 pxor xmm2,xmm5 998 pxor xmm7,xmm1 999 1000 movdqu XMMWORD[64+rdi],xmm6 1001 movdqu xmm6,XMMWORD[rsi] 1002 movdqu XMMWORD[80+rdi],xmm11 1003 movdqu xmm11,XMMWORD[16+rsi] 1004 movdqu XMMWORD[96+rdi],xmm2 1005 movdqu xmm2,XMMWORD[32+rsi] 1006 movdqu XMMWORD[112+rdi],xmm7 1007 lea rdi,[128+rdi] 1008 movdqu xmm7,XMMWORD[48+rsi] 1009 pxor xmm6,XMMWORD[32+rsp] 1010 pxor xmm11,xmm10 1011 pxor xmm2,xmm14 1012 pxor xmm7,xmm8 1013 movdqu XMMWORD[rdi],xmm6 1014 movdqu XMMWORD[16+rdi],xmm11 1015 movdqu XMMWORD[32+rdi],xmm2 1016 movdqu XMMWORD[48+rdi],xmm7 1017 je NEAR $L$done4x 1018 1019 movdqa xmm6,XMMWORD[48+rsp] 1020 lea rsi,[64+rsi] 1021 xor r10,r10 1022 movdqa XMMWORD[rsp],xmm6 1023 movdqa XMMWORD[16+rsp],xmm15 1024 lea rdi,[64+rdi] 1025 movdqa XMMWORD[32+rsp],xmm9 1026 sub rdx,192 1027 movdqa XMMWORD[48+rsp],xmm3 1028 1029$L$oop_tail4x: 1030 movzx eax,BYTE[r10*1+rsi] 1031 movzx ecx,BYTE[r10*1+rsp] 1032 lea r10,[1+r10] 1033 xor eax,ecx 1034 mov BYTE[((-1))+r10*1+rdi],al 1035 dec rdx 1036 jnz NEAR $L$oop_tail4x 1037 1038$L$done4x: 1039 movaps xmm6,XMMWORD[((-168))+r9] 1040 movaps xmm7,XMMWORD[((-152))+r9] 1041 movaps xmm8,XMMWORD[((-136))+r9] 1042 movaps xmm9,XMMWORD[((-120))+r9] 1043 movaps xmm10,XMMWORD[((-104))+r9] 1044 movaps xmm11,XMMWORD[((-88))+r9] 1045 movaps xmm12,XMMWORD[((-72))+r9] 1046 movaps xmm13,XMMWORD[((-56))+r9] 1047 movaps xmm14,XMMWORD[((-40))+r9] 1048 movaps xmm15,XMMWORD[((-24))+r9] 1049 lea rsp,[r9] 1050$L$4x_epilogue: 1051 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 1052 mov rsi,QWORD[16+rsp] 1053 DB 0F3h,0C3h ;repret 1054$L$SEH_end_ChaCha20_4x: 1055 1056ALIGN 32 1057ChaCha20_8x: 1058 mov QWORD[8+rsp],rdi ;WIN64 prologue 1059 mov QWORD[16+rsp],rsi 1060 mov rax,rsp 1061$L$SEH_begin_ChaCha20_8x: 1062 mov rdi,rcx 1063 mov rsi,rdx 1064 mov rdx,r8 1065 mov rcx,r9 1066 mov r8,QWORD[40+rsp] 1067 1068 1069$L$ChaCha20_8x: 1070 mov r9,rsp 1071 sub rsp,0x280+168 1072 and rsp,-32 1073 movaps XMMWORD[(-168)+r9],xmm6 1074 movaps XMMWORD[(-152)+r9],xmm7 1075 movaps XMMWORD[(-136)+r9],xmm8 1076 movaps XMMWORD[(-120)+r9],xmm9 1077 movaps XMMWORD[(-104)+r9],xmm10 1078 movaps XMMWORD[(-88)+r9],xmm11 1079 movaps XMMWORD[(-72)+r9],xmm12 1080 movaps XMMWORD[(-56)+r9],xmm13 1081 movaps XMMWORD[(-40)+r9],xmm14 1082 movaps XMMWORD[(-24)+r9],xmm15 1083$L$8x_body: 1084 vzeroupper 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 vbroadcasti128 ymm11,XMMWORD[$L$sigma] 1096 vbroadcasti128 ymm3,XMMWORD[rcx] 1097 vbroadcasti128 ymm15,XMMWORD[16+rcx] 1098 vbroadcasti128 ymm7,XMMWORD[r8] 1099 lea rcx,[256+rsp] 1100 lea rax,[512+rsp] 1101 lea r10,[$L$rot16] 1102 lea r11,[$L$rot24] 1103 1104 vpshufd ymm8,ymm11,0x00 1105 vpshufd ymm9,ymm11,0x55 1106 vmovdqa YMMWORD[(128-256)+rcx],ymm8 1107 vpshufd ymm10,ymm11,0xaa 1108 vmovdqa YMMWORD[(160-256)+rcx],ymm9 1109 vpshufd ymm11,ymm11,0xff 1110 vmovdqa YMMWORD[(192-256)+rcx],ymm10 1111 vmovdqa YMMWORD[(224-256)+rcx],ymm11 1112 1113 vpshufd ymm0,ymm3,0x00 1114 vpshufd ymm1,ymm3,0x55 1115 vmovdqa YMMWORD[(256-256)+rcx],ymm0 1116 vpshufd ymm2,ymm3,0xaa 1117 vmovdqa YMMWORD[(288-256)+rcx],ymm1 1118 vpshufd ymm3,ymm3,0xff 1119 vmovdqa YMMWORD[(320-256)+rcx],ymm2 1120 vmovdqa YMMWORD[(352-256)+rcx],ymm3 1121 1122 vpshufd ymm12,ymm15,0x00 1123 vpshufd ymm13,ymm15,0x55 1124 vmovdqa YMMWORD[(384-512)+rax],ymm12 1125 vpshufd ymm14,ymm15,0xaa 1126 vmovdqa YMMWORD[(416-512)+rax],ymm13 1127 vpshufd ymm15,ymm15,0xff 1128 vmovdqa YMMWORD[(448-512)+rax],ymm14 1129 vmovdqa YMMWORD[(480-512)+rax],ymm15 1130 1131 vpshufd ymm4,ymm7,0x00 1132 vpshufd ymm5,ymm7,0x55 1133 vpaddd ymm4,ymm4,YMMWORD[$L$incy] 1134 vpshufd ymm6,ymm7,0xaa 1135 vmovdqa YMMWORD[(544-512)+rax],ymm5 1136 vpshufd ymm7,ymm7,0xff 1137 vmovdqa YMMWORD[(576-512)+rax],ymm6 1138 vmovdqa YMMWORD[(608-512)+rax],ymm7 1139 1140 jmp NEAR $L$oop_enter8x 1141 1142ALIGN 32 1143$L$oop_outer8x: 1144 vmovdqa ymm8,YMMWORD[((128-256))+rcx] 1145 vmovdqa ymm9,YMMWORD[((160-256))+rcx] 1146 vmovdqa ymm10,YMMWORD[((192-256))+rcx] 1147 vmovdqa ymm11,YMMWORD[((224-256))+rcx] 1148 vmovdqa ymm0,YMMWORD[((256-256))+rcx] 1149 vmovdqa ymm1,YMMWORD[((288-256))+rcx] 1150 vmovdqa ymm2,YMMWORD[((320-256))+rcx] 1151 vmovdqa ymm3,YMMWORD[((352-256))+rcx] 1152 vmovdqa ymm12,YMMWORD[((384-512))+rax] 1153 vmovdqa ymm13,YMMWORD[((416-512))+rax] 1154 vmovdqa ymm14,YMMWORD[((448-512))+rax] 1155 vmovdqa ymm15,YMMWORD[((480-512))+rax] 1156 vmovdqa ymm4,YMMWORD[((512-512))+rax] 1157 vmovdqa ymm5,YMMWORD[((544-512))+rax] 1158 vmovdqa ymm6,YMMWORD[((576-512))+rax] 1159 vmovdqa ymm7,YMMWORD[((608-512))+rax] 1160 vpaddd ymm4,ymm4,YMMWORD[$L$eight] 1161 1162$L$oop_enter8x: 1163 vmovdqa YMMWORD[64+rsp],ymm14 1164 vmovdqa YMMWORD[96+rsp],ymm15 1165 vbroadcasti128 ymm15,XMMWORD[r10] 1166 vmovdqa YMMWORD[(512-512)+rax],ymm4 1167 mov eax,10 1168 jmp NEAR $L$oop8x 1169 1170ALIGN 32 1171$L$oop8x: 1172 vpaddd ymm8,ymm8,ymm0 1173 vpxor ymm4,ymm8,ymm4 1174 vpshufb ymm4,ymm4,ymm15 1175 vpaddd ymm9,ymm9,ymm1 1176 vpxor ymm5,ymm9,ymm5 1177 vpshufb ymm5,ymm5,ymm15 1178 vpaddd ymm12,ymm12,ymm4 1179 vpxor ymm0,ymm12,ymm0 1180 vpslld ymm14,ymm0,12 1181 vpsrld ymm0,ymm0,20 1182 vpor ymm0,ymm14,ymm0 1183 vbroadcasti128 ymm14,XMMWORD[r11] 1184 vpaddd ymm13,ymm13,ymm5 1185 vpxor ymm1,ymm13,ymm1 1186 vpslld ymm15,ymm1,12 1187 vpsrld ymm1,ymm1,20 1188 vpor ymm1,ymm15,ymm1 1189 vpaddd ymm8,ymm8,ymm0 1190 vpxor ymm4,ymm8,ymm4 1191 vpshufb ymm4,ymm4,ymm14 1192 vpaddd ymm9,ymm9,ymm1 1193 vpxor ymm5,ymm9,ymm5 1194 vpshufb ymm5,ymm5,ymm14 1195 vpaddd ymm12,ymm12,ymm4 1196 vpxor ymm0,ymm12,ymm0 1197 vpslld ymm15,ymm0,7 1198 vpsrld ymm0,ymm0,25 1199 vpor ymm0,ymm15,ymm0 1200 vbroadcasti128 ymm15,XMMWORD[r10] 1201 vpaddd ymm13,ymm13,ymm5 1202 vpxor ymm1,ymm13,ymm1 1203 vpslld ymm14,ymm1,7 1204 vpsrld ymm1,ymm1,25 1205 vpor ymm1,ymm14,ymm1 1206 vmovdqa YMMWORD[rsp],ymm12 1207 vmovdqa YMMWORD[32+rsp],ymm13 1208 vmovdqa ymm12,YMMWORD[64+rsp] 1209 vmovdqa ymm13,YMMWORD[96+rsp] 1210 vpaddd ymm10,ymm10,ymm2 1211 vpxor ymm6,ymm10,ymm6 1212 vpshufb ymm6,ymm6,ymm15 1213 vpaddd ymm11,ymm11,ymm3 1214 vpxor ymm7,ymm11,ymm7 1215 vpshufb ymm7,ymm7,ymm15 1216 vpaddd ymm12,ymm12,ymm6 1217 vpxor ymm2,ymm12,ymm2 1218 vpslld ymm14,ymm2,12 1219 vpsrld ymm2,ymm2,20 1220 vpor ymm2,ymm14,ymm2 1221 vbroadcasti128 ymm14,XMMWORD[r11] 1222 vpaddd ymm13,ymm13,ymm7 1223 vpxor ymm3,ymm13,ymm3 1224 vpslld ymm15,ymm3,12 1225 vpsrld ymm3,ymm3,20 1226 vpor ymm3,ymm15,ymm3 1227 vpaddd ymm10,ymm10,ymm2 1228 vpxor ymm6,ymm10,ymm6 1229 vpshufb ymm6,ymm6,ymm14 1230 vpaddd ymm11,ymm11,ymm3 1231 vpxor ymm7,ymm11,ymm7 1232 vpshufb ymm7,ymm7,ymm14 1233 vpaddd ymm12,ymm12,ymm6 1234 vpxor ymm2,ymm12,ymm2 1235 vpslld ymm15,ymm2,7 1236 vpsrld ymm2,ymm2,25 1237 vpor ymm2,ymm15,ymm2 1238 vbroadcasti128 ymm15,XMMWORD[r10] 1239 vpaddd ymm13,ymm13,ymm7 1240 vpxor ymm3,ymm13,ymm3 1241 vpslld ymm14,ymm3,7 1242 vpsrld ymm3,ymm3,25 1243 vpor ymm3,ymm14,ymm3 1244 vpaddd ymm8,ymm8,ymm1 1245 vpxor ymm7,ymm8,ymm7 1246 vpshufb ymm7,ymm7,ymm15 1247 vpaddd ymm9,ymm9,ymm2 1248 vpxor ymm4,ymm9,ymm4 1249 vpshufb ymm4,ymm4,ymm15 1250 vpaddd ymm12,ymm12,ymm7 1251 vpxor ymm1,ymm12,ymm1 1252 vpslld ymm14,ymm1,12 1253 vpsrld ymm1,ymm1,20 1254 vpor ymm1,ymm14,ymm1 1255 vbroadcasti128 ymm14,XMMWORD[r11] 1256 vpaddd ymm13,ymm13,ymm4 1257 vpxor ymm2,ymm13,ymm2 1258 vpslld ymm15,ymm2,12 1259 vpsrld ymm2,ymm2,20 1260 vpor ymm2,ymm15,ymm2 1261 vpaddd ymm8,ymm8,ymm1 1262 vpxor ymm7,ymm8,ymm7 1263 vpshufb ymm7,ymm7,ymm14 1264 vpaddd ymm9,ymm9,ymm2 1265 vpxor ymm4,ymm9,ymm4 1266 vpshufb ymm4,ymm4,ymm14 1267 vpaddd ymm12,ymm12,ymm7 1268 vpxor ymm1,ymm12,ymm1 1269 vpslld ymm15,ymm1,7 1270 vpsrld ymm1,ymm1,25 1271 vpor ymm1,ymm15,ymm1 1272 vbroadcasti128 ymm15,XMMWORD[r10] 1273 vpaddd ymm13,ymm13,ymm4 1274 vpxor ymm2,ymm13,ymm2 1275 vpslld ymm14,ymm2,7 1276 vpsrld ymm2,ymm2,25 1277 vpor ymm2,ymm14,ymm2 1278 vmovdqa YMMWORD[64+rsp],ymm12 1279 vmovdqa YMMWORD[96+rsp],ymm13 1280 vmovdqa ymm12,YMMWORD[rsp] 1281 vmovdqa ymm13,YMMWORD[32+rsp] 1282 vpaddd ymm10,ymm10,ymm3 1283 vpxor ymm5,ymm10,ymm5 1284 vpshufb ymm5,ymm5,ymm15 1285 vpaddd ymm11,ymm11,ymm0 1286 vpxor ymm6,ymm11,ymm6 1287 vpshufb ymm6,ymm6,ymm15 1288 vpaddd ymm12,ymm12,ymm5 1289 vpxor ymm3,ymm12,ymm3 1290 vpslld ymm14,ymm3,12 1291 vpsrld ymm3,ymm3,20 1292 vpor ymm3,ymm14,ymm3 1293 vbroadcasti128 ymm14,XMMWORD[r11] 1294 vpaddd ymm13,ymm13,ymm6 1295 vpxor ymm0,ymm13,ymm0 1296 vpslld ymm15,ymm0,12 1297 vpsrld ymm0,ymm0,20 1298 vpor ymm0,ymm15,ymm0 1299 vpaddd ymm10,ymm10,ymm3 1300 vpxor ymm5,ymm10,ymm5 1301 vpshufb ymm5,ymm5,ymm14 1302 vpaddd ymm11,ymm11,ymm0 1303 vpxor ymm6,ymm11,ymm6 1304 vpshufb ymm6,ymm6,ymm14 1305 vpaddd ymm12,ymm12,ymm5 1306 vpxor ymm3,ymm12,ymm3 1307 vpslld ymm15,ymm3,7 1308 vpsrld ymm3,ymm3,25 1309 vpor ymm3,ymm15,ymm3 1310 vbroadcasti128 ymm15,XMMWORD[r10] 1311 vpaddd ymm13,ymm13,ymm6 1312 vpxor ymm0,ymm13,ymm0 1313 vpslld ymm14,ymm0,7 1314 vpsrld ymm0,ymm0,25 1315 vpor ymm0,ymm14,ymm0 1316 dec eax 1317 jnz NEAR $L$oop8x 1318 1319 lea rax,[512+rsp] 1320 vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] 1321 vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] 1322 vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] 1323 vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] 1324 1325 vpunpckldq ymm14,ymm8,ymm9 1326 vpunpckldq ymm15,ymm10,ymm11 1327 vpunpckhdq ymm8,ymm8,ymm9 1328 vpunpckhdq ymm10,ymm10,ymm11 1329 vpunpcklqdq ymm9,ymm14,ymm15 1330 vpunpckhqdq ymm14,ymm14,ymm15 1331 vpunpcklqdq ymm11,ymm8,ymm10 1332 vpunpckhqdq ymm8,ymm8,ymm10 1333 vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] 1334 vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] 1335 vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] 1336 vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] 1337 1338 vpunpckldq ymm10,ymm0,ymm1 1339 vpunpckldq ymm15,ymm2,ymm3 1340 vpunpckhdq ymm0,ymm0,ymm1 1341 vpunpckhdq ymm2,ymm2,ymm3 1342 vpunpcklqdq ymm1,ymm10,ymm15 1343 vpunpckhqdq ymm10,ymm10,ymm15 1344 vpunpcklqdq ymm3,ymm0,ymm2 1345 vpunpckhqdq ymm0,ymm0,ymm2 1346 vperm2i128 ymm15,ymm9,ymm1,0x20 1347 vperm2i128 ymm1,ymm9,ymm1,0x31 1348 vperm2i128 ymm9,ymm14,ymm10,0x20 1349 vperm2i128 ymm10,ymm14,ymm10,0x31 1350 vperm2i128 ymm14,ymm11,ymm3,0x20 1351 vperm2i128 ymm3,ymm11,ymm3,0x31 1352 vperm2i128 ymm11,ymm8,ymm0,0x20 1353 vperm2i128 ymm0,ymm8,ymm0,0x31 1354 vmovdqa YMMWORD[rsp],ymm15 1355 vmovdqa YMMWORD[32+rsp],ymm9 1356 vmovdqa ymm15,YMMWORD[64+rsp] 1357 vmovdqa ymm9,YMMWORD[96+rsp] 1358 1359 vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] 1360 vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] 1361 vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] 1362 vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] 1363 1364 vpunpckldq ymm2,ymm12,ymm13 1365 vpunpckldq ymm8,ymm15,ymm9 1366 vpunpckhdq ymm12,ymm12,ymm13 1367 vpunpckhdq ymm15,ymm15,ymm9 1368 vpunpcklqdq ymm13,ymm2,ymm8 1369 vpunpckhqdq ymm2,ymm2,ymm8 1370 vpunpcklqdq ymm9,ymm12,ymm15 1371 vpunpckhqdq ymm12,ymm12,ymm15 1372 vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] 1373 vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] 1374 vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] 1375 vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] 1376 1377 vpunpckldq ymm15,ymm4,ymm5 1378 vpunpckldq ymm8,ymm6,ymm7 1379 vpunpckhdq ymm4,ymm4,ymm5 1380 vpunpckhdq ymm6,ymm6,ymm7 1381 vpunpcklqdq ymm5,ymm15,ymm8 1382 vpunpckhqdq ymm15,ymm15,ymm8 1383 vpunpcklqdq ymm7,ymm4,ymm6 1384 vpunpckhqdq ymm4,ymm4,ymm6 1385 vperm2i128 ymm8,ymm13,ymm5,0x20 1386 vperm2i128 ymm5,ymm13,ymm5,0x31 1387 vperm2i128 ymm13,ymm2,ymm15,0x20 1388 vperm2i128 ymm15,ymm2,ymm15,0x31 1389 vperm2i128 ymm2,ymm9,ymm7,0x20 1390 vperm2i128 ymm7,ymm9,ymm7,0x31 1391 vperm2i128 ymm9,ymm12,ymm4,0x20 1392 vperm2i128 ymm4,ymm12,ymm4,0x31 1393 vmovdqa ymm6,YMMWORD[rsp] 1394 vmovdqa ymm12,YMMWORD[32+rsp] 1395 1396 cmp rdx,64*8 1397 jb NEAR $L$tail8x 1398 1399 vpxor ymm6,ymm6,YMMWORD[rsi] 1400 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1401 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1402 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1403 lea rsi,[128+rsi] 1404 vmovdqu YMMWORD[rdi],ymm6 1405 vmovdqu YMMWORD[32+rdi],ymm8 1406 vmovdqu YMMWORD[64+rdi],ymm1 1407 vmovdqu YMMWORD[96+rdi],ymm5 1408 lea rdi,[128+rdi] 1409 1410 vpxor ymm12,ymm12,YMMWORD[rsi] 1411 vpxor ymm13,ymm13,YMMWORD[32+rsi] 1412 vpxor ymm10,ymm10,YMMWORD[64+rsi] 1413 vpxor ymm15,ymm15,YMMWORD[96+rsi] 1414 lea rsi,[128+rsi] 1415 vmovdqu YMMWORD[rdi],ymm12 1416 vmovdqu YMMWORD[32+rdi],ymm13 1417 vmovdqu YMMWORD[64+rdi],ymm10 1418 vmovdqu YMMWORD[96+rdi],ymm15 1419 lea rdi,[128+rdi] 1420 1421 vpxor ymm14,ymm14,YMMWORD[rsi] 1422 vpxor ymm2,ymm2,YMMWORD[32+rsi] 1423 vpxor ymm3,ymm3,YMMWORD[64+rsi] 1424 vpxor ymm7,ymm7,YMMWORD[96+rsi] 1425 lea rsi,[128+rsi] 1426 vmovdqu YMMWORD[rdi],ymm14 1427 vmovdqu YMMWORD[32+rdi],ymm2 1428 vmovdqu YMMWORD[64+rdi],ymm3 1429 vmovdqu YMMWORD[96+rdi],ymm7 1430 lea rdi,[128+rdi] 1431 1432 vpxor ymm11,ymm11,YMMWORD[rsi] 1433 vpxor ymm9,ymm9,YMMWORD[32+rsi] 1434 vpxor ymm0,ymm0,YMMWORD[64+rsi] 1435 vpxor ymm4,ymm4,YMMWORD[96+rsi] 1436 lea rsi,[128+rsi] 1437 vmovdqu YMMWORD[rdi],ymm11 1438 vmovdqu YMMWORD[32+rdi],ymm9 1439 vmovdqu YMMWORD[64+rdi],ymm0 1440 vmovdqu YMMWORD[96+rdi],ymm4 1441 lea rdi,[128+rdi] 1442 1443 sub rdx,64*8 1444 jnz NEAR $L$oop_outer8x 1445 1446 jmp NEAR $L$done8x 1447 1448$L$tail8x: 1449 cmp rdx,448 1450 jae NEAR $L$448_or_more8x 1451 cmp rdx,384 1452 jae NEAR $L$384_or_more8x 1453 cmp rdx,320 1454 jae NEAR $L$320_or_more8x 1455 cmp rdx,256 1456 jae NEAR $L$256_or_more8x 1457 cmp rdx,192 1458 jae NEAR $L$192_or_more8x 1459 cmp rdx,128 1460 jae NEAR $L$128_or_more8x 1461 cmp rdx,64 1462 jae NEAR $L$64_or_more8x 1463 1464 xor r10,r10 1465 vmovdqa YMMWORD[rsp],ymm6 1466 vmovdqa YMMWORD[32+rsp],ymm8 1467 jmp NEAR $L$oop_tail8x 1468 1469ALIGN 32 1470$L$64_or_more8x: 1471 vpxor ymm6,ymm6,YMMWORD[rsi] 1472 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1473 vmovdqu YMMWORD[rdi],ymm6 1474 vmovdqu YMMWORD[32+rdi],ymm8 1475 je NEAR $L$done8x 1476 1477 lea rsi,[64+rsi] 1478 xor r10,r10 1479 vmovdqa YMMWORD[rsp],ymm1 1480 lea rdi,[64+rdi] 1481 sub rdx,64 1482 vmovdqa YMMWORD[32+rsp],ymm5 1483 jmp NEAR $L$oop_tail8x 1484 1485ALIGN 32 1486$L$128_or_more8x: 1487 vpxor ymm6,ymm6,YMMWORD[rsi] 1488 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1489 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1490 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1491 vmovdqu YMMWORD[rdi],ymm6 1492 vmovdqu YMMWORD[32+rdi],ymm8 1493 vmovdqu YMMWORD[64+rdi],ymm1 1494 vmovdqu YMMWORD[96+rdi],ymm5 1495 je NEAR $L$done8x 1496 1497 lea rsi,[128+rsi] 1498 xor r10,r10 1499 vmovdqa YMMWORD[rsp],ymm12 1500 lea rdi,[128+rdi] 1501 sub rdx,128 1502 vmovdqa YMMWORD[32+rsp],ymm13 1503 jmp NEAR $L$oop_tail8x 1504 1505ALIGN 32 1506$L$192_or_more8x: 1507 vpxor ymm6,ymm6,YMMWORD[rsi] 1508 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1509 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1510 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1511 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1512 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1513 vmovdqu YMMWORD[rdi],ymm6 1514 vmovdqu YMMWORD[32+rdi],ymm8 1515 vmovdqu YMMWORD[64+rdi],ymm1 1516 vmovdqu YMMWORD[96+rdi],ymm5 1517 vmovdqu YMMWORD[128+rdi],ymm12 1518 vmovdqu YMMWORD[160+rdi],ymm13 1519 je NEAR $L$done8x 1520 1521 lea rsi,[192+rsi] 1522 xor r10,r10 1523 vmovdqa YMMWORD[rsp],ymm10 1524 lea rdi,[192+rdi] 1525 sub rdx,192 1526 vmovdqa YMMWORD[32+rsp],ymm15 1527 jmp NEAR $L$oop_tail8x 1528 1529ALIGN 32 1530$L$256_or_more8x: 1531 vpxor ymm6,ymm6,YMMWORD[rsi] 1532 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1533 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1534 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1535 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1536 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1537 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1538 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1539 vmovdqu YMMWORD[rdi],ymm6 1540 vmovdqu YMMWORD[32+rdi],ymm8 1541 vmovdqu YMMWORD[64+rdi],ymm1 1542 vmovdqu YMMWORD[96+rdi],ymm5 1543 vmovdqu YMMWORD[128+rdi],ymm12 1544 vmovdqu YMMWORD[160+rdi],ymm13 1545 vmovdqu YMMWORD[192+rdi],ymm10 1546 vmovdqu YMMWORD[224+rdi],ymm15 1547 je NEAR $L$done8x 1548 1549 lea rsi,[256+rsi] 1550 xor r10,r10 1551 vmovdqa YMMWORD[rsp],ymm14 1552 lea rdi,[256+rdi] 1553 sub rdx,256 1554 vmovdqa YMMWORD[32+rsp],ymm2 1555 jmp NEAR $L$oop_tail8x 1556 1557ALIGN 32 1558$L$320_or_more8x: 1559 vpxor ymm6,ymm6,YMMWORD[rsi] 1560 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1561 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1562 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1563 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1564 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1565 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1566 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1567 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1568 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1569 vmovdqu YMMWORD[rdi],ymm6 1570 vmovdqu YMMWORD[32+rdi],ymm8 1571 vmovdqu YMMWORD[64+rdi],ymm1 1572 vmovdqu YMMWORD[96+rdi],ymm5 1573 vmovdqu YMMWORD[128+rdi],ymm12 1574 vmovdqu YMMWORD[160+rdi],ymm13 1575 vmovdqu YMMWORD[192+rdi],ymm10 1576 vmovdqu YMMWORD[224+rdi],ymm15 1577 vmovdqu YMMWORD[256+rdi],ymm14 1578 vmovdqu YMMWORD[288+rdi],ymm2 1579 je NEAR $L$done8x 1580 1581 lea rsi,[320+rsi] 1582 xor r10,r10 1583 vmovdqa YMMWORD[rsp],ymm3 1584 lea rdi,[320+rdi] 1585 sub rdx,320 1586 vmovdqa YMMWORD[32+rsp],ymm7 1587 jmp NEAR $L$oop_tail8x 1588 1589ALIGN 32 1590$L$384_or_more8x: 1591 vpxor ymm6,ymm6,YMMWORD[rsi] 1592 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1593 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1594 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1595 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1596 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1597 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1598 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1599 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1600 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1601 vpxor ymm3,ymm3,YMMWORD[320+rsi] 1602 vpxor ymm7,ymm7,YMMWORD[352+rsi] 1603 vmovdqu YMMWORD[rdi],ymm6 1604 vmovdqu YMMWORD[32+rdi],ymm8 1605 vmovdqu YMMWORD[64+rdi],ymm1 1606 vmovdqu YMMWORD[96+rdi],ymm5 1607 vmovdqu YMMWORD[128+rdi],ymm12 1608 vmovdqu YMMWORD[160+rdi],ymm13 1609 vmovdqu YMMWORD[192+rdi],ymm10 1610 vmovdqu YMMWORD[224+rdi],ymm15 1611 vmovdqu YMMWORD[256+rdi],ymm14 1612 vmovdqu YMMWORD[288+rdi],ymm2 1613 vmovdqu YMMWORD[320+rdi],ymm3 1614 vmovdqu YMMWORD[352+rdi],ymm7 1615 je NEAR $L$done8x 1616 1617 lea rsi,[384+rsi] 1618 xor r10,r10 1619 vmovdqa YMMWORD[rsp],ymm11 1620 lea rdi,[384+rdi] 1621 sub rdx,384 1622 vmovdqa YMMWORD[32+rsp],ymm9 1623 jmp NEAR $L$oop_tail8x 1624 1625ALIGN 32 1626$L$448_or_more8x: 1627 vpxor ymm6,ymm6,YMMWORD[rsi] 1628 vpxor ymm8,ymm8,YMMWORD[32+rsi] 1629 vpxor ymm1,ymm1,YMMWORD[64+rsi] 1630 vpxor ymm5,ymm5,YMMWORD[96+rsi] 1631 vpxor ymm12,ymm12,YMMWORD[128+rsi] 1632 vpxor ymm13,ymm13,YMMWORD[160+rsi] 1633 vpxor ymm10,ymm10,YMMWORD[192+rsi] 1634 vpxor ymm15,ymm15,YMMWORD[224+rsi] 1635 vpxor ymm14,ymm14,YMMWORD[256+rsi] 1636 vpxor ymm2,ymm2,YMMWORD[288+rsi] 1637 vpxor ymm3,ymm3,YMMWORD[320+rsi] 1638 vpxor ymm7,ymm7,YMMWORD[352+rsi] 1639 vpxor ymm11,ymm11,YMMWORD[384+rsi] 1640 vpxor ymm9,ymm9,YMMWORD[416+rsi] 1641 vmovdqu YMMWORD[rdi],ymm6 1642 vmovdqu YMMWORD[32+rdi],ymm8 1643 vmovdqu YMMWORD[64+rdi],ymm1 1644 vmovdqu YMMWORD[96+rdi],ymm5 1645 vmovdqu YMMWORD[128+rdi],ymm12 1646 vmovdqu YMMWORD[160+rdi],ymm13 1647 vmovdqu YMMWORD[192+rdi],ymm10 1648 vmovdqu YMMWORD[224+rdi],ymm15 1649 vmovdqu YMMWORD[256+rdi],ymm14 1650 vmovdqu YMMWORD[288+rdi],ymm2 1651 vmovdqu YMMWORD[320+rdi],ymm3 1652 vmovdqu YMMWORD[352+rdi],ymm7 1653 vmovdqu YMMWORD[384+rdi],ymm11 1654 vmovdqu YMMWORD[416+rdi],ymm9 1655 je NEAR $L$done8x 1656 1657 lea rsi,[448+rsi] 1658 xor r10,r10 1659 vmovdqa YMMWORD[rsp],ymm0 1660 lea rdi,[448+rdi] 1661 sub rdx,448 1662 vmovdqa YMMWORD[32+rsp],ymm4 1663 1664$L$oop_tail8x: 1665 movzx eax,BYTE[r10*1+rsi] 1666 movzx ecx,BYTE[r10*1+rsp] 1667 lea r10,[1+r10] 1668 xor eax,ecx 1669 mov BYTE[((-1))+r10*1+rdi],al 1670 dec rdx 1671 jnz NEAR $L$oop_tail8x 1672 1673$L$done8x: 1674 vzeroall 1675 movaps xmm6,XMMWORD[((-168))+r9] 1676 movaps xmm7,XMMWORD[((-152))+r9] 1677 movaps xmm8,XMMWORD[((-136))+r9] 1678 movaps xmm9,XMMWORD[((-120))+r9] 1679 movaps xmm10,XMMWORD[((-104))+r9] 1680 movaps xmm11,XMMWORD[((-88))+r9] 1681 movaps xmm12,XMMWORD[((-72))+r9] 1682 movaps xmm13,XMMWORD[((-56))+r9] 1683 movaps xmm14,XMMWORD[((-40))+r9] 1684 movaps xmm15,XMMWORD[((-24))+r9] 1685 lea rsp,[r9] 1686$L$8x_epilogue: 1687 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 1688 mov rsi,QWORD[16+rsp] 1689 DB 0F3h,0C3h ;repret 1690$L$SEH_end_ChaCha20_8x: 1691EXTERN __imp_RtlVirtualUnwind 1692 1693ALIGN 16 1694se_handler: 1695 push rsi 1696 push rdi 1697 push rbx 1698 push rbp 1699 push r12 1700 push r13 1701 push r14 1702 push r15 1703 pushfq 1704 sub rsp,64 1705 1706 mov rax,QWORD[120+r8] 1707 mov rbx,QWORD[248+r8] 1708 1709 mov rsi,QWORD[8+r9] 1710 mov r11,QWORD[56+r9] 1711 1712 lea r10,[$L$ctr32_body] 1713 cmp rbx,r10 1714 jb NEAR $L$common_seh_tail 1715 1716 mov rax,QWORD[152+r8] 1717 1718 lea r10,[$L$no_data] 1719 cmp rbx,r10 1720 jae NEAR $L$common_seh_tail 1721 1722 lea rax,[((64+24+48))+rax] 1723 1724 mov rbx,QWORD[((-8))+rax] 1725 mov rbp,QWORD[((-16))+rax] 1726 mov r12,QWORD[((-24))+rax] 1727 mov r13,QWORD[((-32))+rax] 1728 mov r14,QWORD[((-40))+rax] 1729 mov r15,QWORD[((-48))+rax] 1730 mov QWORD[144+r8],rbx 1731 mov QWORD[160+r8],rbp 1732 mov QWORD[216+r8],r12 1733 mov QWORD[224+r8],r13 1734 mov QWORD[232+r8],r14 1735 mov QWORD[240+r8],r15 1736 1737$L$common_seh_tail: 1738 mov rdi,QWORD[8+rax] 1739 mov rsi,QWORD[16+rax] 1740 mov QWORD[152+r8],rax 1741 mov QWORD[168+r8],rsi 1742 mov QWORD[176+r8],rdi 1743 1744 mov rdi,QWORD[40+r9] 1745 mov rsi,r8 1746 mov ecx,154 1747 DD 0xa548f3fc 1748 1749 mov rsi,r9 1750 xor rcx,rcx 1751 mov rdx,QWORD[8+rsi] 1752 mov r8,QWORD[rsi] 1753 mov r9,QWORD[16+rsi] 1754 mov r10,QWORD[40+rsi] 1755 lea r11,[56+rsi] 1756 lea r12,[24+rsi] 1757 mov QWORD[32+rsp],r10 1758 mov QWORD[40+rsp],r11 1759 mov QWORD[48+rsp],r12 1760 mov QWORD[56+rsp],rcx 1761 call QWORD[__imp_RtlVirtualUnwind] 1762 1763 mov eax,1 1764 add rsp,64 1765 popfq 1766 pop r15 1767 pop r14 1768 pop r13 1769 pop r12 1770 pop rbp 1771 pop rbx 1772 pop rdi 1773 pop rsi 1774 DB 0F3h,0C3h ;repret 1775 1776 1777 1778ALIGN 16 1779ssse3_handler: 1780 push rsi 1781 push rdi 1782 push rbx 1783 push rbp 1784 push r12 1785 push r13 1786 push r14 1787 push r15 1788 pushfq 1789 sub rsp,64 1790 1791 mov rax,QWORD[120+r8] 1792 mov rbx,QWORD[248+r8] 1793 1794 mov rsi,QWORD[8+r9] 1795 mov r11,QWORD[56+r9] 1796 1797 mov r10d,DWORD[r11] 1798 lea r10,[r10*1+rsi] 1799 cmp rbx,r10 1800 jb NEAR $L$common_seh_tail 1801 1802 mov rax,QWORD[192+r8] 1803 1804 mov r10d,DWORD[4+r11] 1805 lea r10,[r10*1+rsi] 1806 cmp rbx,r10 1807 jae NEAR $L$common_seh_tail 1808 1809 lea rsi,[((-40))+rax] 1810 lea rdi,[512+r8] 1811 mov ecx,4 1812 DD 0xa548f3fc 1813 1814 jmp NEAR $L$common_seh_tail 1815 1816 1817 1818ALIGN 16 1819full_handler: 1820 push rsi 1821 push rdi 1822 push rbx 1823 push rbp 1824 push r12 1825 push r13 1826 push r14 1827 push r15 1828 pushfq 1829 sub rsp,64 1830 1831 mov rax,QWORD[120+r8] 1832 mov rbx,QWORD[248+r8] 1833 1834 mov rsi,QWORD[8+r9] 1835 mov r11,QWORD[56+r9] 1836 1837 mov r10d,DWORD[r11] 1838 lea r10,[r10*1+rsi] 1839 cmp rbx,r10 1840 jb NEAR $L$common_seh_tail 1841 1842 mov rax,QWORD[192+r8] 1843 1844 mov r10d,DWORD[4+r11] 1845 lea r10,[r10*1+rsi] 1846 cmp rbx,r10 1847 jae NEAR $L$common_seh_tail 1848 1849 lea rsi,[((-168))+rax] 1850 lea rdi,[512+r8] 1851 mov ecx,20 1852 DD 0xa548f3fc 1853 1854 jmp NEAR $L$common_seh_tail 1855 1856 1857section .pdata rdata align=4 1858ALIGN 4 1859 DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase 1860 DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase 1861 DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase 1862 1863 DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase 1864 DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase 1865 DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase 1866 1867 DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase 1868 DD $L$SEH_end_ChaCha20_4x wrt ..imagebase 1869 DD $L$SEH_info_ChaCha20_4x wrt ..imagebase 1870 DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase 1871 DD $L$SEH_end_ChaCha20_8x wrt ..imagebase 1872 DD $L$SEH_info_ChaCha20_8x wrt ..imagebase 1873section .xdata rdata align=8 1874ALIGN 8 1875$L$SEH_info_ChaCha20_ctr32: 1876DB 9,0,0,0 1877 DD se_handler wrt ..imagebase 1878 1879$L$SEH_info_ChaCha20_ssse3: 1880DB 9,0,0,0 1881 DD ssse3_handler wrt ..imagebase 1882 DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase 1883 1884$L$SEH_info_ChaCha20_4x: 1885DB 9,0,0,0 1886 DD full_handler wrt ..imagebase 1887 DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase 1888$L$SEH_info_ChaCha20_8x: 1889DB 9,0,0,0 1890 DD full_handler wrt ..imagebase 1891 DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase 1892