1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straghtforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instuctions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizies 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 183# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 184# 185# (*) Atom Silvermont ECB result is suboptimal because of penalties 186# incurred by operations on %xmm8-15. As ECB is not considered 187# critical, nothing was done to mitigate the problem. 188 189$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 190 # generates drop-in replacement for 191 # crypto/aes/asm/aes-x86_64.pl:-) 192 193$flavour = shift; 194$output = shift; 195if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 196 197$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 198 199$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 200( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 201( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 202die "can't locate x86_64-xlate.pl"; 203 204open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 205*STDOUT=*OUT; 206 207$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 208@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 209 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 210 211$code=".text\n"; 212$code.=".extern OPENSSL_ia32cap_P\n"; 213 214$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 215# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 216$inp="%rdi"; 217$out="%rsi"; 218$len="%rdx"; 219$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 220$ivp="%r8"; # cbc, ctr, ... 221 222$rnds_="%r10d"; # backup copy for $rounds 223$key_="%r11"; # backup copy for $key 224 225# %xmm register layout 226$rndkey0="%xmm0"; $rndkey1="%xmm1"; 227$inout0="%xmm2"; $inout1="%xmm3"; 228$inout2="%xmm4"; $inout3="%xmm5"; 229$inout4="%xmm6"; $inout5="%xmm7"; 230$inout6="%xmm8"; $inout7="%xmm9"; 231 232$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 233$in0="%xmm8"; $iv="%xmm9"; 234 235# Inline version of internal aesni_[en|de]crypt1. 236# 237# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 238# cycles which take care of loop variables... 239{ my $sn; 240sub aesni_generate1 { 241my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 242++$sn; 243$code.=<<___; 244 $movkey ($key),$rndkey0 245 $movkey 16($key),$rndkey1 246___ 247$code.=<<___ if (defined($ivec)); 248 xorps $rndkey0,$ivec 249 lea 32($key),$key 250 xorps $ivec,$inout 251___ 252$code.=<<___ if (!defined($ivec)); 253 lea 32($key),$key 254 xorps $rndkey0,$inout 255___ 256$code.=<<___; 257.Loop_${p}1_$sn: 258 aes${p} $rndkey1,$inout 259 dec $rounds 260 $movkey ($key),$rndkey1 261 lea 16($key),$key 262 jnz .Loop_${p}1_$sn # loop body is 16 bytes 263 aes${p}last $rndkey1,$inout 264___ 265}} 266# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 267# 268{ my ($inp,$out,$key) = @_4args; 269 270$code.=<<___; 271.globl ${PREFIX}_encrypt 272.type ${PREFIX}_encrypt,\@abi-omnipotent 273.align 16 274${PREFIX}_encrypt: 275 movups ($inp),$inout0 # load input 276 mov 240($key),$rounds # key->rounds 277___ 278 &aesni_generate1("enc",$key,$rounds); 279$code.=<<___; 280 pxor $rndkey0,$rndkey0 # clear register bank 281 pxor $rndkey1,$rndkey1 282 movups $inout0,($out) # output 283 pxor $inout0,$inout0 284 ret 285.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 286 287.globl ${PREFIX}_decrypt 288.type ${PREFIX}_decrypt,\@abi-omnipotent 289.align 16 290${PREFIX}_decrypt: 291 movups ($inp),$inout0 # load input 292 mov 240($key),$rounds # key->rounds 293___ 294 &aesni_generate1("dec",$key,$rounds); 295$code.=<<___; 296 pxor $rndkey0,$rndkey0 # clear register bank 297 pxor $rndkey1,$rndkey1 298 movups $inout0,($out) # output 299 pxor $inout0,$inout0 300 ret 301.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 302___ 303} 304 305# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 306# factor. Why 3x subroutine were originally used in loops? Even though 307# aes[enc|dec] latency was originally 6, it could be scheduled only 308# every *2nd* cycle. Thus 3x interleave was the one providing optimal 309# utilization, i.e. when subroutine's throughput is virtually same as 310# of non-interleaved subroutine [for number of input blocks up to 3]. 311# This is why it originally made no sense to implement 2x subroutine. 312# But times change and it became appropriate to spend extra 192 bytes 313# on 2x subroutine on Atom Silvermont account. For processors that 314# can schedule aes[enc|dec] every cycle optimal interleave factor 315# equals to corresponding instructions latency. 8x is optimal for 316# * Bridge and "super-optimal" for other Intel CPUs... 317 318sub aesni_generate2 { 319my $dir=shift; 320# As already mentioned it takes in $key and $rounds, which are *not* 321# preserved. $inout[0-1] is cipher/clear text... 322$code.=<<___; 323.type _aesni_${dir}rypt2,\@abi-omnipotent 324.align 16 325_aesni_${dir}rypt2: 326 $movkey ($key),$rndkey0 327 shl \$4,$rounds 328 $movkey 16($key),$rndkey1 329 xorps $rndkey0,$inout0 330 xorps $rndkey0,$inout1 331 $movkey 32($key),$rndkey0 332 lea 32($key,$rounds),$key 333 neg %rax # $rounds 334 add \$16,%rax 335 336.L${dir}_loop2: 337 aes${dir} $rndkey1,$inout0 338 aes${dir} $rndkey1,$inout1 339 $movkey ($key,%rax),$rndkey1 340 add \$32,%rax 341 aes${dir} $rndkey0,$inout0 342 aes${dir} $rndkey0,$inout1 343 $movkey -16($key,%rax),$rndkey0 344 jnz .L${dir}_loop2 345 346 aes${dir} $rndkey1,$inout0 347 aes${dir} $rndkey1,$inout1 348 aes${dir}last $rndkey0,$inout0 349 aes${dir}last $rndkey0,$inout1 350 ret 351.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 352___ 353} 354sub aesni_generate3 { 355my $dir=shift; 356# As already mentioned it takes in $key and $rounds, which are *not* 357# preserved. $inout[0-2] is cipher/clear text... 358$code.=<<___; 359.type _aesni_${dir}rypt3,\@abi-omnipotent 360.align 16 361_aesni_${dir}rypt3: 362 $movkey ($key),$rndkey0 363 shl \$4,$rounds 364 $movkey 16($key),$rndkey1 365 xorps $rndkey0,$inout0 366 xorps $rndkey0,$inout1 367 xorps $rndkey0,$inout2 368 $movkey 32($key),$rndkey0 369 lea 32($key,$rounds),$key 370 neg %rax # $rounds 371 add \$16,%rax 372 373.L${dir}_loop3: 374 aes${dir} $rndkey1,$inout0 375 aes${dir} $rndkey1,$inout1 376 aes${dir} $rndkey1,$inout2 377 $movkey ($key,%rax),$rndkey1 378 add \$32,%rax 379 aes${dir} $rndkey0,$inout0 380 aes${dir} $rndkey0,$inout1 381 aes${dir} $rndkey0,$inout2 382 $movkey -16($key,%rax),$rndkey0 383 jnz .L${dir}_loop3 384 385 aes${dir} $rndkey1,$inout0 386 aes${dir} $rndkey1,$inout1 387 aes${dir} $rndkey1,$inout2 388 aes${dir}last $rndkey0,$inout0 389 aes${dir}last $rndkey0,$inout1 390 aes${dir}last $rndkey0,$inout2 391 ret 392.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 393___ 394} 395# 4x interleave is implemented to improve small block performance, 396# most notably [and naturally] 4 block by ~30%. One can argue that one 397# should have implemented 5x as well, but improvement would be <20%, 398# so it's not worth it... 399sub aesni_generate4 { 400my $dir=shift; 401# As already mentioned it takes in $key and $rounds, which are *not* 402# preserved. $inout[0-3] is cipher/clear text... 403$code.=<<___; 404.type _aesni_${dir}rypt4,\@abi-omnipotent 405.align 16 406_aesni_${dir}rypt4: 407 $movkey ($key),$rndkey0 408 shl \$4,$rounds 409 $movkey 16($key),$rndkey1 410 xorps $rndkey0,$inout0 411 xorps $rndkey0,$inout1 412 xorps $rndkey0,$inout2 413 xorps $rndkey0,$inout3 414 $movkey 32($key),$rndkey0 415 lea 32($key,$rounds),$key 416 neg %rax # $rounds 417 .byte 0x0f,0x1f,0x00 418 add \$16,%rax 419 420.L${dir}_loop4: 421 aes${dir} $rndkey1,$inout0 422 aes${dir} $rndkey1,$inout1 423 aes${dir} $rndkey1,$inout2 424 aes${dir} $rndkey1,$inout3 425 $movkey ($key,%rax),$rndkey1 426 add \$32,%rax 427 aes${dir} $rndkey0,$inout0 428 aes${dir} $rndkey0,$inout1 429 aes${dir} $rndkey0,$inout2 430 aes${dir} $rndkey0,$inout3 431 $movkey -16($key,%rax),$rndkey0 432 jnz .L${dir}_loop4 433 434 aes${dir} $rndkey1,$inout0 435 aes${dir} $rndkey1,$inout1 436 aes${dir} $rndkey1,$inout2 437 aes${dir} $rndkey1,$inout3 438 aes${dir}last $rndkey0,$inout0 439 aes${dir}last $rndkey0,$inout1 440 aes${dir}last $rndkey0,$inout2 441 aes${dir}last $rndkey0,$inout3 442 ret 443.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 444___ 445} 446sub aesni_generate6 { 447my $dir=shift; 448# As already mentioned it takes in $key and $rounds, which are *not* 449# preserved. $inout[0-5] is cipher/clear text... 450$code.=<<___; 451.type _aesni_${dir}rypt6,\@abi-omnipotent 452.align 16 453_aesni_${dir}rypt6: 454 $movkey ($key),$rndkey0 455 shl \$4,$rounds 456 $movkey 16($key),$rndkey1 457 xorps $rndkey0,$inout0 458 pxor $rndkey0,$inout1 459 pxor $rndkey0,$inout2 460 aes${dir} $rndkey1,$inout0 461 lea 32($key,$rounds),$key 462 neg %rax # $rounds 463 aes${dir} $rndkey1,$inout1 464 pxor $rndkey0,$inout3 465 pxor $rndkey0,$inout4 466 aes${dir} $rndkey1,$inout2 467 pxor $rndkey0,$inout5 468 $movkey ($key,%rax),$rndkey0 469 add \$16,%rax 470 jmp .L${dir}_loop6_enter 471.align 16 472.L${dir}_loop6: 473 aes${dir} $rndkey1,$inout0 474 aes${dir} $rndkey1,$inout1 475 aes${dir} $rndkey1,$inout2 476.L${dir}_loop6_enter: 477 aes${dir} $rndkey1,$inout3 478 aes${dir} $rndkey1,$inout4 479 aes${dir} $rndkey1,$inout5 480 $movkey ($key,%rax),$rndkey1 481 add \$32,%rax 482 aes${dir} $rndkey0,$inout0 483 aes${dir} $rndkey0,$inout1 484 aes${dir} $rndkey0,$inout2 485 aes${dir} $rndkey0,$inout3 486 aes${dir} $rndkey0,$inout4 487 aes${dir} $rndkey0,$inout5 488 $movkey -16($key,%rax),$rndkey0 489 jnz .L${dir}_loop6 490 491 aes${dir} $rndkey1,$inout0 492 aes${dir} $rndkey1,$inout1 493 aes${dir} $rndkey1,$inout2 494 aes${dir} $rndkey1,$inout3 495 aes${dir} $rndkey1,$inout4 496 aes${dir} $rndkey1,$inout5 497 aes${dir}last $rndkey0,$inout0 498 aes${dir}last $rndkey0,$inout1 499 aes${dir}last $rndkey0,$inout2 500 aes${dir}last $rndkey0,$inout3 501 aes${dir}last $rndkey0,$inout4 502 aes${dir}last $rndkey0,$inout5 503 ret 504.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 505___ 506} 507sub aesni_generate8 { 508my $dir=shift; 509# As already mentioned it takes in $key and $rounds, which are *not* 510# preserved. $inout[0-7] is cipher/clear text... 511$code.=<<___; 512.type _aesni_${dir}rypt8,\@abi-omnipotent 513.align 16 514_aesni_${dir}rypt8: 515 $movkey ($key),$rndkey0 516 shl \$4,$rounds 517 $movkey 16($key),$rndkey1 518 xorps $rndkey0,$inout0 519 xorps $rndkey0,$inout1 520 pxor $rndkey0,$inout2 521 pxor $rndkey0,$inout3 522 pxor $rndkey0,$inout4 523 lea 32($key,$rounds),$key 524 neg %rax # $rounds 525 aes${dir} $rndkey1,$inout0 526 pxor $rndkey0,$inout5 527 pxor $rndkey0,$inout6 528 aes${dir} $rndkey1,$inout1 529 pxor $rndkey0,$inout7 530 $movkey ($key,%rax),$rndkey0 531 add \$16,%rax 532 jmp .L${dir}_loop8_inner 533.align 16 534.L${dir}_loop8: 535 aes${dir} $rndkey1,$inout0 536 aes${dir} $rndkey1,$inout1 537.L${dir}_loop8_inner: 538 aes${dir} $rndkey1,$inout2 539 aes${dir} $rndkey1,$inout3 540 aes${dir} $rndkey1,$inout4 541 aes${dir} $rndkey1,$inout5 542 aes${dir} $rndkey1,$inout6 543 aes${dir} $rndkey1,$inout7 544.L${dir}_loop8_enter: 545 $movkey ($key,%rax),$rndkey1 546 add \$32,%rax 547 aes${dir} $rndkey0,$inout0 548 aes${dir} $rndkey0,$inout1 549 aes${dir} $rndkey0,$inout2 550 aes${dir} $rndkey0,$inout3 551 aes${dir} $rndkey0,$inout4 552 aes${dir} $rndkey0,$inout5 553 aes${dir} $rndkey0,$inout6 554 aes${dir} $rndkey0,$inout7 555 $movkey -16($key,%rax),$rndkey0 556 jnz .L${dir}_loop8 557 558 aes${dir} $rndkey1,$inout0 559 aes${dir} $rndkey1,$inout1 560 aes${dir} $rndkey1,$inout2 561 aes${dir} $rndkey1,$inout3 562 aes${dir} $rndkey1,$inout4 563 aes${dir} $rndkey1,$inout5 564 aes${dir} $rndkey1,$inout6 565 aes${dir} $rndkey1,$inout7 566 aes${dir}last $rndkey0,$inout0 567 aes${dir}last $rndkey0,$inout1 568 aes${dir}last $rndkey0,$inout2 569 aes${dir}last $rndkey0,$inout3 570 aes${dir}last $rndkey0,$inout4 571 aes${dir}last $rndkey0,$inout5 572 aes${dir}last $rndkey0,$inout6 573 aes${dir}last $rndkey0,$inout7 574 ret 575.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 576___ 577} 578&aesni_generate2("enc") if ($PREFIX eq "aesni"); 579&aesni_generate2("dec"); 580&aesni_generate3("enc") if ($PREFIX eq "aesni"); 581&aesni_generate3("dec"); 582&aesni_generate4("enc") if ($PREFIX eq "aesni"); 583&aesni_generate4("dec"); 584&aesni_generate6("enc") if ($PREFIX eq "aesni"); 585&aesni_generate6("dec"); 586&aesni_generate8("enc") if ($PREFIX eq "aesni"); 587&aesni_generate8("dec"); 588 589if ($PREFIX eq "aesni") { 590######################################################################## 591# void aesni_ecb_encrypt (const void *in, void *out, 592# size_t length, const AES_KEY *key, 593# int enc); 594$code.=<<___; 595.globl aesni_ecb_encrypt 596.type aesni_ecb_encrypt,\@function,5 597.align 16 598aesni_ecb_encrypt: 599___ 600$code.=<<___ if ($win64); 601 lea -0x58(%rsp),%rsp 602 movaps %xmm6,(%rsp) # offload $inout4..7 603 movaps %xmm7,0x10(%rsp) 604 movaps %xmm8,0x20(%rsp) 605 movaps %xmm9,0x30(%rsp) 606.Lecb_enc_body: 607___ 608$code.=<<___; 609 and \$-16,$len # if ($len<16) 610 jz .Lecb_ret # return 611 612 mov 240($key),$rounds # key->rounds 613 $movkey ($key),$rndkey0 614 mov $key,$key_ # backup $key 615 mov $rounds,$rnds_ # backup $rounds 616 test %r8d,%r8d # 5th argument 617 jz .Lecb_decrypt 618#--------------------------- ECB ENCRYPT ------------------------------# 619 cmp \$0x80,$len # if ($len<8*16) 620 jb .Lecb_enc_tail # short input 621 622 movdqu ($inp),$inout0 # load 8 input blocks 623 movdqu 0x10($inp),$inout1 624 movdqu 0x20($inp),$inout2 625 movdqu 0x30($inp),$inout3 626 movdqu 0x40($inp),$inout4 627 movdqu 0x50($inp),$inout5 628 movdqu 0x60($inp),$inout6 629 movdqu 0x70($inp),$inout7 630 lea 0x80($inp),$inp # $inp+=8*16 631 sub \$0x80,$len # $len-=8*16 (can be zero) 632 jmp .Lecb_enc_loop8_enter 633.align 16 634.Lecb_enc_loop8: 635 movups $inout0,($out) # store 8 output blocks 636 mov $key_,$key # restore $key 637 movdqu ($inp),$inout0 # load 8 input blocks 638 mov $rnds_,$rounds # restore $rounds 639 movups $inout1,0x10($out) 640 movdqu 0x10($inp),$inout1 641 movups $inout2,0x20($out) 642 movdqu 0x20($inp),$inout2 643 movups $inout3,0x30($out) 644 movdqu 0x30($inp),$inout3 645 movups $inout4,0x40($out) 646 movdqu 0x40($inp),$inout4 647 movups $inout5,0x50($out) 648 movdqu 0x50($inp),$inout5 649 movups $inout6,0x60($out) 650 movdqu 0x60($inp),$inout6 651 movups $inout7,0x70($out) 652 lea 0x80($out),$out # $out+=8*16 653 movdqu 0x70($inp),$inout7 654 lea 0x80($inp),$inp # $inp+=8*16 655.Lecb_enc_loop8_enter: 656 657 call _aesni_encrypt8 658 659 sub \$0x80,$len 660 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 661 662 movups $inout0,($out) # store 8 output blocks 663 mov $key_,$key # restore $key 664 movups $inout1,0x10($out) 665 mov $rnds_,$rounds # restore $rounds 666 movups $inout2,0x20($out) 667 movups $inout3,0x30($out) 668 movups $inout4,0x40($out) 669 movups $inout5,0x50($out) 670 movups $inout6,0x60($out) 671 movups $inout7,0x70($out) 672 lea 0x80($out),$out # $out+=8*16 673 add \$0x80,$len # restore real remaining $len 674 jz .Lecb_ret # done if ($len==0) 675 676.Lecb_enc_tail: # $len is less than 8*16 677 movups ($inp),$inout0 678 cmp \$0x20,$len 679 jb .Lecb_enc_one 680 movups 0x10($inp),$inout1 681 je .Lecb_enc_two 682 movups 0x20($inp),$inout2 683 cmp \$0x40,$len 684 jb .Lecb_enc_three 685 movups 0x30($inp),$inout3 686 je .Lecb_enc_four 687 movups 0x40($inp),$inout4 688 cmp \$0x60,$len 689 jb .Lecb_enc_five 690 movups 0x50($inp),$inout5 691 je .Lecb_enc_six 692 movdqu 0x60($inp),$inout6 693 xorps $inout7,$inout7 694 call _aesni_encrypt8 695 movups $inout0,($out) # store 7 output blocks 696 movups $inout1,0x10($out) 697 movups $inout2,0x20($out) 698 movups $inout3,0x30($out) 699 movups $inout4,0x40($out) 700 movups $inout5,0x50($out) 701 movups $inout6,0x60($out) 702 jmp .Lecb_ret 703.align 16 704.Lecb_enc_one: 705___ 706 &aesni_generate1("enc",$key,$rounds); 707$code.=<<___; 708 movups $inout0,($out) # store one output block 709 jmp .Lecb_ret 710.align 16 711.Lecb_enc_two: 712 call _aesni_encrypt2 713 movups $inout0,($out) # store 2 output blocks 714 movups $inout1,0x10($out) 715 jmp .Lecb_ret 716.align 16 717.Lecb_enc_three: 718 call _aesni_encrypt3 719 movups $inout0,($out) # store 3 output blocks 720 movups $inout1,0x10($out) 721 movups $inout2,0x20($out) 722 jmp .Lecb_ret 723.align 16 724.Lecb_enc_four: 725 call _aesni_encrypt4 726 movups $inout0,($out) # store 4 output blocks 727 movups $inout1,0x10($out) 728 movups $inout2,0x20($out) 729 movups $inout3,0x30($out) 730 jmp .Lecb_ret 731.align 16 732.Lecb_enc_five: 733 xorps $inout5,$inout5 734 call _aesni_encrypt6 735 movups $inout0,($out) # store 5 output blocks 736 movups $inout1,0x10($out) 737 movups $inout2,0x20($out) 738 movups $inout3,0x30($out) 739 movups $inout4,0x40($out) 740 jmp .Lecb_ret 741.align 16 742.Lecb_enc_six: 743 call _aesni_encrypt6 744 movups $inout0,($out) # store 6 output blocks 745 movups $inout1,0x10($out) 746 movups $inout2,0x20($out) 747 movups $inout3,0x30($out) 748 movups $inout4,0x40($out) 749 movups $inout5,0x50($out) 750 jmp .Lecb_ret 751#--------------------------- ECB DECRYPT ------------------------------# 752.align 16 753.Lecb_decrypt: 754 cmp \$0x80,$len # if ($len<8*16) 755 jb .Lecb_dec_tail # short input 756 757 movdqu ($inp),$inout0 # load 8 input blocks 758 movdqu 0x10($inp),$inout1 759 movdqu 0x20($inp),$inout2 760 movdqu 0x30($inp),$inout3 761 movdqu 0x40($inp),$inout4 762 movdqu 0x50($inp),$inout5 763 movdqu 0x60($inp),$inout6 764 movdqu 0x70($inp),$inout7 765 lea 0x80($inp),$inp # $inp+=8*16 766 sub \$0x80,$len # $len-=8*16 (can be zero) 767 jmp .Lecb_dec_loop8_enter 768.align 16 769.Lecb_dec_loop8: 770 movups $inout0,($out) # store 8 output blocks 771 mov $key_,$key # restore $key 772 movdqu ($inp),$inout0 # load 8 input blocks 773 mov $rnds_,$rounds # restore $rounds 774 movups $inout1,0x10($out) 775 movdqu 0x10($inp),$inout1 776 movups $inout2,0x20($out) 777 movdqu 0x20($inp),$inout2 778 movups $inout3,0x30($out) 779 movdqu 0x30($inp),$inout3 780 movups $inout4,0x40($out) 781 movdqu 0x40($inp),$inout4 782 movups $inout5,0x50($out) 783 movdqu 0x50($inp),$inout5 784 movups $inout6,0x60($out) 785 movdqu 0x60($inp),$inout6 786 movups $inout7,0x70($out) 787 lea 0x80($out),$out # $out+=8*16 788 movdqu 0x70($inp),$inout7 789 lea 0x80($inp),$inp # $inp+=8*16 790.Lecb_dec_loop8_enter: 791 792 call _aesni_decrypt8 793 794 $movkey ($key_),$rndkey0 795 sub \$0x80,$len 796 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 797 798 movups $inout0,($out) # store 8 output blocks 799 pxor $inout0,$inout0 # clear register bank 800 mov $key_,$key # restore $key 801 movups $inout1,0x10($out) 802 pxor $inout1,$inout1 803 mov $rnds_,$rounds # restore $rounds 804 movups $inout2,0x20($out) 805 pxor $inout2,$inout2 806 movups $inout3,0x30($out) 807 pxor $inout3,$inout3 808 movups $inout4,0x40($out) 809 pxor $inout4,$inout4 810 movups $inout5,0x50($out) 811 pxor $inout5,$inout5 812 movups $inout6,0x60($out) 813 pxor $inout6,$inout6 814 movups $inout7,0x70($out) 815 pxor $inout7,$inout7 816 lea 0x80($out),$out # $out+=8*16 817 add \$0x80,$len # restore real remaining $len 818 jz .Lecb_ret # done if ($len==0) 819 820.Lecb_dec_tail: 821 movups ($inp),$inout0 822 cmp \$0x20,$len 823 jb .Lecb_dec_one 824 movups 0x10($inp),$inout1 825 je .Lecb_dec_two 826 movups 0x20($inp),$inout2 827 cmp \$0x40,$len 828 jb .Lecb_dec_three 829 movups 0x30($inp),$inout3 830 je .Lecb_dec_four 831 movups 0x40($inp),$inout4 832 cmp \$0x60,$len 833 jb .Lecb_dec_five 834 movups 0x50($inp),$inout5 835 je .Lecb_dec_six 836 movups 0x60($inp),$inout6 837 $movkey ($key),$rndkey0 838 xorps $inout7,$inout7 839 call _aesni_decrypt8 840 movups $inout0,($out) # store 7 output blocks 841 pxor $inout0,$inout0 # clear register bank 842 movups $inout1,0x10($out) 843 pxor $inout1,$inout1 844 movups $inout2,0x20($out) 845 pxor $inout2,$inout2 846 movups $inout3,0x30($out) 847 pxor $inout3,$inout3 848 movups $inout4,0x40($out) 849 pxor $inout4,$inout4 850 movups $inout5,0x50($out) 851 pxor $inout5,$inout5 852 movups $inout6,0x60($out) 853 pxor $inout6,$inout6 854 pxor $inout7,$inout7 855 jmp .Lecb_ret 856.align 16 857.Lecb_dec_one: 858___ 859 &aesni_generate1("dec",$key,$rounds); 860$code.=<<___; 861 movups $inout0,($out) # store one output block 862 pxor $inout0,$inout0 # clear register bank 863 jmp .Lecb_ret 864.align 16 865.Lecb_dec_two: 866 call _aesni_decrypt2 867 movups $inout0,($out) # store 2 output blocks 868 pxor $inout0,$inout0 # clear register bank 869 movups $inout1,0x10($out) 870 pxor $inout1,$inout1 871 jmp .Lecb_ret 872.align 16 873.Lecb_dec_three: 874 call _aesni_decrypt3 875 movups $inout0,($out) # store 3 output blocks 876 pxor $inout0,$inout0 # clear register bank 877 movups $inout1,0x10($out) 878 pxor $inout1,$inout1 879 movups $inout2,0x20($out) 880 pxor $inout2,$inout2 881 jmp .Lecb_ret 882.align 16 883.Lecb_dec_four: 884 call _aesni_decrypt4 885 movups $inout0,($out) # store 4 output blocks 886 pxor $inout0,$inout0 # clear register bank 887 movups $inout1,0x10($out) 888 pxor $inout1,$inout1 889 movups $inout2,0x20($out) 890 pxor $inout2,$inout2 891 movups $inout3,0x30($out) 892 pxor $inout3,$inout3 893 jmp .Lecb_ret 894.align 16 895.Lecb_dec_five: 896 xorps $inout5,$inout5 897 call _aesni_decrypt6 898 movups $inout0,($out) # store 5 output blocks 899 pxor $inout0,$inout0 # clear register bank 900 movups $inout1,0x10($out) 901 pxor $inout1,$inout1 902 movups $inout2,0x20($out) 903 pxor $inout2,$inout2 904 movups $inout3,0x30($out) 905 pxor $inout3,$inout3 906 movups $inout4,0x40($out) 907 pxor $inout4,$inout4 908 pxor $inout5,$inout5 909 jmp .Lecb_ret 910.align 16 911.Lecb_dec_six: 912 call _aesni_decrypt6 913 movups $inout0,($out) # store 6 output blocks 914 pxor $inout0,$inout0 # clear register bank 915 movups $inout1,0x10($out) 916 pxor $inout1,$inout1 917 movups $inout2,0x20($out) 918 pxor $inout2,$inout2 919 movups $inout3,0x30($out) 920 pxor $inout3,$inout3 921 movups $inout4,0x40($out) 922 pxor $inout4,$inout4 923 movups $inout5,0x50($out) 924 pxor $inout5,$inout5 925 926.Lecb_ret: 927 xorps $rndkey0,$rndkey0 # %xmm0 928 pxor $rndkey1,$rndkey1 929___ 930$code.=<<___ if ($win64); 931 movaps (%rsp),%xmm6 932 movaps %xmm0,(%rsp) # clear stack 933 movaps 0x10(%rsp),%xmm7 934 movaps %xmm0,0x10(%rsp) 935 movaps 0x20(%rsp),%xmm8 936 movaps %xmm0,0x20(%rsp) 937 movaps 0x30(%rsp),%xmm9 938 movaps %xmm0,0x30(%rsp) 939 lea 0x58(%rsp),%rsp 940.Lecb_enc_ret: 941___ 942$code.=<<___; 943 ret 944.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 945___ 946 947{ 948###################################################################### 949# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 950# size_t blocks, const AES_KEY *key, 951# const char *ivec,char *cmac); 952# 953# Handles only complete blocks, operates on 64-bit counter and 954# does not update *ivec! Nor does it finalize CMAC value 955# (see engine/eng_aesni.c for details) 956# 957{ 958my $cmac="%r9"; # 6th argument 959 960my $increment="%xmm9"; 961my $iv="%xmm6"; 962my $bswap_mask="%xmm7"; 963 964$code.=<<___; 965.globl aesni_ccm64_encrypt_blocks 966.type aesni_ccm64_encrypt_blocks,\@function,6 967.align 16 968aesni_ccm64_encrypt_blocks: 969___ 970$code.=<<___ if ($win64); 971 lea -0x58(%rsp),%rsp 972 movaps %xmm6,(%rsp) # $iv 973 movaps %xmm7,0x10(%rsp) # $bswap_mask 974 movaps %xmm8,0x20(%rsp) # $in0 975 movaps %xmm9,0x30(%rsp) # $increment 976.Lccm64_enc_body: 977___ 978$code.=<<___; 979 mov 240($key),$rounds # key->rounds 980 movdqu ($ivp),$iv 981 movdqa .Lincrement64(%rip),$increment 982 movdqa .Lbswap_mask(%rip),$bswap_mask 983 984 shl \$4,$rounds 985 mov \$16,$rnds_ 986 lea 0($key),$key_ 987 movdqu ($cmac),$inout1 988 movdqa $iv,$inout0 989 lea 32($key,$rounds),$key # end of key schedule 990 pshufb $bswap_mask,$iv 991 sub %rax,%r10 # twisted $rounds 992 jmp .Lccm64_enc_outer 993.align 16 994.Lccm64_enc_outer: 995 $movkey ($key_),$rndkey0 996 mov %r10,%rax 997 movups ($inp),$in0 # load inp 998 999 xorps $rndkey0,$inout0 # counter 1000 $movkey 16($key_),$rndkey1 1001 xorps $in0,$rndkey0 1002 xorps $rndkey0,$inout1 # cmac^=inp 1003 $movkey 32($key_),$rndkey0 1004 1005.Lccm64_enc2_loop: 1006 aesenc $rndkey1,$inout0 1007 aesenc $rndkey1,$inout1 1008 $movkey ($key,%rax),$rndkey1 1009 add \$32,%rax 1010 aesenc $rndkey0,$inout0 1011 aesenc $rndkey0,$inout1 1012 $movkey -16($key,%rax),$rndkey0 1013 jnz .Lccm64_enc2_loop 1014 aesenc $rndkey1,$inout0 1015 aesenc $rndkey1,$inout1 1016 paddq $increment,$iv 1017 dec $len # $len-- ($len is in blocks) 1018 aesenclast $rndkey0,$inout0 1019 aesenclast $rndkey0,$inout1 1020 1021 lea 16($inp),$inp 1022 xorps $inout0,$in0 # inp ^= E(iv) 1023 movdqa $iv,$inout0 1024 movups $in0,($out) # save output 1025 pshufb $bswap_mask,$inout0 1026 lea 16($out),$out # $out+=16 1027 jnz .Lccm64_enc_outer # loop if ($len!=0) 1028 1029 pxor $rndkey0,$rndkey0 # clear register bank 1030 pxor $rndkey1,$rndkey1 1031 pxor $inout0,$inout0 1032 movups $inout1,($cmac) # store resulting mac 1033 pxor $inout1,$inout1 1034 pxor $in0,$in0 1035 pxor $iv,$iv 1036___ 1037$code.=<<___ if ($win64); 1038 movaps (%rsp),%xmm6 1039 movaps %xmm0,(%rsp) # clear stack 1040 movaps 0x10(%rsp),%xmm7 1041 movaps %xmm0,0x10(%rsp) 1042 movaps 0x20(%rsp),%xmm8 1043 movaps %xmm0,0x20(%rsp) 1044 movaps 0x30(%rsp),%xmm9 1045 movaps %xmm0,0x30(%rsp) 1046 lea 0x58(%rsp),%rsp 1047.Lccm64_enc_ret: 1048___ 1049$code.=<<___; 1050 ret 1051.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1052___ 1053###################################################################### 1054$code.=<<___; 1055.globl aesni_ccm64_decrypt_blocks 1056.type aesni_ccm64_decrypt_blocks,\@function,6 1057.align 16 1058aesni_ccm64_decrypt_blocks: 1059___ 1060$code.=<<___ if ($win64); 1061 lea -0x58(%rsp),%rsp 1062 movaps %xmm6,(%rsp) # $iv 1063 movaps %xmm7,0x10(%rsp) # $bswap_mask 1064 movaps %xmm8,0x20(%rsp) # $in8 1065 movaps %xmm9,0x30(%rsp) # $increment 1066.Lccm64_dec_body: 1067___ 1068$code.=<<___; 1069 mov 240($key),$rounds # key->rounds 1070 movups ($ivp),$iv 1071 movdqu ($cmac),$inout1 1072 movdqa .Lincrement64(%rip),$increment 1073 movdqa .Lbswap_mask(%rip),$bswap_mask 1074 1075 movaps $iv,$inout0 1076 mov $rounds,$rnds_ 1077 mov $key,$key_ 1078 pshufb $bswap_mask,$iv 1079___ 1080 &aesni_generate1("enc",$key,$rounds); 1081$code.=<<___; 1082 shl \$4,$rnds_ 1083 mov \$16,$rounds 1084 movups ($inp),$in0 # load inp 1085 paddq $increment,$iv 1086 lea 16($inp),$inp # $inp+=16 1087 sub %r10,%rax # twisted $rounds 1088 lea 32($key_,$rnds_),$key # end of key schedule 1089 mov %rax,%r10 1090 jmp .Lccm64_dec_outer 1091.align 16 1092.Lccm64_dec_outer: 1093 xorps $inout0,$in0 # inp ^= E(iv) 1094 movdqa $iv,$inout0 1095 movups $in0,($out) # save output 1096 lea 16($out),$out # $out+=16 1097 pshufb $bswap_mask,$inout0 1098 1099 sub \$1,$len # $len-- ($len is in blocks) 1100 jz .Lccm64_dec_break # if ($len==0) break 1101 1102 $movkey ($key_),$rndkey0 1103 mov %r10,%rax 1104 $movkey 16($key_),$rndkey1 1105 xorps $rndkey0,$in0 1106 xorps $rndkey0,$inout0 1107 xorps $in0,$inout1 # cmac^=out 1108 $movkey 32($key_),$rndkey0 1109 jmp .Lccm64_dec2_loop 1110.align 16 1111.Lccm64_dec2_loop: 1112 aesenc $rndkey1,$inout0 1113 aesenc $rndkey1,$inout1 1114 $movkey ($key,%rax),$rndkey1 1115 add \$32,%rax 1116 aesenc $rndkey0,$inout0 1117 aesenc $rndkey0,$inout1 1118 $movkey -16($key,%rax),$rndkey0 1119 jnz .Lccm64_dec2_loop 1120 movups ($inp),$in0 # load input 1121 paddq $increment,$iv 1122 aesenc $rndkey1,$inout0 1123 aesenc $rndkey1,$inout1 1124 aesenclast $rndkey0,$inout0 1125 aesenclast $rndkey0,$inout1 1126 lea 16($inp),$inp # $inp+=16 1127 jmp .Lccm64_dec_outer 1128 1129.align 16 1130.Lccm64_dec_break: 1131 #xorps $in0,$inout1 # cmac^=out 1132 mov 240($key_),$rounds 1133___ 1134 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1135$code.=<<___; 1136 pxor $rndkey0,$rndkey0 # clear register bank 1137 pxor $rndkey1,$rndkey1 1138 pxor $inout0,$inout0 1139 movups $inout1,($cmac) # store resulting mac 1140 pxor $inout1,$inout1 1141 pxor $in0,$in0 1142 pxor $iv,$iv 1143___ 1144$code.=<<___ if ($win64); 1145 movaps (%rsp),%xmm6 1146 movaps %xmm0,(%rsp) # clear stack 1147 movaps 0x10(%rsp),%xmm7 1148 movaps %xmm0,0x10(%rsp) 1149 movaps 0x20(%rsp),%xmm8 1150 movaps %xmm0,0x20(%rsp) 1151 movaps 0x30(%rsp),%xmm9 1152 movaps %xmm0,0x30(%rsp) 1153 lea 0x58(%rsp),%rsp 1154.Lccm64_dec_ret: 1155___ 1156$code.=<<___; 1157 ret 1158.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1159___ 1160} 1161###################################################################### 1162# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1163# size_t blocks, const AES_KEY *key, 1164# const char *ivec); 1165# 1166# Handles only complete blocks, operates on 32-bit counter and 1167# does not update *ivec! (see crypto/modes/ctr128.c for details) 1168# 1169# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1170# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1171# Keywords are full unroll and modulo-schedule counter calculations 1172# with zero-round key xor. 1173{ 1174my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1175my ($key0,$ctr)=("%ebp","${ivp}d"); 1176my $frame_size = 0x80 + ($win64?160:0); 1177 1178$code.=<<___; 1179.globl aesni_ctr32_encrypt_blocks 1180.type aesni_ctr32_encrypt_blocks,\@function,5 1181.align 16 1182aesni_ctr32_encrypt_blocks: 1183 cmp \$1,$len 1184 jne .Lctr32_bulk 1185 1186 # handle single block without allocating stack frame, 1187 # useful when handling edges 1188 movups ($ivp),$inout0 1189 movups ($inp),$inout1 1190 mov 240($key),%edx # key->rounds 1191___ 1192 &aesni_generate1("enc",$key,"%edx"); 1193$code.=<<___; 1194 pxor $rndkey0,$rndkey0 # clear register bank 1195 pxor $rndkey1,$rndkey1 1196 xorps $inout1,$inout0 1197 pxor $inout1,$inout1 1198 movups $inout0,($out) 1199 xorps $inout0,$inout0 1200 jmp .Lctr32_epilogue 1201 1202.align 16 1203.Lctr32_bulk: 1204 lea (%rsp),$key_ # use $key_ as frame pointer 1205 push %rbp 1206 sub \$$frame_size,%rsp 1207 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1208___ 1209$code.=<<___ if ($win64); 1210 movaps %xmm6,-0xa8($key_) # offload everything 1211 movaps %xmm7,-0x98($key_) 1212 movaps %xmm8,-0x88($key_) 1213 movaps %xmm9,-0x78($key_) 1214 movaps %xmm10,-0x68($key_) 1215 movaps %xmm11,-0x58($key_) 1216 movaps %xmm12,-0x48($key_) 1217 movaps %xmm13,-0x38($key_) 1218 movaps %xmm14,-0x28($key_) 1219 movaps %xmm15,-0x18($key_) 1220.Lctr32_body: 1221___ 1222$code.=<<___; 1223 1224 # 8 16-byte words on top of stack are counter values 1225 # xor-ed with zero-round key 1226 1227 movdqu ($ivp),$inout0 1228 movdqu ($key),$rndkey0 1229 mov 12($ivp),$ctr # counter LSB 1230 pxor $rndkey0,$inout0 1231 mov 12($key),$key0 # 0-round key LSB 1232 movdqa $inout0,0x00(%rsp) # populate counter block 1233 bswap $ctr 1234 movdqa $inout0,$inout1 1235 movdqa $inout0,$inout2 1236 movdqa $inout0,$inout3 1237 movdqa $inout0,0x40(%rsp) 1238 movdqa $inout0,0x50(%rsp) 1239 movdqa $inout0,0x60(%rsp) 1240 mov %rdx,%r10 # about to borrow %rdx 1241 movdqa $inout0,0x70(%rsp) 1242 1243 lea 1($ctr),%rax 1244 lea 2($ctr),%rdx 1245 bswap %eax 1246 bswap %edx 1247 xor $key0,%eax 1248 xor $key0,%edx 1249 pinsrd \$3,%eax,$inout1 1250 lea 3($ctr),%rax 1251 movdqa $inout1,0x10(%rsp) 1252 pinsrd \$3,%edx,$inout2 1253 bswap %eax 1254 mov %r10,%rdx # restore %rdx 1255 lea 4($ctr),%r10 1256 movdqa $inout2,0x20(%rsp) 1257 xor $key0,%eax 1258 bswap %r10d 1259 pinsrd \$3,%eax,$inout3 1260 xor $key0,%r10d 1261 movdqa $inout3,0x30(%rsp) 1262 lea 5($ctr),%r9 1263 mov %r10d,0x40+12(%rsp) 1264 bswap %r9d 1265 lea 6($ctr),%r10 1266 mov 240($key),$rounds # key->rounds 1267 xor $key0,%r9d 1268 bswap %r10d 1269 mov %r9d,0x50+12(%rsp) 1270 xor $key0,%r10d 1271 lea 7($ctr),%r9 1272 mov %r10d,0x60+12(%rsp) 1273 bswap %r9d 1274 leaq OPENSSL_ia32cap_P(%rip),%r10 1275 mov 4(%r10),%r10d 1276 xor $key0,%r9d 1277 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1278 mov %r9d,0x70+12(%rsp) 1279 1280 $movkey 0x10($key),$rndkey1 1281 1282 movdqa 0x40(%rsp),$inout4 1283 movdqa 0x50(%rsp),$inout5 1284 1285 cmp \$8,$len # $len is in blocks 1286 jb .Lctr32_tail # short input if ($len<8) 1287 1288 sub \$6,$len # $len is biased by -6 1289 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1290 je .Lctr32_6x # [which denotes Atom Silvermont] 1291 1292 lea 0x80($key),$key # size optimization 1293 sub \$2,$len # $len is biased by -8 1294 jmp .Lctr32_loop8 1295 1296.align 16 1297.Lctr32_6x: 1298 shl \$4,$rounds 1299 mov \$48,$rnds_ 1300 bswap $key0 1301 lea 32($key,$rounds),$key # end of key schedule 1302 sub %rax,%r10 # twisted $rounds 1303 jmp .Lctr32_loop6 1304 1305.align 16 1306.Lctr32_loop6: 1307 add \$6,$ctr # next counter value 1308 $movkey -48($key,$rnds_),$rndkey0 1309 aesenc $rndkey1,$inout0 1310 mov $ctr,%eax 1311 xor $key0,%eax 1312 aesenc $rndkey1,$inout1 1313 movbe %eax,`0x00+12`(%rsp) # store next counter value 1314 lea 1($ctr),%eax 1315 aesenc $rndkey1,$inout2 1316 xor $key0,%eax 1317 movbe %eax,`0x10+12`(%rsp) 1318 aesenc $rndkey1,$inout3 1319 lea 2($ctr),%eax 1320 xor $key0,%eax 1321 aesenc $rndkey1,$inout4 1322 movbe %eax,`0x20+12`(%rsp) 1323 lea 3($ctr),%eax 1324 aesenc $rndkey1,$inout5 1325 $movkey -32($key,$rnds_),$rndkey1 1326 xor $key0,%eax 1327 1328 aesenc $rndkey0,$inout0 1329 movbe %eax,`0x30+12`(%rsp) 1330 lea 4($ctr),%eax 1331 aesenc $rndkey0,$inout1 1332 xor $key0,%eax 1333 movbe %eax,`0x40+12`(%rsp) 1334 aesenc $rndkey0,$inout2 1335 lea 5($ctr),%eax 1336 xor $key0,%eax 1337 aesenc $rndkey0,$inout3 1338 movbe %eax,`0x50+12`(%rsp) 1339 mov %r10,%rax # mov $rnds_,$rounds 1340 aesenc $rndkey0,$inout4 1341 aesenc $rndkey0,$inout5 1342 $movkey -16($key,$rnds_),$rndkey0 1343 1344 call .Lenc_loop6 1345 1346 movdqu ($inp),$inout6 # load 6 input blocks 1347 movdqu 0x10($inp),$inout7 1348 movdqu 0x20($inp),$in0 1349 movdqu 0x30($inp),$in1 1350 movdqu 0x40($inp),$in2 1351 movdqu 0x50($inp),$in3 1352 lea 0x60($inp),$inp # $inp+=6*16 1353 $movkey -64($key,$rnds_),$rndkey1 1354 pxor $inout0,$inout6 # inp^=E(ctr) 1355 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1356 pxor $inout1,$inout7 1357 movaps 0x10(%rsp),$inout1 1358 pxor $inout2,$in0 1359 movaps 0x20(%rsp),$inout2 1360 pxor $inout3,$in1 1361 movaps 0x30(%rsp),$inout3 1362 pxor $inout4,$in2 1363 movaps 0x40(%rsp),$inout4 1364 pxor $inout5,$in3 1365 movaps 0x50(%rsp),$inout5 1366 movdqu $inout6,($out) # store 6 output blocks 1367 movdqu $inout7,0x10($out) 1368 movdqu $in0,0x20($out) 1369 movdqu $in1,0x30($out) 1370 movdqu $in2,0x40($out) 1371 movdqu $in3,0x50($out) 1372 lea 0x60($out),$out # $out+=6*16 1373 1374 sub \$6,$len 1375 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1376 1377 add \$6,$len # restore real remaining $len 1378 jz .Lctr32_done # done if ($len==0) 1379 1380 lea -48($rnds_),$rounds 1381 lea -80($key,$rnds_),$key # restore $key 1382 neg $rounds 1383 shr \$4,$rounds # restore $rounds 1384 jmp .Lctr32_tail 1385 1386.align 32 1387.Lctr32_loop8: 1388 add \$8,$ctr # next counter value 1389 movdqa 0x60(%rsp),$inout6 1390 aesenc $rndkey1,$inout0 1391 mov $ctr,%r9d 1392 movdqa 0x70(%rsp),$inout7 1393 aesenc $rndkey1,$inout1 1394 bswap %r9d 1395 $movkey 0x20-0x80($key),$rndkey0 1396 aesenc $rndkey1,$inout2 1397 xor $key0,%r9d 1398 nop 1399 aesenc $rndkey1,$inout3 1400 mov %r9d,0x00+12(%rsp) # store next counter value 1401 lea 1($ctr),%r9 1402 aesenc $rndkey1,$inout4 1403 aesenc $rndkey1,$inout5 1404 aesenc $rndkey1,$inout6 1405 aesenc $rndkey1,$inout7 1406 $movkey 0x30-0x80($key),$rndkey1 1407___ 1408for($i=2;$i<8;$i++) { 1409my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1410$code.=<<___; 1411 bswap %r9d 1412 aesenc $rndkeyx,$inout0 1413 aesenc $rndkeyx,$inout1 1414 xor $key0,%r9d 1415 .byte 0x66,0x90 1416 aesenc $rndkeyx,$inout2 1417 aesenc $rndkeyx,$inout3 1418 mov %r9d,`0x10*($i-1)`+12(%rsp) 1419 lea $i($ctr),%r9 1420 aesenc $rndkeyx,$inout4 1421 aesenc $rndkeyx,$inout5 1422 aesenc $rndkeyx,$inout6 1423 aesenc $rndkeyx,$inout7 1424 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1425___ 1426} 1427$code.=<<___; 1428 bswap %r9d 1429 aesenc $rndkey0,$inout0 1430 aesenc $rndkey0,$inout1 1431 aesenc $rndkey0,$inout2 1432 xor $key0,%r9d 1433 movdqu 0x00($inp),$in0 # start loading input 1434 aesenc $rndkey0,$inout3 1435 mov %r9d,0x70+12(%rsp) 1436 cmp \$11,$rounds 1437 aesenc $rndkey0,$inout4 1438 aesenc $rndkey0,$inout5 1439 aesenc $rndkey0,$inout6 1440 aesenc $rndkey0,$inout7 1441 $movkey 0xa0-0x80($key),$rndkey0 1442 1443 jb .Lctr32_enc_done 1444 1445 aesenc $rndkey1,$inout0 1446 aesenc $rndkey1,$inout1 1447 aesenc $rndkey1,$inout2 1448 aesenc $rndkey1,$inout3 1449 aesenc $rndkey1,$inout4 1450 aesenc $rndkey1,$inout5 1451 aesenc $rndkey1,$inout6 1452 aesenc $rndkey1,$inout7 1453 $movkey 0xb0-0x80($key),$rndkey1 1454 1455 aesenc $rndkey0,$inout0 1456 aesenc $rndkey0,$inout1 1457 aesenc $rndkey0,$inout2 1458 aesenc $rndkey0,$inout3 1459 aesenc $rndkey0,$inout4 1460 aesenc $rndkey0,$inout5 1461 aesenc $rndkey0,$inout6 1462 aesenc $rndkey0,$inout7 1463 $movkey 0xc0-0x80($key),$rndkey0 1464 je .Lctr32_enc_done 1465 1466 aesenc $rndkey1,$inout0 1467 aesenc $rndkey1,$inout1 1468 aesenc $rndkey1,$inout2 1469 aesenc $rndkey1,$inout3 1470 aesenc $rndkey1,$inout4 1471 aesenc $rndkey1,$inout5 1472 aesenc $rndkey1,$inout6 1473 aesenc $rndkey1,$inout7 1474 $movkey 0xd0-0x80($key),$rndkey1 1475 1476 aesenc $rndkey0,$inout0 1477 aesenc $rndkey0,$inout1 1478 aesenc $rndkey0,$inout2 1479 aesenc $rndkey0,$inout3 1480 aesenc $rndkey0,$inout4 1481 aesenc $rndkey0,$inout5 1482 aesenc $rndkey0,$inout6 1483 aesenc $rndkey0,$inout7 1484 $movkey 0xe0-0x80($key),$rndkey0 1485 jmp .Lctr32_enc_done 1486 1487.align 16 1488.Lctr32_enc_done: 1489 movdqu 0x10($inp),$in1 1490 pxor $rndkey0,$in0 # input^=round[last] 1491 movdqu 0x20($inp),$in2 1492 pxor $rndkey0,$in1 1493 movdqu 0x30($inp),$in3 1494 pxor $rndkey0,$in2 1495 movdqu 0x40($inp),$in4 1496 pxor $rndkey0,$in3 1497 movdqu 0x50($inp),$in5 1498 pxor $rndkey0,$in4 1499 pxor $rndkey0,$in5 1500 aesenc $rndkey1,$inout0 1501 aesenc $rndkey1,$inout1 1502 aesenc $rndkey1,$inout2 1503 aesenc $rndkey1,$inout3 1504 aesenc $rndkey1,$inout4 1505 aesenc $rndkey1,$inout5 1506 aesenc $rndkey1,$inout6 1507 aesenc $rndkey1,$inout7 1508 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1509 lea 0x80($inp),$inp # $inp+=8*16 1510 1511 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1512 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1513 movdqu 0x70-0x80($inp),$in0 1514 aesenclast $in1,$inout1 1515 pxor $rndkey0,$in0 1516 movdqa 0x00(%rsp),$in1 # load next counter block 1517 aesenclast $in2,$inout2 1518 aesenclast $in3,$inout3 1519 movdqa 0x10(%rsp),$in2 1520 movdqa 0x20(%rsp),$in3 1521 aesenclast $in4,$inout4 1522 aesenclast $in5,$inout5 1523 movdqa 0x30(%rsp),$in4 1524 movdqa 0x40(%rsp),$in5 1525 aesenclast $rndkey1,$inout6 1526 movdqa 0x50(%rsp),$rndkey0 1527 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1528 aesenclast $in0,$inout7 1529 1530 movups $inout0,($out) # store 8 output blocks 1531 movdqa $in1,$inout0 1532 movups $inout1,0x10($out) 1533 movdqa $in2,$inout1 1534 movups $inout2,0x20($out) 1535 movdqa $in3,$inout2 1536 movups $inout3,0x30($out) 1537 movdqa $in4,$inout3 1538 movups $inout4,0x40($out) 1539 movdqa $in5,$inout4 1540 movups $inout5,0x50($out) 1541 movdqa $rndkey0,$inout5 1542 movups $inout6,0x60($out) 1543 movups $inout7,0x70($out) 1544 lea 0x80($out),$out # $out+=8*16 1545 1546 sub \$8,$len 1547 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1548 1549 add \$8,$len # restore real remainig $len 1550 jz .Lctr32_done # done if ($len==0) 1551 lea -0x80($key),$key 1552 1553.Lctr32_tail: 1554 # note that at this point $inout0..5 are populated with 1555 # counter values xor-ed with 0-round key 1556 lea 16($key),$key 1557 cmp \$4,$len 1558 jb .Lctr32_loop3 1559 je .Lctr32_loop4 1560 1561 # if ($len>4) compute 7 E(counter) 1562 shl \$4,$rounds 1563 movdqa 0x60(%rsp),$inout6 1564 pxor $inout7,$inout7 1565 1566 $movkey 16($key),$rndkey0 1567 aesenc $rndkey1,$inout0 1568 aesenc $rndkey1,$inout1 1569 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1570 neg %rax 1571 aesenc $rndkey1,$inout2 1572 add \$16,%rax # prepare for .Lenc_loop8_enter 1573 movups ($inp),$in0 1574 aesenc $rndkey1,$inout3 1575 aesenc $rndkey1,$inout4 1576 movups 0x10($inp),$in1 # pre-load input 1577 movups 0x20($inp),$in2 1578 aesenc $rndkey1,$inout5 1579 aesenc $rndkey1,$inout6 1580 1581 call .Lenc_loop8_enter 1582 1583 movdqu 0x30($inp),$in3 1584 pxor $in0,$inout0 1585 movdqu 0x40($inp),$in0 1586 pxor $in1,$inout1 1587 movdqu $inout0,($out) # store output 1588 pxor $in2,$inout2 1589 movdqu $inout1,0x10($out) 1590 pxor $in3,$inout3 1591 movdqu $inout2,0x20($out) 1592 pxor $in0,$inout4 1593 movdqu $inout3,0x30($out) 1594 movdqu $inout4,0x40($out) 1595 cmp \$6,$len 1596 jb .Lctr32_done # $len was 5, stop store 1597 1598 movups 0x50($inp),$in1 1599 xorps $in1,$inout5 1600 movups $inout5,0x50($out) 1601 je .Lctr32_done # $len was 6, stop store 1602 1603 movups 0x60($inp),$in2 1604 xorps $in2,$inout6 1605 movups $inout6,0x60($out) 1606 jmp .Lctr32_done # $len was 7, stop store 1607 1608.align 32 1609.Lctr32_loop4: 1610 aesenc $rndkey1,$inout0 1611 lea 16($key),$key 1612 dec $rounds 1613 aesenc $rndkey1,$inout1 1614 aesenc $rndkey1,$inout2 1615 aesenc $rndkey1,$inout3 1616 $movkey ($key),$rndkey1 1617 jnz .Lctr32_loop4 1618 aesenclast $rndkey1,$inout0 1619 aesenclast $rndkey1,$inout1 1620 movups ($inp),$in0 # load input 1621 movups 0x10($inp),$in1 1622 aesenclast $rndkey1,$inout2 1623 aesenclast $rndkey1,$inout3 1624 movups 0x20($inp),$in2 1625 movups 0x30($inp),$in3 1626 1627 xorps $in0,$inout0 1628 movups $inout0,($out) # store output 1629 xorps $in1,$inout1 1630 movups $inout1,0x10($out) 1631 pxor $in2,$inout2 1632 movdqu $inout2,0x20($out) 1633 pxor $in3,$inout3 1634 movdqu $inout3,0x30($out) 1635 jmp .Lctr32_done # $len was 4, stop store 1636 1637.align 32 1638.Lctr32_loop3: 1639 aesenc $rndkey1,$inout0 1640 lea 16($key),$key 1641 dec $rounds 1642 aesenc $rndkey1,$inout1 1643 aesenc $rndkey1,$inout2 1644 $movkey ($key),$rndkey1 1645 jnz .Lctr32_loop3 1646 aesenclast $rndkey1,$inout0 1647 aesenclast $rndkey1,$inout1 1648 aesenclast $rndkey1,$inout2 1649 1650 movups ($inp),$in0 # load input 1651 xorps $in0,$inout0 1652 movups $inout0,($out) # store output 1653 cmp \$2,$len 1654 jb .Lctr32_done # $len was 1, stop store 1655 1656 movups 0x10($inp),$in1 1657 xorps $in1,$inout1 1658 movups $inout1,0x10($out) 1659 je .Lctr32_done # $len was 2, stop store 1660 1661 movups 0x20($inp),$in2 1662 xorps $in2,$inout2 1663 movups $inout2,0x20($out) # $len was 3, stop store 1664 1665.Lctr32_done: 1666 xorps %xmm0,%xmm0 # clear regiser bank 1667 xor $key0,$key0 1668 pxor %xmm1,%xmm1 1669 pxor %xmm2,%xmm2 1670 pxor %xmm3,%xmm3 1671 pxor %xmm4,%xmm4 1672 pxor %xmm5,%xmm5 1673___ 1674$code.=<<___ if (!$win64); 1675 pxor %xmm6,%xmm6 1676 pxor %xmm7,%xmm7 1677 movaps %xmm0,0x00(%rsp) # clear stack 1678 pxor %xmm8,%xmm8 1679 movaps %xmm0,0x10(%rsp) 1680 pxor %xmm9,%xmm9 1681 movaps %xmm0,0x20(%rsp) 1682 pxor %xmm10,%xmm10 1683 movaps %xmm0,0x30(%rsp) 1684 pxor %xmm11,%xmm11 1685 movaps %xmm0,0x40(%rsp) 1686 pxor %xmm12,%xmm12 1687 movaps %xmm0,0x50(%rsp) 1688 pxor %xmm13,%xmm13 1689 movaps %xmm0,0x60(%rsp) 1690 pxor %xmm14,%xmm14 1691 movaps %xmm0,0x70(%rsp) 1692 pxor %xmm15,%xmm15 1693___ 1694$code.=<<___ if ($win64); 1695 movaps -0xa8($key_),%xmm6 1696 movaps %xmm0,-0xa8($key_) # clear stack 1697 movaps -0x98($key_),%xmm7 1698 movaps %xmm0,-0x98($key_) 1699 movaps -0x88($key_),%xmm8 1700 movaps %xmm0,-0x88($key_) 1701 movaps -0x78($key_),%xmm9 1702 movaps %xmm0,-0x78($key_) 1703 movaps -0x68($key_),%xmm10 1704 movaps %xmm0,-0x68($key_) 1705 movaps -0x58($key_),%xmm11 1706 movaps %xmm0,-0x58($key_) 1707 movaps -0x48($key_),%xmm12 1708 movaps %xmm0,-0x48($key_) 1709 movaps -0x38($key_),%xmm13 1710 movaps %xmm0,-0x38($key_) 1711 movaps -0x28($key_),%xmm14 1712 movaps %xmm0,-0x28($key_) 1713 movaps -0x18($key_),%xmm15 1714 movaps %xmm0,-0x18($key_) 1715 movaps %xmm0,0x00(%rsp) 1716 movaps %xmm0,0x10(%rsp) 1717 movaps %xmm0,0x20(%rsp) 1718 movaps %xmm0,0x30(%rsp) 1719 movaps %xmm0,0x40(%rsp) 1720 movaps %xmm0,0x50(%rsp) 1721 movaps %xmm0,0x60(%rsp) 1722 movaps %xmm0,0x70(%rsp) 1723___ 1724$code.=<<___; 1725 mov -8($key_),%rbp 1726 lea ($key_),%rsp 1727.Lctr32_epilogue: 1728 ret 1729.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1730___ 1731} 1732 1733###################################################################### 1734# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1735# const AES_KEY *key1, const AES_KEY *key2 1736# const unsigned char iv[16]); 1737# 1738{ 1739my @tweak=map("%xmm$_",(10..15)); 1740my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1741my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1742my $frame_size = 0x70 + ($win64?160:0); 1743my $key_ = "%rbp"; # override so that we can use %r11 as FP 1744 1745$code.=<<___; 1746.globl aesni_xts_encrypt 1747.type aesni_xts_encrypt,\@function,6 1748.align 16 1749aesni_xts_encrypt: 1750 lea (%rsp),%r11 # frame pointer 1751 push %rbp 1752 sub \$$frame_size,%rsp 1753 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1754___ 1755$code.=<<___ if ($win64); 1756 movaps %xmm6,-0xa8(%r11) # offload everything 1757 movaps %xmm7,-0x98(%r11) 1758 movaps %xmm8,-0x88(%r11) 1759 movaps %xmm9,-0x78(%r11) 1760 movaps %xmm10,-0x68(%r11) 1761 movaps %xmm11,-0x58(%r11) 1762 movaps %xmm12,-0x48(%r11) 1763 movaps %xmm13,-0x38(%r11) 1764 movaps %xmm14,-0x28(%r11) 1765 movaps %xmm15,-0x18(%r11) 1766.Lxts_enc_body: 1767___ 1768$code.=<<___; 1769 movups ($ivp),$inout0 # load clear-text tweak 1770 mov 240(%r8),$rounds # key2->rounds 1771 mov 240($key),$rnds_ # key1->rounds 1772___ 1773 # generate the tweak 1774 &aesni_generate1("enc",$key2,$rounds,$inout0); 1775$code.=<<___; 1776 $movkey ($key),$rndkey0 # zero round key 1777 mov $key,$key_ # backup $key 1778 mov $rnds_,$rounds # backup $rounds 1779 shl \$4,$rnds_ 1780 mov $len,$len_ # backup $len 1781 and \$-16,$len 1782 1783 $movkey 16($key,$rnds_),$rndkey1 # last round key 1784 1785 movdqa .Lxts_magic(%rip),$twmask 1786 movdqa $inout0,@tweak[5] 1787 pshufd \$0x5f,$inout0,$twres 1788 pxor $rndkey0,$rndkey1 1789___ 1790 # alternative tweak calculation algorithm is based on suggestions 1791 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1792 # and should help in the future... 1793 for ($i=0;$i<4;$i++) { 1794 $code.=<<___; 1795 movdqa $twres,$twtmp 1796 paddd $twres,$twres 1797 movdqa @tweak[5],@tweak[$i] 1798 psrad \$31,$twtmp # broadcast upper bits 1799 paddq @tweak[5],@tweak[5] 1800 pand $twmask,$twtmp 1801 pxor $rndkey0,@tweak[$i] 1802 pxor $twtmp,@tweak[5] 1803___ 1804 } 1805$code.=<<___; 1806 movdqa @tweak[5],@tweak[4] 1807 psrad \$31,$twres 1808 paddq @tweak[5],@tweak[5] 1809 pand $twmask,$twres 1810 pxor $rndkey0,@tweak[4] 1811 pxor $twres,@tweak[5] 1812 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1813 1814 sub \$16*6,$len 1815 jc .Lxts_enc_short # if $len-=6*16 borrowed 1816 1817 mov \$16+96,$rounds 1818 lea 32($key_,$rnds_),$key # end of key schedule 1819 sub %r10,%rax # twisted $rounds 1820 $movkey 16($key_),$rndkey1 1821 mov %rax,%r10 # backup twisted $rounds 1822 lea .Lxts_magic(%rip),%r8 1823 jmp .Lxts_enc_grandloop 1824 1825.align 32 1826.Lxts_enc_grandloop: 1827 movdqu `16*0`($inp),$inout0 # load input 1828 movdqa $rndkey0,$twmask 1829 movdqu `16*1`($inp),$inout1 1830 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1831 movdqu `16*2`($inp),$inout2 1832 pxor @tweak[1],$inout1 1833 aesenc $rndkey1,$inout0 1834 movdqu `16*3`($inp),$inout3 1835 pxor @tweak[2],$inout2 1836 aesenc $rndkey1,$inout1 1837 movdqu `16*4`($inp),$inout4 1838 pxor @tweak[3],$inout3 1839 aesenc $rndkey1,$inout2 1840 movdqu `16*5`($inp),$inout5 1841 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1842 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1843 pxor @tweak[4],$inout4 1844 aesenc $rndkey1,$inout3 1845 $movkey 32($key_),$rndkey0 1846 lea `16*6`($inp),$inp 1847 pxor $twmask,$inout5 1848 1849 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 1850 aesenc $rndkey1,$inout4 1851 pxor $twres,@tweak[1] 1852 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1853 aesenc $rndkey1,$inout5 1854 $movkey 48($key_),$rndkey1 1855 pxor $twres,@tweak[2] 1856 1857 aesenc $rndkey0,$inout0 1858 pxor $twres,@tweak[3] 1859 movdqa @tweak[1],`16*1`(%rsp) 1860 aesenc $rndkey0,$inout1 1861 pxor $twres,@tweak[4] 1862 movdqa @tweak[2],`16*2`(%rsp) 1863 aesenc $rndkey0,$inout2 1864 aesenc $rndkey0,$inout3 1865 pxor $twres,$twmask 1866 movdqa @tweak[4],`16*4`(%rsp) 1867 aesenc $rndkey0,$inout4 1868 aesenc $rndkey0,$inout5 1869 $movkey 64($key_),$rndkey0 1870 movdqa $twmask,`16*5`(%rsp) 1871 pshufd \$0x5f,@tweak[5],$twres 1872 jmp .Lxts_enc_loop6 1873.align 32 1874.Lxts_enc_loop6: 1875 aesenc $rndkey1,$inout0 1876 aesenc $rndkey1,$inout1 1877 aesenc $rndkey1,$inout2 1878 aesenc $rndkey1,$inout3 1879 aesenc $rndkey1,$inout4 1880 aesenc $rndkey1,$inout5 1881 $movkey -64($key,%rax),$rndkey1 1882 add \$32,%rax 1883 1884 aesenc $rndkey0,$inout0 1885 aesenc $rndkey0,$inout1 1886 aesenc $rndkey0,$inout2 1887 aesenc $rndkey0,$inout3 1888 aesenc $rndkey0,$inout4 1889 aesenc $rndkey0,$inout5 1890 $movkey -80($key,%rax),$rndkey0 1891 jnz .Lxts_enc_loop6 1892 1893 movdqa (%r8),$twmask # start calculating next tweak 1894 movdqa $twres,$twtmp 1895 paddd $twres,$twres 1896 aesenc $rndkey1,$inout0 1897 paddq @tweak[5],@tweak[5] 1898 psrad \$31,$twtmp 1899 aesenc $rndkey1,$inout1 1900 pand $twmask,$twtmp 1901 $movkey ($key_),@tweak[0] # load round[0] 1902 aesenc $rndkey1,$inout2 1903 aesenc $rndkey1,$inout3 1904 aesenc $rndkey1,$inout4 1905 pxor $twtmp,@tweak[5] 1906 movaps @tweak[0],@tweak[1] # copy round[0] 1907 aesenc $rndkey1,$inout5 1908 $movkey -64($key),$rndkey1 1909 1910 movdqa $twres,$twtmp 1911 aesenc $rndkey0,$inout0 1912 paddd $twres,$twres 1913 pxor @tweak[5],@tweak[0] 1914 aesenc $rndkey0,$inout1 1915 psrad \$31,$twtmp 1916 paddq @tweak[5],@tweak[5] 1917 aesenc $rndkey0,$inout2 1918 aesenc $rndkey0,$inout3 1919 pand $twmask,$twtmp 1920 movaps @tweak[1],@tweak[2] 1921 aesenc $rndkey0,$inout4 1922 pxor $twtmp,@tweak[5] 1923 movdqa $twres,$twtmp 1924 aesenc $rndkey0,$inout5 1925 $movkey -48($key),$rndkey0 1926 1927 paddd $twres,$twres 1928 aesenc $rndkey1,$inout0 1929 pxor @tweak[5],@tweak[1] 1930 psrad \$31,$twtmp 1931 aesenc $rndkey1,$inout1 1932 paddq @tweak[5],@tweak[5] 1933 pand $twmask,$twtmp 1934 aesenc $rndkey1,$inout2 1935 aesenc $rndkey1,$inout3 1936 movdqa @tweak[3],`16*3`(%rsp) 1937 pxor $twtmp,@tweak[5] 1938 aesenc $rndkey1,$inout4 1939 movaps @tweak[2],@tweak[3] 1940 movdqa $twres,$twtmp 1941 aesenc $rndkey1,$inout5 1942 $movkey -32($key),$rndkey1 1943 1944 paddd $twres,$twres 1945 aesenc $rndkey0,$inout0 1946 pxor @tweak[5],@tweak[2] 1947 psrad \$31,$twtmp 1948 aesenc $rndkey0,$inout1 1949 paddq @tweak[5],@tweak[5] 1950 pand $twmask,$twtmp 1951 aesenc $rndkey0,$inout2 1952 aesenc $rndkey0,$inout3 1953 aesenc $rndkey0,$inout4 1954 pxor $twtmp,@tweak[5] 1955 movaps @tweak[3],@tweak[4] 1956 aesenc $rndkey0,$inout5 1957 1958 movdqa $twres,$rndkey0 1959 paddd $twres,$twres 1960 aesenc $rndkey1,$inout0 1961 pxor @tweak[5],@tweak[3] 1962 psrad \$31,$rndkey0 1963 aesenc $rndkey1,$inout1 1964 paddq @tweak[5],@tweak[5] 1965 pand $twmask,$rndkey0 1966 aesenc $rndkey1,$inout2 1967 aesenc $rndkey1,$inout3 1968 pxor $rndkey0,@tweak[5] 1969 $movkey ($key_),$rndkey0 1970 aesenc $rndkey1,$inout4 1971 aesenc $rndkey1,$inout5 1972 $movkey 16($key_),$rndkey1 1973 1974 pxor @tweak[5],@tweak[4] 1975 aesenclast `16*0`(%rsp),$inout0 1976 psrad \$31,$twres 1977 paddq @tweak[5],@tweak[5] 1978 aesenclast `16*1`(%rsp),$inout1 1979 aesenclast `16*2`(%rsp),$inout2 1980 pand $twmask,$twres 1981 mov %r10,%rax # restore $rounds 1982 aesenclast `16*3`(%rsp),$inout3 1983 aesenclast `16*4`(%rsp),$inout4 1984 aesenclast `16*5`(%rsp),$inout5 1985 pxor $twres,@tweak[5] 1986 1987 lea `16*6`($out),$out # $out+=6*16 1988 movups $inout0,`-16*6`($out) # store 6 output blocks 1989 movups $inout1,`-16*5`($out) 1990 movups $inout2,`-16*4`($out) 1991 movups $inout3,`-16*3`($out) 1992 movups $inout4,`-16*2`($out) 1993 movups $inout5,`-16*1`($out) 1994 sub \$16*6,$len 1995 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 1996 1997 mov \$16+96,$rounds 1998 sub $rnds_,$rounds 1999 mov $key_,$key # restore $key 2000 shr \$4,$rounds # restore original value 2001 2002.Lxts_enc_short: 2003 # at the point @tweak[0..5] are populated with tweak values 2004 mov $rounds,$rnds_ # backup $rounds 2005 pxor $rndkey0,@tweak[0] 2006 add \$16*6,$len # restore real remaining $len 2007 jz .Lxts_enc_done # done if ($len==0) 2008 2009 pxor $rndkey0,@tweak[1] 2010 cmp \$0x20,$len 2011 jb .Lxts_enc_one # $len is 1*16 2012 pxor $rndkey0,@tweak[2] 2013 je .Lxts_enc_two # $len is 2*16 2014 2015 pxor $rndkey0,@tweak[3] 2016 cmp \$0x40,$len 2017 jb .Lxts_enc_three # $len is 3*16 2018 pxor $rndkey0,@tweak[4] 2019 je .Lxts_enc_four # $len is 4*16 2020 2021 movdqu ($inp),$inout0 # $len is 5*16 2022 movdqu 16*1($inp),$inout1 2023 movdqu 16*2($inp),$inout2 2024 pxor @tweak[0],$inout0 2025 movdqu 16*3($inp),$inout3 2026 pxor @tweak[1],$inout1 2027 movdqu 16*4($inp),$inout4 2028 lea 16*5($inp),$inp # $inp+=5*16 2029 pxor @tweak[2],$inout2 2030 pxor @tweak[3],$inout3 2031 pxor @tweak[4],$inout4 2032 pxor $inout5,$inout5 2033 2034 call _aesni_encrypt6 2035 2036 xorps @tweak[0],$inout0 2037 movdqa @tweak[5],@tweak[0] 2038 xorps @tweak[1],$inout1 2039 xorps @tweak[2],$inout2 2040 movdqu $inout0,($out) # store 5 output blocks 2041 xorps @tweak[3],$inout3 2042 movdqu $inout1,16*1($out) 2043 xorps @tweak[4],$inout4 2044 movdqu $inout2,16*2($out) 2045 movdqu $inout3,16*3($out) 2046 movdqu $inout4,16*4($out) 2047 lea 16*5($out),$out # $out+=5*16 2048 jmp .Lxts_enc_done 2049 2050.align 16 2051.Lxts_enc_one: 2052 movups ($inp),$inout0 2053 lea 16*1($inp),$inp # inp+=1*16 2054 xorps @tweak[0],$inout0 2055___ 2056 &aesni_generate1("enc",$key,$rounds); 2057$code.=<<___; 2058 xorps @tweak[0],$inout0 2059 movdqa @tweak[1],@tweak[0] 2060 movups $inout0,($out) # store one output block 2061 lea 16*1($out),$out # $out+=1*16 2062 jmp .Lxts_enc_done 2063 2064.align 16 2065.Lxts_enc_two: 2066 movups ($inp),$inout0 2067 movups 16($inp),$inout1 2068 lea 32($inp),$inp # $inp+=2*16 2069 xorps @tweak[0],$inout0 2070 xorps @tweak[1],$inout1 2071 2072 call _aesni_encrypt2 2073 2074 xorps @tweak[0],$inout0 2075 movdqa @tweak[2],@tweak[0] 2076 xorps @tweak[1],$inout1 2077 movups $inout0,($out) # store 2 output blocks 2078 movups $inout1,16*1($out) 2079 lea 16*2($out),$out # $out+=2*16 2080 jmp .Lxts_enc_done 2081 2082.align 16 2083.Lxts_enc_three: 2084 movups ($inp),$inout0 2085 movups 16*1($inp),$inout1 2086 movups 16*2($inp),$inout2 2087 lea 16*3($inp),$inp # $inp+=3*16 2088 xorps @tweak[0],$inout0 2089 xorps @tweak[1],$inout1 2090 xorps @tweak[2],$inout2 2091 2092 call _aesni_encrypt3 2093 2094 xorps @tweak[0],$inout0 2095 movdqa @tweak[3],@tweak[0] 2096 xorps @tweak[1],$inout1 2097 xorps @tweak[2],$inout2 2098 movups $inout0,($out) # store 3 output blocks 2099 movups $inout1,16*1($out) 2100 movups $inout2,16*2($out) 2101 lea 16*3($out),$out # $out+=3*16 2102 jmp .Lxts_enc_done 2103 2104.align 16 2105.Lxts_enc_four: 2106 movups ($inp),$inout0 2107 movups 16*1($inp),$inout1 2108 movups 16*2($inp),$inout2 2109 xorps @tweak[0],$inout0 2110 movups 16*3($inp),$inout3 2111 lea 16*4($inp),$inp # $inp+=4*16 2112 xorps @tweak[1],$inout1 2113 xorps @tweak[2],$inout2 2114 xorps @tweak[3],$inout3 2115 2116 call _aesni_encrypt4 2117 2118 pxor @tweak[0],$inout0 2119 movdqa @tweak[4],@tweak[0] 2120 pxor @tweak[1],$inout1 2121 pxor @tweak[2],$inout2 2122 movdqu $inout0,($out) # store 4 output blocks 2123 pxor @tweak[3],$inout3 2124 movdqu $inout1,16*1($out) 2125 movdqu $inout2,16*2($out) 2126 movdqu $inout3,16*3($out) 2127 lea 16*4($out),$out # $out+=4*16 2128 jmp .Lxts_enc_done 2129 2130.align 16 2131.Lxts_enc_done: 2132 and \$15,$len_ # see if $len%16 is 0 2133 jz .Lxts_enc_ret 2134 mov $len_,$len 2135 2136.Lxts_enc_steal: 2137 movzb ($inp),%eax # borrow $rounds ... 2138 movzb -16($out),%ecx # ... and $key 2139 lea 1($inp),$inp 2140 mov %al,-16($out) 2141 mov %cl,0($out) 2142 lea 1($out),$out 2143 sub \$1,$len 2144 jnz .Lxts_enc_steal 2145 2146 sub $len_,$out # rewind $out 2147 mov $key_,$key # restore $key 2148 mov $rnds_,$rounds # restore $rounds 2149 2150 movups -16($out),$inout0 2151 xorps @tweak[0],$inout0 2152___ 2153 &aesni_generate1("enc",$key,$rounds); 2154$code.=<<___; 2155 xorps @tweak[0],$inout0 2156 movups $inout0,-16($out) 2157 2158.Lxts_enc_ret: 2159 xorps %xmm0,%xmm0 # clear register bank 2160 pxor %xmm1,%xmm1 2161 pxor %xmm2,%xmm2 2162 pxor %xmm3,%xmm3 2163 pxor %xmm4,%xmm4 2164 pxor %xmm5,%xmm5 2165___ 2166$code.=<<___ if (!$win64); 2167 pxor %xmm6,%xmm6 2168 pxor %xmm7,%xmm7 2169 movaps %xmm0,0x00(%rsp) # clear stack 2170 pxor %xmm8,%xmm8 2171 movaps %xmm0,0x10(%rsp) 2172 pxor %xmm9,%xmm9 2173 movaps %xmm0,0x20(%rsp) 2174 pxor %xmm10,%xmm10 2175 movaps %xmm0,0x30(%rsp) 2176 pxor %xmm11,%xmm11 2177 movaps %xmm0,0x40(%rsp) 2178 pxor %xmm12,%xmm12 2179 movaps %xmm0,0x50(%rsp) 2180 pxor %xmm13,%xmm13 2181 movaps %xmm0,0x60(%rsp) 2182 pxor %xmm14,%xmm14 2183 pxor %xmm15,%xmm15 2184___ 2185$code.=<<___ if ($win64); 2186 movaps -0xa8(%r11),%xmm6 2187 movaps %xmm0,-0xa8(%r11) # clear stack 2188 movaps -0x98(%r11),%xmm7 2189 movaps %xmm0,-0x98(%r11) 2190 movaps -0x88(%r11),%xmm8 2191 movaps %xmm0,-0x88(%r11) 2192 movaps -0x78(%r11),%xmm9 2193 movaps %xmm0,-0x78(%r11) 2194 movaps -0x68(%r11),%xmm10 2195 movaps %xmm0,-0x68(%r11) 2196 movaps -0x58(%r11),%xmm11 2197 movaps %xmm0,-0x58(%r11) 2198 movaps -0x48(%r11),%xmm12 2199 movaps %xmm0,-0x48(%r11) 2200 movaps -0x38(%r11),%xmm13 2201 movaps %xmm0,-0x38(%r11) 2202 movaps -0x28(%r11),%xmm14 2203 movaps %xmm0,-0x28(%r11) 2204 movaps -0x18(%r11),%xmm15 2205 movaps %xmm0,-0x18(%r11) 2206 movaps %xmm0,0x00(%rsp) 2207 movaps %xmm0,0x10(%rsp) 2208 movaps %xmm0,0x20(%rsp) 2209 movaps %xmm0,0x30(%rsp) 2210 movaps %xmm0,0x40(%rsp) 2211 movaps %xmm0,0x50(%rsp) 2212 movaps %xmm0,0x60(%rsp) 2213___ 2214$code.=<<___; 2215 mov -8(%r11),%rbp 2216 lea (%r11),%rsp 2217.Lxts_enc_epilogue: 2218 ret 2219.size aesni_xts_encrypt,.-aesni_xts_encrypt 2220___ 2221 2222$code.=<<___; 2223.globl aesni_xts_decrypt 2224.type aesni_xts_decrypt,\@function,6 2225.align 16 2226aesni_xts_decrypt: 2227 lea (%rsp),%r11 # frame pointer 2228 push %rbp 2229 sub \$$frame_size,%rsp 2230 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2231___ 2232$code.=<<___ if ($win64); 2233 movaps %xmm6,-0xa8(%r11) # offload everything 2234 movaps %xmm7,-0x98(%r11) 2235 movaps %xmm8,-0x88(%r11) 2236 movaps %xmm9,-0x78(%r11) 2237 movaps %xmm10,-0x68(%r11) 2238 movaps %xmm11,-0x58(%r11) 2239 movaps %xmm12,-0x48(%r11) 2240 movaps %xmm13,-0x38(%r11) 2241 movaps %xmm14,-0x28(%r11) 2242 movaps %xmm15,-0x18(%r11) 2243.Lxts_dec_body: 2244___ 2245$code.=<<___; 2246 movups ($ivp),$inout0 # load clear-text tweak 2247 mov 240($key2),$rounds # key2->rounds 2248 mov 240($key),$rnds_ # key1->rounds 2249___ 2250 # generate the tweak 2251 &aesni_generate1("enc",$key2,$rounds,$inout0); 2252$code.=<<___; 2253 xor %eax,%eax # if ($len%16) len-=16; 2254 test \$15,$len 2255 setnz %al 2256 shl \$4,%rax 2257 sub %rax,$len 2258 2259 $movkey ($key),$rndkey0 # zero round key 2260 mov $key,$key_ # backup $key 2261 mov $rnds_,$rounds # backup $rounds 2262 shl \$4,$rnds_ 2263 mov $len,$len_ # backup $len 2264 and \$-16,$len 2265 2266 $movkey 16($key,$rnds_),$rndkey1 # last round key 2267 2268 movdqa .Lxts_magic(%rip),$twmask 2269 movdqa $inout0,@tweak[5] 2270 pshufd \$0x5f,$inout0,$twres 2271 pxor $rndkey0,$rndkey1 2272___ 2273 for ($i=0;$i<4;$i++) { 2274 $code.=<<___; 2275 movdqa $twres,$twtmp 2276 paddd $twres,$twres 2277 movdqa @tweak[5],@tweak[$i] 2278 psrad \$31,$twtmp # broadcast upper bits 2279 paddq @tweak[5],@tweak[5] 2280 pand $twmask,$twtmp 2281 pxor $rndkey0,@tweak[$i] 2282 pxor $twtmp,@tweak[5] 2283___ 2284 } 2285$code.=<<___; 2286 movdqa @tweak[5],@tweak[4] 2287 psrad \$31,$twres 2288 paddq @tweak[5],@tweak[5] 2289 pand $twmask,$twres 2290 pxor $rndkey0,@tweak[4] 2291 pxor $twres,@tweak[5] 2292 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2293 2294 sub \$16*6,$len 2295 jc .Lxts_dec_short # if $len-=6*16 borrowed 2296 2297 mov \$16+96,$rounds 2298 lea 32($key_,$rnds_),$key # end of key schedule 2299 sub %r10,%rax # twisted $rounds 2300 $movkey 16($key_),$rndkey1 2301 mov %rax,%r10 # backup twisted $rounds 2302 lea .Lxts_magic(%rip),%r8 2303 jmp .Lxts_dec_grandloop 2304 2305.align 32 2306.Lxts_dec_grandloop: 2307 movdqu `16*0`($inp),$inout0 # load input 2308 movdqa $rndkey0,$twmask 2309 movdqu `16*1`($inp),$inout1 2310 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2311 movdqu `16*2`($inp),$inout2 2312 pxor @tweak[1],$inout1 2313 aesdec $rndkey1,$inout0 2314 movdqu `16*3`($inp),$inout3 2315 pxor @tweak[2],$inout2 2316 aesdec $rndkey1,$inout1 2317 movdqu `16*4`($inp),$inout4 2318 pxor @tweak[3],$inout3 2319 aesdec $rndkey1,$inout2 2320 movdqu `16*5`($inp),$inout5 2321 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2322 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2323 pxor @tweak[4],$inout4 2324 aesdec $rndkey1,$inout3 2325 $movkey 32($key_),$rndkey0 2326 lea `16*6`($inp),$inp 2327 pxor $twmask,$inout5 2328 2329 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 2330 aesdec $rndkey1,$inout4 2331 pxor $twres,@tweak[1] 2332 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2333 aesdec $rndkey1,$inout5 2334 $movkey 48($key_),$rndkey1 2335 pxor $twres,@tweak[2] 2336 2337 aesdec $rndkey0,$inout0 2338 pxor $twres,@tweak[3] 2339 movdqa @tweak[1],`16*1`(%rsp) 2340 aesdec $rndkey0,$inout1 2341 pxor $twres,@tweak[4] 2342 movdqa @tweak[2],`16*2`(%rsp) 2343 aesdec $rndkey0,$inout2 2344 aesdec $rndkey0,$inout3 2345 pxor $twres,$twmask 2346 movdqa @tweak[4],`16*4`(%rsp) 2347 aesdec $rndkey0,$inout4 2348 aesdec $rndkey0,$inout5 2349 $movkey 64($key_),$rndkey0 2350 movdqa $twmask,`16*5`(%rsp) 2351 pshufd \$0x5f,@tweak[5],$twres 2352 jmp .Lxts_dec_loop6 2353.align 32 2354.Lxts_dec_loop6: 2355 aesdec $rndkey1,$inout0 2356 aesdec $rndkey1,$inout1 2357 aesdec $rndkey1,$inout2 2358 aesdec $rndkey1,$inout3 2359 aesdec $rndkey1,$inout4 2360 aesdec $rndkey1,$inout5 2361 $movkey -64($key,%rax),$rndkey1 2362 add \$32,%rax 2363 2364 aesdec $rndkey0,$inout0 2365 aesdec $rndkey0,$inout1 2366 aesdec $rndkey0,$inout2 2367 aesdec $rndkey0,$inout3 2368 aesdec $rndkey0,$inout4 2369 aesdec $rndkey0,$inout5 2370 $movkey -80($key,%rax),$rndkey0 2371 jnz .Lxts_dec_loop6 2372 2373 movdqa (%r8),$twmask # start calculating next tweak 2374 movdqa $twres,$twtmp 2375 paddd $twres,$twres 2376 aesdec $rndkey1,$inout0 2377 paddq @tweak[5],@tweak[5] 2378 psrad \$31,$twtmp 2379 aesdec $rndkey1,$inout1 2380 pand $twmask,$twtmp 2381 $movkey ($key_),@tweak[0] # load round[0] 2382 aesdec $rndkey1,$inout2 2383 aesdec $rndkey1,$inout3 2384 aesdec $rndkey1,$inout4 2385 pxor $twtmp,@tweak[5] 2386 movaps @tweak[0],@tweak[1] # copy round[0] 2387 aesdec $rndkey1,$inout5 2388 $movkey -64($key),$rndkey1 2389 2390 movdqa $twres,$twtmp 2391 aesdec $rndkey0,$inout0 2392 paddd $twres,$twres 2393 pxor @tweak[5],@tweak[0] 2394 aesdec $rndkey0,$inout1 2395 psrad \$31,$twtmp 2396 paddq @tweak[5],@tweak[5] 2397 aesdec $rndkey0,$inout2 2398 aesdec $rndkey0,$inout3 2399 pand $twmask,$twtmp 2400 movaps @tweak[1],@tweak[2] 2401 aesdec $rndkey0,$inout4 2402 pxor $twtmp,@tweak[5] 2403 movdqa $twres,$twtmp 2404 aesdec $rndkey0,$inout5 2405 $movkey -48($key),$rndkey0 2406 2407 paddd $twres,$twres 2408 aesdec $rndkey1,$inout0 2409 pxor @tweak[5],@tweak[1] 2410 psrad \$31,$twtmp 2411 aesdec $rndkey1,$inout1 2412 paddq @tweak[5],@tweak[5] 2413 pand $twmask,$twtmp 2414 aesdec $rndkey1,$inout2 2415 aesdec $rndkey1,$inout3 2416 movdqa @tweak[3],`16*3`(%rsp) 2417 pxor $twtmp,@tweak[5] 2418 aesdec $rndkey1,$inout4 2419 movaps @tweak[2],@tweak[3] 2420 movdqa $twres,$twtmp 2421 aesdec $rndkey1,$inout5 2422 $movkey -32($key),$rndkey1 2423 2424 paddd $twres,$twres 2425 aesdec $rndkey0,$inout0 2426 pxor @tweak[5],@tweak[2] 2427 psrad \$31,$twtmp 2428 aesdec $rndkey0,$inout1 2429 paddq @tweak[5],@tweak[5] 2430 pand $twmask,$twtmp 2431 aesdec $rndkey0,$inout2 2432 aesdec $rndkey0,$inout3 2433 aesdec $rndkey0,$inout4 2434 pxor $twtmp,@tweak[5] 2435 movaps @tweak[3],@tweak[4] 2436 aesdec $rndkey0,$inout5 2437 2438 movdqa $twres,$rndkey0 2439 paddd $twres,$twres 2440 aesdec $rndkey1,$inout0 2441 pxor @tweak[5],@tweak[3] 2442 psrad \$31,$rndkey0 2443 aesdec $rndkey1,$inout1 2444 paddq @tweak[5],@tweak[5] 2445 pand $twmask,$rndkey0 2446 aesdec $rndkey1,$inout2 2447 aesdec $rndkey1,$inout3 2448 pxor $rndkey0,@tweak[5] 2449 $movkey ($key_),$rndkey0 2450 aesdec $rndkey1,$inout4 2451 aesdec $rndkey1,$inout5 2452 $movkey 16($key_),$rndkey1 2453 2454 pxor @tweak[5],@tweak[4] 2455 aesdeclast `16*0`(%rsp),$inout0 2456 psrad \$31,$twres 2457 paddq @tweak[5],@tweak[5] 2458 aesdeclast `16*1`(%rsp),$inout1 2459 aesdeclast `16*2`(%rsp),$inout2 2460 pand $twmask,$twres 2461 mov %r10,%rax # restore $rounds 2462 aesdeclast `16*3`(%rsp),$inout3 2463 aesdeclast `16*4`(%rsp),$inout4 2464 aesdeclast `16*5`(%rsp),$inout5 2465 pxor $twres,@tweak[5] 2466 2467 lea `16*6`($out),$out # $out+=6*16 2468 movups $inout0,`-16*6`($out) # store 6 output blocks 2469 movups $inout1,`-16*5`($out) 2470 movups $inout2,`-16*4`($out) 2471 movups $inout3,`-16*3`($out) 2472 movups $inout4,`-16*2`($out) 2473 movups $inout5,`-16*1`($out) 2474 sub \$16*6,$len 2475 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2476 2477 mov \$16+96,$rounds 2478 sub $rnds_,$rounds 2479 mov $key_,$key # restore $key 2480 shr \$4,$rounds # restore original value 2481 2482.Lxts_dec_short: 2483 # at the point @tweak[0..5] are populated with tweak values 2484 mov $rounds,$rnds_ # backup $rounds 2485 pxor $rndkey0,@tweak[0] 2486 pxor $rndkey0,@tweak[1] 2487 add \$16*6,$len # restore real remaining $len 2488 jz .Lxts_dec_done # done if ($len==0) 2489 2490 pxor $rndkey0,@tweak[2] 2491 cmp \$0x20,$len 2492 jb .Lxts_dec_one # $len is 1*16 2493 pxor $rndkey0,@tweak[3] 2494 je .Lxts_dec_two # $len is 2*16 2495 2496 pxor $rndkey0,@tweak[4] 2497 cmp \$0x40,$len 2498 jb .Lxts_dec_three # $len is 3*16 2499 je .Lxts_dec_four # $len is 4*16 2500 2501 movdqu ($inp),$inout0 # $len is 5*16 2502 movdqu 16*1($inp),$inout1 2503 movdqu 16*2($inp),$inout2 2504 pxor @tweak[0],$inout0 2505 movdqu 16*3($inp),$inout3 2506 pxor @tweak[1],$inout1 2507 movdqu 16*4($inp),$inout4 2508 lea 16*5($inp),$inp # $inp+=5*16 2509 pxor @tweak[2],$inout2 2510 pxor @tweak[3],$inout3 2511 pxor @tweak[4],$inout4 2512 2513 call _aesni_decrypt6 2514 2515 xorps @tweak[0],$inout0 2516 xorps @tweak[1],$inout1 2517 xorps @tweak[2],$inout2 2518 movdqu $inout0,($out) # store 5 output blocks 2519 xorps @tweak[3],$inout3 2520 movdqu $inout1,16*1($out) 2521 xorps @tweak[4],$inout4 2522 movdqu $inout2,16*2($out) 2523 pxor $twtmp,$twtmp 2524 movdqu $inout3,16*3($out) 2525 pcmpgtd @tweak[5],$twtmp 2526 movdqu $inout4,16*4($out) 2527 lea 16*5($out),$out # $out+=5*16 2528 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2529 and \$15,$len_ 2530 jz .Lxts_dec_ret 2531 2532 movdqa @tweak[5],@tweak[0] 2533 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2534 pand $twmask,@tweak[1] # isolate carry and residue 2535 pxor @tweak[5],@tweak[1] 2536 jmp .Lxts_dec_done2 2537 2538.align 16 2539.Lxts_dec_one: 2540 movups ($inp),$inout0 2541 lea 16*1($inp),$inp # $inp+=1*16 2542 xorps @tweak[0],$inout0 2543___ 2544 &aesni_generate1("dec",$key,$rounds); 2545$code.=<<___; 2546 xorps @tweak[0],$inout0 2547 movdqa @tweak[1],@tweak[0] 2548 movups $inout0,($out) # store one output block 2549 movdqa @tweak[2],@tweak[1] 2550 lea 16*1($out),$out # $out+=1*16 2551 jmp .Lxts_dec_done 2552 2553.align 16 2554.Lxts_dec_two: 2555 movups ($inp),$inout0 2556 movups 16($inp),$inout1 2557 lea 32($inp),$inp # $inp+=2*16 2558 xorps @tweak[0],$inout0 2559 xorps @tweak[1],$inout1 2560 2561 call _aesni_decrypt2 2562 2563 xorps @tweak[0],$inout0 2564 movdqa @tweak[2],@tweak[0] 2565 xorps @tweak[1],$inout1 2566 movdqa @tweak[3],@tweak[1] 2567 movups $inout0,($out) # store 2 output blocks 2568 movups $inout1,16*1($out) 2569 lea 16*2($out),$out # $out+=2*16 2570 jmp .Lxts_dec_done 2571 2572.align 16 2573.Lxts_dec_three: 2574 movups ($inp),$inout0 2575 movups 16*1($inp),$inout1 2576 movups 16*2($inp),$inout2 2577 lea 16*3($inp),$inp # $inp+=3*16 2578 xorps @tweak[0],$inout0 2579 xorps @tweak[1],$inout1 2580 xorps @tweak[2],$inout2 2581 2582 call _aesni_decrypt3 2583 2584 xorps @tweak[0],$inout0 2585 movdqa @tweak[3],@tweak[0] 2586 xorps @tweak[1],$inout1 2587 movdqa @tweak[4],@tweak[1] 2588 xorps @tweak[2],$inout2 2589 movups $inout0,($out) # store 3 output blocks 2590 movups $inout1,16*1($out) 2591 movups $inout2,16*2($out) 2592 lea 16*3($out),$out # $out+=3*16 2593 jmp .Lxts_dec_done 2594 2595.align 16 2596.Lxts_dec_four: 2597 movups ($inp),$inout0 2598 movups 16*1($inp),$inout1 2599 movups 16*2($inp),$inout2 2600 xorps @tweak[0],$inout0 2601 movups 16*3($inp),$inout3 2602 lea 16*4($inp),$inp # $inp+=4*16 2603 xorps @tweak[1],$inout1 2604 xorps @tweak[2],$inout2 2605 xorps @tweak[3],$inout3 2606 2607 call _aesni_decrypt4 2608 2609 pxor @tweak[0],$inout0 2610 movdqa @tweak[4],@tweak[0] 2611 pxor @tweak[1],$inout1 2612 movdqa @tweak[5],@tweak[1] 2613 pxor @tweak[2],$inout2 2614 movdqu $inout0,($out) # store 4 output blocks 2615 pxor @tweak[3],$inout3 2616 movdqu $inout1,16*1($out) 2617 movdqu $inout2,16*2($out) 2618 movdqu $inout3,16*3($out) 2619 lea 16*4($out),$out # $out+=4*16 2620 jmp .Lxts_dec_done 2621 2622.align 16 2623.Lxts_dec_done: 2624 and \$15,$len_ # see if $len%16 is 0 2625 jz .Lxts_dec_ret 2626.Lxts_dec_done2: 2627 mov $len_,$len 2628 mov $key_,$key # restore $key 2629 mov $rnds_,$rounds # restore $rounds 2630 2631 movups ($inp),$inout0 2632 xorps @tweak[1],$inout0 2633___ 2634 &aesni_generate1("dec",$key,$rounds); 2635$code.=<<___; 2636 xorps @tweak[1],$inout0 2637 movups $inout0,($out) 2638 2639.Lxts_dec_steal: 2640 movzb 16($inp),%eax # borrow $rounds ... 2641 movzb ($out),%ecx # ... and $key 2642 lea 1($inp),$inp 2643 mov %al,($out) 2644 mov %cl,16($out) 2645 lea 1($out),$out 2646 sub \$1,$len 2647 jnz .Lxts_dec_steal 2648 2649 sub $len_,$out # rewind $out 2650 mov $key_,$key # restore $key 2651 mov $rnds_,$rounds # restore $rounds 2652 2653 movups ($out),$inout0 2654 xorps @tweak[0],$inout0 2655___ 2656 &aesni_generate1("dec",$key,$rounds); 2657$code.=<<___; 2658 xorps @tweak[0],$inout0 2659 movups $inout0,($out) 2660 2661.Lxts_dec_ret: 2662 xorps %xmm0,%xmm0 # clear register bank 2663 pxor %xmm1,%xmm1 2664 pxor %xmm2,%xmm2 2665 pxor %xmm3,%xmm3 2666 pxor %xmm4,%xmm4 2667 pxor %xmm5,%xmm5 2668___ 2669$code.=<<___ if (!$win64); 2670 pxor %xmm6,%xmm6 2671 pxor %xmm7,%xmm7 2672 movaps %xmm0,0x00(%rsp) # clear stack 2673 pxor %xmm8,%xmm8 2674 movaps %xmm0,0x10(%rsp) 2675 pxor %xmm9,%xmm9 2676 movaps %xmm0,0x20(%rsp) 2677 pxor %xmm10,%xmm10 2678 movaps %xmm0,0x30(%rsp) 2679 pxor %xmm11,%xmm11 2680 movaps %xmm0,0x40(%rsp) 2681 pxor %xmm12,%xmm12 2682 movaps %xmm0,0x50(%rsp) 2683 pxor %xmm13,%xmm13 2684 movaps %xmm0,0x60(%rsp) 2685 pxor %xmm14,%xmm14 2686 pxor %xmm15,%xmm15 2687___ 2688$code.=<<___ if ($win64); 2689 movaps -0xa8(%r11),%xmm6 2690 movaps %xmm0,-0xa8(%r11) # clear stack 2691 movaps -0x98(%r11),%xmm7 2692 movaps %xmm0,-0x98(%r11) 2693 movaps -0x88(%r11),%xmm8 2694 movaps %xmm0,-0x88(%r11) 2695 movaps -0x78(%r11),%xmm9 2696 movaps %xmm0,-0x78(%r11) 2697 movaps -0x68(%r11),%xmm10 2698 movaps %xmm0,-0x68(%r11) 2699 movaps -0x58(%r11),%xmm11 2700 movaps %xmm0,-0x58(%r11) 2701 movaps -0x48(%r11),%xmm12 2702 movaps %xmm0,-0x48(%r11) 2703 movaps -0x38(%r11),%xmm13 2704 movaps %xmm0,-0x38(%r11) 2705 movaps -0x28(%r11),%xmm14 2706 movaps %xmm0,-0x28(%r11) 2707 movaps -0x18(%r11),%xmm15 2708 movaps %xmm0,-0x18(%r11) 2709 movaps %xmm0,0x00(%rsp) 2710 movaps %xmm0,0x10(%rsp) 2711 movaps %xmm0,0x20(%rsp) 2712 movaps %xmm0,0x30(%rsp) 2713 movaps %xmm0,0x40(%rsp) 2714 movaps %xmm0,0x50(%rsp) 2715 movaps %xmm0,0x60(%rsp) 2716___ 2717$code.=<<___; 2718 mov -8(%r11),%rbp 2719 lea (%r11),%rsp 2720.Lxts_dec_epilogue: 2721 ret 2722.size aesni_xts_decrypt,.-aesni_xts_decrypt 2723___ 2724} 2725 2726###################################################################### 2727# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2728# const AES_KEY *key, unsigned int start_block_num, 2729# unsigned char offset_i[16], const unsigned char L_[][16], 2730# unsigned char checksum[16]); 2731# 2732{ 2733my @offset=map("%xmm$_",(10..15)); 2734my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2735my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2736my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2737my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2738my $seventh_arg = $win64 ? 56 : 8; 2739my $blocks = $len; 2740 2741$code.=<<___; 2742.globl aesni_ocb_encrypt 2743.type aesni_ocb_encrypt,\@function,6 2744.align 32 2745aesni_ocb_encrypt: 2746 lea (%rsp),%rax 2747 push %rbx 2748 push %rbp 2749 push %r12 2750 push %r13 2751 push %r14 2752___ 2753$code.=<<___ if ($win64); 2754 lea -0xa0(%rsp),%rsp 2755 movaps %xmm6,0x00(%rsp) # offload everything 2756 movaps %xmm7,0x10(%rsp) 2757 movaps %xmm8,0x20(%rsp) 2758 movaps %xmm9,0x30(%rsp) 2759 movaps %xmm10,0x40(%rsp) 2760 movaps %xmm11,0x50(%rsp) 2761 movaps %xmm12,0x60(%rsp) 2762 movaps %xmm13,0x70(%rsp) 2763 movaps %xmm14,0x80(%rsp) 2764 movaps %xmm15,0x90(%rsp) 2765.Locb_enc_body: 2766___ 2767$code.=<<___; 2768 mov $seventh_arg(%rax),$L_p # 7th argument 2769 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2770 2771 mov 240($key),$rnds_ 2772 mov $key,$key_ 2773 shl \$4,$rnds_ 2774 $movkey ($key),$rndkey0l # round[0] 2775 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2776 2777 movdqu ($offset_p),@offset[5] # load last offset_i 2778 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2779 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2780 2781 mov \$16+32,$rounds 2782 lea 32($key_,$rnds_),$key 2783 $movkey 16($key_),$rndkey1 # round[1] 2784 sub %r10,%rax # twisted $rounds 2785 mov %rax,%r10 # backup twisted $rounds 2786 2787 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2788 movdqu ($checksum_p),$checksum # load checksum 2789 2790 test \$1,$block_num # is first block number odd? 2791 jnz .Locb_enc_odd 2792 2793 bsf $block_num,$i1 2794 add \$1,$block_num 2795 shl \$4,$i1 2796 movdqu ($L_p,$i1),$inout5 # borrow 2797 movdqu ($inp),$inout0 2798 lea 16($inp),$inp 2799 2800 call __ocb_encrypt1 2801 2802 movdqa $inout5,@offset[5] 2803 movups $inout0,($out) 2804 lea 16($out),$out 2805 sub \$1,$blocks 2806 jz .Locb_enc_done 2807 2808.Locb_enc_odd: 2809 lea 1($block_num),$i1 # even-numbered blocks 2810 lea 3($block_num),$i3 2811 lea 5($block_num),$i5 2812 lea 6($block_num),$block_num 2813 bsf $i1,$i1 # ntz(block) 2814 bsf $i3,$i3 2815 bsf $i5,$i5 2816 shl \$4,$i1 # ntz(block) -> table offset 2817 shl \$4,$i3 2818 shl \$4,$i5 2819 2820 sub \$6,$blocks 2821 jc .Locb_enc_short 2822 jmp .Locb_enc_grandloop 2823 2824.align 32 2825.Locb_enc_grandloop: 2826 movdqu `16*0`($inp),$inout0 # load input 2827 movdqu `16*1`($inp),$inout1 2828 movdqu `16*2`($inp),$inout2 2829 movdqu `16*3`($inp),$inout3 2830 movdqu `16*4`($inp),$inout4 2831 movdqu `16*5`($inp),$inout5 2832 lea `16*6`($inp),$inp 2833 2834 call __ocb_encrypt6 2835 2836 movups $inout0,`16*0`($out) # store output 2837 movups $inout1,`16*1`($out) 2838 movups $inout2,`16*2`($out) 2839 movups $inout3,`16*3`($out) 2840 movups $inout4,`16*4`($out) 2841 movups $inout5,`16*5`($out) 2842 lea `16*6`($out),$out 2843 sub \$6,$blocks 2844 jnc .Locb_enc_grandloop 2845 2846.Locb_enc_short: 2847 add \$6,$blocks 2848 jz .Locb_enc_done 2849 2850 movdqu `16*0`($inp),$inout0 2851 cmp \$2,$blocks 2852 jb .Locb_enc_one 2853 movdqu `16*1`($inp),$inout1 2854 je .Locb_enc_two 2855 2856 movdqu `16*2`($inp),$inout2 2857 cmp \$4,$blocks 2858 jb .Locb_enc_three 2859 movdqu `16*3`($inp),$inout3 2860 je .Locb_enc_four 2861 2862 movdqu `16*4`($inp),$inout4 2863 pxor $inout5,$inout5 2864 2865 call __ocb_encrypt6 2866 2867 movdqa @offset[4],@offset[5] 2868 movups $inout0,`16*0`($out) 2869 movups $inout1,`16*1`($out) 2870 movups $inout2,`16*2`($out) 2871 movups $inout3,`16*3`($out) 2872 movups $inout4,`16*4`($out) 2873 2874 jmp .Locb_enc_done 2875 2876.align 16 2877.Locb_enc_one: 2878 movdqa @offset[0],$inout5 # borrow 2879 2880 call __ocb_encrypt1 2881 2882 movdqa $inout5,@offset[5] 2883 movups $inout0,`16*0`($out) 2884 jmp .Locb_enc_done 2885 2886.align 16 2887.Locb_enc_two: 2888 pxor $inout2,$inout2 2889 pxor $inout3,$inout3 2890 2891 call __ocb_encrypt4 2892 2893 movdqa @offset[1],@offset[5] 2894 movups $inout0,`16*0`($out) 2895 movups $inout1,`16*1`($out) 2896 2897 jmp .Locb_enc_done 2898 2899.align 16 2900.Locb_enc_three: 2901 pxor $inout3,$inout3 2902 2903 call __ocb_encrypt4 2904 2905 movdqa @offset[2],@offset[5] 2906 movups $inout0,`16*0`($out) 2907 movups $inout1,`16*1`($out) 2908 movups $inout2,`16*2`($out) 2909 2910 jmp .Locb_enc_done 2911 2912.align 16 2913.Locb_enc_four: 2914 call __ocb_encrypt4 2915 2916 movdqa @offset[3],@offset[5] 2917 movups $inout0,`16*0`($out) 2918 movups $inout1,`16*1`($out) 2919 movups $inout2,`16*2`($out) 2920 movups $inout3,`16*3`($out) 2921 2922.Locb_enc_done: 2923 pxor $rndkey0,@offset[5] # "remove" round[last] 2924 movdqu $checksum,($checksum_p) # store checksum 2925 movdqu @offset[5],($offset_p) # store last offset_i 2926 2927 xorps %xmm0,%xmm0 # clear register bank 2928 pxor %xmm1,%xmm1 2929 pxor %xmm2,%xmm2 2930 pxor %xmm3,%xmm3 2931 pxor %xmm4,%xmm4 2932 pxor %xmm5,%xmm5 2933___ 2934$code.=<<___ if (!$win64); 2935 pxor %xmm6,%xmm6 2936 pxor %xmm7,%xmm7 2937 pxor %xmm8,%xmm8 2938 pxor %xmm9,%xmm9 2939 pxor %xmm10,%xmm10 2940 pxor %xmm11,%xmm11 2941 pxor %xmm12,%xmm12 2942 pxor %xmm13,%xmm13 2943 pxor %xmm14,%xmm14 2944 pxor %xmm15,%xmm15 2945 lea 0x28(%rsp),%rax 2946___ 2947$code.=<<___ if ($win64); 2948 movaps 0x00(%rsp),%xmm6 2949 movaps %xmm0,0x00(%rsp) # clear stack 2950 movaps 0x10(%rsp),%xmm7 2951 movaps %xmm0,0x10(%rsp) 2952 movaps 0x20(%rsp),%xmm8 2953 movaps %xmm0,0x20(%rsp) 2954 movaps 0x30(%rsp),%xmm9 2955 movaps %xmm0,0x30(%rsp) 2956 movaps 0x40(%rsp),%xmm10 2957 movaps %xmm0,0x40(%rsp) 2958 movaps 0x50(%rsp),%xmm11 2959 movaps %xmm0,0x50(%rsp) 2960 movaps 0x60(%rsp),%xmm12 2961 movaps %xmm0,0x60(%rsp) 2962 movaps 0x70(%rsp),%xmm13 2963 movaps %xmm0,0x70(%rsp) 2964 movaps 0x80(%rsp),%xmm14 2965 movaps %xmm0,0x80(%rsp) 2966 movaps 0x90(%rsp),%xmm15 2967 movaps %xmm0,0x90(%rsp) 2968 lea 0xa0+0x28(%rsp),%rax 2969.Locb_enc_pop: 2970___ 2971$code.=<<___; 2972 mov -40(%rax),%r14 2973 mov -32(%rax),%r13 2974 mov -24(%rax),%r12 2975 mov -16(%rax),%rbp 2976 mov -8(%rax),%rbx 2977 lea (%rax),%rsp 2978.Locb_enc_epilogue: 2979 ret 2980.size aesni_ocb_encrypt,.-aesni_ocb_encrypt 2981 2982.type __ocb_encrypt6,\@abi-omnipotent 2983.align 32 2984__ocb_encrypt6: 2985 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 2986 movdqu ($L_p,$i1),@offset[1] 2987 movdqa @offset[0],@offset[2] 2988 movdqu ($L_p,$i3),@offset[3] 2989 movdqa @offset[0],@offset[4] 2990 pxor @offset[5],@offset[0] 2991 movdqu ($L_p,$i5),@offset[5] 2992 pxor @offset[0],@offset[1] 2993 pxor $inout0,$checksum # accumulate checksum 2994 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 2995 pxor @offset[1],@offset[2] 2996 pxor $inout1,$checksum 2997 pxor @offset[1],$inout1 2998 pxor @offset[2],@offset[3] 2999 pxor $inout2,$checksum 3000 pxor @offset[2],$inout2 3001 pxor @offset[3],@offset[4] 3002 pxor $inout3,$checksum 3003 pxor @offset[3],$inout3 3004 pxor @offset[4],@offset[5] 3005 pxor $inout4,$checksum 3006 pxor @offset[4],$inout4 3007 pxor $inout5,$checksum 3008 pxor @offset[5],$inout5 3009 $movkey 32($key_),$rndkey0 3010 3011 lea 1($block_num),$i1 # even-numbered blocks 3012 lea 3($block_num),$i3 3013 lea 5($block_num),$i5 3014 add \$6,$block_num 3015 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3016 bsf $i1,$i1 # ntz(block) 3017 bsf $i3,$i3 3018 bsf $i5,$i5 3019 3020 aesenc $rndkey1,$inout0 3021 aesenc $rndkey1,$inout1 3022 aesenc $rndkey1,$inout2 3023 aesenc $rndkey1,$inout3 3024 pxor $rndkey0l,@offset[1] 3025 pxor $rndkey0l,@offset[2] 3026 aesenc $rndkey1,$inout4 3027 pxor $rndkey0l,@offset[3] 3028 pxor $rndkey0l,@offset[4] 3029 aesenc $rndkey1,$inout5 3030 $movkey 48($key_),$rndkey1 3031 pxor $rndkey0l,@offset[5] 3032 3033 aesenc $rndkey0,$inout0 3034 aesenc $rndkey0,$inout1 3035 aesenc $rndkey0,$inout2 3036 aesenc $rndkey0,$inout3 3037 aesenc $rndkey0,$inout4 3038 aesenc $rndkey0,$inout5 3039 $movkey 64($key_),$rndkey0 3040 shl \$4,$i1 # ntz(block) -> table offset 3041 shl \$4,$i3 3042 jmp .Locb_enc_loop6 3043 3044.align 32 3045.Locb_enc_loop6: 3046 aesenc $rndkey1,$inout0 3047 aesenc $rndkey1,$inout1 3048 aesenc $rndkey1,$inout2 3049 aesenc $rndkey1,$inout3 3050 aesenc $rndkey1,$inout4 3051 aesenc $rndkey1,$inout5 3052 $movkey ($key,%rax),$rndkey1 3053 add \$32,%rax 3054 3055 aesenc $rndkey0,$inout0 3056 aesenc $rndkey0,$inout1 3057 aesenc $rndkey0,$inout2 3058 aesenc $rndkey0,$inout3 3059 aesenc $rndkey0,$inout4 3060 aesenc $rndkey0,$inout5 3061 $movkey -16($key,%rax),$rndkey0 3062 jnz .Locb_enc_loop6 3063 3064 aesenc $rndkey1,$inout0 3065 aesenc $rndkey1,$inout1 3066 aesenc $rndkey1,$inout2 3067 aesenc $rndkey1,$inout3 3068 aesenc $rndkey1,$inout4 3069 aesenc $rndkey1,$inout5 3070 $movkey 16($key_),$rndkey1 3071 shl \$4,$i5 3072 3073 aesenclast @offset[0],$inout0 3074 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3075 mov %r10,%rax # restore twisted rounds 3076 aesenclast @offset[1],$inout1 3077 aesenclast @offset[2],$inout2 3078 aesenclast @offset[3],$inout3 3079 aesenclast @offset[4],$inout4 3080 aesenclast @offset[5],$inout5 3081 ret 3082.size __ocb_encrypt6,.-__ocb_encrypt6 3083 3084.type __ocb_encrypt4,\@abi-omnipotent 3085.align 32 3086__ocb_encrypt4: 3087 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3088 movdqu ($L_p,$i1),@offset[1] 3089 movdqa @offset[0],@offset[2] 3090 movdqu ($L_p,$i3),@offset[3] 3091 pxor @offset[5],@offset[0] 3092 pxor @offset[0],@offset[1] 3093 pxor $inout0,$checksum # accumulate checksum 3094 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3095 pxor @offset[1],@offset[2] 3096 pxor $inout1,$checksum 3097 pxor @offset[1],$inout1 3098 pxor @offset[2],@offset[3] 3099 pxor $inout2,$checksum 3100 pxor @offset[2],$inout2 3101 pxor $inout3,$checksum 3102 pxor @offset[3],$inout3 3103 $movkey 32($key_),$rndkey0 3104 3105 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3106 pxor $rndkey0l,@offset[1] 3107 pxor $rndkey0l,@offset[2] 3108 pxor $rndkey0l,@offset[3] 3109 3110 aesenc $rndkey1,$inout0 3111 aesenc $rndkey1,$inout1 3112 aesenc $rndkey1,$inout2 3113 aesenc $rndkey1,$inout3 3114 $movkey 48($key_),$rndkey1 3115 3116 aesenc $rndkey0,$inout0 3117 aesenc $rndkey0,$inout1 3118 aesenc $rndkey0,$inout2 3119 aesenc $rndkey0,$inout3 3120 $movkey 64($key_),$rndkey0 3121 jmp .Locb_enc_loop4 3122 3123.align 32 3124.Locb_enc_loop4: 3125 aesenc $rndkey1,$inout0 3126 aesenc $rndkey1,$inout1 3127 aesenc $rndkey1,$inout2 3128 aesenc $rndkey1,$inout3 3129 $movkey ($key,%rax),$rndkey1 3130 add \$32,%rax 3131 3132 aesenc $rndkey0,$inout0 3133 aesenc $rndkey0,$inout1 3134 aesenc $rndkey0,$inout2 3135 aesenc $rndkey0,$inout3 3136 $movkey -16($key,%rax),$rndkey0 3137 jnz .Locb_enc_loop4 3138 3139 aesenc $rndkey1,$inout0 3140 aesenc $rndkey1,$inout1 3141 aesenc $rndkey1,$inout2 3142 aesenc $rndkey1,$inout3 3143 $movkey 16($key_),$rndkey1 3144 mov %r10,%rax # restore twisted rounds 3145 3146 aesenclast @offset[0],$inout0 3147 aesenclast @offset[1],$inout1 3148 aesenclast @offset[2],$inout2 3149 aesenclast @offset[3],$inout3 3150 ret 3151.size __ocb_encrypt4,.-__ocb_encrypt4 3152 3153.type __ocb_encrypt1,\@abi-omnipotent 3154.align 32 3155__ocb_encrypt1: 3156 pxor @offset[5],$inout5 # offset_i 3157 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3158 pxor $inout0,$checksum # accumulate checksum 3159 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3160 $movkey 32($key_),$rndkey0 3161 3162 aesenc $rndkey1,$inout0 3163 $movkey 48($key_),$rndkey1 3164 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3165 3166 aesenc $rndkey0,$inout0 3167 $movkey 64($key_),$rndkey0 3168 jmp .Locb_enc_loop1 3169 3170.align 32 3171.Locb_enc_loop1: 3172 aesenc $rndkey1,$inout0 3173 $movkey ($key,%rax),$rndkey1 3174 add \$32,%rax 3175 3176 aesenc $rndkey0,$inout0 3177 $movkey -16($key,%rax),$rndkey0 3178 jnz .Locb_enc_loop1 3179 3180 aesenc $rndkey1,$inout0 3181 $movkey 16($key_),$rndkey1 # redundant in tail 3182 mov %r10,%rax # restore twisted rounds 3183 3184 aesenclast $inout5,$inout0 3185 ret 3186.size __ocb_encrypt1,.-__ocb_encrypt1 3187 3188.globl aesni_ocb_decrypt 3189.type aesni_ocb_decrypt,\@function,6 3190.align 32 3191aesni_ocb_decrypt: 3192 lea (%rsp),%rax 3193 push %rbx 3194 push %rbp 3195 push %r12 3196 push %r13 3197 push %r14 3198___ 3199$code.=<<___ if ($win64); 3200 lea -0xa0(%rsp),%rsp 3201 movaps %xmm6,0x00(%rsp) # offload everything 3202 movaps %xmm7,0x10(%rsp) 3203 movaps %xmm8,0x20(%rsp) 3204 movaps %xmm9,0x30(%rsp) 3205 movaps %xmm10,0x40(%rsp) 3206 movaps %xmm11,0x50(%rsp) 3207 movaps %xmm12,0x60(%rsp) 3208 movaps %xmm13,0x70(%rsp) 3209 movaps %xmm14,0x80(%rsp) 3210 movaps %xmm15,0x90(%rsp) 3211.Locb_dec_body: 3212___ 3213$code.=<<___; 3214 mov $seventh_arg(%rax),$L_p # 7th argument 3215 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3216 3217 mov 240($key),$rnds_ 3218 mov $key,$key_ 3219 shl \$4,$rnds_ 3220 $movkey ($key),$rndkey0l # round[0] 3221 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3222 3223 movdqu ($offset_p),@offset[5] # load last offset_i 3224 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3225 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3226 3227 mov \$16+32,$rounds 3228 lea 32($key_,$rnds_),$key 3229 $movkey 16($key_),$rndkey1 # round[1] 3230 sub %r10,%rax # twisted $rounds 3231 mov %rax,%r10 # backup twisted $rounds 3232 3233 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3234 movdqu ($checksum_p),$checksum # load checksum 3235 3236 test \$1,$block_num # is first block number odd? 3237 jnz .Locb_dec_odd 3238 3239 bsf $block_num,$i1 3240 add \$1,$block_num 3241 shl \$4,$i1 3242 movdqu ($L_p,$i1),$inout5 # borrow 3243 movdqu ($inp),$inout0 3244 lea 16($inp),$inp 3245 3246 call __ocb_decrypt1 3247 3248 movdqa $inout5,@offset[5] 3249 movups $inout0,($out) 3250 xorps $inout0,$checksum # accumulate checksum 3251 lea 16($out),$out 3252 sub \$1,$blocks 3253 jz .Locb_dec_done 3254 3255.Locb_dec_odd: 3256 lea 1($block_num),$i1 # even-numbered blocks 3257 lea 3($block_num),$i3 3258 lea 5($block_num),$i5 3259 lea 6($block_num),$block_num 3260 bsf $i1,$i1 # ntz(block) 3261 bsf $i3,$i3 3262 bsf $i5,$i5 3263 shl \$4,$i1 # ntz(block) -> table offset 3264 shl \$4,$i3 3265 shl \$4,$i5 3266 3267 sub \$6,$blocks 3268 jc .Locb_dec_short 3269 jmp .Locb_dec_grandloop 3270 3271.align 32 3272.Locb_dec_grandloop: 3273 movdqu `16*0`($inp),$inout0 # load input 3274 movdqu `16*1`($inp),$inout1 3275 movdqu `16*2`($inp),$inout2 3276 movdqu `16*3`($inp),$inout3 3277 movdqu `16*4`($inp),$inout4 3278 movdqu `16*5`($inp),$inout5 3279 lea `16*6`($inp),$inp 3280 3281 call __ocb_decrypt6 3282 3283 movups $inout0,`16*0`($out) # store output 3284 pxor $inout0,$checksum # accumulate checksum 3285 movups $inout1,`16*1`($out) 3286 pxor $inout1,$checksum 3287 movups $inout2,`16*2`($out) 3288 pxor $inout2,$checksum 3289 movups $inout3,`16*3`($out) 3290 pxor $inout3,$checksum 3291 movups $inout4,`16*4`($out) 3292 pxor $inout4,$checksum 3293 movups $inout5,`16*5`($out) 3294 pxor $inout5,$checksum 3295 lea `16*6`($out),$out 3296 sub \$6,$blocks 3297 jnc .Locb_dec_grandloop 3298 3299.Locb_dec_short: 3300 add \$6,$blocks 3301 jz .Locb_dec_done 3302 3303 movdqu `16*0`($inp),$inout0 3304 cmp \$2,$blocks 3305 jb .Locb_dec_one 3306 movdqu `16*1`($inp),$inout1 3307 je .Locb_dec_two 3308 3309 movdqu `16*2`($inp),$inout2 3310 cmp \$4,$blocks 3311 jb .Locb_dec_three 3312 movdqu `16*3`($inp),$inout3 3313 je .Locb_dec_four 3314 3315 movdqu `16*4`($inp),$inout4 3316 pxor $inout5,$inout5 3317 3318 call __ocb_decrypt6 3319 3320 movdqa @offset[4],@offset[5] 3321 movups $inout0,`16*0`($out) # store output 3322 pxor $inout0,$checksum # accumulate checksum 3323 movups $inout1,`16*1`($out) 3324 pxor $inout1,$checksum 3325 movups $inout2,`16*2`($out) 3326 pxor $inout2,$checksum 3327 movups $inout3,`16*3`($out) 3328 pxor $inout3,$checksum 3329 movups $inout4,`16*4`($out) 3330 pxor $inout4,$checksum 3331 3332 jmp .Locb_dec_done 3333 3334.align 16 3335.Locb_dec_one: 3336 movdqa @offset[0],$inout5 # borrow 3337 3338 call __ocb_decrypt1 3339 3340 movdqa $inout5,@offset[5] 3341 movups $inout0,`16*0`($out) # store output 3342 xorps $inout0,$checksum # accumulate checksum 3343 jmp .Locb_dec_done 3344 3345.align 16 3346.Locb_dec_two: 3347 pxor $inout2,$inout2 3348 pxor $inout3,$inout3 3349 3350 call __ocb_decrypt4 3351 3352 movdqa @offset[1],@offset[5] 3353 movups $inout0,`16*0`($out) # store output 3354 xorps $inout0,$checksum # accumulate checksum 3355 movups $inout1,`16*1`($out) 3356 xorps $inout1,$checksum 3357 3358 jmp .Locb_dec_done 3359 3360.align 16 3361.Locb_dec_three: 3362 pxor $inout3,$inout3 3363 3364 call __ocb_decrypt4 3365 3366 movdqa @offset[2],@offset[5] 3367 movups $inout0,`16*0`($out) # store output 3368 xorps $inout0,$checksum # accumulate checksum 3369 movups $inout1,`16*1`($out) 3370 xorps $inout1,$checksum 3371 movups $inout2,`16*2`($out) 3372 xorps $inout2,$checksum 3373 3374 jmp .Locb_dec_done 3375 3376.align 16 3377.Locb_dec_four: 3378 call __ocb_decrypt4 3379 3380 movdqa @offset[3],@offset[5] 3381 movups $inout0,`16*0`($out) # store output 3382 pxor $inout0,$checksum # accumulate checksum 3383 movups $inout1,`16*1`($out) 3384 pxor $inout1,$checksum 3385 movups $inout2,`16*2`($out) 3386 pxor $inout2,$checksum 3387 movups $inout3,`16*3`($out) 3388 pxor $inout3,$checksum 3389 3390.Locb_dec_done: 3391 pxor $rndkey0,@offset[5] # "remove" round[last] 3392 movdqu $checksum,($checksum_p) # store checksum 3393 movdqu @offset[5],($offset_p) # store last offset_i 3394 3395 xorps %xmm0,%xmm0 # clear register bank 3396 pxor %xmm1,%xmm1 3397 pxor %xmm2,%xmm2 3398 pxor %xmm3,%xmm3 3399 pxor %xmm4,%xmm4 3400 pxor %xmm5,%xmm5 3401___ 3402$code.=<<___ if (!$win64); 3403 pxor %xmm6,%xmm6 3404 pxor %xmm7,%xmm7 3405 pxor %xmm8,%xmm8 3406 pxor %xmm9,%xmm9 3407 pxor %xmm10,%xmm10 3408 pxor %xmm11,%xmm11 3409 pxor %xmm12,%xmm12 3410 pxor %xmm13,%xmm13 3411 pxor %xmm14,%xmm14 3412 pxor %xmm15,%xmm15 3413 lea 0x28(%rsp),%rax 3414___ 3415$code.=<<___ if ($win64); 3416 movaps 0x00(%rsp),%xmm6 3417 movaps %xmm0,0x00(%rsp) # clear stack 3418 movaps 0x10(%rsp),%xmm7 3419 movaps %xmm0,0x10(%rsp) 3420 movaps 0x20(%rsp),%xmm8 3421 movaps %xmm0,0x20(%rsp) 3422 movaps 0x30(%rsp),%xmm9 3423 movaps %xmm0,0x30(%rsp) 3424 movaps 0x40(%rsp),%xmm10 3425 movaps %xmm0,0x40(%rsp) 3426 movaps 0x50(%rsp),%xmm11 3427 movaps %xmm0,0x50(%rsp) 3428 movaps 0x60(%rsp),%xmm12 3429 movaps %xmm0,0x60(%rsp) 3430 movaps 0x70(%rsp),%xmm13 3431 movaps %xmm0,0x70(%rsp) 3432 movaps 0x80(%rsp),%xmm14 3433 movaps %xmm0,0x80(%rsp) 3434 movaps 0x90(%rsp),%xmm15 3435 movaps %xmm0,0x90(%rsp) 3436 lea 0xa0+0x28(%rsp),%rax 3437.Locb_dec_pop: 3438___ 3439$code.=<<___; 3440 mov -40(%rax),%r14 3441 mov -32(%rax),%r13 3442 mov -24(%rax),%r12 3443 mov -16(%rax),%rbp 3444 mov -8(%rax),%rbx 3445 lea (%rax),%rsp 3446.Locb_dec_epilogue: 3447 ret 3448.size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3449 3450.type __ocb_decrypt6,\@abi-omnipotent 3451.align 32 3452__ocb_decrypt6: 3453 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3454 movdqu ($L_p,$i1),@offset[1] 3455 movdqa @offset[0],@offset[2] 3456 movdqu ($L_p,$i3),@offset[3] 3457 movdqa @offset[0],@offset[4] 3458 pxor @offset[5],@offset[0] 3459 movdqu ($L_p,$i5),@offset[5] 3460 pxor @offset[0],@offset[1] 3461 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3462 pxor @offset[1],@offset[2] 3463 pxor @offset[1],$inout1 3464 pxor @offset[2],@offset[3] 3465 pxor @offset[2],$inout2 3466 pxor @offset[3],@offset[4] 3467 pxor @offset[3],$inout3 3468 pxor @offset[4],@offset[5] 3469 pxor @offset[4],$inout4 3470 pxor @offset[5],$inout5 3471 $movkey 32($key_),$rndkey0 3472 3473 lea 1($block_num),$i1 # even-numbered blocks 3474 lea 3($block_num),$i3 3475 lea 5($block_num),$i5 3476 add \$6,$block_num 3477 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3478 bsf $i1,$i1 # ntz(block) 3479 bsf $i3,$i3 3480 bsf $i5,$i5 3481 3482 aesdec $rndkey1,$inout0 3483 aesdec $rndkey1,$inout1 3484 aesdec $rndkey1,$inout2 3485 aesdec $rndkey1,$inout3 3486 pxor $rndkey0l,@offset[1] 3487 pxor $rndkey0l,@offset[2] 3488 aesdec $rndkey1,$inout4 3489 pxor $rndkey0l,@offset[3] 3490 pxor $rndkey0l,@offset[4] 3491 aesdec $rndkey1,$inout5 3492 $movkey 48($key_),$rndkey1 3493 pxor $rndkey0l,@offset[5] 3494 3495 aesdec $rndkey0,$inout0 3496 aesdec $rndkey0,$inout1 3497 aesdec $rndkey0,$inout2 3498 aesdec $rndkey0,$inout3 3499 aesdec $rndkey0,$inout4 3500 aesdec $rndkey0,$inout5 3501 $movkey 64($key_),$rndkey0 3502 shl \$4,$i1 # ntz(block) -> table offset 3503 shl \$4,$i3 3504 jmp .Locb_dec_loop6 3505 3506.align 32 3507.Locb_dec_loop6: 3508 aesdec $rndkey1,$inout0 3509 aesdec $rndkey1,$inout1 3510 aesdec $rndkey1,$inout2 3511 aesdec $rndkey1,$inout3 3512 aesdec $rndkey1,$inout4 3513 aesdec $rndkey1,$inout5 3514 $movkey ($key,%rax),$rndkey1 3515 add \$32,%rax 3516 3517 aesdec $rndkey0,$inout0 3518 aesdec $rndkey0,$inout1 3519 aesdec $rndkey0,$inout2 3520 aesdec $rndkey0,$inout3 3521 aesdec $rndkey0,$inout4 3522 aesdec $rndkey0,$inout5 3523 $movkey -16($key,%rax),$rndkey0 3524 jnz .Locb_dec_loop6 3525 3526 aesdec $rndkey1,$inout0 3527 aesdec $rndkey1,$inout1 3528 aesdec $rndkey1,$inout2 3529 aesdec $rndkey1,$inout3 3530 aesdec $rndkey1,$inout4 3531 aesdec $rndkey1,$inout5 3532 $movkey 16($key_),$rndkey1 3533 shl \$4,$i5 3534 3535 aesdeclast @offset[0],$inout0 3536 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3537 mov %r10,%rax # restore twisted rounds 3538 aesdeclast @offset[1],$inout1 3539 aesdeclast @offset[2],$inout2 3540 aesdeclast @offset[3],$inout3 3541 aesdeclast @offset[4],$inout4 3542 aesdeclast @offset[5],$inout5 3543 ret 3544.size __ocb_decrypt6,.-__ocb_decrypt6 3545 3546.type __ocb_decrypt4,\@abi-omnipotent 3547.align 32 3548__ocb_decrypt4: 3549 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3550 movdqu ($L_p,$i1),@offset[1] 3551 movdqa @offset[0],@offset[2] 3552 movdqu ($L_p,$i3),@offset[3] 3553 pxor @offset[5],@offset[0] 3554 pxor @offset[0],@offset[1] 3555 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3556 pxor @offset[1],@offset[2] 3557 pxor @offset[1],$inout1 3558 pxor @offset[2],@offset[3] 3559 pxor @offset[2],$inout2 3560 pxor @offset[3],$inout3 3561 $movkey 32($key_),$rndkey0 3562 3563 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3564 pxor $rndkey0l,@offset[1] 3565 pxor $rndkey0l,@offset[2] 3566 pxor $rndkey0l,@offset[3] 3567 3568 aesdec $rndkey1,$inout0 3569 aesdec $rndkey1,$inout1 3570 aesdec $rndkey1,$inout2 3571 aesdec $rndkey1,$inout3 3572 $movkey 48($key_),$rndkey1 3573 3574 aesdec $rndkey0,$inout0 3575 aesdec $rndkey0,$inout1 3576 aesdec $rndkey0,$inout2 3577 aesdec $rndkey0,$inout3 3578 $movkey 64($key_),$rndkey0 3579 jmp .Locb_dec_loop4 3580 3581.align 32 3582.Locb_dec_loop4: 3583 aesdec $rndkey1,$inout0 3584 aesdec $rndkey1,$inout1 3585 aesdec $rndkey1,$inout2 3586 aesdec $rndkey1,$inout3 3587 $movkey ($key,%rax),$rndkey1 3588 add \$32,%rax 3589 3590 aesdec $rndkey0,$inout0 3591 aesdec $rndkey0,$inout1 3592 aesdec $rndkey0,$inout2 3593 aesdec $rndkey0,$inout3 3594 $movkey -16($key,%rax),$rndkey0 3595 jnz .Locb_dec_loop4 3596 3597 aesdec $rndkey1,$inout0 3598 aesdec $rndkey1,$inout1 3599 aesdec $rndkey1,$inout2 3600 aesdec $rndkey1,$inout3 3601 $movkey 16($key_),$rndkey1 3602 mov %r10,%rax # restore twisted rounds 3603 3604 aesdeclast @offset[0],$inout0 3605 aesdeclast @offset[1],$inout1 3606 aesdeclast @offset[2],$inout2 3607 aesdeclast @offset[3],$inout3 3608 ret 3609.size __ocb_decrypt4,.-__ocb_decrypt4 3610 3611.type __ocb_decrypt1,\@abi-omnipotent 3612.align 32 3613__ocb_decrypt1: 3614 pxor @offset[5],$inout5 # offset_i 3615 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3616 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3617 $movkey 32($key_),$rndkey0 3618 3619 aesdec $rndkey1,$inout0 3620 $movkey 48($key_),$rndkey1 3621 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3622 3623 aesdec $rndkey0,$inout0 3624 $movkey 64($key_),$rndkey0 3625 jmp .Locb_dec_loop1 3626 3627.align 32 3628.Locb_dec_loop1: 3629 aesdec $rndkey1,$inout0 3630 $movkey ($key,%rax),$rndkey1 3631 add \$32,%rax 3632 3633 aesdec $rndkey0,$inout0 3634 $movkey -16($key,%rax),$rndkey0 3635 jnz .Locb_dec_loop1 3636 3637 aesdec $rndkey1,$inout0 3638 $movkey 16($key_),$rndkey1 # redundant in tail 3639 mov %r10,%rax # restore twisted rounds 3640 3641 aesdeclast $inout5,$inout0 3642 ret 3643.size __ocb_decrypt1,.-__ocb_decrypt1 3644___ 3645} }} 3646 3647######################################################################## 3648# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3649# size_t length, const AES_KEY *key, 3650# unsigned char *ivp,const int enc); 3651{ 3652my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3653my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3654 3655$code.=<<___; 3656.globl ${PREFIX}_cbc_encrypt 3657.type ${PREFIX}_cbc_encrypt,\@function,6 3658.align 16 3659${PREFIX}_cbc_encrypt: 3660 test $len,$len # check length 3661 jz .Lcbc_ret 3662 3663 mov 240($key),$rnds_ # key->rounds 3664 mov $key,$key_ # backup $key 3665 test %r9d,%r9d # 6th argument 3666 jz .Lcbc_decrypt 3667#--------------------------- CBC ENCRYPT ------------------------------# 3668 movups ($ivp),$inout0 # load iv as initial state 3669 mov $rnds_,$rounds 3670 cmp \$16,$len 3671 jb .Lcbc_enc_tail 3672 sub \$16,$len 3673 jmp .Lcbc_enc_loop 3674.align 16 3675.Lcbc_enc_loop: 3676 movups ($inp),$inout1 # load input 3677 lea 16($inp),$inp 3678 #xorps $inout1,$inout0 3679___ 3680 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3681$code.=<<___; 3682 mov $rnds_,$rounds # restore $rounds 3683 mov $key_,$key # restore $key 3684 movups $inout0,0($out) # store output 3685 lea 16($out),$out 3686 sub \$16,$len 3687 jnc .Lcbc_enc_loop 3688 add \$16,$len 3689 jnz .Lcbc_enc_tail 3690 pxor $rndkey0,$rndkey0 # clear register bank 3691 pxor $rndkey1,$rndkey1 3692 movups $inout0,($ivp) 3693 pxor $inout0,$inout0 3694 pxor $inout1,$inout1 3695 jmp .Lcbc_ret 3696 3697.Lcbc_enc_tail: 3698 mov $len,%rcx # zaps $key 3699 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3700 .long 0x9066A4F3 # rep movsb 3701 mov \$16,%ecx # zero tail 3702 sub $len,%rcx 3703 xor %eax,%eax 3704 .long 0x9066AAF3 # rep stosb 3705 lea -16(%rdi),%rdi # rewind $out by 1 block 3706 mov $rnds_,$rounds # restore $rounds 3707 mov %rdi,%rsi # $inp and $out are the same 3708 mov $key_,$key # restore $key 3709 xor $len,$len # len=16 3710 jmp .Lcbc_enc_loop # one more spin 3711#--------------------------- CBC DECRYPT ------------------------------# 3712.align 16 3713.Lcbc_decrypt: 3714 cmp \$16,$len 3715 jne .Lcbc_decrypt_bulk 3716 3717 # handle single block without allocating stack frame, 3718 # useful in ciphertext stealing mode 3719 movdqu ($inp),$inout0 # load input 3720 movdqu ($ivp),$inout1 # load iv 3721 movdqa $inout0,$inout2 # future iv 3722___ 3723 &aesni_generate1("dec",$key,$rnds_); 3724$code.=<<___; 3725 pxor $rndkey0,$rndkey0 # clear register bank 3726 pxor $rndkey1,$rndkey1 3727 movdqu $inout2,($ivp) # store iv 3728 xorps $inout1,$inout0 # ^=iv 3729 pxor $inout1,$inout1 3730 movups $inout0,($out) # store output 3731 pxor $inout0,$inout0 3732 jmp .Lcbc_ret 3733.align 16 3734.Lcbc_decrypt_bulk: 3735 lea (%rsp),%r11 # frame pointer 3736 push %rbp 3737 sub \$$frame_size,%rsp 3738 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3739___ 3740$code.=<<___ if ($win64); 3741 movaps %xmm6,0x10(%rsp) 3742 movaps %xmm7,0x20(%rsp) 3743 movaps %xmm8,0x30(%rsp) 3744 movaps %xmm9,0x40(%rsp) 3745 movaps %xmm10,0x50(%rsp) 3746 movaps %xmm11,0x60(%rsp) 3747 movaps %xmm12,0x70(%rsp) 3748 movaps %xmm13,0x80(%rsp) 3749 movaps %xmm14,0x90(%rsp) 3750 movaps %xmm15,0xa0(%rsp) 3751.Lcbc_decrypt_body: 3752___ 3753 3754my $inp_=$key_="%rbp"; # reassign $key_ 3755 3756$code.=<<___; 3757 mov $key,$key_ # [re-]backup $key [after reassignment] 3758 movups ($ivp),$iv 3759 mov $rnds_,$rounds 3760 cmp \$0x50,$len 3761 jbe .Lcbc_dec_tail 3762 3763 $movkey ($key),$rndkey0 3764 movdqu 0x00($inp),$inout0 # load input 3765 movdqu 0x10($inp),$inout1 3766 movdqa $inout0,$in0 3767 movdqu 0x20($inp),$inout2 3768 movdqa $inout1,$in1 3769 movdqu 0x30($inp),$inout3 3770 movdqa $inout2,$in2 3771 movdqu 0x40($inp),$inout4 3772 movdqa $inout3,$in3 3773 movdqu 0x50($inp),$inout5 3774 movdqa $inout4,$in4 3775 leaq OPENSSL_ia32cap_P(%rip),%r9 3776 mov 4(%r9),%r9d 3777 cmp \$0x70,$len 3778 jbe .Lcbc_dec_six_or_seven 3779 3780 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3781 sub \$0x50,$len # $len is biased by -5*16 3782 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3783 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3784 sub \$0x20,$len # $len is biased by -7*16 3785 lea 0x70($key),$key # size optimization 3786 jmp .Lcbc_dec_loop8_enter 3787.align 16 3788.Lcbc_dec_loop8: 3789 movups $inout7,($out) 3790 lea 0x10($out),$out 3791.Lcbc_dec_loop8_enter: 3792 movdqu 0x60($inp),$inout6 3793 pxor $rndkey0,$inout0 3794 movdqu 0x70($inp),$inout7 3795 pxor $rndkey0,$inout1 3796 $movkey 0x10-0x70($key),$rndkey1 3797 pxor $rndkey0,$inout2 3798 mov \$-1,$inp_ 3799 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3800 pxor $rndkey0,$inout3 3801 pxor $rndkey0,$inout4 3802 pxor $rndkey0,$inout5 3803 pxor $rndkey0,$inout6 3804 3805 aesdec $rndkey1,$inout0 3806 pxor $rndkey0,$inout7 3807 $movkey 0x20-0x70($key),$rndkey0 3808 aesdec $rndkey1,$inout1 3809 aesdec $rndkey1,$inout2 3810 aesdec $rndkey1,$inout3 3811 aesdec $rndkey1,$inout4 3812 aesdec $rndkey1,$inout5 3813 aesdec $rndkey1,$inout6 3814 adc \$0,$inp_ 3815 and \$128,$inp_ 3816 aesdec $rndkey1,$inout7 3817 add $inp,$inp_ 3818 $movkey 0x30-0x70($key),$rndkey1 3819___ 3820for($i=1;$i<12;$i++) { 3821my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3822$code.=<<___ if ($i==7); 3823 cmp \$11,$rounds 3824___ 3825$code.=<<___; 3826 aesdec $rndkeyx,$inout0 3827 aesdec $rndkeyx,$inout1 3828 aesdec $rndkeyx,$inout2 3829 aesdec $rndkeyx,$inout3 3830 aesdec $rndkeyx,$inout4 3831 aesdec $rndkeyx,$inout5 3832 aesdec $rndkeyx,$inout6 3833 aesdec $rndkeyx,$inout7 3834 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3835___ 3836$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3837 nop 3838___ 3839$code.=<<___ if ($i==7); 3840 jb .Lcbc_dec_done 3841___ 3842$code.=<<___ if ($i==9); 3843 je .Lcbc_dec_done 3844___ 3845$code.=<<___ if ($i==11); 3846 jmp .Lcbc_dec_done 3847___ 3848} 3849$code.=<<___; 3850.align 16 3851.Lcbc_dec_done: 3852 aesdec $rndkey1,$inout0 3853 aesdec $rndkey1,$inout1 3854 pxor $rndkey0,$iv 3855 pxor $rndkey0,$in0 3856 aesdec $rndkey1,$inout2 3857 aesdec $rndkey1,$inout3 3858 pxor $rndkey0,$in1 3859 pxor $rndkey0,$in2 3860 aesdec $rndkey1,$inout4 3861 aesdec $rndkey1,$inout5 3862 pxor $rndkey0,$in3 3863 pxor $rndkey0,$in4 3864 aesdec $rndkey1,$inout6 3865 aesdec $rndkey1,$inout7 3866 movdqu 0x50($inp),$rndkey1 3867 3868 aesdeclast $iv,$inout0 3869 movdqu 0x60($inp),$iv # borrow $iv 3870 pxor $rndkey0,$rndkey1 3871 aesdeclast $in0,$inout1 3872 pxor $rndkey0,$iv 3873 movdqu 0x70($inp),$rndkey0 # next IV 3874 aesdeclast $in1,$inout2 3875 lea 0x80($inp),$inp 3876 movdqu 0x00($inp_),$in0 3877 aesdeclast $in2,$inout3 3878 aesdeclast $in3,$inout4 3879 movdqu 0x10($inp_),$in1 3880 movdqu 0x20($inp_),$in2 3881 aesdeclast $in4,$inout5 3882 aesdeclast $rndkey1,$inout6 3883 movdqu 0x30($inp_),$in3 3884 movdqu 0x40($inp_),$in4 3885 aesdeclast $iv,$inout7 3886 movdqa $rndkey0,$iv # return $iv 3887 movdqu 0x50($inp_),$rndkey1 3888 $movkey -0x70($key),$rndkey0 3889 3890 movups $inout0,($out) # store output 3891 movdqa $in0,$inout0 3892 movups $inout1,0x10($out) 3893 movdqa $in1,$inout1 3894 movups $inout2,0x20($out) 3895 movdqa $in2,$inout2 3896 movups $inout3,0x30($out) 3897 movdqa $in3,$inout3 3898 movups $inout4,0x40($out) 3899 movdqa $in4,$inout4 3900 movups $inout5,0x50($out) 3901 movdqa $rndkey1,$inout5 3902 movups $inout6,0x60($out) 3903 lea 0x70($out),$out 3904 3905 sub \$0x80,$len 3906 ja .Lcbc_dec_loop8 3907 3908 movaps $inout7,$inout0 3909 lea -0x70($key),$key 3910 add \$0x70,$len 3911 jle .Lcbc_dec_clear_tail_collected 3912 movups $inout7,($out) 3913 lea 0x10($out),$out 3914 cmp \$0x50,$len 3915 jbe .Lcbc_dec_tail 3916 3917 movaps $in0,$inout0 3918.Lcbc_dec_six_or_seven: 3919 cmp \$0x60,$len 3920 ja .Lcbc_dec_seven 3921 3922 movaps $inout5,$inout6 3923 call _aesni_decrypt6 3924 pxor $iv,$inout0 # ^= IV 3925 movaps $inout6,$iv 3926 pxor $in0,$inout1 3927 movdqu $inout0,($out) 3928 pxor $in1,$inout2 3929 movdqu $inout1,0x10($out) 3930 pxor $inout1,$inout1 # clear register bank 3931 pxor $in2,$inout3 3932 movdqu $inout2,0x20($out) 3933 pxor $inout2,$inout2 3934 pxor $in3,$inout4 3935 movdqu $inout3,0x30($out) 3936 pxor $inout3,$inout3 3937 pxor $in4,$inout5 3938 movdqu $inout4,0x40($out) 3939 pxor $inout4,$inout4 3940 lea 0x50($out),$out 3941 movdqa $inout5,$inout0 3942 pxor $inout5,$inout5 3943 jmp .Lcbc_dec_tail_collected 3944 3945.align 16 3946.Lcbc_dec_seven: 3947 movups 0x60($inp),$inout6 3948 xorps $inout7,$inout7 3949 call _aesni_decrypt8 3950 movups 0x50($inp),$inout7 3951 pxor $iv,$inout0 # ^= IV 3952 movups 0x60($inp),$iv 3953 pxor $in0,$inout1 3954 movdqu $inout0,($out) 3955 pxor $in1,$inout2 3956 movdqu $inout1,0x10($out) 3957 pxor $inout1,$inout1 # clear register bank 3958 pxor $in2,$inout3 3959 movdqu $inout2,0x20($out) 3960 pxor $inout2,$inout2 3961 pxor $in3,$inout4 3962 movdqu $inout3,0x30($out) 3963 pxor $inout3,$inout3 3964 pxor $in4,$inout5 3965 movdqu $inout4,0x40($out) 3966 pxor $inout4,$inout4 3967 pxor $inout7,$inout6 3968 movdqu $inout5,0x50($out) 3969 pxor $inout5,$inout5 3970 lea 0x60($out),$out 3971 movdqa $inout6,$inout0 3972 pxor $inout6,$inout6 3973 pxor $inout7,$inout7 3974 jmp .Lcbc_dec_tail_collected 3975 3976.align 16 3977.Lcbc_dec_loop6: 3978 movups $inout5,($out) 3979 lea 0x10($out),$out 3980 movdqu 0x00($inp),$inout0 # load input 3981 movdqu 0x10($inp),$inout1 3982 movdqa $inout0,$in0 3983 movdqu 0x20($inp),$inout2 3984 movdqa $inout1,$in1 3985 movdqu 0x30($inp),$inout3 3986 movdqa $inout2,$in2 3987 movdqu 0x40($inp),$inout4 3988 movdqa $inout3,$in3 3989 movdqu 0x50($inp),$inout5 3990 movdqa $inout4,$in4 3991.Lcbc_dec_loop6_enter: 3992 lea 0x60($inp),$inp 3993 movdqa $inout5,$inout6 3994 3995 call _aesni_decrypt6 3996 3997 pxor $iv,$inout0 # ^= IV 3998 movdqa $inout6,$iv 3999 pxor $in0,$inout1 4000 movdqu $inout0,($out) 4001 pxor $in1,$inout2 4002 movdqu $inout1,0x10($out) 4003 pxor $in2,$inout3 4004 movdqu $inout2,0x20($out) 4005 pxor $in3,$inout4 4006 mov $key_,$key 4007 movdqu $inout3,0x30($out) 4008 pxor $in4,$inout5 4009 mov $rnds_,$rounds 4010 movdqu $inout4,0x40($out) 4011 lea 0x50($out),$out 4012 sub \$0x60,$len 4013 ja .Lcbc_dec_loop6 4014 4015 movdqa $inout5,$inout0 4016 add \$0x50,$len 4017 jle .Lcbc_dec_clear_tail_collected 4018 movups $inout5,($out) 4019 lea 0x10($out),$out 4020 4021.Lcbc_dec_tail: 4022 movups ($inp),$inout0 4023 sub \$0x10,$len 4024 jbe .Lcbc_dec_one # $len is 1*16 or less 4025 4026 movups 0x10($inp),$inout1 4027 movaps $inout0,$in0 4028 sub \$0x10,$len 4029 jbe .Lcbc_dec_two # $len is 2*16 or less 4030 4031 movups 0x20($inp),$inout2 4032 movaps $inout1,$in1 4033 sub \$0x10,$len 4034 jbe .Lcbc_dec_three # $len is 3*16 or less 4035 4036 movups 0x30($inp),$inout3 4037 movaps $inout2,$in2 4038 sub \$0x10,$len 4039 jbe .Lcbc_dec_four # $len is 4*16 or less 4040 4041 movups 0x40($inp),$inout4 # $len is 5*16 or less 4042 movaps $inout3,$in3 4043 movaps $inout4,$in4 4044 xorps $inout5,$inout5 4045 call _aesni_decrypt6 4046 pxor $iv,$inout0 4047 movaps $in4,$iv 4048 pxor $in0,$inout1 4049 movdqu $inout0,($out) 4050 pxor $in1,$inout2 4051 movdqu $inout1,0x10($out) 4052 pxor $inout1,$inout1 # clear register bank 4053 pxor $in2,$inout3 4054 movdqu $inout2,0x20($out) 4055 pxor $inout2,$inout2 4056 pxor $in3,$inout4 4057 movdqu $inout3,0x30($out) 4058 pxor $inout3,$inout3 4059 lea 0x40($out),$out 4060 movdqa $inout4,$inout0 4061 pxor $inout4,$inout4 4062 pxor $inout5,$inout5 4063 sub \$0x10,$len 4064 jmp .Lcbc_dec_tail_collected 4065 4066.align 16 4067.Lcbc_dec_one: 4068 movaps $inout0,$in0 4069___ 4070 &aesni_generate1("dec",$key,$rounds); 4071$code.=<<___; 4072 xorps $iv,$inout0 4073 movaps $in0,$iv 4074 jmp .Lcbc_dec_tail_collected 4075.align 16 4076.Lcbc_dec_two: 4077 movaps $inout1,$in1 4078 call _aesni_decrypt2 4079 pxor $iv,$inout0 4080 movaps $in1,$iv 4081 pxor $in0,$inout1 4082 movdqu $inout0,($out) 4083 movdqa $inout1,$inout0 4084 pxor $inout1,$inout1 # clear register bank 4085 lea 0x10($out),$out 4086 jmp .Lcbc_dec_tail_collected 4087.align 16 4088.Lcbc_dec_three: 4089 movaps $inout2,$in2 4090 call _aesni_decrypt3 4091 pxor $iv,$inout0 4092 movaps $in2,$iv 4093 pxor $in0,$inout1 4094 movdqu $inout0,($out) 4095 pxor $in1,$inout2 4096 movdqu $inout1,0x10($out) 4097 pxor $inout1,$inout1 # clear register bank 4098 movdqa $inout2,$inout0 4099 pxor $inout2,$inout2 4100 lea 0x20($out),$out 4101 jmp .Lcbc_dec_tail_collected 4102.align 16 4103.Lcbc_dec_four: 4104 movaps $inout3,$in3 4105 call _aesni_decrypt4 4106 pxor $iv,$inout0 4107 movaps $in3,$iv 4108 pxor $in0,$inout1 4109 movdqu $inout0,($out) 4110 pxor $in1,$inout2 4111 movdqu $inout1,0x10($out) 4112 pxor $inout1,$inout1 # clear register bank 4113 pxor $in2,$inout3 4114 movdqu $inout2,0x20($out) 4115 pxor $inout2,$inout2 4116 movdqa $inout3,$inout0 4117 pxor $inout3,$inout3 4118 lea 0x30($out),$out 4119 jmp .Lcbc_dec_tail_collected 4120 4121.align 16 4122.Lcbc_dec_clear_tail_collected: 4123 pxor $inout1,$inout1 # clear register bank 4124 pxor $inout2,$inout2 4125 pxor $inout3,$inout3 4126___ 4127$code.=<<___ if (!$win64); 4128 pxor $inout4,$inout4 # %xmm6..9 4129 pxor $inout5,$inout5 4130 pxor $inout6,$inout6 4131 pxor $inout7,$inout7 4132___ 4133$code.=<<___; 4134.Lcbc_dec_tail_collected: 4135 movups $iv,($ivp) 4136 and \$15,$len 4137 jnz .Lcbc_dec_tail_partial 4138 movups $inout0,($out) 4139 pxor $inout0,$inout0 4140 jmp .Lcbc_dec_ret 4141.align 16 4142.Lcbc_dec_tail_partial: 4143 movaps $inout0,(%rsp) 4144 pxor $inout0,$inout0 4145 mov \$16,%rcx 4146 mov $out,%rdi 4147 sub $len,%rcx 4148 lea (%rsp),%rsi 4149 .long 0x9066A4F3 # rep movsb 4150 movdqa $inout0,(%rsp) 4151 4152.Lcbc_dec_ret: 4153 xorps $rndkey0,$rndkey0 # %xmm0 4154 pxor $rndkey1,$rndkey1 4155___ 4156$code.=<<___ if ($win64); 4157 movaps 0x10(%rsp),%xmm6 4158 movaps %xmm0,0x10(%rsp) # clear stack 4159 movaps 0x20(%rsp),%xmm7 4160 movaps %xmm0,0x20(%rsp) 4161 movaps 0x30(%rsp),%xmm8 4162 movaps %xmm0,0x30(%rsp) 4163 movaps 0x40(%rsp),%xmm9 4164 movaps %xmm0,0x40(%rsp) 4165 movaps 0x50(%rsp),%xmm10 4166 movaps %xmm0,0x50(%rsp) 4167 movaps 0x60(%rsp),%xmm11 4168 movaps %xmm0,0x60(%rsp) 4169 movaps 0x70(%rsp),%xmm12 4170 movaps %xmm0,0x70(%rsp) 4171 movaps 0x80(%rsp),%xmm13 4172 movaps %xmm0,0x80(%rsp) 4173 movaps 0x90(%rsp),%xmm14 4174 movaps %xmm0,0x90(%rsp) 4175 movaps 0xa0(%rsp),%xmm15 4176 movaps %xmm0,0xa0(%rsp) 4177___ 4178$code.=<<___; 4179 mov -8(%r11),%rbp 4180 lea (%r11),%rsp 4181.Lcbc_ret: 4182 ret 4183.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4184___ 4185} 4186# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4187# int bits, AES_KEY *key) 4188# 4189# input: $inp user-supplied key 4190# $bits $inp length in bits 4191# $key pointer to key schedule 4192# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4193# *$key key schedule 4194# 4195{ my ($inp,$bits,$key) = @_4args; 4196 $bits =~ s/%r/%e/; 4197 4198$code.=<<___; 4199.globl ${PREFIX}_set_decrypt_key 4200.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4201.align 16 4202${PREFIX}_set_decrypt_key: 4203 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4204 call __aesni_set_encrypt_key 4205 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4206 test %eax,%eax 4207 jnz .Ldec_key_ret 4208 lea 16($key,$bits),$inp # points at the end of key schedule 4209 4210 $movkey ($key),%xmm0 # just swap 4211 $movkey ($inp),%xmm1 4212 $movkey %xmm0,($inp) 4213 $movkey %xmm1,($key) 4214 lea 16($key),$key 4215 lea -16($inp),$inp 4216 4217.Ldec_key_inverse: 4218 $movkey ($key),%xmm0 # swap and inverse 4219 $movkey ($inp),%xmm1 4220 aesimc %xmm0,%xmm0 4221 aesimc %xmm1,%xmm1 4222 lea 16($key),$key 4223 lea -16($inp),$inp 4224 $movkey %xmm0,16($inp) 4225 $movkey %xmm1,-16($key) 4226 cmp $key,$inp 4227 ja .Ldec_key_inverse 4228 4229 $movkey ($key),%xmm0 # inverse middle 4230 aesimc %xmm0,%xmm0 4231 pxor %xmm1,%xmm1 4232 $movkey %xmm0,($inp) 4233 pxor %xmm0,%xmm0 4234.Ldec_key_ret: 4235 add \$8,%rsp 4236 ret 4237.LSEH_end_set_decrypt_key: 4238.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4239___ 4240 4241# This is based on submission by 4242# 4243# Huang Ying <ying.huang@intel.com> 4244# Vinodh Gopal <vinodh.gopal@intel.com> 4245# Kahraman Akdemir 4246# 4247# Aggressively optimized in respect to aeskeygenassist's critical path 4248# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4249# 4250# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4251# int bits, AES_KEY * const key); 4252# 4253# input: $inp user-supplied key 4254# $bits $inp length in bits 4255# $key pointer to key schedule 4256# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4257# $bits rounds-1 (used in aesni_set_decrypt_key) 4258# *$key key schedule 4259# $key pointer to key schedule (used in 4260# aesni_set_decrypt_key) 4261# 4262# Subroutine is frame-less, which means that only volatile registers 4263# are used. Note that it's declared "abi-omnipotent", which means that 4264# amount of volatile registers is smaller on Windows. 4265# 4266$code.=<<___; 4267.globl ${PREFIX}_set_encrypt_key 4268.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4269.align 16 4270${PREFIX}_set_encrypt_key: 4271__aesni_set_encrypt_key: 4272 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4273 mov \$-1,%rax 4274 test $inp,$inp 4275 jz .Lenc_key_ret 4276 test $key,$key 4277 jz .Lenc_key_ret 4278 4279 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4280 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4281 leaq OPENSSL_ia32cap_P(%rip),%r10 4282 movl 4(%r10),%r10d 4283 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4284 lea 16($key),%rax # %rax is used as modifiable copy of $key 4285 cmp \$256,$bits 4286 je .L14rounds 4287 cmp \$192,$bits 4288 je .L12rounds 4289 cmp \$128,$bits 4290 jne .Lbad_keybits 4291 4292.L10rounds: 4293 mov \$9,$bits # 10 rounds for 128-bit key 4294 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4295 je .L10rounds_alt 4296 4297 $movkey %xmm0,($key) # round 0 4298 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4299 call .Lkey_expansion_128_cold 4300 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4301 call .Lkey_expansion_128 4302 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4303 call .Lkey_expansion_128 4304 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4305 call .Lkey_expansion_128 4306 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4307 call .Lkey_expansion_128 4308 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4309 call .Lkey_expansion_128 4310 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4311 call .Lkey_expansion_128 4312 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4313 call .Lkey_expansion_128 4314 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4315 call .Lkey_expansion_128 4316 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4317 call .Lkey_expansion_128 4318 $movkey %xmm0,(%rax) 4319 mov $bits,80(%rax) # 240(%rdx) 4320 xor %eax,%eax 4321 jmp .Lenc_key_ret 4322 4323.align 16 4324.L10rounds_alt: 4325 movdqa .Lkey_rotate(%rip),%xmm5 4326 mov \$8,%r10d 4327 movdqa .Lkey_rcon1(%rip),%xmm4 4328 movdqa %xmm0,%xmm2 4329 movdqu %xmm0,($key) 4330 jmp .Loop_key128 4331 4332.align 16 4333.Loop_key128: 4334 pshufb %xmm5,%xmm0 4335 aesenclast %xmm4,%xmm0 4336 pslld \$1,%xmm4 4337 lea 16(%rax),%rax 4338 4339 movdqa %xmm2,%xmm3 4340 pslldq \$4,%xmm2 4341 pxor %xmm2,%xmm3 4342 pslldq \$4,%xmm2 4343 pxor %xmm2,%xmm3 4344 pslldq \$4,%xmm2 4345 pxor %xmm3,%xmm2 4346 4347 pxor %xmm2,%xmm0 4348 movdqu %xmm0,-16(%rax) 4349 movdqa %xmm0,%xmm2 4350 4351 dec %r10d 4352 jnz .Loop_key128 4353 4354 movdqa .Lkey_rcon1b(%rip),%xmm4 4355 4356 pshufb %xmm5,%xmm0 4357 aesenclast %xmm4,%xmm0 4358 pslld \$1,%xmm4 4359 4360 movdqa %xmm2,%xmm3 4361 pslldq \$4,%xmm2 4362 pxor %xmm2,%xmm3 4363 pslldq \$4,%xmm2 4364 pxor %xmm2,%xmm3 4365 pslldq \$4,%xmm2 4366 pxor %xmm3,%xmm2 4367 4368 pxor %xmm2,%xmm0 4369 movdqu %xmm0,(%rax) 4370 4371 movdqa %xmm0,%xmm2 4372 pshufb %xmm5,%xmm0 4373 aesenclast %xmm4,%xmm0 4374 4375 movdqa %xmm2,%xmm3 4376 pslldq \$4,%xmm2 4377 pxor %xmm2,%xmm3 4378 pslldq \$4,%xmm2 4379 pxor %xmm2,%xmm3 4380 pslldq \$4,%xmm2 4381 pxor %xmm3,%xmm2 4382 4383 pxor %xmm2,%xmm0 4384 movdqu %xmm0,16(%rax) 4385 4386 mov $bits,96(%rax) # 240($key) 4387 xor %eax,%eax 4388 jmp .Lenc_key_ret 4389 4390.align 16 4391.L12rounds: 4392 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4393 mov \$11,$bits # 12 rounds for 192 4394 cmp \$`1<<28`,%r10d # AVX, but no XOP 4395 je .L12rounds_alt 4396 4397 $movkey %xmm0,($key) # round 0 4398 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4399 call .Lkey_expansion_192a_cold 4400 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4401 call .Lkey_expansion_192b 4402 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4403 call .Lkey_expansion_192a 4404 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4405 call .Lkey_expansion_192b 4406 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4407 call .Lkey_expansion_192a 4408 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4409 call .Lkey_expansion_192b 4410 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4411 call .Lkey_expansion_192a 4412 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4413 call .Lkey_expansion_192b 4414 $movkey %xmm0,(%rax) 4415 mov $bits,48(%rax) # 240(%rdx) 4416 xor %rax, %rax 4417 jmp .Lenc_key_ret 4418 4419.align 16 4420.L12rounds_alt: 4421 movdqa .Lkey_rotate192(%rip),%xmm5 4422 movdqa .Lkey_rcon1(%rip),%xmm4 4423 mov \$8,%r10d 4424 movdqu %xmm0,($key) 4425 jmp .Loop_key192 4426 4427.align 16 4428.Loop_key192: 4429 movq %xmm2,0(%rax) 4430 movdqa %xmm2,%xmm1 4431 pshufb %xmm5,%xmm2 4432 aesenclast %xmm4,%xmm2 4433 pslld \$1, %xmm4 4434 lea 24(%rax),%rax 4435 4436 movdqa %xmm0,%xmm3 4437 pslldq \$4,%xmm0 4438 pxor %xmm0,%xmm3 4439 pslldq \$4,%xmm0 4440 pxor %xmm0,%xmm3 4441 pslldq \$4,%xmm0 4442 pxor %xmm3,%xmm0 4443 4444 pshufd \$0xff,%xmm0,%xmm3 4445 pxor %xmm1,%xmm3 4446 pslldq \$4,%xmm1 4447 pxor %xmm1,%xmm3 4448 4449 pxor %xmm2,%xmm0 4450 pxor %xmm3,%xmm2 4451 movdqu %xmm0,-16(%rax) 4452 4453 dec %r10d 4454 jnz .Loop_key192 4455 4456 mov $bits,32(%rax) # 240($key) 4457 xor %eax,%eax 4458 jmp .Lenc_key_ret 4459 4460.align 16 4461.L14rounds: 4462 movups 16($inp),%xmm2 # remaning half of *userKey 4463 mov \$13,$bits # 14 rounds for 256 4464 lea 16(%rax),%rax 4465 cmp \$`1<<28`,%r10d # AVX, but no XOP 4466 je .L14rounds_alt 4467 4468 $movkey %xmm0,($key) # round 0 4469 $movkey %xmm2,16($key) # round 1 4470 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4471 call .Lkey_expansion_256a_cold 4472 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4473 call .Lkey_expansion_256b 4474 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4475 call .Lkey_expansion_256a 4476 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4477 call .Lkey_expansion_256b 4478 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4479 call .Lkey_expansion_256a 4480 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4481 call .Lkey_expansion_256b 4482 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4483 call .Lkey_expansion_256a 4484 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4485 call .Lkey_expansion_256b 4486 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4487 call .Lkey_expansion_256a 4488 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4489 call .Lkey_expansion_256b 4490 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4491 call .Lkey_expansion_256a 4492 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4493 call .Lkey_expansion_256b 4494 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4495 call .Lkey_expansion_256a 4496 $movkey %xmm0,(%rax) 4497 mov $bits,16(%rax) # 240(%rdx) 4498 xor %rax,%rax 4499 jmp .Lenc_key_ret 4500 4501.align 16 4502.L14rounds_alt: 4503 movdqa .Lkey_rotate(%rip),%xmm5 4504 movdqa .Lkey_rcon1(%rip),%xmm4 4505 mov \$7,%r10d 4506 movdqu %xmm0,0($key) 4507 movdqa %xmm2,%xmm1 4508 movdqu %xmm2,16($key) 4509 jmp .Loop_key256 4510 4511.align 16 4512.Loop_key256: 4513 pshufb %xmm5,%xmm2 4514 aesenclast %xmm4,%xmm2 4515 4516 movdqa %xmm0,%xmm3 4517 pslldq \$4,%xmm0 4518 pxor %xmm0,%xmm3 4519 pslldq \$4,%xmm0 4520 pxor %xmm0,%xmm3 4521 pslldq \$4,%xmm0 4522 pxor %xmm3,%xmm0 4523 pslld \$1,%xmm4 4524 4525 pxor %xmm2,%xmm0 4526 movdqu %xmm0,(%rax) 4527 4528 dec %r10d 4529 jz .Ldone_key256 4530 4531 pshufd \$0xff,%xmm0,%xmm2 4532 pxor %xmm3,%xmm3 4533 aesenclast %xmm3,%xmm2 4534 4535 movdqa %xmm1,%xmm3 4536 pslldq \$4,%xmm1 4537 pxor %xmm1,%xmm3 4538 pslldq \$4,%xmm1 4539 pxor %xmm1,%xmm3 4540 pslldq \$4,%xmm1 4541 pxor %xmm3,%xmm1 4542 4543 pxor %xmm1,%xmm2 4544 movdqu %xmm2,16(%rax) 4545 lea 32(%rax),%rax 4546 movdqa %xmm2,%xmm1 4547 4548 jmp .Loop_key256 4549 4550.Ldone_key256: 4551 mov $bits,16(%rax) # 240($key) 4552 xor %eax,%eax 4553 jmp .Lenc_key_ret 4554 4555.align 16 4556.Lbad_keybits: 4557 mov \$-2,%rax 4558.Lenc_key_ret: 4559 pxor %xmm0,%xmm0 4560 pxor %xmm1,%xmm1 4561 pxor %xmm2,%xmm2 4562 pxor %xmm3,%xmm3 4563 pxor %xmm4,%xmm4 4564 pxor %xmm5,%xmm5 4565 add \$8,%rsp 4566 ret 4567.LSEH_end_set_encrypt_key: 4568 4569.align 16 4570.Lkey_expansion_128: 4571 $movkey %xmm0,(%rax) 4572 lea 16(%rax),%rax 4573.Lkey_expansion_128_cold: 4574 shufps \$0b00010000,%xmm0,%xmm4 4575 xorps %xmm4, %xmm0 4576 shufps \$0b10001100,%xmm0,%xmm4 4577 xorps %xmm4, %xmm0 4578 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4579 xorps %xmm1,%xmm0 4580 ret 4581 4582.align 16 4583.Lkey_expansion_192a: 4584 $movkey %xmm0,(%rax) 4585 lea 16(%rax),%rax 4586.Lkey_expansion_192a_cold: 4587 movaps %xmm2, %xmm5 4588.Lkey_expansion_192b_warm: 4589 shufps \$0b00010000,%xmm0,%xmm4 4590 movdqa %xmm2,%xmm3 4591 xorps %xmm4,%xmm0 4592 shufps \$0b10001100,%xmm0,%xmm4 4593 pslldq \$4,%xmm3 4594 xorps %xmm4,%xmm0 4595 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4596 pxor %xmm3,%xmm2 4597 pxor %xmm1,%xmm0 4598 pshufd \$0b11111111,%xmm0,%xmm3 4599 pxor %xmm3,%xmm2 4600 ret 4601 4602.align 16 4603.Lkey_expansion_192b: 4604 movaps %xmm0,%xmm3 4605 shufps \$0b01000100,%xmm0,%xmm5 4606 $movkey %xmm5,(%rax) 4607 shufps \$0b01001110,%xmm2,%xmm3 4608 $movkey %xmm3,16(%rax) 4609 lea 32(%rax),%rax 4610 jmp .Lkey_expansion_192b_warm 4611 4612.align 16 4613.Lkey_expansion_256a: 4614 $movkey %xmm2,(%rax) 4615 lea 16(%rax),%rax 4616.Lkey_expansion_256a_cold: 4617 shufps \$0b00010000,%xmm0,%xmm4 4618 xorps %xmm4,%xmm0 4619 shufps \$0b10001100,%xmm0,%xmm4 4620 xorps %xmm4,%xmm0 4621 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4622 xorps %xmm1,%xmm0 4623 ret 4624 4625.align 16 4626.Lkey_expansion_256b: 4627 $movkey %xmm0,(%rax) 4628 lea 16(%rax),%rax 4629 4630 shufps \$0b00010000,%xmm2,%xmm4 4631 xorps %xmm4,%xmm2 4632 shufps \$0b10001100,%xmm2,%xmm4 4633 xorps %xmm4,%xmm2 4634 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4635 xorps %xmm1,%xmm2 4636 ret 4637.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4638.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4639___ 4640} 4641 4642$code.=<<___; 4643.align 64 4644.Lbswap_mask: 4645 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4646.Lincrement32: 4647 .long 6,6,6,0 4648.Lincrement64: 4649 .long 1,0,0,0 4650.Lxts_magic: 4651 .long 0x87,0,1,0 4652.Lincrement1: 4653 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4654.Lkey_rotate: 4655 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4656.Lkey_rotate192: 4657 .long 0x04070605,0x04070605,0x04070605,0x04070605 4658.Lkey_rcon1: 4659 .long 1,1,1,1 4660.Lkey_rcon1b: 4661 .long 0x1b,0x1b,0x1b,0x1b 4662 4663.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4664.align 64 4665___ 4666 4667# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4668# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4669if ($win64) { 4670$rec="%rcx"; 4671$frame="%rdx"; 4672$context="%r8"; 4673$disp="%r9"; 4674 4675$code.=<<___; 4676.extern __imp_RtlVirtualUnwind 4677___ 4678$code.=<<___ if ($PREFIX eq "aesni"); 4679.type ecb_ccm64_se_handler,\@abi-omnipotent 4680.align 16 4681ecb_ccm64_se_handler: 4682 push %rsi 4683 push %rdi 4684 push %rbx 4685 push %rbp 4686 push %r12 4687 push %r13 4688 push %r14 4689 push %r15 4690 pushfq 4691 sub \$64,%rsp 4692 4693 mov 120($context),%rax # pull context->Rax 4694 mov 248($context),%rbx # pull context->Rip 4695 4696 mov 8($disp),%rsi # disp->ImageBase 4697 mov 56($disp),%r11 # disp->HandlerData 4698 4699 mov 0(%r11),%r10d # HandlerData[0] 4700 lea (%rsi,%r10),%r10 # prologue label 4701 cmp %r10,%rbx # context->Rip<prologue label 4702 jb .Lcommon_seh_tail 4703 4704 mov 152($context),%rax # pull context->Rsp 4705 4706 mov 4(%r11),%r10d # HandlerData[1] 4707 lea (%rsi,%r10),%r10 # epilogue label 4708 cmp %r10,%rbx # context->Rip>=epilogue label 4709 jae .Lcommon_seh_tail 4710 4711 lea 0(%rax),%rsi # %xmm save area 4712 lea 512($context),%rdi # &context.Xmm6 4713 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4714 .long 0xa548f3fc # cld; rep movsq 4715 lea 0x58(%rax),%rax # adjust stack pointer 4716 4717 jmp .Lcommon_seh_tail 4718.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4719 4720.type ctr_xts_se_handler,\@abi-omnipotent 4721.align 16 4722ctr_xts_se_handler: 4723 push %rsi 4724 push %rdi 4725 push %rbx 4726 push %rbp 4727 push %r12 4728 push %r13 4729 push %r14 4730 push %r15 4731 pushfq 4732 sub \$64,%rsp 4733 4734 mov 120($context),%rax # pull context->Rax 4735 mov 248($context),%rbx # pull context->Rip 4736 4737 mov 8($disp),%rsi # disp->ImageBase 4738 mov 56($disp),%r11 # disp->HandlerData 4739 4740 mov 0(%r11),%r10d # HandlerData[0] 4741 lea (%rsi,%r10),%r10 # prologue lable 4742 cmp %r10,%rbx # context->Rip<prologue label 4743 jb .Lcommon_seh_tail 4744 4745 mov 152($context),%rax # pull context->Rsp 4746 4747 mov 4(%r11),%r10d # HandlerData[1] 4748 lea (%rsi,%r10),%r10 # epilogue label 4749 cmp %r10,%rbx # context->Rip>=epilogue label 4750 jae .Lcommon_seh_tail 4751 4752 mov 208($context),%rax # pull context->R11 4753 4754 lea -0xa8(%rax),%rsi # %xmm save area 4755 lea 512($context),%rdi # & context.Xmm6 4756 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4757 .long 0xa548f3fc # cld; rep movsq 4758 4759 mov -8(%rax),%rbp # restore saved %rbp 4760 mov %rbp,160($context) # restore context->Rbp 4761 jmp .Lcommon_seh_tail 4762.size ctr_xts_se_handler,.-ctr_xts_se_handler 4763 4764.type ocb_se_handler,\@abi-omnipotent 4765.align 16 4766ocb_se_handler: 4767 push %rsi 4768 push %rdi 4769 push %rbx 4770 push %rbp 4771 push %r12 4772 push %r13 4773 push %r14 4774 push %r15 4775 pushfq 4776 sub \$64,%rsp 4777 4778 mov 120($context),%rax # pull context->Rax 4779 mov 248($context),%rbx # pull context->Rip 4780 4781 mov 8($disp),%rsi # disp->ImageBase 4782 mov 56($disp),%r11 # disp->HandlerData 4783 4784 mov 0(%r11),%r10d # HandlerData[0] 4785 lea (%rsi,%r10),%r10 # prologue lable 4786 cmp %r10,%rbx # context->Rip<prologue label 4787 jb .Lcommon_seh_tail 4788 4789 mov 4(%r11),%r10d # HandlerData[1] 4790 lea (%rsi,%r10),%r10 # epilogue label 4791 cmp %r10,%rbx # context->Rip>=epilogue label 4792 jae .Lcommon_seh_tail 4793 4794 mov 8(%r11),%r10d # HandlerData[2] 4795 lea (%rsi,%r10),%r10 4796 cmp %r10,%rbx # context->Rip>=pop label 4797 jae .Locb_no_xmm 4798 4799 mov 152($context),%rax # pull context->Rsp 4800 4801 lea (%rax),%rsi # %xmm save area 4802 lea 512($context),%rdi # & context.Xmm6 4803 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4804 .long 0xa548f3fc # cld; rep movsq 4805 lea 0xa0+0x28(%rax),%rax 4806 4807.Locb_no_xmm: 4808 mov -8(%rax),%rbx 4809 mov -16(%rax),%rbp 4810 mov -24(%rax),%r12 4811 mov -32(%rax),%r13 4812 mov -40(%rax),%r14 4813 4814 mov %rbx,144($context) # restore context->Rbx 4815 mov %rbp,160($context) # restore context->Rbp 4816 mov %r12,216($context) # restore context->R12 4817 mov %r13,224($context) # restore context->R13 4818 mov %r14,232($context) # restore context->R14 4819 4820 jmp .Lcommon_seh_tail 4821.size ocb_se_handler,.-ocb_se_handler 4822___ 4823$code.=<<___; 4824.type cbc_se_handler,\@abi-omnipotent 4825.align 16 4826cbc_se_handler: 4827 push %rsi 4828 push %rdi 4829 push %rbx 4830 push %rbp 4831 push %r12 4832 push %r13 4833 push %r14 4834 push %r15 4835 pushfq 4836 sub \$64,%rsp 4837 4838 mov 152($context),%rax # pull context->Rsp 4839 mov 248($context),%rbx # pull context->Rip 4840 4841 lea .Lcbc_decrypt_bulk(%rip),%r10 4842 cmp %r10,%rbx # context->Rip<"prologue" label 4843 jb .Lcommon_seh_tail 4844 4845 mov 120($context),%rax # pull context->Rax 4846 4847 lea .Lcbc_decrypt_body(%rip),%r10 4848 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4849 jb .Lcommon_seh_tail 4850 4851 mov 152($context),%rax # pull context->Rsp 4852 4853 lea .Lcbc_ret(%rip),%r10 4854 cmp %r10,%rbx # context->Rip>="epilogue" label 4855 jae .Lcommon_seh_tail 4856 4857 lea 16(%rax),%rsi # %xmm save area 4858 lea 512($context),%rdi # &context.Xmm6 4859 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4860 .long 0xa548f3fc # cld; rep movsq 4861 4862 mov 208($context),%rax # pull context->R11 4863 4864 mov -8(%rax),%rbp # restore saved %rbp 4865 mov %rbp,160($context) # restore context->Rbp 4866 4867.Lcommon_seh_tail: 4868 mov 8(%rax),%rdi 4869 mov 16(%rax),%rsi 4870 mov %rax,152($context) # restore context->Rsp 4871 mov %rsi,168($context) # restore context->Rsi 4872 mov %rdi,176($context) # restore context->Rdi 4873 4874 mov 40($disp),%rdi # disp->ContextRecord 4875 mov $context,%rsi # context 4876 mov \$154,%ecx # sizeof(CONTEXT) 4877 .long 0xa548f3fc # cld; rep movsq 4878 4879 mov $disp,%rsi 4880 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4881 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4882 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4883 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4884 mov 40(%rsi),%r10 # disp->ContextRecord 4885 lea 56(%rsi),%r11 # &disp->HandlerData 4886 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4887 mov %r10,32(%rsp) # arg5 4888 mov %r11,40(%rsp) # arg6 4889 mov %r12,48(%rsp) # arg7 4890 mov %rcx,56(%rsp) # arg8, (NULL) 4891 call *__imp_RtlVirtualUnwind(%rip) 4892 4893 mov \$1,%eax # ExceptionContinueSearch 4894 add \$64,%rsp 4895 popfq 4896 pop %r15 4897 pop %r14 4898 pop %r13 4899 pop %r12 4900 pop %rbp 4901 pop %rbx 4902 pop %rdi 4903 pop %rsi 4904 ret 4905.size cbc_se_handler,.-cbc_se_handler 4906 4907.section .pdata 4908.align 4 4909___ 4910$code.=<<___ if ($PREFIX eq "aesni"); 4911 .rva .LSEH_begin_aesni_ecb_encrypt 4912 .rva .LSEH_end_aesni_ecb_encrypt 4913 .rva .LSEH_info_ecb 4914 4915 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 4916 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 4917 .rva .LSEH_info_ccm64_enc 4918 4919 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 4920 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 4921 .rva .LSEH_info_ccm64_dec 4922 4923 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 4924 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 4925 .rva .LSEH_info_ctr32 4926 4927 .rva .LSEH_begin_aesni_xts_encrypt 4928 .rva .LSEH_end_aesni_xts_encrypt 4929 .rva .LSEH_info_xts_enc 4930 4931 .rva .LSEH_begin_aesni_xts_decrypt 4932 .rva .LSEH_end_aesni_xts_decrypt 4933 .rva .LSEH_info_xts_dec 4934 4935 .rva .LSEH_begin_aesni_ocb_encrypt 4936 .rva .LSEH_end_aesni_ocb_encrypt 4937 .rva .LSEH_info_ocb_enc 4938 4939 .rva .LSEH_begin_aesni_ocb_decrypt 4940 .rva .LSEH_end_aesni_ocb_decrypt 4941 .rva .LSEH_info_ocb_dec 4942___ 4943$code.=<<___; 4944 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 4945 .rva .LSEH_end_${PREFIX}_cbc_encrypt 4946 .rva .LSEH_info_cbc 4947 4948 .rva ${PREFIX}_set_decrypt_key 4949 .rva .LSEH_end_set_decrypt_key 4950 .rva .LSEH_info_key 4951 4952 .rva ${PREFIX}_set_encrypt_key 4953 .rva .LSEH_end_set_encrypt_key 4954 .rva .LSEH_info_key 4955.section .xdata 4956.align 8 4957___ 4958$code.=<<___ if ($PREFIX eq "aesni"); 4959.LSEH_info_ecb: 4960 .byte 9,0,0,0 4961 .rva ecb_ccm64_se_handler 4962 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 4963.LSEH_info_ccm64_enc: 4964 .byte 9,0,0,0 4965 .rva ecb_ccm64_se_handler 4966 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 4967.LSEH_info_ccm64_dec: 4968 .byte 9,0,0,0 4969 .rva ecb_ccm64_se_handler 4970 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 4971.LSEH_info_ctr32: 4972 .byte 9,0,0,0 4973 .rva ctr_xts_se_handler 4974 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 4975.LSEH_info_xts_enc: 4976 .byte 9,0,0,0 4977 .rva ctr_xts_se_handler 4978 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 4979.LSEH_info_xts_dec: 4980 .byte 9,0,0,0 4981 .rva ctr_xts_se_handler 4982 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 4983.LSEH_info_ocb_enc: 4984 .byte 9,0,0,0 4985 .rva ocb_se_handler 4986 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 4987 .rva .Locb_enc_pop 4988 .long 0 4989.LSEH_info_ocb_dec: 4990 .byte 9,0,0,0 4991 .rva ocb_se_handler 4992 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 4993 .rva .Locb_dec_pop 4994 .long 0 4995___ 4996$code.=<<___; 4997.LSEH_info_cbc: 4998 .byte 9,0,0,0 4999 .rva cbc_se_handler 5000.LSEH_info_key: 5001 .byte 0x01,0x04,0x01,0x00 5002 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5003___ 5004} 5005 5006sub rex { 5007 local *opcode=shift; 5008 my ($dst,$src)=@_; 5009 my $rex=0; 5010 5011 $rex|=0x04 if($dst>=8); 5012 $rex|=0x01 if($src>=8); 5013 push @opcode,$rex|0x40 if($rex); 5014} 5015 5016sub aesni { 5017 my $line=shift; 5018 my @opcode=(0x66); 5019 5020 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5021 rex(\@opcode,$4,$3); 5022 push @opcode,0x0f,0x3a,0xdf; 5023 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5024 my $c=$2; 5025 push @opcode,$c=~/^0/?oct($c):$c; 5026 return ".byte\t".join(',',@opcode); 5027 } 5028 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5029 my %opcodelet = ( 5030 "aesimc" => 0xdb, 5031 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5032 "aesdec" => 0xde, "aesdeclast" => 0xdf 5033 ); 5034 return undef if (!defined($opcodelet{$1})); 5035 rex(\@opcode,$3,$2); 5036 push @opcode,0x0f,0x38,$opcodelet{$1}; 5037 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5038 return ".byte\t".join(',',@opcode); 5039 } 5040 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5041 my %opcodelet = ( 5042 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5043 "aesdec" => 0xde, "aesdeclast" => 0xdf 5044 ); 5045 return undef if (!defined($opcodelet{$1})); 5046 my $off = $2; 5047 push @opcode,0x44 if ($3>=8); 5048 push @opcode,0x0f,0x38,$opcodelet{$1}; 5049 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5050 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5051 return ".byte\t".join(',',@opcode); 5052 } 5053 return $line; 5054} 5055 5056sub movbe { 5057 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5058} 5059 5060$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5061$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5062#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5063$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5064 5065print $code; 5066 5067close STDOUT; 5068