1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for Intel AES-NI extension. In 11# OpenSSL context it's used with Intel engine, but can also be used as 12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13# details]. 14# 15# Performance. 16# 17# To start with see corresponding paragraph in aesni-x86_64.pl... 18# Instead of filling table similar to one found there I've chosen to 19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20# The simplified table below represents 32-bit performance relative 21# to 64-bit one in every given point. Ratios vary for different 22# encryption modes, therefore interval values. 23# 24# 16-byte 64-byte 256-byte 1-KB 8-KB 25# 53-67% 67-84% 91-94% 95-98% 97-99.5% 26# 27# Lower ratios for smaller block sizes are perfectly understandable, 28# because function call overhead is higher in 32-bit mode. Largest 29# 8-KB block performance is virtually same: 32-bit code is less than 30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32# January 2011 33# 34# See aesni-x86_64.pl for details. Unlike x86_64 version this module 35# interleaves at most 6 aes[enc|dec] instructions, because there are 36# not enough registers for 8x interleave [which should be optimal for 37# Sandy Bridge]. Actually, performance results for 6x interleave 38# factor presented in aesni-x86_64.pl (except for CTR) are for this 39# module. 40 41# April 2011 42# 43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46###################################################################### 47# Current large-block performance in cycles per byte processed with 48# 128-bit key (less is better). 49# 50# CBC en-/decrypt CTR XTS ECB 51# Westmere 3.77/1.37 1.37 1.52 1.27 52# * Bridge 5.07/0.98 0.99 1.09 0.91 53# Haswell 4.44/0.80 0.97 1.03 0.72 54# Skylake 2.68/0.65 0.65 0.66 0.64 55# Silvermont 5.77/3.56 3.67 4.03 3.46 56# Goldmont 3.84/1.39 1.39 1.63 1.31 57# Bulldozer 5.80/0.98 1.05 1.24 0.93 58 59$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 60 # generates drop-in replacement for 61 # crypto/aes/asm/aes-586.pl:-) 62$inline=1; # inline _aesni_[en|de]crypt 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65push(@INC,"${dir}","${dir}../../perlasm"); 66require "x86asm.pl"; 67 68$output = pop; 69open OUT,">$output"; 70*STDOUT=*OUT; 71 72&asm_init($ARGV[0],$0); 73 74&external_label("OPENSSL_ia32cap_P"); 75&static_label("key_const"); 76 77if ($PREFIX eq "aesni") { $movekey=\&movups; } 78else { $movekey=\&movups; } 79 80$len="eax"; 81$rounds="ecx"; 82$key="edx"; 83$inp="esi"; 84$out="edi"; 85$rounds_="ebx"; # backup copy for $rounds 86$key_="ebp"; # backup copy for $key 87 88$rndkey0="xmm0"; 89$rndkey1="xmm1"; 90$inout0="xmm2"; 91$inout1="xmm3"; 92$inout2="xmm4"; 93$inout3="xmm5"; $in1="xmm5"; 94$inout4="xmm6"; $in0="xmm6"; 95$inout5="xmm7"; $ivec="xmm7"; 96 97# AESNI extension 98sub aeskeygenassist 99{ my($dst,$src,$imm)=@_; 100 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 101 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 102} 103sub aescommon 104{ my($opcodelet,$dst,$src)=@_; 105 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 106 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 107} 108sub aesimc { aescommon(0xdb,@_); } 109sub aesenc { aescommon(0xdc,@_); } 110sub aesenclast { aescommon(0xdd,@_); } 111sub aesdec { aescommon(0xde,@_); } 112sub aesdeclast { aescommon(0xdf,@_); } 113 114# Inline version of internal aesni_[en|de]crypt1 115{ my $sn; 116sub aesni_inline_generate1 117{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 118 $sn++; 119 120 &$movekey ($rndkey0,&QWP(0,$key)); 121 &$movekey ($rndkey1,&QWP(16,$key)); 122 &xorps ($ivec,$rndkey0) if (defined($ivec)); 123 &lea ($key,&DWP(32,$key)); 124 &xorps ($inout,$ivec) if (defined($ivec)); 125 &xorps ($inout,$rndkey0) if (!defined($ivec)); 126 &set_label("${p}1_loop_$sn"); 127 eval"&aes${p} ($inout,$rndkey1)"; 128 &dec ($rounds); 129 &$movekey ($rndkey1,&QWP(0,$key)); 130 &lea ($key,&DWP(16,$key)); 131 &jnz (&label("${p}1_loop_$sn")); 132 eval"&aes${p}last ($inout,$rndkey1)"; 133}} 134 135sub aesni_generate1 # fully unrolled loop 136{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 137 138 &function_begin_B("_aesni_${p}rypt1"); 139 &movups ($rndkey0,&QWP(0,$key)); 140 &$movekey ($rndkey1,&QWP(0x10,$key)); 141 &xorps ($inout,$rndkey0); 142 &$movekey ($rndkey0,&QWP(0x20,$key)); 143 &lea ($key,&DWP(0x30,$key)); 144 &cmp ($rounds,11); 145 &jb (&label("${p}128")); 146 &lea ($key,&DWP(0x20,$key)); 147 &je (&label("${p}192")); 148 &lea ($key,&DWP(0x20,$key)); 149 eval"&aes${p} ($inout,$rndkey1)"; 150 &$movekey ($rndkey1,&QWP(-0x40,$key)); 151 eval"&aes${p} ($inout,$rndkey0)"; 152 &$movekey ($rndkey0,&QWP(-0x30,$key)); 153 &set_label("${p}192"); 154 eval"&aes${p} ($inout,$rndkey1)"; 155 &$movekey ($rndkey1,&QWP(-0x20,$key)); 156 eval"&aes${p} ($inout,$rndkey0)"; 157 &$movekey ($rndkey0,&QWP(-0x10,$key)); 158 &set_label("${p}128"); 159 eval"&aes${p} ($inout,$rndkey1)"; 160 &$movekey ($rndkey1,&QWP(0,$key)); 161 eval"&aes${p} ($inout,$rndkey0)"; 162 &$movekey ($rndkey0,&QWP(0x10,$key)); 163 eval"&aes${p} ($inout,$rndkey1)"; 164 &$movekey ($rndkey1,&QWP(0x20,$key)); 165 eval"&aes${p} ($inout,$rndkey0)"; 166 &$movekey ($rndkey0,&QWP(0x30,$key)); 167 eval"&aes${p} ($inout,$rndkey1)"; 168 &$movekey ($rndkey1,&QWP(0x40,$key)); 169 eval"&aes${p} ($inout,$rndkey0)"; 170 &$movekey ($rndkey0,&QWP(0x50,$key)); 171 eval"&aes${p} ($inout,$rndkey1)"; 172 &$movekey ($rndkey1,&QWP(0x60,$key)); 173 eval"&aes${p} ($inout,$rndkey0)"; 174 &$movekey ($rndkey0,&QWP(0x70,$key)); 175 eval"&aes${p} ($inout,$rndkey1)"; 176 eval"&aes${p}last ($inout,$rndkey0)"; 177 &ret(); 178 &function_end_B("_aesni_${p}rypt1"); 179} 180 181# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 182&aesni_generate1("enc") if (!$inline); 183&function_begin_B("${PREFIX}_encrypt"); 184 &mov ("eax",&wparam(0)); 185 &mov ($key,&wparam(2)); 186 &movups ($inout0,&QWP(0,"eax")); 187 &mov ($rounds,&DWP(240,$key)); 188 &mov ("eax",&wparam(1)); 189 if ($inline) 190 { &aesni_inline_generate1("enc"); } 191 else 192 { &call ("_aesni_encrypt1"); } 193 &pxor ($rndkey0,$rndkey0); # clear register bank 194 &pxor ($rndkey1,$rndkey1); 195 &movups (&QWP(0,"eax"),$inout0); 196 &pxor ($inout0,$inout0); 197 &ret (); 198&function_end_B("${PREFIX}_encrypt"); 199 200# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 201&aesni_generate1("dec") if(!$inline); 202&function_begin_B("${PREFIX}_decrypt"); 203 &mov ("eax",&wparam(0)); 204 &mov ($key,&wparam(2)); 205 &movups ($inout0,&QWP(0,"eax")); 206 &mov ($rounds,&DWP(240,$key)); 207 &mov ("eax",&wparam(1)); 208 if ($inline) 209 { &aesni_inline_generate1("dec"); } 210 else 211 { &call ("_aesni_decrypt1"); } 212 &pxor ($rndkey0,$rndkey0); # clear register bank 213 &pxor ($rndkey1,$rndkey1); 214 &movups (&QWP(0,"eax"),$inout0); 215 &pxor ($inout0,$inout0); 216 &ret (); 217&function_end_B("${PREFIX}_decrypt"); 218 219# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 220# factor. Why 3x subroutine were originally used in loops? Even though 221# aes[enc|dec] latency was originally 6, it could be scheduled only 222# every *2nd* cycle. Thus 3x interleave was the one providing optimal 223# utilization, i.e. when subroutine's throughput is virtually same as 224# of non-interleaved subroutine [for number of input blocks up to 3]. 225# This is why it originally made no sense to implement 2x subroutine. 226# But times change and it became appropriate to spend extra 192 bytes 227# on 2x subroutine on Atom Silvermont account. For processors that 228# can schedule aes[enc|dec] every cycle optimal interleave factor 229# equals to corresponding instructions latency. 8x is optimal for 230# * Bridge, but it's unfeasible to accommodate such implementation 231# in XMM registers addreassable in 32-bit mode and therefore maximum 232# of 6x is used instead... 233 234sub aesni_generate2 235{ my $p=shift; 236 237 &function_begin_B("_aesni_${p}rypt2"); 238 &$movekey ($rndkey0,&QWP(0,$key)); 239 &shl ($rounds,4); 240 &$movekey ($rndkey1,&QWP(16,$key)); 241 &xorps ($inout0,$rndkey0); 242 &pxor ($inout1,$rndkey0); 243 &$movekey ($rndkey0,&QWP(32,$key)); 244 &lea ($key,&DWP(32,$key,$rounds)); 245 &neg ($rounds); 246 &add ($rounds,16); 247 248 &set_label("${p}2_loop"); 249 eval"&aes${p} ($inout0,$rndkey1)"; 250 eval"&aes${p} ($inout1,$rndkey1)"; 251 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 252 &add ($rounds,32); 253 eval"&aes${p} ($inout0,$rndkey0)"; 254 eval"&aes${p} ($inout1,$rndkey0)"; 255 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 256 &jnz (&label("${p}2_loop")); 257 eval"&aes${p} ($inout0,$rndkey1)"; 258 eval"&aes${p} ($inout1,$rndkey1)"; 259 eval"&aes${p}last ($inout0,$rndkey0)"; 260 eval"&aes${p}last ($inout1,$rndkey0)"; 261 &ret(); 262 &function_end_B("_aesni_${p}rypt2"); 263} 264 265sub aesni_generate3 266{ my $p=shift; 267 268 &function_begin_B("_aesni_${p}rypt3"); 269 &$movekey ($rndkey0,&QWP(0,$key)); 270 &shl ($rounds,4); 271 &$movekey ($rndkey1,&QWP(16,$key)); 272 &xorps ($inout0,$rndkey0); 273 &pxor ($inout1,$rndkey0); 274 &pxor ($inout2,$rndkey0); 275 &$movekey ($rndkey0,&QWP(32,$key)); 276 &lea ($key,&DWP(32,$key,$rounds)); 277 &neg ($rounds); 278 &add ($rounds,16); 279 280 &set_label("${p}3_loop"); 281 eval"&aes${p} ($inout0,$rndkey1)"; 282 eval"&aes${p} ($inout1,$rndkey1)"; 283 eval"&aes${p} ($inout2,$rndkey1)"; 284 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 285 &add ($rounds,32); 286 eval"&aes${p} ($inout0,$rndkey0)"; 287 eval"&aes${p} ($inout1,$rndkey0)"; 288 eval"&aes${p} ($inout2,$rndkey0)"; 289 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 290 &jnz (&label("${p}3_loop")); 291 eval"&aes${p} ($inout0,$rndkey1)"; 292 eval"&aes${p} ($inout1,$rndkey1)"; 293 eval"&aes${p} ($inout2,$rndkey1)"; 294 eval"&aes${p}last ($inout0,$rndkey0)"; 295 eval"&aes${p}last ($inout1,$rndkey0)"; 296 eval"&aes${p}last ($inout2,$rndkey0)"; 297 &ret(); 298 &function_end_B("_aesni_${p}rypt3"); 299} 300 301# 4x interleave is implemented to improve small block performance, 302# most notably [and naturally] 4 block by ~30%. One can argue that one 303# should have implemented 5x as well, but improvement would be <20%, 304# so it's not worth it... 305sub aesni_generate4 306{ my $p=shift; 307 308 &function_begin_B("_aesni_${p}rypt4"); 309 &$movekey ($rndkey0,&QWP(0,$key)); 310 &$movekey ($rndkey1,&QWP(16,$key)); 311 &shl ($rounds,4); 312 &xorps ($inout0,$rndkey0); 313 &pxor ($inout1,$rndkey0); 314 &pxor ($inout2,$rndkey0); 315 &pxor ($inout3,$rndkey0); 316 &$movekey ($rndkey0,&QWP(32,$key)); 317 &lea ($key,&DWP(32,$key,$rounds)); 318 &neg ($rounds); 319 &data_byte (0x0f,0x1f,0x40,0x00); 320 &add ($rounds,16); 321 322 &set_label("${p}4_loop"); 323 eval"&aes${p} ($inout0,$rndkey1)"; 324 eval"&aes${p} ($inout1,$rndkey1)"; 325 eval"&aes${p} ($inout2,$rndkey1)"; 326 eval"&aes${p} ($inout3,$rndkey1)"; 327 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 328 &add ($rounds,32); 329 eval"&aes${p} ($inout0,$rndkey0)"; 330 eval"&aes${p} ($inout1,$rndkey0)"; 331 eval"&aes${p} ($inout2,$rndkey0)"; 332 eval"&aes${p} ($inout3,$rndkey0)"; 333 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 334 &jnz (&label("${p}4_loop")); 335 336 eval"&aes${p} ($inout0,$rndkey1)"; 337 eval"&aes${p} ($inout1,$rndkey1)"; 338 eval"&aes${p} ($inout2,$rndkey1)"; 339 eval"&aes${p} ($inout3,$rndkey1)"; 340 eval"&aes${p}last ($inout0,$rndkey0)"; 341 eval"&aes${p}last ($inout1,$rndkey0)"; 342 eval"&aes${p}last ($inout2,$rndkey0)"; 343 eval"&aes${p}last ($inout3,$rndkey0)"; 344 &ret(); 345 &function_end_B("_aesni_${p}rypt4"); 346} 347 348sub aesni_generate6 349{ my $p=shift; 350 351 &function_begin_B("_aesni_${p}rypt6"); 352 &static_label("_aesni_${p}rypt6_enter"); 353 &$movekey ($rndkey0,&QWP(0,$key)); 354 &shl ($rounds,4); 355 &$movekey ($rndkey1,&QWP(16,$key)); 356 &xorps ($inout0,$rndkey0); 357 &pxor ($inout1,$rndkey0); # pxor does better here 358 &pxor ($inout2,$rndkey0); 359 eval"&aes${p} ($inout0,$rndkey1)"; 360 &pxor ($inout3,$rndkey0); 361 &pxor ($inout4,$rndkey0); 362 eval"&aes${p} ($inout1,$rndkey1)"; 363 &lea ($key,&DWP(32,$key,$rounds)); 364 &neg ($rounds); 365 eval"&aes${p} ($inout2,$rndkey1)"; 366 &pxor ($inout5,$rndkey0); 367 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 368 &add ($rounds,16); 369 &jmp (&label("_aesni_${p}rypt6_inner")); 370 371 &set_label("${p}6_loop",16); 372 eval"&aes${p} ($inout0,$rndkey1)"; 373 eval"&aes${p} ($inout1,$rndkey1)"; 374 eval"&aes${p} ($inout2,$rndkey1)"; 375 &set_label("_aesni_${p}rypt6_inner"); 376 eval"&aes${p} ($inout3,$rndkey1)"; 377 eval"&aes${p} ($inout4,$rndkey1)"; 378 eval"&aes${p} ($inout5,$rndkey1)"; 379 &set_label("_aesni_${p}rypt6_enter"); 380 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 381 &add ($rounds,32); 382 eval"&aes${p} ($inout0,$rndkey0)"; 383 eval"&aes${p} ($inout1,$rndkey0)"; 384 eval"&aes${p} ($inout2,$rndkey0)"; 385 eval"&aes${p} ($inout3,$rndkey0)"; 386 eval"&aes${p} ($inout4,$rndkey0)"; 387 eval"&aes${p} ($inout5,$rndkey0)"; 388 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 389 &jnz (&label("${p}6_loop")); 390 391 eval"&aes${p} ($inout0,$rndkey1)"; 392 eval"&aes${p} ($inout1,$rndkey1)"; 393 eval"&aes${p} ($inout2,$rndkey1)"; 394 eval"&aes${p} ($inout3,$rndkey1)"; 395 eval"&aes${p} ($inout4,$rndkey1)"; 396 eval"&aes${p} ($inout5,$rndkey1)"; 397 eval"&aes${p}last ($inout0,$rndkey0)"; 398 eval"&aes${p}last ($inout1,$rndkey0)"; 399 eval"&aes${p}last ($inout2,$rndkey0)"; 400 eval"&aes${p}last ($inout3,$rndkey0)"; 401 eval"&aes${p}last ($inout4,$rndkey0)"; 402 eval"&aes${p}last ($inout5,$rndkey0)"; 403 &ret(); 404 &function_end_B("_aesni_${p}rypt6"); 405} 406&aesni_generate2("enc") if ($PREFIX eq "aesni"); 407&aesni_generate2("dec"); 408&aesni_generate3("enc") if ($PREFIX eq "aesni"); 409&aesni_generate3("dec"); 410&aesni_generate4("enc") if ($PREFIX eq "aesni"); 411&aesni_generate4("dec"); 412&aesni_generate6("enc") if ($PREFIX eq "aesni"); 413&aesni_generate6("dec"); 414 415if ($PREFIX eq "aesni") { 416###################################################################### 417# void aesni_ecb_encrypt (const void *in, void *out, 418# size_t length, const AES_KEY *key, 419# int enc); 420&function_begin("aesni_ecb_encrypt"); 421 &mov ($inp,&wparam(0)); 422 &mov ($out,&wparam(1)); 423 &mov ($len,&wparam(2)); 424 &mov ($key,&wparam(3)); 425 &mov ($rounds_,&wparam(4)); 426 &and ($len,-16); 427 &jz (&label("ecb_ret")); 428 &mov ($rounds,&DWP(240,$key)); 429 &test ($rounds_,$rounds_); 430 &jz (&label("ecb_decrypt")); 431 432 &mov ($key_,$key); # backup $key 433 &mov ($rounds_,$rounds); # backup $rounds 434 &cmp ($len,0x60); 435 &jb (&label("ecb_enc_tail")); 436 437 &movdqu ($inout0,&QWP(0,$inp)); 438 &movdqu ($inout1,&QWP(0x10,$inp)); 439 &movdqu ($inout2,&QWP(0x20,$inp)); 440 &movdqu ($inout3,&QWP(0x30,$inp)); 441 &movdqu ($inout4,&QWP(0x40,$inp)); 442 &movdqu ($inout5,&QWP(0x50,$inp)); 443 &lea ($inp,&DWP(0x60,$inp)); 444 &sub ($len,0x60); 445 &jmp (&label("ecb_enc_loop6_enter")); 446 447&set_label("ecb_enc_loop6",16); 448 &movups (&QWP(0,$out),$inout0); 449 &movdqu ($inout0,&QWP(0,$inp)); 450 &movups (&QWP(0x10,$out),$inout1); 451 &movdqu ($inout1,&QWP(0x10,$inp)); 452 &movups (&QWP(0x20,$out),$inout2); 453 &movdqu ($inout2,&QWP(0x20,$inp)); 454 &movups (&QWP(0x30,$out),$inout3); 455 &movdqu ($inout3,&QWP(0x30,$inp)); 456 &movups (&QWP(0x40,$out),$inout4); 457 &movdqu ($inout4,&QWP(0x40,$inp)); 458 &movups (&QWP(0x50,$out),$inout5); 459 &lea ($out,&DWP(0x60,$out)); 460 &movdqu ($inout5,&QWP(0x50,$inp)); 461 &lea ($inp,&DWP(0x60,$inp)); 462&set_label("ecb_enc_loop6_enter"); 463 464 &call ("_aesni_encrypt6"); 465 466 &mov ($key,$key_); # restore $key 467 &mov ($rounds,$rounds_); # restore $rounds 468 &sub ($len,0x60); 469 &jnc (&label("ecb_enc_loop6")); 470 471 &movups (&QWP(0,$out),$inout0); 472 &movups (&QWP(0x10,$out),$inout1); 473 &movups (&QWP(0x20,$out),$inout2); 474 &movups (&QWP(0x30,$out),$inout3); 475 &movups (&QWP(0x40,$out),$inout4); 476 &movups (&QWP(0x50,$out),$inout5); 477 &lea ($out,&DWP(0x60,$out)); 478 &add ($len,0x60); 479 &jz (&label("ecb_ret")); 480 481&set_label("ecb_enc_tail"); 482 &movups ($inout0,&QWP(0,$inp)); 483 &cmp ($len,0x20); 484 &jb (&label("ecb_enc_one")); 485 &movups ($inout1,&QWP(0x10,$inp)); 486 &je (&label("ecb_enc_two")); 487 &movups ($inout2,&QWP(0x20,$inp)); 488 &cmp ($len,0x40); 489 &jb (&label("ecb_enc_three")); 490 &movups ($inout3,&QWP(0x30,$inp)); 491 &je (&label("ecb_enc_four")); 492 &movups ($inout4,&QWP(0x40,$inp)); 493 &xorps ($inout5,$inout5); 494 &call ("_aesni_encrypt6"); 495 &movups (&QWP(0,$out),$inout0); 496 &movups (&QWP(0x10,$out),$inout1); 497 &movups (&QWP(0x20,$out),$inout2); 498 &movups (&QWP(0x30,$out),$inout3); 499 &movups (&QWP(0x40,$out),$inout4); 500 jmp (&label("ecb_ret")); 501 502&set_label("ecb_enc_one",16); 503 if ($inline) 504 { &aesni_inline_generate1("enc"); } 505 else 506 { &call ("_aesni_encrypt1"); } 507 &movups (&QWP(0,$out),$inout0); 508 &jmp (&label("ecb_ret")); 509 510&set_label("ecb_enc_two",16); 511 &call ("_aesni_encrypt2"); 512 &movups (&QWP(0,$out),$inout0); 513 &movups (&QWP(0x10,$out),$inout1); 514 &jmp (&label("ecb_ret")); 515 516&set_label("ecb_enc_three",16); 517 &call ("_aesni_encrypt3"); 518 &movups (&QWP(0,$out),$inout0); 519 &movups (&QWP(0x10,$out),$inout1); 520 &movups (&QWP(0x20,$out),$inout2); 521 &jmp (&label("ecb_ret")); 522 523&set_label("ecb_enc_four",16); 524 &call ("_aesni_encrypt4"); 525 &movups (&QWP(0,$out),$inout0); 526 &movups (&QWP(0x10,$out),$inout1); 527 &movups (&QWP(0x20,$out),$inout2); 528 &movups (&QWP(0x30,$out),$inout3); 529 &jmp (&label("ecb_ret")); 530###################################################################### 531&set_label("ecb_decrypt",16); 532 &mov ($key_,$key); # backup $key 533 &mov ($rounds_,$rounds); # backup $rounds 534 &cmp ($len,0x60); 535 &jb (&label("ecb_dec_tail")); 536 537 &movdqu ($inout0,&QWP(0,$inp)); 538 &movdqu ($inout1,&QWP(0x10,$inp)); 539 &movdqu ($inout2,&QWP(0x20,$inp)); 540 &movdqu ($inout3,&QWP(0x30,$inp)); 541 &movdqu ($inout4,&QWP(0x40,$inp)); 542 &movdqu ($inout5,&QWP(0x50,$inp)); 543 &lea ($inp,&DWP(0x60,$inp)); 544 &sub ($len,0x60); 545 &jmp (&label("ecb_dec_loop6_enter")); 546 547&set_label("ecb_dec_loop6",16); 548 &movups (&QWP(0,$out),$inout0); 549 &movdqu ($inout0,&QWP(0,$inp)); 550 &movups (&QWP(0x10,$out),$inout1); 551 &movdqu ($inout1,&QWP(0x10,$inp)); 552 &movups (&QWP(0x20,$out),$inout2); 553 &movdqu ($inout2,&QWP(0x20,$inp)); 554 &movups (&QWP(0x30,$out),$inout3); 555 &movdqu ($inout3,&QWP(0x30,$inp)); 556 &movups (&QWP(0x40,$out),$inout4); 557 &movdqu ($inout4,&QWP(0x40,$inp)); 558 &movups (&QWP(0x50,$out),$inout5); 559 &lea ($out,&DWP(0x60,$out)); 560 &movdqu ($inout5,&QWP(0x50,$inp)); 561 &lea ($inp,&DWP(0x60,$inp)); 562&set_label("ecb_dec_loop6_enter"); 563 564 &call ("_aesni_decrypt6"); 565 566 &mov ($key,$key_); # restore $key 567 &mov ($rounds,$rounds_); # restore $rounds 568 &sub ($len,0x60); 569 &jnc (&label("ecb_dec_loop6")); 570 571 &movups (&QWP(0,$out),$inout0); 572 &movups (&QWP(0x10,$out),$inout1); 573 &movups (&QWP(0x20,$out),$inout2); 574 &movups (&QWP(0x30,$out),$inout3); 575 &movups (&QWP(0x40,$out),$inout4); 576 &movups (&QWP(0x50,$out),$inout5); 577 &lea ($out,&DWP(0x60,$out)); 578 &add ($len,0x60); 579 &jz (&label("ecb_ret")); 580 581&set_label("ecb_dec_tail"); 582 &movups ($inout0,&QWP(0,$inp)); 583 &cmp ($len,0x20); 584 &jb (&label("ecb_dec_one")); 585 &movups ($inout1,&QWP(0x10,$inp)); 586 &je (&label("ecb_dec_two")); 587 &movups ($inout2,&QWP(0x20,$inp)); 588 &cmp ($len,0x40); 589 &jb (&label("ecb_dec_three")); 590 &movups ($inout3,&QWP(0x30,$inp)); 591 &je (&label("ecb_dec_four")); 592 &movups ($inout4,&QWP(0x40,$inp)); 593 &xorps ($inout5,$inout5); 594 &call ("_aesni_decrypt6"); 595 &movups (&QWP(0,$out),$inout0); 596 &movups (&QWP(0x10,$out),$inout1); 597 &movups (&QWP(0x20,$out),$inout2); 598 &movups (&QWP(0x30,$out),$inout3); 599 &movups (&QWP(0x40,$out),$inout4); 600 &jmp (&label("ecb_ret")); 601 602&set_label("ecb_dec_one",16); 603 if ($inline) 604 { &aesni_inline_generate1("dec"); } 605 else 606 { &call ("_aesni_decrypt1"); } 607 &movups (&QWP(0,$out),$inout0); 608 &jmp (&label("ecb_ret")); 609 610&set_label("ecb_dec_two",16); 611 &call ("_aesni_decrypt2"); 612 &movups (&QWP(0,$out),$inout0); 613 &movups (&QWP(0x10,$out),$inout1); 614 &jmp (&label("ecb_ret")); 615 616&set_label("ecb_dec_three",16); 617 &call ("_aesni_decrypt3"); 618 &movups (&QWP(0,$out),$inout0); 619 &movups (&QWP(0x10,$out),$inout1); 620 &movups (&QWP(0x20,$out),$inout2); 621 &jmp (&label("ecb_ret")); 622 623&set_label("ecb_dec_four",16); 624 &call ("_aesni_decrypt4"); 625 &movups (&QWP(0,$out),$inout0); 626 &movups (&QWP(0x10,$out),$inout1); 627 &movups (&QWP(0x20,$out),$inout2); 628 &movups (&QWP(0x30,$out),$inout3); 629 630&set_label("ecb_ret"); 631 &pxor ("xmm0","xmm0"); # clear register bank 632 &pxor ("xmm1","xmm1"); 633 &pxor ("xmm2","xmm2"); 634 &pxor ("xmm3","xmm3"); 635 &pxor ("xmm4","xmm4"); 636 &pxor ("xmm5","xmm5"); 637 &pxor ("xmm6","xmm6"); 638 &pxor ("xmm7","xmm7"); 639&function_end("aesni_ecb_encrypt"); 640 641###################################################################### 642# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 643# size_t blocks, const AES_KEY *key, 644# const char *ivec,char *cmac); 645# 646# Handles only complete blocks, operates on 64-bit counter and 647# does not update *ivec! Nor does it finalize CMAC value 648# (see engine/eng_aesni.c for details) 649# 650{ my $cmac=$inout1; 651&function_begin("aesni_ccm64_encrypt_blocks"); 652 &mov ($inp,&wparam(0)); 653 &mov ($out,&wparam(1)); 654 &mov ($len,&wparam(2)); 655 &mov ($key,&wparam(3)); 656 &mov ($rounds_,&wparam(4)); 657 &mov ($rounds,&wparam(5)); 658 &mov ($key_,"esp"); 659 &sub ("esp",60); 660 &and ("esp",-16); # align stack 661 &mov (&DWP(48,"esp"),$key_); 662 663 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 664 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 665 &mov ($rounds,&DWP(240,$key)); 666 667 # compose byte-swap control mask for pshufb on stack 668 &mov (&DWP(0,"esp"),0x0c0d0e0f); 669 &mov (&DWP(4,"esp"),0x08090a0b); 670 &mov (&DWP(8,"esp"),0x04050607); 671 &mov (&DWP(12,"esp"),0x00010203); 672 673 # compose counter increment vector on stack 674 &mov ($rounds_,1); 675 &xor ($key_,$key_); 676 &mov (&DWP(16,"esp"),$rounds_); 677 &mov (&DWP(20,"esp"),$key_); 678 &mov (&DWP(24,"esp"),$key_); 679 &mov (&DWP(28,"esp"),$key_); 680 681 &shl ($rounds,4); 682 &mov ($rounds_,16); 683 &lea ($key_,&DWP(0,$key)); 684 &movdqa ($inout3,&QWP(0,"esp")); 685 &movdqa ($inout0,$ivec); 686 &lea ($key,&DWP(32,$key,$rounds)); 687 &sub ($rounds_,$rounds); 688 &pshufb ($ivec,$inout3); 689 690&set_label("ccm64_enc_outer"); 691 &$movekey ($rndkey0,&QWP(0,$key_)); 692 &mov ($rounds,$rounds_); 693 &movups ($in0,&QWP(0,$inp)); 694 695 &xorps ($inout0,$rndkey0); 696 &$movekey ($rndkey1,&QWP(16,$key_)); 697 &xorps ($rndkey0,$in0); 698 &xorps ($cmac,$rndkey0); # cmac^=inp 699 &$movekey ($rndkey0,&QWP(32,$key_)); 700 701&set_label("ccm64_enc2_loop"); 702 &aesenc ($inout0,$rndkey1); 703 &aesenc ($cmac,$rndkey1); 704 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 705 &add ($rounds,32); 706 &aesenc ($inout0,$rndkey0); 707 &aesenc ($cmac,$rndkey0); 708 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 709 &jnz (&label("ccm64_enc2_loop")); 710 &aesenc ($inout0,$rndkey1); 711 &aesenc ($cmac,$rndkey1); 712 &paddq ($ivec,&QWP(16,"esp")); 713 &dec ($len); 714 &aesenclast ($inout0,$rndkey0); 715 &aesenclast ($cmac,$rndkey0); 716 717 &lea ($inp,&DWP(16,$inp)); 718 &xorps ($in0,$inout0); # inp^=E(ivec) 719 &movdqa ($inout0,$ivec); 720 &movups (&QWP(0,$out),$in0); # save output 721 &pshufb ($inout0,$inout3); 722 &lea ($out,&DWP(16,$out)); 723 &jnz (&label("ccm64_enc_outer")); 724 725 &mov ("esp",&DWP(48,"esp")); 726 &mov ($out,&wparam(5)); 727 &movups (&QWP(0,$out),$cmac); 728 729 &pxor ("xmm0","xmm0"); # clear register bank 730 &pxor ("xmm1","xmm1"); 731 &pxor ("xmm2","xmm2"); 732 &pxor ("xmm3","xmm3"); 733 &pxor ("xmm4","xmm4"); 734 &pxor ("xmm5","xmm5"); 735 &pxor ("xmm6","xmm6"); 736 &pxor ("xmm7","xmm7"); 737&function_end("aesni_ccm64_encrypt_blocks"); 738 739&function_begin("aesni_ccm64_decrypt_blocks"); 740 &mov ($inp,&wparam(0)); 741 &mov ($out,&wparam(1)); 742 &mov ($len,&wparam(2)); 743 &mov ($key,&wparam(3)); 744 &mov ($rounds_,&wparam(4)); 745 &mov ($rounds,&wparam(5)); 746 &mov ($key_,"esp"); 747 &sub ("esp",60); 748 &and ("esp",-16); # align stack 749 &mov (&DWP(48,"esp"),$key_); 750 751 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 752 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 753 &mov ($rounds,&DWP(240,$key)); 754 755 # compose byte-swap control mask for pshufb on stack 756 &mov (&DWP(0,"esp"),0x0c0d0e0f); 757 &mov (&DWP(4,"esp"),0x08090a0b); 758 &mov (&DWP(8,"esp"),0x04050607); 759 &mov (&DWP(12,"esp"),0x00010203); 760 761 # compose counter increment vector on stack 762 &mov ($rounds_,1); 763 &xor ($key_,$key_); 764 &mov (&DWP(16,"esp"),$rounds_); 765 &mov (&DWP(20,"esp"),$key_); 766 &mov (&DWP(24,"esp"),$key_); 767 &mov (&DWP(28,"esp"),$key_); 768 769 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 770 &movdqa ($inout0,$ivec); 771 772 &mov ($key_,$key); 773 &mov ($rounds_,$rounds); 774 775 &pshufb ($ivec,$inout3); 776 if ($inline) 777 { &aesni_inline_generate1("enc"); } 778 else 779 { &call ("_aesni_encrypt1"); } 780 &shl ($rounds_,4); 781 &mov ($rounds,16); 782 &movups ($in0,&QWP(0,$inp)); # load inp 783 &paddq ($ivec,&QWP(16,"esp")); 784 &lea ($inp,&QWP(16,$inp)); 785 &sub ($rounds,$rounds_); 786 &lea ($key,&DWP(32,$key_,$rounds_)); 787 &mov ($rounds_,$rounds); 788 &jmp (&label("ccm64_dec_outer")); 789 790&set_label("ccm64_dec_outer",16); 791 &xorps ($in0,$inout0); # inp ^= E(ivec) 792 &movdqa ($inout0,$ivec); 793 &movups (&QWP(0,$out),$in0); # save output 794 &lea ($out,&DWP(16,$out)); 795 &pshufb ($inout0,$inout3); 796 797 &sub ($len,1); 798 &jz (&label("ccm64_dec_break")); 799 800 &$movekey ($rndkey0,&QWP(0,$key_)); 801 &mov ($rounds,$rounds_); 802 &$movekey ($rndkey1,&QWP(16,$key_)); 803 &xorps ($in0,$rndkey0); 804 &xorps ($inout0,$rndkey0); 805 &xorps ($cmac,$in0); # cmac^=out 806 &$movekey ($rndkey0,&QWP(32,$key_)); 807 808&set_label("ccm64_dec2_loop"); 809 &aesenc ($inout0,$rndkey1); 810 &aesenc ($cmac,$rndkey1); 811 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 812 &add ($rounds,32); 813 &aesenc ($inout0,$rndkey0); 814 &aesenc ($cmac,$rndkey0); 815 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 816 &jnz (&label("ccm64_dec2_loop")); 817 &movups ($in0,&QWP(0,$inp)); # load inp 818 &paddq ($ivec,&QWP(16,"esp")); 819 &aesenc ($inout0,$rndkey1); 820 &aesenc ($cmac,$rndkey1); 821 &aesenclast ($inout0,$rndkey0); 822 &aesenclast ($cmac,$rndkey0); 823 &lea ($inp,&QWP(16,$inp)); 824 &jmp (&label("ccm64_dec_outer")); 825 826&set_label("ccm64_dec_break",16); 827 &mov ($rounds,&DWP(240,$key_)); 828 &mov ($key,$key_); 829 if ($inline) 830 { &aesni_inline_generate1("enc",$cmac,$in0); } 831 else 832 { &call ("_aesni_encrypt1",$cmac); } 833 834 &mov ("esp",&DWP(48,"esp")); 835 &mov ($out,&wparam(5)); 836 &movups (&QWP(0,$out),$cmac); 837 838 &pxor ("xmm0","xmm0"); # clear register bank 839 &pxor ("xmm1","xmm1"); 840 &pxor ("xmm2","xmm2"); 841 &pxor ("xmm3","xmm3"); 842 &pxor ("xmm4","xmm4"); 843 &pxor ("xmm5","xmm5"); 844 &pxor ("xmm6","xmm6"); 845 &pxor ("xmm7","xmm7"); 846&function_end("aesni_ccm64_decrypt_blocks"); 847} 848 849###################################################################### 850# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 851# size_t blocks, const AES_KEY *key, 852# const char *ivec); 853# 854# Handles only complete blocks, operates on 32-bit counter and 855# does not update *ivec! (see crypto/modes/ctr128.c for details) 856# 857# stack layout: 858# 0 pshufb mask 859# 16 vector addend: 0,6,6,6 860# 32 counter-less ivec 861# 48 1st triplet of counter vector 862# 64 2nd triplet of counter vector 863# 80 saved %esp 864 865&function_begin("aesni_ctr32_encrypt_blocks"); 866 &mov ($inp,&wparam(0)); 867 &mov ($out,&wparam(1)); 868 &mov ($len,&wparam(2)); 869 &mov ($key,&wparam(3)); 870 &mov ($rounds_,&wparam(4)); 871 &mov ($key_,"esp"); 872 &sub ("esp",88); 873 &and ("esp",-16); # align stack 874 &mov (&DWP(80,"esp"),$key_); 875 876 &cmp ($len,1); 877 &je (&label("ctr32_one_shortcut")); 878 879 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 880 881 # compose byte-swap control mask for pshufb on stack 882 &mov (&DWP(0,"esp"),0x0c0d0e0f); 883 &mov (&DWP(4,"esp"),0x08090a0b); 884 &mov (&DWP(8,"esp"),0x04050607); 885 &mov (&DWP(12,"esp"),0x00010203); 886 887 # compose counter increment vector on stack 888 &mov ($rounds,6); 889 &xor ($key_,$key_); 890 &mov (&DWP(16,"esp"),$rounds); 891 &mov (&DWP(20,"esp"),$rounds); 892 &mov (&DWP(24,"esp"),$rounds); 893 &mov (&DWP(28,"esp"),$key_); 894 895 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 896 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 897 898 &mov ($rounds,&DWP(240,$key)); # key->rounds 899 900 # compose 2 vectors of 3x32-bit counters 901 &bswap ($rounds_); 902 &pxor ($rndkey0,$rndkey0); 903 &pxor ($rndkey1,$rndkey1); 904 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 905 &pinsrd ($rndkey0,$rounds_,0); 906 &lea ($key_,&DWP(3,$rounds_)); 907 &pinsrd ($rndkey1,$key_,0); 908 &inc ($rounds_); 909 &pinsrd ($rndkey0,$rounds_,1); 910 &inc ($key_); 911 &pinsrd ($rndkey1,$key_,1); 912 &inc ($rounds_); 913 &pinsrd ($rndkey0,$rounds_,2); 914 &inc ($key_); 915 &pinsrd ($rndkey1,$key_,2); 916 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 917 &pshufb ($rndkey0,$inout0); # byte swap 918 &movdqu ($inout4,&QWP(0,$key)); # key[0] 919 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 920 &pshufb ($rndkey1,$inout0); # byte swap 921 922 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 923 &pshufd ($inout1,$rndkey0,2<<6); 924 &cmp ($len,6); 925 &jb (&label("ctr32_tail")); 926 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 927 &shl ($rounds,4); 928 &mov ($rounds_,16); 929 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 930 &mov ($key_,$key); # backup $key 931 &sub ($rounds_,$rounds); # backup twisted $rounds 932 &lea ($key,&DWP(32,$key,$rounds)); 933 &sub ($len,6); 934 &jmp (&label("ctr32_loop6")); 935 936&set_label("ctr32_loop6",16); 937 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 938 &pshufd ($inout2,$rndkey0,1<<6); 939 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 940 &pshufd ($inout3,$rndkey1,3<<6); 941 &pxor ($inout0,$rndkey0); # merge counter-less ivec 942 &pshufd ($inout4,$rndkey1,2<<6); 943 &pxor ($inout1,$rndkey0); 944 &pshufd ($inout5,$rndkey1,1<<6); 945 &$movekey ($rndkey1,&QWP(16,$key_)); 946 &pxor ($inout2,$rndkey0); 947 &pxor ($inout3,$rndkey0); 948 &aesenc ($inout0,$rndkey1); 949 &pxor ($inout4,$rndkey0); 950 &pxor ($inout5,$rndkey0); 951 &aesenc ($inout1,$rndkey1); 952 &$movekey ($rndkey0,&QWP(32,$key_)); 953 &mov ($rounds,$rounds_); 954 &aesenc ($inout2,$rndkey1); 955 &aesenc ($inout3,$rndkey1); 956 &aesenc ($inout4,$rndkey1); 957 &aesenc ($inout5,$rndkey1); 958 959 &call (&label("_aesni_encrypt6_enter")); 960 961 &movups ($rndkey1,&QWP(0,$inp)); 962 &movups ($rndkey0,&QWP(0x10,$inp)); 963 &xorps ($inout0,$rndkey1); 964 &movups ($rndkey1,&QWP(0x20,$inp)); 965 &xorps ($inout1,$rndkey0); 966 &movups (&QWP(0,$out),$inout0); 967 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 968 &xorps ($inout2,$rndkey1); 969 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 970 &movups (&QWP(0x10,$out),$inout1); 971 &movups (&QWP(0x20,$out),$inout2); 972 973 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 974 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 975 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 976 977 &movups ($inout1,&QWP(0x30,$inp)); 978 &movups ($inout2,&QWP(0x40,$inp)); 979 &xorps ($inout3,$inout1); 980 &movups ($inout1,&QWP(0x50,$inp)); 981 &lea ($inp,&DWP(0x60,$inp)); 982 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 983 &pshufb ($rndkey0,$inout0); # byte swap 984 &xorps ($inout4,$inout2); 985 &movups (&QWP(0x30,$out),$inout3); 986 &xorps ($inout5,$inout1); 987 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 988 &pshufb ($rndkey1,$inout0); # byte swap 989 &movups (&QWP(0x40,$out),$inout4); 990 &pshufd ($inout0,$rndkey0,3<<6); 991 &movups (&QWP(0x50,$out),$inout5); 992 &lea ($out,&DWP(0x60,$out)); 993 994 &pshufd ($inout1,$rndkey0,2<<6); 995 &sub ($len,6); 996 &jnc (&label("ctr32_loop6")); 997 998 &add ($len,6); 999 &jz (&label("ctr32_ret")); 1000 &movdqu ($inout5,&QWP(0,$key_)); 1001 &mov ($key,$key_); 1002 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1003 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1004 1005&set_label("ctr32_tail"); 1006 &por ($inout0,$inout5); 1007 &cmp ($len,2); 1008 &jb (&label("ctr32_one")); 1009 1010 &pshufd ($inout2,$rndkey0,1<<6); 1011 &por ($inout1,$inout5); 1012 &je (&label("ctr32_two")); 1013 1014 &pshufd ($inout3,$rndkey1,3<<6); 1015 &por ($inout2,$inout5); 1016 &cmp ($len,4); 1017 &jb (&label("ctr32_three")); 1018 1019 &pshufd ($inout4,$rndkey1,2<<6); 1020 &por ($inout3,$inout5); 1021 &je (&label("ctr32_four")); 1022 1023 &por ($inout4,$inout5); 1024 &call ("_aesni_encrypt6"); 1025 &movups ($rndkey1,&QWP(0,$inp)); 1026 &movups ($rndkey0,&QWP(0x10,$inp)); 1027 &xorps ($inout0,$rndkey1); 1028 &movups ($rndkey1,&QWP(0x20,$inp)); 1029 &xorps ($inout1,$rndkey0); 1030 &movups ($rndkey0,&QWP(0x30,$inp)); 1031 &xorps ($inout2,$rndkey1); 1032 &movups ($rndkey1,&QWP(0x40,$inp)); 1033 &xorps ($inout3,$rndkey0); 1034 &movups (&QWP(0,$out),$inout0); 1035 &xorps ($inout4,$rndkey1); 1036 &movups (&QWP(0x10,$out),$inout1); 1037 &movups (&QWP(0x20,$out),$inout2); 1038 &movups (&QWP(0x30,$out),$inout3); 1039 &movups (&QWP(0x40,$out),$inout4); 1040 &jmp (&label("ctr32_ret")); 1041 1042&set_label("ctr32_one_shortcut",16); 1043 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1044 &mov ($rounds,&DWP(240,$key)); 1045 1046&set_label("ctr32_one"); 1047 if ($inline) 1048 { &aesni_inline_generate1("enc"); } 1049 else 1050 { &call ("_aesni_encrypt1"); } 1051 &movups ($in0,&QWP(0,$inp)); 1052 &xorps ($in0,$inout0); 1053 &movups (&QWP(0,$out),$in0); 1054 &jmp (&label("ctr32_ret")); 1055 1056&set_label("ctr32_two",16); 1057 &call ("_aesni_encrypt2"); 1058 &movups ($inout3,&QWP(0,$inp)); 1059 &movups ($inout4,&QWP(0x10,$inp)); 1060 &xorps ($inout0,$inout3); 1061 &xorps ($inout1,$inout4); 1062 &movups (&QWP(0,$out),$inout0); 1063 &movups (&QWP(0x10,$out),$inout1); 1064 &jmp (&label("ctr32_ret")); 1065 1066&set_label("ctr32_three",16); 1067 &call ("_aesni_encrypt3"); 1068 &movups ($inout3,&QWP(0,$inp)); 1069 &movups ($inout4,&QWP(0x10,$inp)); 1070 &xorps ($inout0,$inout3); 1071 &movups ($inout5,&QWP(0x20,$inp)); 1072 &xorps ($inout1,$inout4); 1073 &movups (&QWP(0,$out),$inout0); 1074 &xorps ($inout2,$inout5); 1075 &movups (&QWP(0x10,$out),$inout1); 1076 &movups (&QWP(0x20,$out),$inout2); 1077 &jmp (&label("ctr32_ret")); 1078 1079&set_label("ctr32_four",16); 1080 &call ("_aesni_encrypt4"); 1081 &movups ($inout4,&QWP(0,$inp)); 1082 &movups ($inout5,&QWP(0x10,$inp)); 1083 &movups ($rndkey1,&QWP(0x20,$inp)); 1084 &xorps ($inout0,$inout4); 1085 &movups ($rndkey0,&QWP(0x30,$inp)); 1086 &xorps ($inout1,$inout5); 1087 &movups (&QWP(0,$out),$inout0); 1088 &xorps ($inout2,$rndkey1); 1089 &movups (&QWP(0x10,$out),$inout1); 1090 &xorps ($inout3,$rndkey0); 1091 &movups (&QWP(0x20,$out),$inout2); 1092 &movups (&QWP(0x30,$out),$inout3); 1093 1094&set_label("ctr32_ret"); 1095 &pxor ("xmm0","xmm0"); # clear register bank 1096 &pxor ("xmm1","xmm1"); 1097 &pxor ("xmm2","xmm2"); 1098 &pxor ("xmm3","xmm3"); 1099 &pxor ("xmm4","xmm4"); 1100 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1101 &pxor ("xmm5","xmm5"); 1102 &movdqa (&QWP(48,"esp"),"xmm0"); 1103 &pxor ("xmm6","xmm6"); 1104 &movdqa (&QWP(64,"esp"),"xmm0"); 1105 &pxor ("xmm7","xmm7"); 1106 &mov ("esp",&DWP(80,"esp")); 1107&function_end("aesni_ctr32_encrypt_blocks"); 1108 1109###################################################################### 1110# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1111# const AES_KEY *key1, const AES_KEY *key2 1112# const unsigned char iv[16]); 1113# 1114{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1115 1116&function_begin("aesni_xts_encrypt"); 1117 &mov ($key,&wparam(4)); # key2 1118 &mov ($inp,&wparam(5)); # clear-text tweak 1119 1120 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1121 &movups ($inout0,&QWP(0,$inp)); 1122 if ($inline) 1123 { &aesni_inline_generate1("enc"); } 1124 else 1125 { &call ("_aesni_encrypt1"); } 1126 1127 &mov ($inp,&wparam(0)); 1128 &mov ($out,&wparam(1)); 1129 &mov ($len,&wparam(2)); 1130 &mov ($key,&wparam(3)); # key1 1131 1132 &mov ($key_,"esp"); 1133 &sub ("esp",16*7+8); 1134 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1135 &and ("esp",-16); # align stack 1136 1137 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1138 &mov (&DWP(16*6+4,"esp"),0); 1139 &mov (&DWP(16*6+8,"esp"),1); 1140 &mov (&DWP(16*6+12,"esp"),0); 1141 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1142 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1143 1144 &movdqa ($tweak,$inout0); 1145 &pxor ($twtmp,$twtmp); 1146 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1147 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1148 1149 &and ($len,-16); 1150 &mov ($key_,$key); # backup $key 1151 &mov ($rounds_,$rounds); # backup $rounds 1152 &sub ($len,16*6); 1153 &jc (&label("xts_enc_short")); 1154 1155 &shl ($rounds,4); 1156 &mov ($rounds_,16); 1157 &sub ($rounds_,$rounds); 1158 &lea ($key,&DWP(32,$key,$rounds)); 1159 &jmp (&label("xts_enc_loop6")); 1160 1161&set_label("xts_enc_loop6",16); 1162 for ($i=0;$i<4;$i++) { 1163 &pshufd ($twres,$twtmp,0x13); 1164 &pxor ($twtmp,$twtmp); 1165 &movdqa (&QWP(16*$i,"esp"),$tweak); 1166 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1167 &pand ($twres,$twmask); # isolate carry and residue 1168 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1169 &pxor ($tweak,$twres); 1170 } 1171 &pshufd ($inout5,$twtmp,0x13); 1172 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1173 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1174 &$movekey ($rndkey0,&QWP(0,$key_)); 1175 &pand ($inout5,$twmask); # isolate carry and residue 1176 &movups ($inout0,&QWP(0,$inp)); # load input 1177 &pxor ($inout5,$tweak); 1178 1179 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1180 &mov ($rounds,$rounds_); # restore $rounds 1181 &movdqu ($inout1,&QWP(16*1,$inp)); 1182 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1183 &movdqu ($inout2,&QWP(16*2,$inp)); 1184 &pxor ($inout1,$rndkey0); 1185 &movdqu ($inout3,&QWP(16*3,$inp)); 1186 &pxor ($inout2,$rndkey0); 1187 &movdqu ($inout4,&QWP(16*4,$inp)); 1188 &pxor ($inout3,$rndkey0); 1189 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1190 &pxor ($inout4,$rndkey0); 1191 &lea ($inp,&DWP(16*6,$inp)); 1192 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1193 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1194 &pxor ($inout5,$rndkey1); 1195 1196 &$movekey ($rndkey1,&QWP(16,$key_)); 1197 &pxor ($inout1,&QWP(16*1,"esp")); 1198 &pxor ($inout2,&QWP(16*2,"esp")); 1199 &aesenc ($inout0,$rndkey1); 1200 &pxor ($inout3,&QWP(16*3,"esp")); 1201 &pxor ($inout4,&QWP(16*4,"esp")); 1202 &aesenc ($inout1,$rndkey1); 1203 &pxor ($inout5,$rndkey0); 1204 &$movekey ($rndkey0,&QWP(32,$key_)); 1205 &aesenc ($inout2,$rndkey1); 1206 &aesenc ($inout3,$rndkey1); 1207 &aesenc ($inout4,$rndkey1); 1208 &aesenc ($inout5,$rndkey1); 1209 &call (&label("_aesni_encrypt6_enter")); 1210 1211 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1212 &pxor ($twtmp,$twtmp); 1213 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1214 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1215 &xorps ($inout1,&QWP(16*1,"esp")); 1216 &movups (&QWP(16*0,$out),$inout0); # write output 1217 &xorps ($inout2,&QWP(16*2,"esp")); 1218 &movups (&QWP(16*1,$out),$inout1); 1219 &xorps ($inout3,&QWP(16*3,"esp")); 1220 &movups (&QWP(16*2,$out),$inout2); 1221 &xorps ($inout4,&QWP(16*4,"esp")); 1222 &movups (&QWP(16*3,$out),$inout3); 1223 &xorps ($inout5,$tweak); 1224 &movups (&QWP(16*4,$out),$inout4); 1225 &pshufd ($twres,$twtmp,0x13); 1226 &movups (&QWP(16*5,$out),$inout5); 1227 &lea ($out,&DWP(16*6,$out)); 1228 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1229 1230 &pxor ($twtmp,$twtmp); 1231 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1232 &pand ($twres,$twmask); # isolate carry and residue 1233 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1234 &pxor ($tweak,$twres); 1235 1236 &sub ($len,16*6); 1237 &jnc (&label("xts_enc_loop6")); 1238 1239 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1240 &mov ($key,$key_); # restore $key 1241 &mov ($rounds_,$rounds); 1242 1243&set_label("xts_enc_short"); 1244 &add ($len,16*6); 1245 &jz (&label("xts_enc_done6x")); 1246 1247 &movdqa ($inout3,$tweak); # put aside previous tweak 1248 &cmp ($len,0x20); 1249 &jb (&label("xts_enc_one")); 1250 1251 &pshufd ($twres,$twtmp,0x13); 1252 &pxor ($twtmp,$twtmp); 1253 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1254 &pand ($twres,$twmask); # isolate carry and residue 1255 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1256 &pxor ($tweak,$twres); 1257 &je (&label("xts_enc_two")); 1258 1259 &pshufd ($twres,$twtmp,0x13); 1260 &pxor ($twtmp,$twtmp); 1261 &movdqa ($inout4,$tweak); # put aside previous tweak 1262 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1263 &pand ($twres,$twmask); # isolate carry and residue 1264 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1265 &pxor ($tweak,$twres); 1266 &cmp ($len,0x40); 1267 &jb (&label("xts_enc_three")); 1268 1269 &pshufd ($twres,$twtmp,0x13); 1270 &pxor ($twtmp,$twtmp); 1271 &movdqa ($inout5,$tweak); # put aside previous tweak 1272 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1273 &pand ($twres,$twmask); # isolate carry and residue 1274 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1275 &pxor ($tweak,$twres); 1276 &movdqa (&QWP(16*0,"esp"),$inout3); 1277 &movdqa (&QWP(16*1,"esp"),$inout4); 1278 &je (&label("xts_enc_four")); 1279 1280 &movdqa (&QWP(16*2,"esp"),$inout5); 1281 &pshufd ($inout5,$twtmp,0x13); 1282 &movdqa (&QWP(16*3,"esp"),$tweak); 1283 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1284 &pand ($inout5,$twmask); # isolate carry and residue 1285 &pxor ($inout5,$tweak); 1286 1287 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1288 &movdqu ($inout1,&QWP(16*1,$inp)); 1289 &movdqu ($inout2,&QWP(16*2,$inp)); 1290 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1291 &movdqu ($inout3,&QWP(16*3,$inp)); 1292 &pxor ($inout1,&QWP(16*1,"esp")); 1293 &movdqu ($inout4,&QWP(16*4,$inp)); 1294 &pxor ($inout2,&QWP(16*2,"esp")); 1295 &lea ($inp,&DWP(16*5,$inp)); 1296 &pxor ($inout3,&QWP(16*3,"esp")); 1297 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1298 &pxor ($inout4,$inout5); 1299 1300 &call ("_aesni_encrypt6"); 1301 1302 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1303 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1304 &xorps ($inout1,&QWP(16*1,"esp")); 1305 &xorps ($inout2,&QWP(16*2,"esp")); 1306 &movups (&QWP(16*0,$out),$inout0); # write output 1307 &xorps ($inout3,&QWP(16*3,"esp")); 1308 &movups (&QWP(16*1,$out),$inout1); 1309 &xorps ($inout4,$tweak); 1310 &movups (&QWP(16*2,$out),$inout2); 1311 &movups (&QWP(16*3,$out),$inout3); 1312 &movups (&QWP(16*4,$out),$inout4); 1313 &lea ($out,&DWP(16*5,$out)); 1314 &jmp (&label("xts_enc_done")); 1315 1316&set_label("xts_enc_one",16); 1317 &movups ($inout0,&QWP(16*0,$inp)); # load input 1318 &lea ($inp,&DWP(16*1,$inp)); 1319 &xorps ($inout0,$inout3); # input^=tweak 1320 if ($inline) 1321 { &aesni_inline_generate1("enc"); } 1322 else 1323 { &call ("_aesni_encrypt1"); } 1324 &xorps ($inout0,$inout3); # output^=tweak 1325 &movups (&QWP(16*0,$out),$inout0); # write output 1326 &lea ($out,&DWP(16*1,$out)); 1327 1328 &movdqa ($tweak,$inout3); # last tweak 1329 &jmp (&label("xts_enc_done")); 1330 1331&set_label("xts_enc_two",16); 1332 &movaps ($inout4,$tweak); # put aside last tweak 1333 1334 &movups ($inout0,&QWP(16*0,$inp)); # load input 1335 &movups ($inout1,&QWP(16*1,$inp)); 1336 &lea ($inp,&DWP(16*2,$inp)); 1337 &xorps ($inout0,$inout3); # input^=tweak 1338 &xorps ($inout1,$inout4); 1339 1340 &call ("_aesni_encrypt2"); 1341 1342 &xorps ($inout0,$inout3); # output^=tweak 1343 &xorps ($inout1,$inout4); 1344 &movups (&QWP(16*0,$out),$inout0); # write output 1345 &movups (&QWP(16*1,$out),$inout1); 1346 &lea ($out,&DWP(16*2,$out)); 1347 1348 &movdqa ($tweak,$inout4); # last tweak 1349 &jmp (&label("xts_enc_done")); 1350 1351&set_label("xts_enc_three",16); 1352 &movaps ($inout5,$tweak); # put aside last tweak 1353 &movups ($inout0,&QWP(16*0,$inp)); # load input 1354 &movups ($inout1,&QWP(16*1,$inp)); 1355 &movups ($inout2,&QWP(16*2,$inp)); 1356 &lea ($inp,&DWP(16*3,$inp)); 1357 &xorps ($inout0,$inout3); # input^=tweak 1358 &xorps ($inout1,$inout4); 1359 &xorps ($inout2,$inout5); 1360 1361 &call ("_aesni_encrypt3"); 1362 1363 &xorps ($inout0,$inout3); # output^=tweak 1364 &xorps ($inout1,$inout4); 1365 &xorps ($inout2,$inout5); 1366 &movups (&QWP(16*0,$out),$inout0); # write output 1367 &movups (&QWP(16*1,$out),$inout1); 1368 &movups (&QWP(16*2,$out),$inout2); 1369 &lea ($out,&DWP(16*3,$out)); 1370 1371 &movdqa ($tweak,$inout5); # last tweak 1372 &jmp (&label("xts_enc_done")); 1373 1374&set_label("xts_enc_four",16); 1375 &movaps ($inout4,$tweak); # put aside last tweak 1376 1377 &movups ($inout0,&QWP(16*0,$inp)); # load input 1378 &movups ($inout1,&QWP(16*1,$inp)); 1379 &movups ($inout2,&QWP(16*2,$inp)); 1380 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1381 &movups ($inout3,&QWP(16*3,$inp)); 1382 &lea ($inp,&DWP(16*4,$inp)); 1383 &xorps ($inout1,&QWP(16*1,"esp")); 1384 &xorps ($inout2,$inout5); 1385 &xorps ($inout3,$inout4); 1386 1387 &call ("_aesni_encrypt4"); 1388 1389 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1390 &xorps ($inout1,&QWP(16*1,"esp")); 1391 &xorps ($inout2,$inout5); 1392 &movups (&QWP(16*0,$out),$inout0); # write output 1393 &xorps ($inout3,$inout4); 1394 &movups (&QWP(16*1,$out),$inout1); 1395 &movups (&QWP(16*2,$out),$inout2); 1396 &movups (&QWP(16*3,$out),$inout3); 1397 &lea ($out,&DWP(16*4,$out)); 1398 1399 &movdqa ($tweak,$inout4); # last tweak 1400 &jmp (&label("xts_enc_done")); 1401 1402&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1403 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1404 &and ($len,15); 1405 &jz (&label("xts_enc_ret")); 1406 &movdqa ($inout3,$tweak); 1407 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1408 &jmp (&label("xts_enc_steal")); 1409 1410&set_label("xts_enc_done",16); 1411 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1412 &pxor ($twtmp,$twtmp); 1413 &and ($len,15); 1414 &jz (&label("xts_enc_ret")); 1415 1416 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1417 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1418 &pshufd ($inout3,$twtmp,0x13); 1419 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1420 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1421 &pxor ($inout3,$tweak); 1422 1423&set_label("xts_enc_steal"); 1424 &movz ($rounds,&BP(0,$inp)); 1425 &movz ($key,&BP(-16,$out)); 1426 &lea ($inp,&DWP(1,$inp)); 1427 &mov (&BP(-16,$out),&LB($rounds)); 1428 &mov (&BP(0,$out),&LB($key)); 1429 &lea ($out,&DWP(1,$out)); 1430 &sub ($len,1); 1431 &jnz (&label("xts_enc_steal")); 1432 1433 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1434 &mov ($key,$key_); # restore $key 1435 &mov ($rounds,$rounds_); # restore $rounds 1436 1437 &movups ($inout0,&QWP(-16,$out)); # load input 1438 &xorps ($inout0,$inout3); # input^=tweak 1439 if ($inline) 1440 { &aesni_inline_generate1("enc"); } 1441 else 1442 { &call ("_aesni_encrypt1"); } 1443 &xorps ($inout0,$inout3); # output^=tweak 1444 &movups (&QWP(-16,$out),$inout0); # write output 1445 1446&set_label("xts_enc_ret"); 1447 &pxor ("xmm0","xmm0"); # clear register bank 1448 &pxor ("xmm1","xmm1"); 1449 &pxor ("xmm2","xmm2"); 1450 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1451 &pxor ("xmm3","xmm3"); 1452 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1453 &pxor ("xmm4","xmm4"); 1454 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1455 &pxor ("xmm5","xmm5"); 1456 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1457 &pxor ("xmm6","xmm6"); 1458 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1459 &pxor ("xmm7","xmm7"); 1460 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1461 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1462&function_end("aesni_xts_encrypt"); 1463 1464&function_begin("aesni_xts_decrypt"); 1465 &mov ($key,&wparam(4)); # key2 1466 &mov ($inp,&wparam(5)); # clear-text tweak 1467 1468 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1469 &movups ($inout0,&QWP(0,$inp)); 1470 if ($inline) 1471 { &aesni_inline_generate1("enc"); } 1472 else 1473 { &call ("_aesni_encrypt1"); } 1474 1475 &mov ($inp,&wparam(0)); 1476 &mov ($out,&wparam(1)); 1477 &mov ($len,&wparam(2)); 1478 &mov ($key,&wparam(3)); # key1 1479 1480 &mov ($key_,"esp"); 1481 &sub ("esp",16*7+8); 1482 &and ("esp",-16); # align stack 1483 1484 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1485 &test ($len,15); 1486 &setnz (&LB($rounds_)); 1487 &shl ($rounds_,4); 1488 &sub ($len,$rounds_); 1489 1490 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1491 &mov (&DWP(16*6+4,"esp"),0); 1492 &mov (&DWP(16*6+8,"esp"),1); 1493 &mov (&DWP(16*6+12,"esp"),0); 1494 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1495 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1496 1497 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1498 &mov ($key_,$key); # backup $key 1499 &mov ($rounds_,$rounds); # backup $rounds 1500 1501 &movdqa ($tweak,$inout0); 1502 &pxor ($twtmp,$twtmp); 1503 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1504 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1505 1506 &and ($len,-16); 1507 &sub ($len,16*6); 1508 &jc (&label("xts_dec_short")); 1509 1510 &shl ($rounds,4); 1511 &mov ($rounds_,16); 1512 &sub ($rounds_,$rounds); 1513 &lea ($key,&DWP(32,$key,$rounds)); 1514 &jmp (&label("xts_dec_loop6")); 1515 1516&set_label("xts_dec_loop6",16); 1517 for ($i=0;$i<4;$i++) { 1518 &pshufd ($twres,$twtmp,0x13); 1519 &pxor ($twtmp,$twtmp); 1520 &movdqa (&QWP(16*$i,"esp"),$tweak); 1521 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1522 &pand ($twres,$twmask); # isolate carry and residue 1523 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1524 &pxor ($tweak,$twres); 1525 } 1526 &pshufd ($inout5,$twtmp,0x13); 1527 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1528 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1529 &$movekey ($rndkey0,&QWP(0,$key_)); 1530 &pand ($inout5,$twmask); # isolate carry and residue 1531 &movups ($inout0,&QWP(0,$inp)); # load input 1532 &pxor ($inout5,$tweak); 1533 1534 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1535 &mov ($rounds,$rounds_); 1536 &movdqu ($inout1,&QWP(16*1,$inp)); 1537 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1538 &movdqu ($inout2,&QWP(16*2,$inp)); 1539 &pxor ($inout1,$rndkey0); 1540 &movdqu ($inout3,&QWP(16*3,$inp)); 1541 &pxor ($inout2,$rndkey0); 1542 &movdqu ($inout4,&QWP(16*4,$inp)); 1543 &pxor ($inout3,$rndkey0); 1544 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1545 &pxor ($inout4,$rndkey0); 1546 &lea ($inp,&DWP(16*6,$inp)); 1547 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1548 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1549 &pxor ($inout5,$rndkey1); 1550 1551 &$movekey ($rndkey1,&QWP(16,$key_)); 1552 &pxor ($inout1,&QWP(16*1,"esp")); 1553 &pxor ($inout2,&QWP(16*2,"esp")); 1554 &aesdec ($inout0,$rndkey1); 1555 &pxor ($inout3,&QWP(16*3,"esp")); 1556 &pxor ($inout4,&QWP(16*4,"esp")); 1557 &aesdec ($inout1,$rndkey1); 1558 &pxor ($inout5,$rndkey0); 1559 &$movekey ($rndkey0,&QWP(32,$key_)); 1560 &aesdec ($inout2,$rndkey1); 1561 &aesdec ($inout3,$rndkey1); 1562 &aesdec ($inout4,$rndkey1); 1563 &aesdec ($inout5,$rndkey1); 1564 &call (&label("_aesni_decrypt6_enter")); 1565 1566 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1567 &pxor ($twtmp,$twtmp); 1568 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1569 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1570 &xorps ($inout1,&QWP(16*1,"esp")); 1571 &movups (&QWP(16*0,$out),$inout0); # write output 1572 &xorps ($inout2,&QWP(16*2,"esp")); 1573 &movups (&QWP(16*1,$out),$inout1); 1574 &xorps ($inout3,&QWP(16*3,"esp")); 1575 &movups (&QWP(16*2,$out),$inout2); 1576 &xorps ($inout4,&QWP(16*4,"esp")); 1577 &movups (&QWP(16*3,$out),$inout3); 1578 &xorps ($inout5,$tweak); 1579 &movups (&QWP(16*4,$out),$inout4); 1580 &pshufd ($twres,$twtmp,0x13); 1581 &movups (&QWP(16*5,$out),$inout5); 1582 &lea ($out,&DWP(16*6,$out)); 1583 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1584 1585 &pxor ($twtmp,$twtmp); 1586 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1587 &pand ($twres,$twmask); # isolate carry and residue 1588 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1589 &pxor ($tweak,$twres); 1590 1591 &sub ($len,16*6); 1592 &jnc (&label("xts_dec_loop6")); 1593 1594 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1595 &mov ($key,$key_); # restore $key 1596 &mov ($rounds_,$rounds); 1597 1598&set_label("xts_dec_short"); 1599 &add ($len,16*6); 1600 &jz (&label("xts_dec_done6x")); 1601 1602 &movdqa ($inout3,$tweak); # put aside previous tweak 1603 &cmp ($len,0x20); 1604 &jb (&label("xts_dec_one")); 1605 1606 &pshufd ($twres,$twtmp,0x13); 1607 &pxor ($twtmp,$twtmp); 1608 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1609 &pand ($twres,$twmask); # isolate carry and residue 1610 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1611 &pxor ($tweak,$twres); 1612 &je (&label("xts_dec_two")); 1613 1614 &pshufd ($twres,$twtmp,0x13); 1615 &pxor ($twtmp,$twtmp); 1616 &movdqa ($inout4,$tweak); # put aside previous tweak 1617 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1618 &pand ($twres,$twmask); # isolate carry and residue 1619 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1620 &pxor ($tweak,$twres); 1621 &cmp ($len,0x40); 1622 &jb (&label("xts_dec_three")); 1623 1624 &pshufd ($twres,$twtmp,0x13); 1625 &pxor ($twtmp,$twtmp); 1626 &movdqa ($inout5,$tweak); # put aside previous tweak 1627 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1628 &pand ($twres,$twmask); # isolate carry and residue 1629 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1630 &pxor ($tweak,$twres); 1631 &movdqa (&QWP(16*0,"esp"),$inout3); 1632 &movdqa (&QWP(16*1,"esp"),$inout4); 1633 &je (&label("xts_dec_four")); 1634 1635 &movdqa (&QWP(16*2,"esp"),$inout5); 1636 &pshufd ($inout5,$twtmp,0x13); 1637 &movdqa (&QWP(16*3,"esp"),$tweak); 1638 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1639 &pand ($inout5,$twmask); # isolate carry and residue 1640 &pxor ($inout5,$tweak); 1641 1642 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1643 &movdqu ($inout1,&QWP(16*1,$inp)); 1644 &movdqu ($inout2,&QWP(16*2,$inp)); 1645 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1646 &movdqu ($inout3,&QWP(16*3,$inp)); 1647 &pxor ($inout1,&QWP(16*1,"esp")); 1648 &movdqu ($inout4,&QWP(16*4,$inp)); 1649 &pxor ($inout2,&QWP(16*2,"esp")); 1650 &lea ($inp,&DWP(16*5,$inp)); 1651 &pxor ($inout3,&QWP(16*3,"esp")); 1652 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1653 &pxor ($inout4,$inout5); 1654 1655 &call ("_aesni_decrypt6"); 1656 1657 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1658 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1659 &xorps ($inout1,&QWP(16*1,"esp")); 1660 &xorps ($inout2,&QWP(16*2,"esp")); 1661 &movups (&QWP(16*0,$out),$inout0); # write output 1662 &xorps ($inout3,&QWP(16*3,"esp")); 1663 &movups (&QWP(16*1,$out),$inout1); 1664 &xorps ($inout4,$tweak); 1665 &movups (&QWP(16*2,$out),$inout2); 1666 &movups (&QWP(16*3,$out),$inout3); 1667 &movups (&QWP(16*4,$out),$inout4); 1668 &lea ($out,&DWP(16*5,$out)); 1669 &jmp (&label("xts_dec_done")); 1670 1671&set_label("xts_dec_one",16); 1672 &movups ($inout0,&QWP(16*0,$inp)); # load input 1673 &lea ($inp,&DWP(16*1,$inp)); 1674 &xorps ($inout0,$inout3); # input^=tweak 1675 if ($inline) 1676 { &aesni_inline_generate1("dec"); } 1677 else 1678 { &call ("_aesni_decrypt1"); } 1679 &xorps ($inout0,$inout3); # output^=tweak 1680 &movups (&QWP(16*0,$out),$inout0); # write output 1681 &lea ($out,&DWP(16*1,$out)); 1682 1683 &movdqa ($tweak,$inout3); # last tweak 1684 &jmp (&label("xts_dec_done")); 1685 1686&set_label("xts_dec_two",16); 1687 &movaps ($inout4,$tweak); # put aside last tweak 1688 1689 &movups ($inout0,&QWP(16*0,$inp)); # load input 1690 &movups ($inout1,&QWP(16*1,$inp)); 1691 &lea ($inp,&DWP(16*2,$inp)); 1692 &xorps ($inout0,$inout3); # input^=tweak 1693 &xorps ($inout1,$inout4); 1694 1695 &call ("_aesni_decrypt2"); 1696 1697 &xorps ($inout0,$inout3); # output^=tweak 1698 &xorps ($inout1,$inout4); 1699 &movups (&QWP(16*0,$out),$inout0); # write output 1700 &movups (&QWP(16*1,$out),$inout1); 1701 &lea ($out,&DWP(16*2,$out)); 1702 1703 &movdqa ($tweak,$inout4); # last tweak 1704 &jmp (&label("xts_dec_done")); 1705 1706&set_label("xts_dec_three",16); 1707 &movaps ($inout5,$tweak); # put aside last tweak 1708 &movups ($inout0,&QWP(16*0,$inp)); # load input 1709 &movups ($inout1,&QWP(16*1,$inp)); 1710 &movups ($inout2,&QWP(16*2,$inp)); 1711 &lea ($inp,&DWP(16*3,$inp)); 1712 &xorps ($inout0,$inout3); # input^=tweak 1713 &xorps ($inout1,$inout4); 1714 &xorps ($inout2,$inout5); 1715 1716 &call ("_aesni_decrypt3"); 1717 1718 &xorps ($inout0,$inout3); # output^=tweak 1719 &xorps ($inout1,$inout4); 1720 &xorps ($inout2,$inout5); 1721 &movups (&QWP(16*0,$out),$inout0); # write output 1722 &movups (&QWP(16*1,$out),$inout1); 1723 &movups (&QWP(16*2,$out),$inout2); 1724 &lea ($out,&DWP(16*3,$out)); 1725 1726 &movdqa ($tweak,$inout5); # last tweak 1727 &jmp (&label("xts_dec_done")); 1728 1729&set_label("xts_dec_four",16); 1730 &movaps ($inout4,$tweak); # put aside last tweak 1731 1732 &movups ($inout0,&QWP(16*0,$inp)); # load input 1733 &movups ($inout1,&QWP(16*1,$inp)); 1734 &movups ($inout2,&QWP(16*2,$inp)); 1735 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1736 &movups ($inout3,&QWP(16*3,$inp)); 1737 &lea ($inp,&DWP(16*4,$inp)); 1738 &xorps ($inout1,&QWP(16*1,"esp")); 1739 &xorps ($inout2,$inout5); 1740 &xorps ($inout3,$inout4); 1741 1742 &call ("_aesni_decrypt4"); 1743 1744 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1745 &xorps ($inout1,&QWP(16*1,"esp")); 1746 &xorps ($inout2,$inout5); 1747 &movups (&QWP(16*0,$out),$inout0); # write output 1748 &xorps ($inout3,$inout4); 1749 &movups (&QWP(16*1,$out),$inout1); 1750 &movups (&QWP(16*2,$out),$inout2); 1751 &movups (&QWP(16*3,$out),$inout3); 1752 &lea ($out,&DWP(16*4,$out)); 1753 1754 &movdqa ($tweak,$inout4); # last tweak 1755 &jmp (&label("xts_dec_done")); 1756 1757&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1758 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1759 &and ($len,15); 1760 &jz (&label("xts_dec_ret")); 1761 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1762 &jmp (&label("xts_dec_only_one_more")); 1763 1764&set_label("xts_dec_done",16); 1765 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1766 &pxor ($twtmp,$twtmp); 1767 &and ($len,15); 1768 &jz (&label("xts_dec_ret")); 1769 1770 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1771 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1772 &pshufd ($twres,$twtmp,0x13); 1773 &pxor ($twtmp,$twtmp); 1774 &movdqa ($twmask,&QWP(16*6,"esp")); 1775 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1776 &pand ($twres,$twmask); # isolate carry and residue 1777 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1778 &pxor ($tweak,$twres); 1779 1780&set_label("xts_dec_only_one_more"); 1781 &pshufd ($inout3,$twtmp,0x13); 1782 &movdqa ($inout4,$tweak); # put aside previous tweak 1783 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1784 &pand ($inout3,$twmask); # isolate carry and residue 1785 &pxor ($inout3,$tweak); 1786 1787 &mov ($key,$key_); # restore $key 1788 &mov ($rounds,$rounds_); # restore $rounds 1789 1790 &movups ($inout0,&QWP(0,$inp)); # load input 1791 &xorps ($inout0,$inout3); # input^=tweak 1792 if ($inline) 1793 { &aesni_inline_generate1("dec"); } 1794 else 1795 { &call ("_aesni_decrypt1"); } 1796 &xorps ($inout0,$inout3); # output^=tweak 1797 &movups (&QWP(0,$out),$inout0); # write output 1798 1799&set_label("xts_dec_steal"); 1800 &movz ($rounds,&BP(16,$inp)); 1801 &movz ($key,&BP(0,$out)); 1802 &lea ($inp,&DWP(1,$inp)); 1803 &mov (&BP(0,$out),&LB($rounds)); 1804 &mov (&BP(16,$out),&LB($key)); 1805 &lea ($out,&DWP(1,$out)); 1806 &sub ($len,1); 1807 &jnz (&label("xts_dec_steal")); 1808 1809 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1810 &mov ($key,$key_); # restore $key 1811 &mov ($rounds,$rounds_); # restore $rounds 1812 1813 &movups ($inout0,&QWP(0,$out)); # load input 1814 &xorps ($inout0,$inout4); # input^=tweak 1815 if ($inline) 1816 { &aesni_inline_generate1("dec"); } 1817 else 1818 { &call ("_aesni_decrypt1"); } 1819 &xorps ($inout0,$inout4); # output^=tweak 1820 &movups (&QWP(0,$out),$inout0); # write output 1821 1822&set_label("xts_dec_ret"); 1823 &pxor ("xmm0","xmm0"); # clear register bank 1824 &pxor ("xmm1","xmm1"); 1825 &pxor ("xmm2","xmm2"); 1826 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1827 &pxor ("xmm3","xmm3"); 1828 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1829 &pxor ("xmm4","xmm4"); 1830 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1831 &pxor ("xmm5","xmm5"); 1832 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1833 &pxor ("xmm6","xmm6"); 1834 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1835 &pxor ("xmm7","xmm7"); 1836 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1837 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1838&function_end("aesni_xts_decrypt"); 1839} 1840} 1841 1842###################################################################### 1843# void $PREFIX_cbc_encrypt (const void *inp, void *out, 1844# size_t length, const AES_KEY *key, 1845# unsigned char *ivp,const int enc); 1846&function_begin("${PREFIX}_cbc_encrypt"); 1847 &mov ($inp,&wparam(0)); 1848 &mov ($rounds_,"esp"); 1849 &mov ($out,&wparam(1)); 1850 &sub ($rounds_,24); 1851 &mov ($len,&wparam(2)); 1852 &and ($rounds_,-16); 1853 &mov ($key,&wparam(3)); 1854 &mov ($key_,&wparam(4)); 1855 &test ($len,$len); 1856 &jz (&label("cbc_abort")); 1857 1858 &cmp (&wparam(5),0); 1859 &xchg ($rounds_,"esp"); # alloca 1860 &movups ($ivec,&QWP(0,$key_)); # load IV 1861 &mov ($rounds,&DWP(240,$key)); 1862 &mov ($key_,$key); # backup $key 1863 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1864 &mov ($rounds_,$rounds); # backup $rounds 1865 &je (&label("cbc_decrypt")); 1866 1867 &movaps ($inout0,$ivec); 1868 &cmp ($len,16); 1869 &jb (&label("cbc_enc_tail")); 1870 &sub ($len,16); 1871 &jmp (&label("cbc_enc_loop")); 1872 1873&set_label("cbc_enc_loop",16); 1874 &movups ($ivec,&QWP(0,$inp)); # input actually 1875 &lea ($inp,&DWP(16,$inp)); 1876 if ($inline) 1877 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1878 else 1879 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1880 &mov ($rounds,$rounds_); # restore $rounds 1881 &mov ($key,$key_); # restore $key 1882 &movups (&QWP(0,$out),$inout0); # store output 1883 &lea ($out,&DWP(16,$out)); 1884 &sub ($len,16); 1885 &jnc (&label("cbc_enc_loop")); 1886 &add ($len,16); 1887 &jnz (&label("cbc_enc_tail")); 1888 &movaps ($ivec,$inout0); 1889 &pxor ($inout0,$inout0); 1890 &jmp (&label("cbc_ret")); 1891 1892&set_label("cbc_enc_tail"); 1893 &mov ("ecx",$len); # zaps $rounds 1894 &data_word(0xA4F3F689); # rep movsb 1895 &mov ("ecx",16); # zero tail 1896 &sub ("ecx",$len); 1897 &xor ("eax","eax"); # zaps $len 1898 &data_word(0xAAF3F689); # rep stosb 1899 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1900 &mov ($rounds,$rounds_); # restore $rounds 1901 &mov ($inp,$out); # $inp and $out are the same 1902 &mov ($key,$key_); # restore $key 1903 &jmp (&label("cbc_enc_loop")); 1904###################################################################### 1905&set_label("cbc_decrypt",16); 1906 &cmp ($len,0x50); 1907 &jbe (&label("cbc_dec_tail")); 1908 &movaps (&QWP(0,"esp"),$ivec); # save IV 1909 &sub ($len,0x50); 1910 &jmp (&label("cbc_dec_loop6_enter")); 1911 1912&set_label("cbc_dec_loop6",16); 1913 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1914 &movups (&QWP(0,$out),$inout5); 1915 &lea ($out,&DWP(0x10,$out)); 1916&set_label("cbc_dec_loop6_enter"); 1917 &movdqu ($inout0,&QWP(0,$inp)); 1918 &movdqu ($inout1,&QWP(0x10,$inp)); 1919 &movdqu ($inout2,&QWP(0x20,$inp)); 1920 &movdqu ($inout3,&QWP(0x30,$inp)); 1921 &movdqu ($inout4,&QWP(0x40,$inp)); 1922 &movdqu ($inout5,&QWP(0x50,$inp)); 1923 1924 &call ("_aesni_decrypt6"); 1925 1926 &movups ($rndkey1,&QWP(0,$inp)); 1927 &movups ($rndkey0,&QWP(0x10,$inp)); 1928 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1929 &xorps ($inout1,$rndkey1); 1930 &movups ($rndkey1,&QWP(0x20,$inp)); 1931 &xorps ($inout2,$rndkey0); 1932 &movups ($rndkey0,&QWP(0x30,$inp)); 1933 &xorps ($inout3,$rndkey1); 1934 &movups ($rndkey1,&QWP(0x40,$inp)); 1935 &xorps ($inout4,$rndkey0); 1936 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1937 &xorps ($inout5,$rndkey1); 1938 &movups (&QWP(0,$out),$inout0); 1939 &movups (&QWP(0x10,$out),$inout1); 1940 &lea ($inp,&DWP(0x60,$inp)); 1941 &movups (&QWP(0x20,$out),$inout2); 1942 &mov ($rounds,$rounds_); # restore $rounds 1943 &movups (&QWP(0x30,$out),$inout3); 1944 &mov ($key,$key_); # restore $key 1945 &movups (&QWP(0x40,$out),$inout4); 1946 &lea ($out,&DWP(0x50,$out)); 1947 &sub ($len,0x60); 1948 &ja (&label("cbc_dec_loop6")); 1949 1950 &movaps ($inout0,$inout5); 1951 &movaps ($ivec,$rndkey0); 1952 &add ($len,0x50); 1953 &jle (&label("cbc_dec_clear_tail_collected")); 1954 &movups (&QWP(0,$out),$inout0); 1955 &lea ($out,&DWP(0x10,$out)); 1956&set_label("cbc_dec_tail"); 1957 &movups ($inout0,&QWP(0,$inp)); 1958 &movaps ($in0,$inout0); 1959 &cmp ($len,0x10); 1960 &jbe (&label("cbc_dec_one")); 1961 1962 &movups ($inout1,&QWP(0x10,$inp)); 1963 &movaps ($in1,$inout1); 1964 &cmp ($len,0x20); 1965 &jbe (&label("cbc_dec_two")); 1966 1967 &movups ($inout2,&QWP(0x20,$inp)); 1968 &cmp ($len,0x30); 1969 &jbe (&label("cbc_dec_three")); 1970 1971 &movups ($inout3,&QWP(0x30,$inp)); 1972 &cmp ($len,0x40); 1973 &jbe (&label("cbc_dec_four")); 1974 1975 &movups ($inout4,&QWP(0x40,$inp)); 1976 &movaps (&QWP(0,"esp"),$ivec); # save IV 1977 &movups ($inout0,&QWP(0,$inp)); 1978 &xorps ($inout5,$inout5); 1979 &call ("_aesni_decrypt6"); 1980 &movups ($rndkey1,&QWP(0,$inp)); 1981 &movups ($rndkey0,&QWP(0x10,$inp)); 1982 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1983 &xorps ($inout1,$rndkey1); 1984 &movups ($rndkey1,&QWP(0x20,$inp)); 1985 &xorps ($inout2,$rndkey0); 1986 &movups ($rndkey0,&QWP(0x30,$inp)); 1987 &xorps ($inout3,$rndkey1); 1988 &movups ($ivec,&QWP(0x40,$inp)); # IV 1989 &xorps ($inout4,$rndkey0); 1990 &movups (&QWP(0,$out),$inout0); 1991 &movups (&QWP(0x10,$out),$inout1); 1992 &pxor ($inout1,$inout1); 1993 &movups (&QWP(0x20,$out),$inout2); 1994 &pxor ($inout2,$inout2); 1995 &movups (&QWP(0x30,$out),$inout3); 1996 &pxor ($inout3,$inout3); 1997 &lea ($out,&DWP(0x40,$out)); 1998 &movaps ($inout0,$inout4); 1999 &pxor ($inout4,$inout4); 2000 &sub ($len,0x50); 2001 &jmp (&label("cbc_dec_tail_collected")); 2002 2003&set_label("cbc_dec_one",16); 2004 if ($inline) 2005 { &aesni_inline_generate1("dec"); } 2006 else 2007 { &call ("_aesni_decrypt1"); } 2008 &xorps ($inout0,$ivec); 2009 &movaps ($ivec,$in0); 2010 &sub ($len,0x10); 2011 &jmp (&label("cbc_dec_tail_collected")); 2012 2013&set_label("cbc_dec_two",16); 2014 &call ("_aesni_decrypt2"); 2015 &xorps ($inout0,$ivec); 2016 &xorps ($inout1,$in0); 2017 &movups (&QWP(0,$out),$inout0); 2018 &movaps ($inout0,$inout1); 2019 &pxor ($inout1,$inout1); 2020 &lea ($out,&DWP(0x10,$out)); 2021 &movaps ($ivec,$in1); 2022 &sub ($len,0x20); 2023 &jmp (&label("cbc_dec_tail_collected")); 2024 2025&set_label("cbc_dec_three",16); 2026 &call ("_aesni_decrypt3"); 2027 &xorps ($inout0,$ivec); 2028 &xorps ($inout1,$in0); 2029 &xorps ($inout2,$in1); 2030 &movups (&QWP(0,$out),$inout0); 2031 &movaps ($inout0,$inout2); 2032 &pxor ($inout2,$inout2); 2033 &movups (&QWP(0x10,$out),$inout1); 2034 &pxor ($inout1,$inout1); 2035 &lea ($out,&DWP(0x20,$out)); 2036 &movups ($ivec,&QWP(0x20,$inp)); 2037 &sub ($len,0x30); 2038 &jmp (&label("cbc_dec_tail_collected")); 2039 2040&set_label("cbc_dec_four",16); 2041 &call ("_aesni_decrypt4"); 2042 &movups ($rndkey1,&QWP(0x10,$inp)); 2043 &movups ($rndkey0,&QWP(0x20,$inp)); 2044 &xorps ($inout0,$ivec); 2045 &movups ($ivec,&QWP(0x30,$inp)); 2046 &xorps ($inout1,$in0); 2047 &movups (&QWP(0,$out),$inout0); 2048 &xorps ($inout2,$rndkey1); 2049 &movups (&QWP(0x10,$out),$inout1); 2050 &pxor ($inout1,$inout1); 2051 &xorps ($inout3,$rndkey0); 2052 &movups (&QWP(0x20,$out),$inout2); 2053 &pxor ($inout2,$inout2); 2054 &lea ($out,&DWP(0x30,$out)); 2055 &movaps ($inout0,$inout3); 2056 &pxor ($inout3,$inout3); 2057 &sub ($len,0x40); 2058 &jmp (&label("cbc_dec_tail_collected")); 2059 2060&set_label("cbc_dec_clear_tail_collected",16); 2061 &pxor ($inout1,$inout1); 2062 &pxor ($inout2,$inout2); 2063 &pxor ($inout3,$inout3); 2064 &pxor ($inout4,$inout4); 2065&set_label("cbc_dec_tail_collected"); 2066 &and ($len,15); 2067 &jnz (&label("cbc_dec_tail_partial")); 2068 &movups (&QWP(0,$out),$inout0); 2069 &pxor ($rndkey0,$rndkey0); 2070 &jmp (&label("cbc_ret")); 2071 2072&set_label("cbc_dec_tail_partial",16); 2073 &movaps (&QWP(0,"esp"),$inout0); 2074 &pxor ($rndkey0,$rndkey0); 2075 &mov ("ecx",16); 2076 &mov ($inp,"esp"); 2077 &sub ("ecx",$len); 2078 &data_word(0xA4F3F689); # rep movsb 2079 &movdqa (&QWP(0,"esp"),$inout0); 2080 2081&set_label("cbc_ret"); 2082 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2083 &mov ($key_,&wparam(4)); 2084 &pxor ($inout0,$inout0); 2085 &pxor ($rndkey1,$rndkey1); 2086 &movups (&QWP(0,$key_),$ivec); # output IV 2087 &pxor ($ivec,$ivec); 2088&set_label("cbc_abort"); 2089&function_end("${PREFIX}_cbc_encrypt"); 2090 2091###################################################################### 2092# Mechanical port from aesni-x86_64.pl. 2093# 2094# _aesni_set_encrypt_key is private interface, 2095# input: 2096# "eax" const unsigned char *userKey 2097# $rounds int bits 2098# $key AES_KEY *key 2099# output: 2100# "eax" return code 2101# $round rounds 2102 2103&function_begin_B("_aesni_set_encrypt_key"); 2104 &push ("ebp"); 2105 &push ("ebx"); 2106 &test ("eax","eax"); 2107 &jz (&label("bad_pointer")); 2108 &test ($key,$key); 2109 &jz (&label("bad_pointer")); 2110 2111 &call (&label("pic")); 2112&set_label("pic"); 2113 &blindpop("ebx"); 2114 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2115 2116 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2117 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2118 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2119 &mov ("ebp",&DWP(4,"ebp")); 2120 &lea ($key,&DWP(16,$key)); 2121 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 2122 &cmp ($rounds,256); 2123 &je (&label("14rounds")); 2124 &cmp ($rounds,192); 2125 &je (&label("12rounds")); 2126 &cmp ($rounds,128); 2127 &jne (&label("bad_keybits")); 2128 2129&set_label("10rounds",16); 2130 &cmp ("ebp",1<<28); 2131 &je (&label("10rounds_alt")); 2132 2133 &mov ($rounds,9); 2134 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2135 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 2136 &call (&label("key_128_cold")); 2137 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 2138 &call (&label("key_128")); 2139 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 2140 &call (&label("key_128")); 2141 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 2142 &call (&label("key_128")); 2143 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 2144 &call (&label("key_128")); 2145 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 2146 &call (&label("key_128")); 2147 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 2148 &call (&label("key_128")); 2149 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2150 &call (&label("key_128")); 2151 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2152 &call (&label("key_128")); 2153 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2154 &call (&label("key_128")); 2155 &$movekey (&QWP(0,$key),"xmm0"); 2156 &mov (&DWP(80,$key),$rounds); 2157 2158 &jmp (&label("good_key")); 2159 2160&set_label("key_128",16); 2161 &$movekey (&QWP(0,$key),"xmm0"); 2162 &lea ($key,&DWP(16,$key)); 2163&set_label("key_128_cold"); 2164 &shufps ("xmm4","xmm0",0b00010000); 2165 &xorps ("xmm0","xmm4"); 2166 &shufps ("xmm4","xmm0",0b10001100); 2167 &xorps ("xmm0","xmm4"); 2168 &shufps ("xmm1","xmm1",0b11111111); # critical path 2169 &xorps ("xmm0","xmm1"); 2170 &ret(); 2171 2172&set_label("10rounds_alt",16); 2173 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2174 &mov ($rounds,8); 2175 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2176 &movdqa ("xmm2","xmm0"); 2177 &movdqu (&QWP(-16,$key),"xmm0"); 2178 2179&set_label("loop_key128"); 2180 &pshufb ("xmm0","xmm5"); 2181 &aesenclast ("xmm0","xmm4"); 2182 &pslld ("xmm4",1); 2183 &lea ($key,&DWP(16,$key)); 2184 2185 &movdqa ("xmm3","xmm2"); 2186 &pslldq ("xmm2",4); 2187 &pxor ("xmm3","xmm2"); 2188 &pslldq ("xmm2",4); 2189 &pxor ("xmm3","xmm2"); 2190 &pslldq ("xmm2",4); 2191 &pxor ("xmm2","xmm3"); 2192 2193 &pxor ("xmm0","xmm2"); 2194 &movdqu (&QWP(-16,$key),"xmm0"); 2195 &movdqa ("xmm2","xmm0"); 2196 2197 &dec ($rounds); 2198 &jnz (&label("loop_key128")); 2199 2200 &movdqa ("xmm4",&QWP(0x30,"ebx")); 2201 2202 &pshufb ("xmm0","xmm5"); 2203 &aesenclast ("xmm0","xmm4"); 2204 &pslld ("xmm4",1); 2205 2206 &movdqa ("xmm3","xmm2"); 2207 &pslldq ("xmm2",4); 2208 &pxor ("xmm3","xmm2"); 2209 &pslldq ("xmm2",4); 2210 &pxor ("xmm3","xmm2"); 2211 &pslldq ("xmm2",4); 2212 &pxor ("xmm2","xmm3"); 2213 2214 &pxor ("xmm0","xmm2"); 2215 &movdqu (&QWP(0,$key),"xmm0"); 2216 2217 &movdqa ("xmm2","xmm0"); 2218 &pshufb ("xmm0","xmm5"); 2219 &aesenclast ("xmm0","xmm4"); 2220 2221 &movdqa ("xmm3","xmm2"); 2222 &pslldq ("xmm2",4); 2223 &pxor ("xmm3","xmm2"); 2224 &pslldq ("xmm2",4); 2225 &pxor ("xmm3","xmm2"); 2226 &pslldq ("xmm2",4); 2227 &pxor ("xmm2","xmm3"); 2228 2229 &pxor ("xmm0","xmm2"); 2230 &movdqu (&QWP(16,$key),"xmm0"); 2231 2232 &mov ($rounds,9); 2233 &mov (&DWP(96,$key),$rounds); 2234 2235 &jmp (&label("good_key")); 2236 2237&set_label("12rounds",16); 2238 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2239 &cmp ("ebp",1<<28); 2240 &je (&label("12rounds_alt")); 2241 2242 &mov ($rounds,11); 2243 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2244 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2245 &call (&label("key_192a_cold")); 2246 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2247 &call (&label("key_192b")); 2248 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2249 &call (&label("key_192a")); 2250 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2251 &call (&label("key_192b")); 2252 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2253 &call (&label("key_192a")); 2254 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2255 &call (&label("key_192b")); 2256 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2257 &call (&label("key_192a")); 2258 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2259 &call (&label("key_192b")); 2260 &$movekey (&QWP(0,$key),"xmm0"); 2261 &mov (&DWP(48,$key),$rounds); 2262 2263 &jmp (&label("good_key")); 2264 2265&set_label("key_192a",16); 2266 &$movekey (&QWP(0,$key),"xmm0"); 2267 &lea ($key,&DWP(16,$key)); 2268&set_label("key_192a_cold",16); 2269 &movaps ("xmm5","xmm2"); 2270&set_label("key_192b_warm"); 2271 &shufps ("xmm4","xmm0",0b00010000); 2272 &movdqa ("xmm3","xmm2"); 2273 &xorps ("xmm0","xmm4"); 2274 &shufps ("xmm4","xmm0",0b10001100); 2275 &pslldq ("xmm3",4); 2276 &xorps ("xmm0","xmm4"); 2277 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2278 &pxor ("xmm2","xmm3"); 2279 &pxor ("xmm0","xmm1"); 2280 &pshufd ("xmm3","xmm0",0b11111111); 2281 &pxor ("xmm2","xmm3"); 2282 &ret(); 2283 2284&set_label("key_192b",16); 2285 &movaps ("xmm3","xmm0"); 2286 &shufps ("xmm5","xmm0",0b01000100); 2287 &$movekey (&QWP(0,$key),"xmm5"); 2288 &shufps ("xmm3","xmm2",0b01001110); 2289 &$movekey (&QWP(16,$key),"xmm3"); 2290 &lea ($key,&DWP(32,$key)); 2291 &jmp (&label("key_192b_warm")); 2292 2293&set_label("12rounds_alt",16); 2294 &movdqa ("xmm5",&QWP(0x10,"ebx")); 2295 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2296 &mov ($rounds,8); 2297 &movdqu (&QWP(-16,$key),"xmm0"); 2298 2299&set_label("loop_key192"); 2300 &movq (&QWP(0,$key),"xmm2"); 2301 &movdqa ("xmm1","xmm2"); 2302 &pshufb ("xmm2","xmm5"); 2303 &aesenclast ("xmm2","xmm4"); 2304 &pslld ("xmm4",1); 2305 &lea ($key,&DWP(24,$key)); 2306 2307 &movdqa ("xmm3","xmm0"); 2308 &pslldq ("xmm0",4); 2309 &pxor ("xmm3","xmm0"); 2310 &pslldq ("xmm0",4); 2311 &pxor ("xmm3","xmm0"); 2312 &pslldq ("xmm0",4); 2313 &pxor ("xmm0","xmm3"); 2314 2315 &pshufd ("xmm3","xmm0",0xff); 2316 &pxor ("xmm3","xmm1"); 2317 &pslldq ("xmm1",4); 2318 &pxor ("xmm3","xmm1"); 2319 2320 &pxor ("xmm0","xmm2"); 2321 &pxor ("xmm2","xmm3"); 2322 &movdqu (&QWP(-16,$key),"xmm0"); 2323 2324 &dec ($rounds); 2325 &jnz (&label("loop_key192")); 2326 2327 &mov ($rounds,11); 2328 &mov (&DWP(32,$key),$rounds); 2329 2330 &jmp (&label("good_key")); 2331 2332&set_label("14rounds",16); 2333 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2334 &lea ($key,&DWP(16,$key)); 2335 &cmp ("ebp",1<<28); 2336 &je (&label("14rounds_alt")); 2337 2338 &mov ($rounds,13); 2339 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2340 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2341 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2342 &call (&label("key_256a_cold")); 2343 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2344 &call (&label("key_256b")); 2345 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2346 &call (&label("key_256a")); 2347 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2348 &call (&label("key_256b")); 2349 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2350 &call (&label("key_256a")); 2351 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2352 &call (&label("key_256b")); 2353 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2354 &call (&label("key_256a")); 2355 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2356 &call (&label("key_256b")); 2357 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2358 &call (&label("key_256a")); 2359 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2360 &call (&label("key_256b")); 2361 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2362 &call (&label("key_256a")); 2363 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2364 &call (&label("key_256b")); 2365 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2366 &call (&label("key_256a")); 2367 &$movekey (&QWP(0,$key),"xmm0"); 2368 &mov (&DWP(16,$key),$rounds); 2369 &xor ("eax","eax"); 2370 2371 &jmp (&label("good_key")); 2372 2373&set_label("key_256a",16); 2374 &$movekey (&QWP(0,$key),"xmm2"); 2375 &lea ($key,&DWP(16,$key)); 2376&set_label("key_256a_cold"); 2377 &shufps ("xmm4","xmm0",0b00010000); 2378 &xorps ("xmm0","xmm4"); 2379 &shufps ("xmm4","xmm0",0b10001100); 2380 &xorps ("xmm0","xmm4"); 2381 &shufps ("xmm1","xmm1",0b11111111); # critical path 2382 &xorps ("xmm0","xmm1"); 2383 &ret(); 2384 2385&set_label("key_256b",16); 2386 &$movekey (&QWP(0,$key),"xmm0"); 2387 &lea ($key,&DWP(16,$key)); 2388 2389 &shufps ("xmm4","xmm2",0b00010000); 2390 &xorps ("xmm2","xmm4"); 2391 &shufps ("xmm4","xmm2",0b10001100); 2392 &xorps ("xmm2","xmm4"); 2393 &shufps ("xmm1","xmm1",0b10101010); # critical path 2394 &xorps ("xmm2","xmm1"); 2395 &ret(); 2396 2397&set_label("14rounds_alt",16); 2398 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2399 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2400 &mov ($rounds,7); 2401 &movdqu (&QWP(-32,$key),"xmm0"); 2402 &movdqa ("xmm1","xmm2"); 2403 &movdqu (&QWP(-16,$key),"xmm2"); 2404 2405&set_label("loop_key256"); 2406 &pshufb ("xmm2","xmm5"); 2407 &aesenclast ("xmm2","xmm4"); 2408 2409 &movdqa ("xmm3","xmm0"); 2410 &pslldq ("xmm0",4); 2411 &pxor ("xmm3","xmm0"); 2412 &pslldq ("xmm0",4); 2413 &pxor ("xmm3","xmm0"); 2414 &pslldq ("xmm0",4); 2415 &pxor ("xmm0","xmm3"); 2416 &pslld ("xmm4",1); 2417 2418 &pxor ("xmm0","xmm2"); 2419 &movdqu (&QWP(0,$key),"xmm0"); 2420 2421 &dec ($rounds); 2422 &jz (&label("done_key256")); 2423 2424 &pshufd ("xmm2","xmm0",0xff); 2425 &pxor ("xmm3","xmm3"); 2426 &aesenclast ("xmm2","xmm3"); 2427 2428 &movdqa ("xmm3","xmm1") 2429 &pslldq ("xmm1",4); 2430 &pxor ("xmm3","xmm1"); 2431 &pslldq ("xmm1",4); 2432 &pxor ("xmm3","xmm1"); 2433 &pslldq ("xmm1",4); 2434 &pxor ("xmm1","xmm3"); 2435 2436 &pxor ("xmm2","xmm1"); 2437 &movdqu (&QWP(16,$key),"xmm2"); 2438 &lea ($key,&DWP(32,$key)); 2439 &movdqa ("xmm1","xmm2"); 2440 &jmp (&label("loop_key256")); 2441 2442&set_label("done_key256"); 2443 &mov ($rounds,13); 2444 &mov (&DWP(16,$key),$rounds); 2445 2446&set_label("good_key"); 2447 &pxor ("xmm0","xmm0"); 2448 &pxor ("xmm1","xmm1"); 2449 &pxor ("xmm2","xmm2"); 2450 &pxor ("xmm3","xmm3"); 2451 &pxor ("xmm4","xmm4"); 2452 &pxor ("xmm5","xmm5"); 2453 &xor ("eax","eax"); 2454 &pop ("ebx"); 2455 &pop ("ebp"); 2456 &ret (); 2457 2458&set_label("bad_pointer",4); 2459 &mov ("eax",-1); 2460 &pop ("ebx"); 2461 &pop ("ebp"); 2462 &ret (); 2463&set_label("bad_keybits",4); 2464 &pxor ("xmm0","xmm0"); 2465 &mov ("eax",-2); 2466 &pop ("ebx"); 2467 &pop ("ebp"); 2468 &ret (); 2469&function_end_B("_aesni_set_encrypt_key"); 2470 2471# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2472# AES_KEY *key) 2473&function_begin_B("${PREFIX}_set_encrypt_key"); 2474 &mov ("eax",&wparam(0)); 2475 &mov ($rounds,&wparam(1)); 2476 &mov ($key,&wparam(2)); 2477 &call ("_aesni_set_encrypt_key"); 2478 &ret (); 2479&function_end_B("${PREFIX}_set_encrypt_key"); 2480 2481# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2482# AES_KEY *key) 2483&function_begin_B("${PREFIX}_set_decrypt_key"); 2484 &mov ("eax",&wparam(0)); 2485 &mov ($rounds,&wparam(1)); 2486 &mov ($key,&wparam(2)); 2487 &call ("_aesni_set_encrypt_key"); 2488 &mov ($key,&wparam(2)); 2489 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 2490 &test ("eax","eax"); 2491 &jnz (&label("dec_key_ret")); 2492 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2493 2494 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2495 &$movekey ("xmm1",&QWP(0,"eax")); 2496 &$movekey (&QWP(0,"eax"),"xmm0"); 2497 &$movekey (&QWP(0,$key),"xmm1"); 2498 &lea ($key,&DWP(16,$key)); 2499 &lea ("eax",&DWP(-16,"eax")); 2500 2501&set_label("dec_key_inverse"); 2502 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2503 &$movekey ("xmm1",&QWP(0,"eax")); 2504 &aesimc ("xmm0","xmm0"); 2505 &aesimc ("xmm1","xmm1"); 2506 &lea ($key,&DWP(16,$key)); 2507 &lea ("eax",&DWP(-16,"eax")); 2508 &$movekey (&QWP(16,"eax"),"xmm0"); 2509 &$movekey (&QWP(-16,$key),"xmm1"); 2510 &cmp ("eax",$key); 2511 &ja (&label("dec_key_inverse")); 2512 2513 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2514 &aesimc ("xmm0","xmm0"); 2515 &$movekey (&QWP(0,$key),"xmm0"); 2516 2517 &pxor ("xmm0","xmm0"); 2518 &pxor ("xmm1","xmm1"); 2519 &xor ("eax","eax"); # return success 2520&set_label("dec_key_ret"); 2521 &ret (); 2522&function_end_B("${PREFIX}_set_decrypt_key"); 2523 2524&set_label("key_const",64); 2525&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 2526&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 2527&data_word(1,1,1,1); 2528&data_word(0x1b,0x1b,0x1b,0x1b); 2529&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2530 2531&asm_finish(); 2532 2533close STDOUT; 2534