1#!/usr/bin/env perl 2 3################################################################### 4### AES-128 [originally in CTR mode] ### 5### bitsliced implementation for Intel Core 2 processors ### 6### requires support of SSE extensions up to SSSE3 ### 7### Author: Emilia Käsper and Peter Schwabe ### 8### Date: 2009-03-19 ### 9### Public domain ### 10### ### 11### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12### further information. ### 13################################################################### 14# 15# September 2011. 16# 17# Started as transliteration to "perlasm" the original code has 18# undergone following changes: 19# 20# - code was made position-independent; 21# - rounds were folded into a loop resulting in >5x size reduction 22# from 12.5KB to 2.2KB; 23# - above was possibile thanks to mixcolumns() modification that 24# allowed to feed its output back to aesenc[last], this was 25# achieved at cost of two additional inter-registers moves; 26# - some instruction reordering and interleaving; 27# - this module doesn't implement key setup subroutine, instead it 28# relies on conversion of "conventional" key schedule as returned 29# by AES_set_encrypt_key (see discussion below); 30# - first and last round keys are treated differently, which allowed 31# to skip one shiftrows(), reduce bit-sliced key schedule and 32# speed-up conversion by 22%; 33# - support for 192- and 256-bit keys was added; 34# 35# Resulting performance in CPU cycles spent to encrypt one byte out 36# of 4096-byte buffer with 128-bit key is: 37# 38# Emilia's this(*) difference 39# 40# Core 2 9.30 8.69 +7% 41# Nehalem(**) 7.63 6.88 +11% 42# Atom 17.1 16.4 +4% 43# Silvermont - 12.9 44# 45# (*) Comparison is not completely fair, because "this" is ECB, 46# i.e. no extra processing such as counter values calculation 47# and xor-ing input as in Emilia's CTR implementation is 48# performed. However, the CTR calculations stand for not more 49# than 1% of total time, so comparison is *rather* fair. 50# 51# (**) Results were collected on Westmere, which is considered to 52# be equivalent to Nehalem for this code. 53# 54# As for key schedule conversion subroutine. Interface to OpenSSL 55# relies on per-invocation on-the-fly conversion. This naturally 56# has impact on performance, especially for short inputs. Conversion 57# time in CPU cycles and its ratio to CPU cycles spent in 8x block 58# function is: 59# 60# conversion conversion/8x block 61# Core 2 240 0.22 62# Nehalem 180 0.20 63# Atom 430 0.20 64# 65# The ratio values mean that 128-byte blocks will be processed 66# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 67# etc. Then keep in mind that input sizes not divisible by 128 are 68# *effectively* slower, especially shortest ones, e.g. consecutive 69# 144-byte blocks are processed 44% slower than one would expect, 70# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 71# it's still faster than ["hyper-threading-safe" code path in] 72# aes-x86_64.pl on all lengths above 64 bytes... 73# 74# October 2011. 75# 76# Add decryption procedure. Performance in CPU cycles spent to decrypt 77# one byte out of 4096-byte buffer with 128-bit key is: 78# 79# Core 2 9.98 80# Nehalem 7.80 81# Atom 17.9 82# Silvermont 14.0 83# 84# November 2011. 85# 86# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 87# suboptimal, but XTS is meant to be used with larger blocks... 88# 89# <appro@openssl.org> 90 91$flavour = shift; 92$output = shift; 93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 94 95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 96 97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 100die "can't locate x86_64-xlate.pl"; 101 102open OUT,"| \"$^X\" $xlate $flavour $output"; 103*STDOUT=*OUT; 104 105my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 106my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 107my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 108 109{ 110my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 111 112sub Sbox { 113# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 114# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 115my @b=@_[0..7]; 116my @t=@_[8..11]; 117my @s=@_[12..15]; 118 &InBasisChange (@b); 119 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 120 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 121} 122 123sub InBasisChange { 124# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 125# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 126my @b=@_[0..7]; 127$code.=<<___; 128 pxor @b[6], @b[5] 129 pxor @b[1], @b[2] 130 pxor @b[0], @b[3] 131 pxor @b[2], @b[6] 132 pxor @b[0], @b[5] 133 134 pxor @b[3], @b[6] 135 pxor @b[7], @b[3] 136 pxor @b[5], @b[7] 137 pxor @b[4], @b[3] 138 pxor @b[5], @b[4] 139 pxor @b[1], @b[3] 140 141 pxor @b[7], @b[2] 142 pxor @b[5], @b[1] 143___ 144} 145 146sub OutBasisChange { 147# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 148# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 149my @b=@_[0..7]; 150$code.=<<___; 151 pxor @b[6], @b[0] 152 pxor @b[4], @b[1] 153 pxor @b[0], @b[2] 154 pxor @b[6], @b[4] 155 pxor @b[1], @b[6] 156 157 pxor @b[5], @b[1] 158 pxor @b[3], @b[5] 159 pxor @b[7], @b[3] 160 pxor @b[5], @b[7] 161 pxor @b[5], @b[2] 162 163 pxor @b[7], @b[4] 164___ 165} 166 167sub InvSbox { 168# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 169# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 170my @b=@_[0..7]; 171my @t=@_[8..11]; 172my @s=@_[12..15]; 173 &InvInBasisChange (@b); 174 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 175 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 176} 177 178sub InvInBasisChange { # OutBasisChange in reverse 179my @b=@_[5,1,2,6,3,7,0,4]; 180$code.=<<___ 181 pxor @b[7], @b[4] 182 183 pxor @b[5], @b[7] 184 pxor @b[5], @b[2] 185 pxor @b[7], @b[3] 186 pxor @b[3], @b[5] 187 pxor @b[5], @b[1] 188 189 pxor @b[1], @b[6] 190 pxor @b[0], @b[2] 191 pxor @b[6], @b[4] 192 pxor @b[6], @b[0] 193 pxor @b[4], @b[1] 194___ 195} 196 197sub InvOutBasisChange { # InBasisChange in reverse 198my @b=@_[2,5,7,3,6,1,0,4]; 199$code.=<<___; 200 pxor @b[5], @b[1] 201 pxor @b[7], @b[2] 202 203 pxor @b[1], @b[3] 204 pxor @b[5], @b[4] 205 pxor @b[5], @b[7] 206 pxor @b[4], @b[3] 207 pxor @b[0], @b[5] 208 pxor @b[7], @b[3] 209 pxor @b[2], @b[6] 210 pxor @b[1], @b[2] 211 pxor @b[3], @b[6] 212 213 pxor @b[0], @b[3] 214 pxor @b[6], @b[5] 215___ 216} 217 218sub Mul_GF4 { 219#;************************************************************* 220#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 221#;************************************************************* 222my ($x0,$x1,$y0,$y1,$t0)=@_; 223$code.=<<___; 224 movdqa $y0, $t0 225 pxor $y1, $t0 226 pand $x0, $t0 227 pxor $x1, $x0 228 pand $y0, $x1 229 pand $y1, $x0 230 pxor $x1, $x0 231 pxor $t0, $x1 232___ 233} 234 235sub Mul_GF4_N { # not used, see next subroutine 236# multiply and scale by N 237my ($x0,$x1,$y0,$y1,$t0)=@_; 238$code.=<<___; 239 movdqa $y0, $t0 240 pxor $y1, $t0 241 pand $x0, $t0 242 pxor $x1, $x0 243 pand $y0, $x1 244 pand $y1, $x0 245 pxor $x0, $x1 246 pxor $t0, $x0 247___ 248} 249 250sub Mul_GF4_N_GF4 { 251# interleaved Mul_GF4_N and Mul_GF4 252my ($x0,$x1,$y0,$y1,$t0, 253 $x2,$x3,$y2,$y3,$t1)=@_; 254$code.=<<___; 255 movdqa $y0, $t0 256 movdqa $y2, $t1 257 pxor $y1, $t0 258 pxor $y3, $t1 259 pand $x0, $t0 260 pand $x2, $t1 261 pxor $x1, $x0 262 pxor $x3, $x2 263 pand $y0, $x1 264 pand $y2, $x3 265 pand $y1, $x0 266 pand $y3, $x2 267 pxor $x0, $x1 268 pxor $x3, $x2 269 pxor $t0, $x0 270 pxor $t1, $x3 271___ 272} 273sub Mul_GF16_2 { 274my @x=@_[0..7]; 275my @y=@_[8..11]; 276my @t=@_[12..15]; 277$code.=<<___; 278 movdqa @x[0], @t[0] 279 movdqa @x[1], @t[1] 280___ 281 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 282$code.=<<___; 283 pxor @x[2], @t[0] 284 pxor @x[3], @t[1] 285 pxor @y[2], @y[0] 286 pxor @y[3], @y[1] 287___ 288 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 289 @x[2], @x[3], @y[2], @y[3], @t[2]); 290$code.=<<___; 291 pxor @t[0], @x[0] 292 pxor @t[0], @x[2] 293 pxor @t[1], @x[1] 294 pxor @t[1], @x[3] 295 296 movdqa @x[4], @t[0] 297 movdqa @x[5], @t[1] 298 pxor @x[6], @t[0] 299 pxor @x[7], @t[1] 300___ 301 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 302 @x[6], @x[7], @y[2], @y[3], @t[2]); 303$code.=<<___; 304 pxor @y[2], @y[0] 305 pxor @y[3], @y[1] 306___ 307 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 308$code.=<<___; 309 pxor @t[0], @x[4] 310 pxor @t[0], @x[6] 311 pxor @t[1], @x[5] 312 pxor @t[1], @x[7] 313___ 314} 315sub Inv_GF256 { 316#;******************************************************************** 317#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 318#;******************************************************************** 319my @x=@_[0..7]; 320my @t=@_[8..11]; 321my @s=@_[12..15]; 322# direct optimizations from hardware 323$code.=<<___; 324 movdqa @x[4], @t[3] 325 movdqa @x[5], @t[2] 326 movdqa @x[1], @t[1] 327 movdqa @x[7], @s[1] 328 movdqa @x[0], @s[0] 329 330 pxor @x[6], @t[3] 331 pxor @x[7], @t[2] 332 pxor @x[3], @t[1] 333 movdqa @t[3], @s[2] 334 pxor @x[6], @s[1] 335 movdqa @t[2], @t[0] 336 pxor @x[2], @s[0] 337 movdqa @t[3], @s[3] 338 339 por @t[1], @t[2] 340 por @s[0], @t[3] 341 pxor @t[0], @s[3] 342 pand @s[0], @s[2] 343 pxor @t[1], @s[0] 344 pand @t[1], @t[0] 345 pand @s[0], @s[3] 346 movdqa @x[3], @s[0] 347 pxor @x[2], @s[0] 348 pand @s[0], @s[1] 349 pxor @s[1], @t[3] 350 pxor @s[1], @t[2] 351 movdqa @x[4], @s[1] 352 movdqa @x[1], @s[0] 353 pxor @x[5], @s[1] 354 pxor @x[0], @s[0] 355 movdqa @s[1], @t[1] 356 pand @s[0], @s[1] 357 por @s[0], @t[1] 358 pxor @s[1], @t[0] 359 pxor @s[3], @t[3] 360 pxor @s[2], @t[2] 361 pxor @s[3], @t[1] 362 movdqa @x[7], @s[0] 363 pxor @s[2], @t[0] 364 movdqa @x[6], @s[1] 365 pxor @s[2], @t[1] 366 movdqa @x[5], @s[2] 367 pand @x[3], @s[0] 368 movdqa @x[4], @s[3] 369 pand @x[2], @s[1] 370 pand @x[1], @s[2] 371 por @x[0], @s[3] 372 pxor @s[0], @t[3] 373 pxor @s[1], @t[2] 374 pxor @s[2], @t[1] 375 pxor @s[3], @t[0] 376 377 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 378 379 # new smaller inversion 380 381 movdqa @t[3], @s[0] 382 pand @t[1], @t[3] 383 pxor @t[2], @s[0] 384 385 movdqa @t[0], @s[2] 386 movdqa @s[0], @s[3] 387 pxor @t[3], @s[2] 388 pand @s[2], @s[3] 389 390 movdqa @t[1], @s[1] 391 pxor @t[2], @s[3] 392 pxor @t[0], @s[1] 393 394 pxor @t[2], @t[3] 395 396 pand @t[3], @s[1] 397 398 movdqa @s[2], @t[2] 399 pxor @t[0], @s[1] 400 401 pxor @s[1], @t[2] 402 pxor @s[1], @t[1] 403 404 pand @t[0], @t[2] 405 406 pxor @t[2], @s[2] 407 pxor @t[2], @t[1] 408 409 pand @s[3], @s[2] 410 411 pxor @s[0], @s[2] 412___ 413# output in s3, s2, s1, t1 414 415# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 416 417# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 418 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 419 420### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 421} 422 423# AES linear components 424 425sub ShiftRows { 426my @x=@_[0..7]; 427my $mask=pop; 428$code.=<<___; 429 pxor 0x00($key),@x[0] 430 pxor 0x10($key),@x[1] 431 pxor 0x20($key),@x[2] 432 pxor 0x30($key),@x[3] 433 pshufb $mask,@x[0] 434 pshufb $mask,@x[1] 435 pxor 0x40($key),@x[4] 436 pxor 0x50($key),@x[5] 437 pshufb $mask,@x[2] 438 pshufb $mask,@x[3] 439 pxor 0x60($key),@x[6] 440 pxor 0x70($key),@x[7] 441 pshufb $mask,@x[4] 442 pshufb $mask,@x[5] 443 pshufb $mask,@x[6] 444 pshufb $mask,@x[7] 445 lea 0x80($key),$key 446___ 447} 448 449sub MixColumns { 450# modified to emit output in order suitable for feeding back to aesenc[last] 451my @x=@_[0..7]; 452my @t=@_[8..15]; 453my $inv=@_[16]; # optional 454$code.=<<___; 455 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 456 pshufd \$0x93, @x[1], @t[1] 457 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 458 pshufd \$0x93, @x[2], @t[2] 459 pxor @t[1], @x[1] 460 pshufd \$0x93, @x[3], @t[3] 461 pxor @t[2], @x[2] 462 pshufd \$0x93, @x[4], @t[4] 463 pxor @t[3], @x[3] 464 pshufd \$0x93, @x[5], @t[5] 465 pxor @t[4], @x[4] 466 pshufd \$0x93, @x[6], @t[6] 467 pxor @t[5], @x[5] 468 pshufd \$0x93, @x[7], @t[7] 469 pxor @t[6], @x[6] 470 pxor @t[7], @x[7] 471 472 pxor @x[0], @t[1] 473 pxor @x[7], @t[0] 474 pxor @x[7], @t[1] 475 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 476 pxor @x[1], @t[2] 477 pshufd \$0x4E, @x[1], @x[1] 478 pxor @x[4], @t[5] 479 pxor @t[0], @x[0] 480 pxor @x[5], @t[6] 481 pxor @t[1], @x[1] 482 pxor @x[3], @t[4] 483 pshufd \$0x4E, @x[4], @t[0] 484 pxor @x[6], @t[7] 485 pshufd \$0x4E, @x[5], @t[1] 486 pxor @x[2], @t[3] 487 pshufd \$0x4E, @x[3], @x[4] 488 pxor @x[7], @t[3] 489 pshufd \$0x4E, @x[7], @x[5] 490 pxor @x[7], @t[4] 491 pshufd \$0x4E, @x[6], @x[3] 492 pxor @t[4], @t[0] 493 pshufd \$0x4E, @x[2], @x[6] 494 pxor @t[5], @t[1] 495___ 496$code.=<<___ if (!$inv); 497 pxor @t[3], @x[4] 498 pxor @t[7], @x[5] 499 pxor @t[6], @x[3] 500 movdqa @t[0], @x[2] 501 pxor @t[2], @x[6] 502 movdqa @t[1], @x[7] 503___ 504$code.=<<___ if ($inv); 505 pxor @x[4], @t[3] 506 pxor @t[7], @x[5] 507 pxor @x[3], @t[6] 508 movdqa @t[0], @x[3] 509 pxor @t[2], @x[6] 510 movdqa @t[6], @x[2] 511 movdqa @t[1], @x[7] 512 movdqa @x[6], @x[4] 513 movdqa @t[3], @x[6] 514___ 515} 516 517sub InvMixColumns_orig { 518my @x=@_[0..7]; 519my @t=@_[8..15]; 520 521$code.=<<___; 522 # multiplication by 0x0e 523 pshufd \$0x93, @x[7], @t[7] 524 movdqa @x[2], @t[2] 525 pxor @x[5], @x[7] # 7 5 526 pxor @x[5], @x[2] # 2 5 527 pshufd \$0x93, @x[0], @t[0] 528 movdqa @x[5], @t[5] 529 pxor @x[0], @x[5] # 5 0 [1] 530 pxor @x[1], @x[0] # 0 1 531 pshufd \$0x93, @x[1], @t[1] 532 pxor @x[2], @x[1] # 1 25 533 pxor @x[6], @x[0] # 01 6 [2] 534 pxor @x[3], @x[1] # 125 3 [4] 535 pshufd \$0x93, @x[3], @t[3] 536 pxor @x[0], @x[2] # 25 016 [3] 537 pxor @x[7], @x[3] # 3 75 538 pxor @x[6], @x[7] # 75 6 [0] 539 pshufd \$0x93, @x[6], @t[6] 540 movdqa @x[4], @t[4] 541 pxor @x[4], @x[6] # 6 4 542 pxor @x[3], @x[4] # 4 375 [6] 543 pxor @x[7], @x[3] # 375 756=36 544 pxor @t[5], @x[6] # 64 5 [7] 545 pxor @t[2], @x[3] # 36 2 546 pxor @t[4], @x[3] # 362 4 [5] 547 pshufd \$0x93, @t[5], @t[5] 548___ 549 my @y = @x[7,5,0,2,1,3,4,6]; 550$code.=<<___; 551 # multiplication by 0x0b 552 pxor @y[0], @y[1] 553 pxor @t[0], @y[0] 554 pxor @t[1], @y[1] 555 pshufd \$0x93, @t[2], @t[2] 556 pxor @t[5], @y[0] 557 pxor @t[6], @y[1] 558 pxor @t[7], @y[0] 559 pshufd \$0x93, @t[4], @t[4] 560 pxor @t[6], @t[7] # clobber t[7] 561 pxor @y[0], @y[1] 562 563 pxor @t[0], @y[3] 564 pshufd \$0x93, @t[0], @t[0] 565 pxor @t[1], @y[2] 566 pxor @t[1], @y[4] 567 pxor @t[2], @y[2] 568 pshufd \$0x93, @t[1], @t[1] 569 pxor @t[2], @y[3] 570 pxor @t[2], @y[5] 571 pxor @t[7], @y[2] 572 pshufd \$0x93, @t[2], @t[2] 573 pxor @t[3], @y[3] 574 pxor @t[3], @y[6] 575 pxor @t[3], @y[4] 576 pshufd \$0x93, @t[3], @t[3] 577 pxor @t[4], @y[7] 578 pxor @t[4], @y[5] 579 pxor @t[7], @y[7] 580 pxor @t[5], @y[3] 581 pxor @t[4], @y[4] 582 pxor @t[5], @t[7] # clobber t[7] even more 583 584 pxor @t[7], @y[5] 585 pshufd \$0x93, @t[4], @t[4] 586 pxor @t[7], @y[6] 587 pxor @t[7], @y[4] 588 589 pxor @t[5], @t[7] 590 pshufd \$0x93, @t[5], @t[5] 591 pxor @t[6], @t[7] # restore t[7] 592 593 # multiplication by 0x0d 594 pxor @y[7], @y[4] 595 pxor @t[4], @y[7] 596 pshufd \$0x93, @t[6], @t[6] 597 pxor @t[0], @y[2] 598 pxor @t[5], @y[7] 599 pxor @t[2], @y[2] 600 pshufd \$0x93, @t[7], @t[7] 601 602 pxor @y[1], @y[3] 603 pxor @t[1], @y[1] 604 pxor @t[0], @y[0] 605 pxor @t[0], @y[3] 606 pxor @t[5], @y[1] 607 pxor @t[5], @y[0] 608 pxor @t[7], @y[1] 609 pshufd \$0x93, @t[0], @t[0] 610 pxor @t[6], @y[0] 611 pxor @y[1], @y[3] 612 pxor @t[1], @y[4] 613 pshufd \$0x93, @t[1], @t[1] 614 615 pxor @t[7], @y[7] 616 pxor @t[2], @y[4] 617 pxor @t[2], @y[5] 618 pshufd \$0x93, @t[2], @t[2] 619 pxor @t[6], @y[2] 620 pxor @t[3], @t[6] # clobber t[6] 621 pxor @y[7], @y[4] 622 pxor @t[6], @y[3] 623 624 pxor @t[6], @y[6] 625 pxor @t[5], @y[5] 626 pxor @t[4], @y[6] 627 pshufd \$0x93, @t[4], @t[4] 628 pxor @t[6], @y[5] 629 pxor @t[7], @y[6] 630 pxor @t[3], @t[6] # restore t[6] 631 632 pshufd \$0x93, @t[5], @t[5] 633 pshufd \$0x93, @t[6], @t[6] 634 pshufd \$0x93, @t[7], @t[7] 635 pshufd \$0x93, @t[3], @t[3] 636 637 # multiplication by 0x09 638 pxor @y[1], @y[4] 639 pxor @y[1], @t[1] # t[1]=y[1] 640 pxor @t[5], @t[0] # clobber t[0] 641 pxor @t[5], @t[1] 642 pxor @t[0], @y[3] 643 pxor @y[0], @t[0] # t[0]=y[0] 644 pxor @t[6], @t[1] 645 pxor @t[7], @t[6] # clobber t[6] 646 pxor @t[1], @y[4] 647 pxor @t[4], @y[7] 648 pxor @y[4], @t[4] # t[4]=y[4] 649 pxor @t[3], @y[6] 650 pxor @y[3], @t[3] # t[3]=y[3] 651 pxor @t[2], @y[5] 652 pxor @y[2], @t[2] # t[2]=y[2] 653 pxor @t[7], @t[3] 654 pxor @y[5], @t[5] # t[5]=y[5] 655 pxor @t[6], @t[2] 656 pxor @t[6], @t[5] 657 pxor @y[6], @t[6] # t[6]=y[6] 658 pxor @y[7], @t[7] # t[7]=y[7] 659 660 movdqa @t[0],@XMM[0] 661 movdqa @t[1],@XMM[1] 662 movdqa @t[2],@XMM[2] 663 movdqa @t[3],@XMM[3] 664 movdqa @t[4],@XMM[4] 665 movdqa @t[5],@XMM[5] 666 movdqa @t[6],@XMM[6] 667 movdqa @t[7],@XMM[7] 668___ 669} 670 671sub InvMixColumns { 672my @x=@_[0..7]; 673my @t=@_[8..15]; 674 675# Thanks to Jussi Kivilinna for providing pointer to 676# 677# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 678# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 679# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 680# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 681 682$code.=<<___; 683 # multiplication by 0x05-0x00-0x04-0x00 684 pshufd \$0x4E, @x[0], @t[0] 685 pshufd \$0x4E, @x[6], @t[6] 686 pxor @x[0], @t[0] 687 pshufd \$0x4E, @x[7], @t[7] 688 pxor @x[6], @t[6] 689 pshufd \$0x4E, @x[1], @t[1] 690 pxor @x[7], @t[7] 691 pshufd \$0x4E, @x[2], @t[2] 692 pxor @x[1], @t[1] 693 pshufd \$0x4E, @x[3], @t[3] 694 pxor @x[2], @t[2] 695 pxor @t[6], @x[0] 696 pxor @t[6], @x[1] 697 pshufd \$0x4E, @x[4], @t[4] 698 pxor @x[3], @t[3] 699 pxor @t[0], @x[2] 700 pxor @t[1], @x[3] 701 pshufd \$0x4E, @x[5], @t[5] 702 pxor @x[4], @t[4] 703 pxor @t[7], @x[1] 704 pxor @t[2], @x[4] 705 pxor @x[5], @t[5] 706 707 pxor @t[7], @x[2] 708 pxor @t[6], @x[3] 709 pxor @t[6], @x[4] 710 pxor @t[3], @x[5] 711 pxor @t[4], @x[6] 712 pxor @t[7], @x[4] 713 pxor @t[7], @x[5] 714 pxor @t[5], @x[7] 715___ 716 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 717} 718 719sub aesenc { # not used 720my @b=@_[0..7]; 721my @t=@_[8..15]; 722$code.=<<___; 723 movdqa 0x30($const),@t[0] # .LSR 724___ 725 &ShiftRows (@b,@t[0]); 726 &Sbox (@b,@t); 727 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 728} 729 730sub aesenclast { # not used 731my @b=@_[0..7]; 732my @t=@_[8..15]; 733$code.=<<___; 734 movdqa 0x40($const),@t[0] # .LSRM0 735___ 736 &ShiftRows (@b,@t[0]); 737 &Sbox (@b,@t); 738$code.=<<___ 739 pxor 0x00($key),@b[0] 740 pxor 0x10($key),@b[1] 741 pxor 0x20($key),@b[4] 742 pxor 0x30($key),@b[6] 743 pxor 0x40($key),@b[3] 744 pxor 0x50($key),@b[7] 745 pxor 0x60($key),@b[2] 746 pxor 0x70($key),@b[5] 747___ 748} 749 750sub swapmove { 751my ($a,$b,$n,$mask,$t)=@_; 752$code.=<<___; 753 movdqa $b,$t 754 psrlq \$$n,$b 755 pxor $a,$b 756 pand $mask,$b 757 pxor $b,$a 758 psllq \$$n,$b 759 pxor $t,$b 760___ 761} 762sub swapmove2x { 763my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 764$code.=<<___; 765 movdqa $b0,$t0 766 psrlq \$$n,$b0 767 movdqa $b1,$t1 768 psrlq \$$n,$b1 769 pxor $a0,$b0 770 pxor $a1,$b1 771 pand $mask,$b0 772 pand $mask,$b1 773 pxor $b0,$a0 774 psllq \$$n,$b0 775 pxor $b1,$a1 776 psllq \$$n,$b1 777 pxor $t0,$b0 778 pxor $t1,$b1 779___ 780} 781 782sub bitslice { 783my @x=reverse(@_[0..7]); 784my ($t0,$t1,$t2,$t3)=@_[8..11]; 785$code.=<<___; 786 movdqa 0x00($const),$t0 # .LBS0 787 movdqa 0x10($const),$t1 # .LBS1 788___ 789 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 790 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 791$code.=<<___; 792 movdqa 0x20($const),$t0 # .LBS2 793___ 794 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 795 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 796 797 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 798 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 799} 800 801$code.=<<___; 802.text 803 804.extern asm_AES_encrypt 805.extern asm_AES_decrypt 806 807.type _bsaes_encrypt8,\@abi-omnipotent 808.align 64 809_bsaes_encrypt8: 810 lea .LBS0(%rip), $const # constants table 811 812 movdqa ($key), @XMM[9] # round 0 key 813 lea 0x10($key), $key 814 movdqa 0x50($const), @XMM[8] # .LM0SR 815 pxor @XMM[9], @XMM[0] # xor with round0 key 816 pxor @XMM[9], @XMM[1] 817 pxor @XMM[9], @XMM[2] 818 pxor @XMM[9], @XMM[3] 819 pshufb @XMM[8], @XMM[0] 820 pshufb @XMM[8], @XMM[1] 821 pxor @XMM[9], @XMM[4] 822 pxor @XMM[9], @XMM[5] 823 pshufb @XMM[8], @XMM[2] 824 pshufb @XMM[8], @XMM[3] 825 pxor @XMM[9], @XMM[6] 826 pxor @XMM[9], @XMM[7] 827 pshufb @XMM[8], @XMM[4] 828 pshufb @XMM[8], @XMM[5] 829 pshufb @XMM[8], @XMM[6] 830 pshufb @XMM[8], @XMM[7] 831_bsaes_encrypt8_bitslice: 832___ 833 &bitslice (@XMM[0..7, 8..11]); 834$code.=<<___; 835 dec $rounds 836 jmp .Lenc_sbox 837.align 16 838.Lenc_loop: 839___ 840 &ShiftRows (@XMM[0..7, 8]); 841$code.=".Lenc_sbox:\n"; 842 &Sbox (@XMM[0..7, 8..15]); 843$code.=<<___; 844 dec $rounds 845 jl .Lenc_done 846___ 847 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 848$code.=<<___; 849 movdqa 0x30($const), @XMM[8] # .LSR 850 jnz .Lenc_loop 851 movdqa 0x40($const), @XMM[8] # .LSRM0 852 jmp .Lenc_loop 853.align 16 854.Lenc_done: 855___ 856 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 857 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 858$code.=<<___; 859 movdqa ($key), @XMM[8] # last round key 860 pxor @XMM[8], @XMM[4] 861 pxor @XMM[8], @XMM[6] 862 pxor @XMM[8], @XMM[3] 863 pxor @XMM[8], @XMM[7] 864 pxor @XMM[8], @XMM[2] 865 pxor @XMM[8], @XMM[5] 866 pxor @XMM[8], @XMM[0] 867 pxor @XMM[8], @XMM[1] 868 ret 869.size _bsaes_encrypt8,.-_bsaes_encrypt8 870 871.type _bsaes_decrypt8,\@abi-omnipotent 872.align 64 873_bsaes_decrypt8: 874 lea .LBS0(%rip), $const # constants table 875 876 movdqa ($key), @XMM[9] # round 0 key 877 lea 0x10($key), $key 878 movdqa -0x30($const), @XMM[8] # .LM0ISR 879 pxor @XMM[9], @XMM[0] # xor with round0 key 880 pxor @XMM[9], @XMM[1] 881 pxor @XMM[9], @XMM[2] 882 pxor @XMM[9], @XMM[3] 883 pshufb @XMM[8], @XMM[0] 884 pshufb @XMM[8], @XMM[1] 885 pxor @XMM[9], @XMM[4] 886 pxor @XMM[9], @XMM[5] 887 pshufb @XMM[8], @XMM[2] 888 pshufb @XMM[8], @XMM[3] 889 pxor @XMM[9], @XMM[6] 890 pxor @XMM[9], @XMM[7] 891 pshufb @XMM[8], @XMM[4] 892 pshufb @XMM[8], @XMM[5] 893 pshufb @XMM[8], @XMM[6] 894 pshufb @XMM[8], @XMM[7] 895___ 896 &bitslice (@XMM[0..7, 8..11]); 897$code.=<<___; 898 dec $rounds 899 jmp .Ldec_sbox 900.align 16 901.Ldec_loop: 902___ 903 &ShiftRows (@XMM[0..7, 8]); 904$code.=".Ldec_sbox:\n"; 905 &InvSbox (@XMM[0..7, 8..15]); 906$code.=<<___; 907 dec $rounds 908 jl .Ldec_done 909___ 910 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 911$code.=<<___; 912 movdqa -0x10($const), @XMM[8] # .LISR 913 jnz .Ldec_loop 914 movdqa -0x20($const), @XMM[8] # .LISRM0 915 jmp .Ldec_loop 916.align 16 917.Ldec_done: 918___ 919 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 920$code.=<<___; 921 movdqa ($key), @XMM[8] # last round key 922 pxor @XMM[8], @XMM[6] 923 pxor @XMM[8], @XMM[4] 924 pxor @XMM[8], @XMM[2] 925 pxor @XMM[8], @XMM[7] 926 pxor @XMM[8], @XMM[3] 927 pxor @XMM[8], @XMM[5] 928 pxor @XMM[8], @XMM[0] 929 pxor @XMM[8], @XMM[1] 930 ret 931.size _bsaes_decrypt8,.-_bsaes_decrypt8 932___ 933} 934{ 935my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 936 937sub bitslice_key { 938my @x=reverse(@_[0..7]); 939my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 940 941 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 942$code.=<<___; 943 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 944 movdqa @x[0], @x[2] 945 movdqa @x[1], @x[3] 946___ 947 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 948 949 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 950$code.=<<___; 951 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 952 movdqa @x[0], @x[4] 953 movdqa @x[2], @x[6] 954 movdqa @x[1], @x[5] 955 movdqa @x[3], @x[7] 956___ 957 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 958 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 959} 960 961$code.=<<___; 962.type _bsaes_key_convert,\@abi-omnipotent 963.align 16 964_bsaes_key_convert: 965 lea .Lmasks(%rip), $const 966 movdqu ($inp), %xmm7 # load round 0 key 967 lea 0x10($inp), $inp 968 movdqa 0x00($const), %xmm0 # 0x01... 969 movdqa 0x10($const), %xmm1 # 0x02... 970 movdqa 0x20($const), %xmm2 # 0x04... 971 movdqa 0x30($const), %xmm3 # 0x08... 972 movdqa 0x40($const), %xmm4 # .LM0 973 pcmpeqd %xmm5, %xmm5 # .LNOT 974 975 movdqu ($inp), %xmm6 # load round 1 key 976 movdqa %xmm7, ($out) # save round 0 key 977 lea 0x10($out), $out 978 dec $rounds 979 jmp .Lkey_loop 980.align 16 981.Lkey_loop: 982 pshufb %xmm4, %xmm6 # .LM0 983 984 movdqa %xmm0, %xmm8 985 movdqa %xmm1, %xmm9 986 987 pand %xmm6, %xmm8 988 pand %xmm6, %xmm9 989 movdqa %xmm2, %xmm10 990 pcmpeqb %xmm0, %xmm8 991 psllq \$4, %xmm0 # 0x10... 992 movdqa %xmm3, %xmm11 993 pcmpeqb %xmm1, %xmm9 994 psllq \$4, %xmm1 # 0x20... 995 996 pand %xmm6, %xmm10 997 pand %xmm6, %xmm11 998 movdqa %xmm0, %xmm12 999 pcmpeqb %xmm2, %xmm10 1000 psllq \$4, %xmm2 # 0x40... 1001 movdqa %xmm1, %xmm13 1002 pcmpeqb %xmm3, %xmm11 1003 psllq \$4, %xmm3 # 0x80... 1004 1005 movdqa %xmm2, %xmm14 1006 movdqa %xmm3, %xmm15 1007 pxor %xmm5, %xmm8 # "pnot" 1008 pxor %xmm5, %xmm9 1009 1010 pand %xmm6, %xmm12 1011 pand %xmm6, %xmm13 1012 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1013 pcmpeqb %xmm0, %xmm12 1014 psrlq \$4, %xmm0 # 0x01... 1015 movdqa %xmm9, 0x10($out) 1016 pcmpeqb %xmm1, %xmm13 1017 psrlq \$4, %xmm1 # 0x02... 1018 lea 0x10($inp), $inp 1019 1020 pand %xmm6, %xmm14 1021 pand %xmm6, %xmm15 1022 movdqa %xmm10, 0x20($out) 1023 pcmpeqb %xmm2, %xmm14 1024 psrlq \$4, %xmm2 # 0x04... 1025 movdqa %xmm11, 0x30($out) 1026 pcmpeqb %xmm3, %xmm15 1027 psrlq \$4, %xmm3 # 0x08... 1028 movdqu ($inp), %xmm6 # load next round key 1029 1030 pxor %xmm5, %xmm13 # "pnot" 1031 pxor %xmm5, %xmm14 1032 movdqa %xmm12, 0x40($out) 1033 movdqa %xmm13, 0x50($out) 1034 movdqa %xmm14, 0x60($out) 1035 movdqa %xmm15, 0x70($out) 1036 lea 0x80($out),$out 1037 dec $rounds 1038 jnz .Lkey_loop 1039 1040 movdqa 0x50($const), %xmm7 # .L63 1041 #movdqa %xmm6, ($out) # don't save last round key 1042 ret 1043.size _bsaes_key_convert,.-_bsaes_key_convert 1044___ 1045} 1046 1047if (0 && !$win64) { # following four functions are unsupported interface 1048 # used for benchmarking... 1049$code.=<<___; 1050.globl bsaes_enc_key_convert 1051.type bsaes_enc_key_convert,\@function,2 1052.align 16 1053bsaes_enc_key_convert: 1054 mov 240($inp),%r10d # pass rounds 1055 mov $inp,%rcx # pass key 1056 mov $out,%rax # pass key schedule 1057 call _bsaes_key_convert 1058 pxor %xmm6,%xmm7 # fix up last round key 1059 movdqa %xmm7,(%rax) # save last round key 1060 ret 1061.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1062 1063.globl bsaes_encrypt_128 1064.type bsaes_encrypt_128,\@function,4 1065.align 16 1066bsaes_encrypt_128: 1067.Lenc128_loop: 1068 movdqu 0x00($inp), @XMM[0] # load input 1069 movdqu 0x10($inp), @XMM[1] 1070 movdqu 0x20($inp), @XMM[2] 1071 movdqu 0x30($inp), @XMM[3] 1072 movdqu 0x40($inp), @XMM[4] 1073 movdqu 0x50($inp), @XMM[5] 1074 movdqu 0x60($inp), @XMM[6] 1075 movdqu 0x70($inp), @XMM[7] 1076 mov $key, %rax # pass the $key 1077 lea 0x80($inp), $inp 1078 mov \$10,%r10d 1079 1080 call _bsaes_encrypt8 1081 1082 movdqu @XMM[0], 0x00($out) # write output 1083 movdqu @XMM[1], 0x10($out) 1084 movdqu @XMM[4], 0x20($out) 1085 movdqu @XMM[6], 0x30($out) 1086 movdqu @XMM[3], 0x40($out) 1087 movdqu @XMM[7], 0x50($out) 1088 movdqu @XMM[2], 0x60($out) 1089 movdqu @XMM[5], 0x70($out) 1090 lea 0x80($out), $out 1091 sub \$0x80,$len 1092 ja .Lenc128_loop 1093 ret 1094.size bsaes_encrypt_128,.-bsaes_encrypt_128 1095 1096.globl bsaes_dec_key_convert 1097.type bsaes_dec_key_convert,\@function,2 1098.align 16 1099bsaes_dec_key_convert: 1100 mov 240($inp),%r10d # pass rounds 1101 mov $inp,%rcx # pass key 1102 mov $out,%rax # pass key schedule 1103 call _bsaes_key_convert 1104 pxor ($out),%xmm7 # fix up round 0 key 1105 movdqa %xmm6,(%rax) # save last round key 1106 movdqa %xmm7,($out) 1107 ret 1108.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1109 1110.globl bsaes_decrypt_128 1111.type bsaes_decrypt_128,\@function,4 1112.align 16 1113bsaes_decrypt_128: 1114.Ldec128_loop: 1115 movdqu 0x00($inp), @XMM[0] # load input 1116 movdqu 0x10($inp), @XMM[1] 1117 movdqu 0x20($inp), @XMM[2] 1118 movdqu 0x30($inp), @XMM[3] 1119 movdqu 0x40($inp), @XMM[4] 1120 movdqu 0x50($inp), @XMM[5] 1121 movdqu 0x60($inp), @XMM[6] 1122 movdqu 0x70($inp), @XMM[7] 1123 mov $key, %rax # pass the $key 1124 lea 0x80($inp), $inp 1125 mov \$10,%r10d 1126 1127 call _bsaes_decrypt8 1128 1129 movdqu @XMM[0], 0x00($out) # write output 1130 movdqu @XMM[1], 0x10($out) 1131 movdqu @XMM[6], 0x20($out) 1132 movdqu @XMM[4], 0x30($out) 1133 movdqu @XMM[2], 0x40($out) 1134 movdqu @XMM[7], 0x50($out) 1135 movdqu @XMM[3], 0x60($out) 1136 movdqu @XMM[5], 0x70($out) 1137 lea 0x80($out), $out 1138 sub \$0x80,$len 1139 ja .Ldec128_loop 1140 ret 1141.size bsaes_decrypt_128,.-bsaes_decrypt_128 1142___ 1143} 1144{ 1145###################################################################### 1146# 1147# OpenSSL interface 1148# 1149my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1150 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1151my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1152 1153if ($ecb) { 1154$code.=<<___; 1155.globl bsaes_ecb_encrypt_blocks 1156.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1157.align 16 1158bsaes_ecb_encrypt_blocks: 1159 mov %rsp, %rax 1160.Lecb_enc_prologue: 1161 push %rbp 1162 push %rbx 1163 push %r12 1164 push %r13 1165 push %r14 1166 push %r15 1167 lea -0x48(%rsp),%rsp 1168___ 1169$code.=<<___ if ($win64); 1170 lea -0xa0(%rsp), %rsp 1171 movaps %xmm6, 0x40(%rsp) 1172 movaps %xmm7, 0x50(%rsp) 1173 movaps %xmm8, 0x60(%rsp) 1174 movaps %xmm9, 0x70(%rsp) 1175 movaps %xmm10, 0x80(%rsp) 1176 movaps %xmm11, 0x90(%rsp) 1177 movaps %xmm12, 0xa0(%rsp) 1178 movaps %xmm13, 0xb0(%rsp) 1179 movaps %xmm14, 0xc0(%rsp) 1180 movaps %xmm15, 0xd0(%rsp) 1181.Lecb_enc_body: 1182___ 1183$code.=<<___; 1184 mov %rsp,%rbp # backup %rsp 1185 mov 240($arg4),%eax # rounds 1186 mov $arg1,$inp # backup arguments 1187 mov $arg2,$out 1188 mov $arg3,$len 1189 mov $arg4,$key 1190 cmp \$8,$arg3 1191 jb .Lecb_enc_short 1192 1193 mov %eax,%ebx # backup rounds 1194 shl \$7,%rax # 128 bytes per inner round key 1195 sub \$`128-32`,%rax # size of bit-sliced key schedule 1196 sub %rax,%rsp 1197 mov %rsp,%rax # pass key schedule 1198 mov $key,%rcx # pass key 1199 mov %ebx,%r10d # pass rounds 1200 call _bsaes_key_convert 1201 pxor %xmm6,%xmm7 # fix up last round key 1202 movdqa %xmm7,(%rax) # save last round key 1203 1204 sub \$8,$len 1205.Lecb_enc_loop: 1206 movdqu 0x00($inp), @XMM[0] # load input 1207 movdqu 0x10($inp), @XMM[1] 1208 movdqu 0x20($inp), @XMM[2] 1209 movdqu 0x30($inp), @XMM[3] 1210 movdqu 0x40($inp), @XMM[4] 1211 movdqu 0x50($inp), @XMM[5] 1212 mov %rsp, %rax # pass key schedule 1213 movdqu 0x60($inp), @XMM[6] 1214 mov %ebx,%r10d # pass rounds 1215 movdqu 0x70($inp), @XMM[7] 1216 lea 0x80($inp), $inp 1217 1218 call _bsaes_encrypt8 1219 1220 movdqu @XMM[0], 0x00($out) # write output 1221 movdqu @XMM[1], 0x10($out) 1222 movdqu @XMM[4], 0x20($out) 1223 movdqu @XMM[6], 0x30($out) 1224 movdqu @XMM[3], 0x40($out) 1225 movdqu @XMM[7], 0x50($out) 1226 movdqu @XMM[2], 0x60($out) 1227 movdqu @XMM[5], 0x70($out) 1228 lea 0x80($out), $out 1229 sub \$8,$len 1230 jnc .Lecb_enc_loop 1231 1232 add \$8,$len 1233 jz .Lecb_enc_done 1234 1235 movdqu 0x00($inp), @XMM[0] # load input 1236 mov %rsp, %rax # pass key schedule 1237 mov %ebx,%r10d # pass rounds 1238 cmp \$2,$len 1239 jb .Lecb_enc_one 1240 movdqu 0x10($inp), @XMM[1] 1241 je .Lecb_enc_two 1242 movdqu 0x20($inp), @XMM[2] 1243 cmp \$4,$len 1244 jb .Lecb_enc_three 1245 movdqu 0x30($inp), @XMM[3] 1246 je .Lecb_enc_four 1247 movdqu 0x40($inp), @XMM[4] 1248 cmp \$6,$len 1249 jb .Lecb_enc_five 1250 movdqu 0x50($inp), @XMM[5] 1251 je .Lecb_enc_six 1252 movdqu 0x60($inp), @XMM[6] 1253 call _bsaes_encrypt8 1254 movdqu @XMM[0], 0x00($out) # write output 1255 movdqu @XMM[1], 0x10($out) 1256 movdqu @XMM[4], 0x20($out) 1257 movdqu @XMM[6], 0x30($out) 1258 movdqu @XMM[3], 0x40($out) 1259 movdqu @XMM[7], 0x50($out) 1260 movdqu @XMM[2], 0x60($out) 1261 jmp .Lecb_enc_done 1262.align 16 1263.Lecb_enc_six: 1264 call _bsaes_encrypt8 1265 movdqu @XMM[0], 0x00($out) # write output 1266 movdqu @XMM[1], 0x10($out) 1267 movdqu @XMM[4], 0x20($out) 1268 movdqu @XMM[6], 0x30($out) 1269 movdqu @XMM[3], 0x40($out) 1270 movdqu @XMM[7], 0x50($out) 1271 jmp .Lecb_enc_done 1272.align 16 1273.Lecb_enc_five: 1274 call _bsaes_encrypt8 1275 movdqu @XMM[0], 0x00($out) # write output 1276 movdqu @XMM[1], 0x10($out) 1277 movdqu @XMM[4], 0x20($out) 1278 movdqu @XMM[6], 0x30($out) 1279 movdqu @XMM[3], 0x40($out) 1280 jmp .Lecb_enc_done 1281.align 16 1282.Lecb_enc_four: 1283 call _bsaes_encrypt8 1284 movdqu @XMM[0], 0x00($out) # write output 1285 movdqu @XMM[1], 0x10($out) 1286 movdqu @XMM[4], 0x20($out) 1287 movdqu @XMM[6], 0x30($out) 1288 jmp .Lecb_enc_done 1289.align 16 1290.Lecb_enc_three: 1291 call _bsaes_encrypt8 1292 movdqu @XMM[0], 0x00($out) # write output 1293 movdqu @XMM[1], 0x10($out) 1294 movdqu @XMM[4], 0x20($out) 1295 jmp .Lecb_enc_done 1296.align 16 1297.Lecb_enc_two: 1298 call _bsaes_encrypt8 1299 movdqu @XMM[0], 0x00($out) # write output 1300 movdqu @XMM[1], 0x10($out) 1301 jmp .Lecb_enc_done 1302.align 16 1303.Lecb_enc_one: 1304 call _bsaes_encrypt8 1305 movdqu @XMM[0], 0x00($out) # write output 1306 jmp .Lecb_enc_done 1307.align 16 1308.Lecb_enc_short: 1309 lea ($inp), $arg1 1310 lea ($out), $arg2 1311 lea ($key), $arg3 1312 call asm_AES_encrypt 1313 lea 16($inp), $inp 1314 lea 16($out), $out 1315 dec $len 1316 jnz .Lecb_enc_short 1317 1318.Lecb_enc_done: 1319 lea (%rsp),%rax 1320 pxor %xmm0, %xmm0 1321.Lecb_enc_bzero: # wipe key schedule [if any] 1322 movdqa %xmm0, 0x00(%rax) 1323 movdqa %xmm0, 0x10(%rax) 1324 lea 0x20(%rax), %rax 1325 cmp %rax, %rbp 1326 jb .Lecb_enc_bzero 1327 1328 lea (%rbp),%rsp # restore %rsp 1329___ 1330$code.=<<___ if ($win64); 1331 movaps 0x40(%rbp), %xmm6 1332 movaps 0x50(%rbp), %xmm7 1333 movaps 0x60(%rbp), %xmm8 1334 movaps 0x70(%rbp), %xmm9 1335 movaps 0x80(%rbp), %xmm10 1336 movaps 0x90(%rbp), %xmm11 1337 movaps 0xa0(%rbp), %xmm12 1338 movaps 0xb0(%rbp), %xmm13 1339 movaps 0xc0(%rbp), %xmm14 1340 movaps 0xd0(%rbp), %xmm15 1341 lea 0xa0(%rbp), %rsp 1342___ 1343$code.=<<___; 1344 mov 0x48(%rsp), %r15 1345 mov 0x50(%rsp), %r14 1346 mov 0x58(%rsp), %r13 1347 mov 0x60(%rsp), %r12 1348 mov 0x68(%rsp), %rbx 1349 mov 0x70(%rsp), %rax 1350 lea 0x78(%rsp), %rsp 1351 mov %rax, %rbp 1352.Lecb_enc_epilogue: 1353 ret 1354.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1355 1356.globl bsaes_ecb_decrypt_blocks 1357.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1358.align 16 1359bsaes_ecb_decrypt_blocks: 1360 mov %rsp, %rax 1361.Lecb_dec_prologue: 1362 push %rbp 1363 push %rbx 1364 push %r12 1365 push %r13 1366 push %r14 1367 push %r15 1368 lea -0x48(%rsp),%rsp 1369___ 1370$code.=<<___ if ($win64); 1371 lea -0xa0(%rsp), %rsp 1372 movaps %xmm6, 0x40(%rsp) 1373 movaps %xmm7, 0x50(%rsp) 1374 movaps %xmm8, 0x60(%rsp) 1375 movaps %xmm9, 0x70(%rsp) 1376 movaps %xmm10, 0x80(%rsp) 1377 movaps %xmm11, 0x90(%rsp) 1378 movaps %xmm12, 0xa0(%rsp) 1379 movaps %xmm13, 0xb0(%rsp) 1380 movaps %xmm14, 0xc0(%rsp) 1381 movaps %xmm15, 0xd0(%rsp) 1382.Lecb_dec_body: 1383___ 1384$code.=<<___; 1385 mov %rsp,%rbp # backup %rsp 1386 mov 240($arg4),%eax # rounds 1387 mov $arg1,$inp # backup arguments 1388 mov $arg2,$out 1389 mov $arg3,$len 1390 mov $arg4,$key 1391 cmp \$8,$arg3 1392 jb .Lecb_dec_short 1393 1394 mov %eax,%ebx # backup rounds 1395 shl \$7,%rax # 128 bytes per inner round key 1396 sub \$`128-32`,%rax # size of bit-sliced key schedule 1397 sub %rax,%rsp 1398 mov %rsp,%rax # pass key schedule 1399 mov $key,%rcx # pass key 1400 mov %ebx,%r10d # pass rounds 1401 call _bsaes_key_convert 1402 pxor (%rsp),%xmm7 # fix up 0 round key 1403 movdqa %xmm6,(%rax) # save last round key 1404 movdqa %xmm7,(%rsp) 1405 1406 sub \$8,$len 1407.Lecb_dec_loop: 1408 movdqu 0x00($inp), @XMM[0] # load input 1409 movdqu 0x10($inp), @XMM[1] 1410 movdqu 0x20($inp), @XMM[2] 1411 movdqu 0x30($inp), @XMM[3] 1412 movdqu 0x40($inp), @XMM[4] 1413 movdqu 0x50($inp), @XMM[5] 1414 mov %rsp, %rax # pass key schedule 1415 movdqu 0x60($inp), @XMM[6] 1416 mov %ebx,%r10d # pass rounds 1417 movdqu 0x70($inp), @XMM[7] 1418 lea 0x80($inp), $inp 1419 1420 call _bsaes_decrypt8 1421 1422 movdqu @XMM[0], 0x00($out) # write output 1423 movdqu @XMM[1], 0x10($out) 1424 movdqu @XMM[6], 0x20($out) 1425 movdqu @XMM[4], 0x30($out) 1426 movdqu @XMM[2], 0x40($out) 1427 movdqu @XMM[7], 0x50($out) 1428 movdqu @XMM[3], 0x60($out) 1429 movdqu @XMM[5], 0x70($out) 1430 lea 0x80($out), $out 1431 sub \$8,$len 1432 jnc .Lecb_dec_loop 1433 1434 add \$8,$len 1435 jz .Lecb_dec_done 1436 1437 movdqu 0x00($inp), @XMM[0] # load input 1438 mov %rsp, %rax # pass key schedule 1439 mov %ebx,%r10d # pass rounds 1440 cmp \$2,$len 1441 jb .Lecb_dec_one 1442 movdqu 0x10($inp), @XMM[1] 1443 je .Lecb_dec_two 1444 movdqu 0x20($inp), @XMM[2] 1445 cmp \$4,$len 1446 jb .Lecb_dec_three 1447 movdqu 0x30($inp), @XMM[3] 1448 je .Lecb_dec_four 1449 movdqu 0x40($inp), @XMM[4] 1450 cmp \$6,$len 1451 jb .Lecb_dec_five 1452 movdqu 0x50($inp), @XMM[5] 1453 je .Lecb_dec_six 1454 movdqu 0x60($inp), @XMM[6] 1455 call _bsaes_decrypt8 1456 movdqu @XMM[0], 0x00($out) # write output 1457 movdqu @XMM[1], 0x10($out) 1458 movdqu @XMM[6], 0x20($out) 1459 movdqu @XMM[4], 0x30($out) 1460 movdqu @XMM[2], 0x40($out) 1461 movdqu @XMM[7], 0x50($out) 1462 movdqu @XMM[3], 0x60($out) 1463 jmp .Lecb_dec_done 1464.align 16 1465.Lecb_dec_six: 1466 call _bsaes_decrypt8 1467 movdqu @XMM[0], 0x00($out) # write output 1468 movdqu @XMM[1], 0x10($out) 1469 movdqu @XMM[6], 0x20($out) 1470 movdqu @XMM[4], 0x30($out) 1471 movdqu @XMM[2], 0x40($out) 1472 movdqu @XMM[7], 0x50($out) 1473 jmp .Lecb_dec_done 1474.align 16 1475.Lecb_dec_five: 1476 call _bsaes_decrypt8 1477 movdqu @XMM[0], 0x00($out) # write output 1478 movdqu @XMM[1], 0x10($out) 1479 movdqu @XMM[6], 0x20($out) 1480 movdqu @XMM[4], 0x30($out) 1481 movdqu @XMM[2], 0x40($out) 1482 jmp .Lecb_dec_done 1483.align 16 1484.Lecb_dec_four: 1485 call _bsaes_decrypt8 1486 movdqu @XMM[0], 0x00($out) # write output 1487 movdqu @XMM[1], 0x10($out) 1488 movdqu @XMM[6], 0x20($out) 1489 movdqu @XMM[4], 0x30($out) 1490 jmp .Lecb_dec_done 1491.align 16 1492.Lecb_dec_three: 1493 call _bsaes_decrypt8 1494 movdqu @XMM[0], 0x00($out) # write output 1495 movdqu @XMM[1], 0x10($out) 1496 movdqu @XMM[6], 0x20($out) 1497 jmp .Lecb_dec_done 1498.align 16 1499.Lecb_dec_two: 1500 call _bsaes_decrypt8 1501 movdqu @XMM[0], 0x00($out) # write output 1502 movdqu @XMM[1], 0x10($out) 1503 jmp .Lecb_dec_done 1504.align 16 1505.Lecb_dec_one: 1506 call _bsaes_decrypt8 1507 movdqu @XMM[0], 0x00($out) # write output 1508 jmp .Lecb_dec_done 1509.align 16 1510.Lecb_dec_short: 1511 lea ($inp), $arg1 1512 lea ($out), $arg2 1513 lea ($key), $arg3 1514 call asm_AES_decrypt 1515 lea 16($inp), $inp 1516 lea 16($out), $out 1517 dec $len 1518 jnz .Lecb_dec_short 1519 1520.Lecb_dec_done: 1521 lea (%rsp),%rax 1522 pxor %xmm0, %xmm0 1523.Lecb_dec_bzero: # wipe key schedule [if any] 1524 movdqa %xmm0, 0x00(%rax) 1525 movdqa %xmm0, 0x10(%rax) 1526 lea 0x20(%rax), %rax 1527 cmp %rax, %rbp 1528 jb .Lecb_dec_bzero 1529 1530 lea (%rbp),%rsp # restore %rsp 1531___ 1532$code.=<<___ if ($win64); 1533 movaps 0x40(%rbp), %xmm6 1534 movaps 0x50(%rbp), %xmm7 1535 movaps 0x60(%rbp), %xmm8 1536 movaps 0x70(%rbp), %xmm9 1537 movaps 0x80(%rbp), %xmm10 1538 movaps 0x90(%rbp), %xmm11 1539 movaps 0xa0(%rbp), %xmm12 1540 movaps 0xb0(%rbp), %xmm13 1541 movaps 0xc0(%rbp), %xmm14 1542 movaps 0xd0(%rbp), %xmm15 1543 lea 0xa0(%rbp), %rsp 1544___ 1545$code.=<<___; 1546 mov 0x48(%rsp), %r15 1547 mov 0x50(%rsp), %r14 1548 mov 0x58(%rsp), %r13 1549 mov 0x60(%rsp), %r12 1550 mov 0x68(%rsp), %rbx 1551 mov 0x70(%rsp), %rax 1552 lea 0x78(%rsp), %rsp 1553 mov %rax, %rbp 1554.Lecb_dec_epilogue: 1555 ret 1556.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1557___ 1558} 1559$code.=<<___; 1560.extern asm_AES_cbc_encrypt 1561.globl bsaes_cbc_encrypt 1562.type bsaes_cbc_encrypt,\@abi-omnipotent 1563.align 16 1564bsaes_cbc_encrypt: 1565___ 1566$code.=<<___ if ($win64); 1567 mov 48(%rsp),$arg6 # pull direction flag 1568___ 1569$code.=<<___; 1570 cmp \$0,$arg6 1571 jne asm_AES_cbc_encrypt 1572 cmp \$128,$arg3 1573 jb asm_AES_cbc_encrypt 1574 1575 mov %rsp, %rax 1576.Lcbc_dec_prologue: 1577 push %rbp 1578 push %rbx 1579 push %r12 1580 push %r13 1581 push %r14 1582 push %r15 1583 lea -0x48(%rsp), %rsp 1584___ 1585$code.=<<___ if ($win64); 1586 mov 0xa0(%rsp),$arg5 # pull ivp 1587 lea -0xa0(%rsp), %rsp 1588 movaps %xmm6, 0x40(%rsp) 1589 movaps %xmm7, 0x50(%rsp) 1590 movaps %xmm8, 0x60(%rsp) 1591 movaps %xmm9, 0x70(%rsp) 1592 movaps %xmm10, 0x80(%rsp) 1593 movaps %xmm11, 0x90(%rsp) 1594 movaps %xmm12, 0xa0(%rsp) 1595 movaps %xmm13, 0xb0(%rsp) 1596 movaps %xmm14, 0xc0(%rsp) 1597 movaps %xmm15, 0xd0(%rsp) 1598.Lcbc_dec_body: 1599___ 1600$code.=<<___; 1601 mov %rsp, %rbp # backup %rsp 1602 mov 240($arg4), %eax # rounds 1603 mov $arg1, $inp # backup arguments 1604 mov $arg2, $out 1605 mov $arg3, $len 1606 mov $arg4, $key 1607 mov $arg5, %rbx 1608 shr \$4, $len # bytes to blocks 1609 1610 mov %eax, %edx # rounds 1611 shl \$7, %rax # 128 bytes per inner round key 1612 sub \$`128-32`, %rax # size of bit-sliced key schedule 1613 sub %rax, %rsp 1614 1615 mov %rsp, %rax # pass key schedule 1616 mov $key, %rcx # pass key 1617 mov %edx, %r10d # pass rounds 1618 call _bsaes_key_convert 1619 pxor (%rsp),%xmm7 # fix up 0 round key 1620 movdqa %xmm6,(%rax) # save last round key 1621 movdqa %xmm7,(%rsp) 1622 1623 movdqu (%rbx), @XMM[15] # load IV 1624 sub \$8,$len 1625.Lcbc_dec_loop: 1626 movdqu 0x00($inp), @XMM[0] # load input 1627 movdqu 0x10($inp), @XMM[1] 1628 movdqu 0x20($inp), @XMM[2] 1629 movdqu 0x30($inp), @XMM[3] 1630 movdqu 0x40($inp), @XMM[4] 1631 movdqu 0x50($inp), @XMM[5] 1632 mov %rsp, %rax # pass key schedule 1633 movdqu 0x60($inp), @XMM[6] 1634 mov %edx,%r10d # pass rounds 1635 movdqu 0x70($inp), @XMM[7] 1636 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1637 1638 call _bsaes_decrypt8 1639 1640 pxor 0x20(%rbp), @XMM[0] # ^= IV 1641 movdqu 0x00($inp), @XMM[8] # re-load input 1642 movdqu 0x10($inp), @XMM[9] 1643 pxor @XMM[8], @XMM[1] 1644 movdqu 0x20($inp), @XMM[10] 1645 pxor @XMM[9], @XMM[6] 1646 movdqu 0x30($inp), @XMM[11] 1647 pxor @XMM[10], @XMM[4] 1648 movdqu 0x40($inp), @XMM[12] 1649 pxor @XMM[11], @XMM[2] 1650 movdqu 0x50($inp), @XMM[13] 1651 pxor @XMM[12], @XMM[7] 1652 movdqu 0x60($inp), @XMM[14] 1653 pxor @XMM[13], @XMM[3] 1654 movdqu 0x70($inp), @XMM[15] # IV 1655 pxor @XMM[14], @XMM[5] 1656 movdqu @XMM[0], 0x00($out) # write output 1657 lea 0x80($inp), $inp 1658 movdqu @XMM[1], 0x10($out) 1659 movdqu @XMM[6], 0x20($out) 1660 movdqu @XMM[4], 0x30($out) 1661 movdqu @XMM[2], 0x40($out) 1662 movdqu @XMM[7], 0x50($out) 1663 movdqu @XMM[3], 0x60($out) 1664 movdqu @XMM[5], 0x70($out) 1665 lea 0x80($out), $out 1666 sub \$8,$len 1667 jnc .Lcbc_dec_loop 1668 1669 add \$8,$len 1670 jz .Lcbc_dec_done 1671 1672 movdqu 0x00($inp), @XMM[0] # load input 1673 mov %rsp, %rax # pass key schedule 1674 mov %edx, %r10d # pass rounds 1675 cmp \$2,$len 1676 jb .Lcbc_dec_one 1677 movdqu 0x10($inp), @XMM[1] 1678 je .Lcbc_dec_two 1679 movdqu 0x20($inp), @XMM[2] 1680 cmp \$4,$len 1681 jb .Lcbc_dec_three 1682 movdqu 0x30($inp), @XMM[3] 1683 je .Lcbc_dec_four 1684 movdqu 0x40($inp), @XMM[4] 1685 cmp \$6,$len 1686 jb .Lcbc_dec_five 1687 movdqu 0x50($inp), @XMM[5] 1688 je .Lcbc_dec_six 1689 movdqu 0x60($inp), @XMM[6] 1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1691 call _bsaes_decrypt8 1692 pxor 0x20(%rbp), @XMM[0] # ^= IV 1693 movdqu 0x00($inp), @XMM[8] # re-load input 1694 movdqu 0x10($inp), @XMM[9] 1695 pxor @XMM[8], @XMM[1] 1696 movdqu 0x20($inp), @XMM[10] 1697 pxor @XMM[9], @XMM[6] 1698 movdqu 0x30($inp), @XMM[11] 1699 pxor @XMM[10], @XMM[4] 1700 movdqu 0x40($inp), @XMM[12] 1701 pxor @XMM[11], @XMM[2] 1702 movdqu 0x50($inp), @XMM[13] 1703 pxor @XMM[12], @XMM[7] 1704 movdqu 0x60($inp), @XMM[15] # IV 1705 pxor @XMM[13], @XMM[3] 1706 movdqu @XMM[0], 0x00($out) # write output 1707 movdqu @XMM[1], 0x10($out) 1708 movdqu @XMM[6], 0x20($out) 1709 movdqu @XMM[4], 0x30($out) 1710 movdqu @XMM[2], 0x40($out) 1711 movdqu @XMM[7], 0x50($out) 1712 movdqu @XMM[3], 0x60($out) 1713 jmp .Lcbc_dec_done 1714.align 16 1715.Lcbc_dec_six: 1716 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1717 call _bsaes_decrypt8 1718 pxor 0x20(%rbp), @XMM[0] # ^= IV 1719 movdqu 0x00($inp), @XMM[8] # re-load input 1720 movdqu 0x10($inp), @XMM[9] 1721 pxor @XMM[8], @XMM[1] 1722 movdqu 0x20($inp), @XMM[10] 1723 pxor @XMM[9], @XMM[6] 1724 movdqu 0x30($inp), @XMM[11] 1725 pxor @XMM[10], @XMM[4] 1726 movdqu 0x40($inp), @XMM[12] 1727 pxor @XMM[11], @XMM[2] 1728 movdqu 0x50($inp), @XMM[15] # IV 1729 pxor @XMM[12], @XMM[7] 1730 movdqu @XMM[0], 0x00($out) # write output 1731 movdqu @XMM[1], 0x10($out) 1732 movdqu @XMM[6], 0x20($out) 1733 movdqu @XMM[4], 0x30($out) 1734 movdqu @XMM[2], 0x40($out) 1735 movdqu @XMM[7], 0x50($out) 1736 jmp .Lcbc_dec_done 1737.align 16 1738.Lcbc_dec_five: 1739 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1740 call _bsaes_decrypt8 1741 pxor 0x20(%rbp), @XMM[0] # ^= IV 1742 movdqu 0x00($inp), @XMM[8] # re-load input 1743 movdqu 0x10($inp), @XMM[9] 1744 pxor @XMM[8], @XMM[1] 1745 movdqu 0x20($inp), @XMM[10] 1746 pxor @XMM[9], @XMM[6] 1747 movdqu 0x30($inp), @XMM[11] 1748 pxor @XMM[10], @XMM[4] 1749 movdqu 0x40($inp), @XMM[15] # IV 1750 pxor @XMM[11], @XMM[2] 1751 movdqu @XMM[0], 0x00($out) # write output 1752 movdqu @XMM[1], 0x10($out) 1753 movdqu @XMM[6], 0x20($out) 1754 movdqu @XMM[4], 0x30($out) 1755 movdqu @XMM[2], 0x40($out) 1756 jmp .Lcbc_dec_done 1757.align 16 1758.Lcbc_dec_four: 1759 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1760 call _bsaes_decrypt8 1761 pxor 0x20(%rbp), @XMM[0] # ^= IV 1762 movdqu 0x00($inp), @XMM[8] # re-load input 1763 movdqu 0x10($inp), @XMM[9] 1764 pxor @XMM[8], @XMM[1] 1765 movdqu 0x20($inp), @XMM[10] 1766 pxor @XMM[9], @XMM[6] 1767 movdqu 0x30($inp), @XMM[15] # IV 1768 pxor @XMM[10], @XMM[4] 1769 movdqu @XMM[0], 0x00($out) # write output 1770 movdqu @XMM[1], 0x10($out) 1771 movdqu @XMM[6], 0x20($out) 1772 movdqu @XMM[4], 0x30($out) 1773 jmp .Lcbc_dec_done 1774.align 16 1775.Lcbc_dec_three: 1776 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1777 call _bsaes_decrypt8 1778 pxor 0x20(%rbp), @XMM[0] # ^= IV 1779 movdqu 0x00($inp), @XMM[8] # re-load input 1780 movdqu 0x10($inp), @XMM[9] 1781 pxor @XMM[8], @XMM[1] 1782 movdqu 0x20($inp), @XMM[15] # IV 1783 pxor @XMM[9], @XMM[6] 1784 movdqu @XMM[0], 0x00($out) # write output 1785 movdqu @XMM[1], 0x10($out) 1786 movdqu @XMM[6], 0x20($out) 1787 jmp .Lcbc_dec_done 1788.align 16 1789.Lcbc_dec_two: 1790 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1791 call _bsaes_decrypt8 1792 pxor 0x20(%rbp), @XMM[0] # ^= IV 1793 movdqu 0x00($inp), @XMM[8] # re-load input 1794 movdqu 0x10($inp), @XMM[15] # IV 1795 pxor @XMM[8], @XMM[1] 1796 movdqu @XMM[0], 0x00($out) # write output 1797 movdqu @XMM[1], 0x10($out) 1798 jmp .Lcbc_dec_done 1799.align 16 1800.Lcbc_dec_one: 1801 lea ($inp), $arg1 1802 lea 0x20(%rbp), $arg2 # buffer output 1803 lea ($key), $arg3 1804 call asm_AES_decrypt # doesn't touch %xmm 1805 pxor 0x20(%rbp), @XMM[15] # ^= IV 1806 movdqu @XMM[15], ($out) # write output 1807 movdqa @XMM[0], @XMM[15] # IV 1808 1809.Lcbc_dec_done: 1810 movdqu @XMM[15], (%rbx) # return IV 1811 lea (%rsp), %rax 1812 pxor %xmm0, %xmm0 1813.Lcbc_dec_bzero: # wipe key schedule [if any] 1814 movdqa %xmm0, 0x00(%rax) 1815 movdqa %xmm0, 0x10(%rax) 1816 lea 0x20(%rax), %rax 1817 cmp %rax, %rbp 1818 ja .Lcbc_dec_bzero 1819 1820 lea (%rbp),%rsp # restore %rsp 1821___ 1822$code.=<<___ if ($win64); 1823 movaps 0x40(%rbp), %xmm6 1824 movaps 0x50(%rbp), %xmm7 1825 movaps 0x60(%rbp), %xmm8 1826 movaps 0x70(%rbp), %xmm9 1827 movaps 0x80(%rbp), %xmm10 1828 movaps 0x90(%rbp), %xmm11 1829 movaps 0xa0(%rbp), %xmm12 1830 movaps 0xb0(%rbp), %xmm13 1831 movaps 0xc0(%rbp), %xmm14 1832 movaps 0xd0(%rbp), %xmm15 1833 lea 0xa0(%rbp), %rsp 1834___ 1835$code.=<<___; 1836 mov 0x48(%rsp), %r15 1837 mov 0x50(%rsp), %r14 1838 mov 0x58(%rsp), %r13 1839 mov 0x60(%rsp), %r12 1840 mov 0x68(%rsp), %rbx 1841 mov 0x70(%rsp), %rax 1842 lea 0x78(%rsp), %rsp 1843 mov %rax, %rbp 1844.Lcbc_dec_epilogue: 1845 ret 1846.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1847 1848.globl bsaes_ctr32_encrypt_blocks 1849.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1850.align 16 1851bsaes_ctr32_encrypt_blocks: 1852 mov %rsp, %rax 1853.Lctr_enc_prologue: 1854 push %rbp 1855 push %rbx 1856 push %r12 1857 push %r13 1858 push %r14 1859 push %r15 1860 lea -0x48(%rsp), %rsp 1861___ 1862$code.=<<___ if ($win64); 1863 mov 0xa0(%rsp),$arg5 # pull ivp 1864 lea -0xa0(%rsp), %rsp 1865 movaps %xmm6, 0x40(%rsp) 1866 movaps %xmm7, 0x50(%rsp) 1867 movaps %xmm8, 0x60(%rsp) 1868 movaps %xmm9, 0x70(%rsp) 1869 movaps %xmm10, 0x80(%rsp) 1870 movaps %xmm11, 0x90(%rsp) 1871 movaps %xmm12, 0xa0(%rsp) 1872 movaps %xmm13, 0xb0(%rsp) 1873 movaps %xmm14, 0xc0(%rsp) 1874 movaps %xmm15, 0xd0(%rsp) 1875.Lctr_enc_body: 1876___ 1877$code.=<<___; 1878 mov %rsp, %rbp # backup %rsp 1879 movdqu ($arg5), %xmm0 # load counter 1880 mov 240($arg4), %eax # rounds 1881 mov $arg1, $inp # backup arguments 1882 mov $arg2, $out 1883 mov $arg3, $len 1884 mov $arg4, $key 1885 movdqa %xmm0, 0x20(%rbp) # copy counter 1886 cmp \$8, $arg3 1887 jb .Lctr_enc_short 1888 1889 mov %eax, %ebx # rounds 1890 shl \$7, %rax # 128 bytes per inner round key 1891 sub \$`128-32`, %rax # size of bit-sliced key schedule 1892 sub %rax, %rsp 1893 1894 mov %rsp, %rax # pass key schedule 1895 mov $key, %rcx # pass key 1896 mov %ebx, %r10d # pass rounds 1897 call _bsaes_key_convert 1898 pxor %xmm6,%xmm7 # fix up last round key 1899 movdqa %xmm7,(%rax) # save last round key 1900 1901 movdqa (%rsp), @XMM[9] # load round0 key 1902 lea .LADD1(%rip), %r11 1903 movdqa 0x20(%rbp), @XMM[0] # counter copy 1904 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1905 pshufb @XMM[8], @XMM[9] # byte swap upper part 1906 pshufb @XMM[8], @XMM[0] 1907 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1908 jmp .Lctr_enc_loop 1909.align 16 1910.Lctr_enc_loop: 1911 movdqa @XMM[0], 0x20(%rbp) # save counter 1912 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1913 movdqa @XMM[0], @XMM[2] 1914 paddd 0x00(%r11), @XMM[1] # .LADD1 1915 movdqa @XMM[0], @XMM[3] 1916 paddd 0x10(%r11), @XMM[2] # .LADD2 1917 movdqa @XMM[0], @XMM[4] 1918 paddd 0x20(%r11), @XMM[3] # .LADD3 1919 movdqa @XMM[0], @XMM[5] 1920 paddd 0x30(%r11), @XMM[4] # .LADD4 1921 movdqa @XMM[0], @XMM[6] 1922 paddd 0x40(%r11), @XMM[5] # .LADD5 1923 movdqa @XMM[0], @XMM[7] 1924 paddd 0x50(%r11), @XMM[6] # .LADD6 1925 paddd 0x60(%r11), @XMM[7] # .LADD7 1926 1927 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1928 # to flip byte order in 32-bit counter 1929 movdqa (%rsp), @XMM[9] # round 0 key 1930 lea 0x10(%rsp), %rax # pass key schedule 1931 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1932 pxor @XMM[9], @XMM[0] # xor with round0 key 1933 pxor @XMM[9], @XMM[1] 1934 pxor @XMM[9], @XMM[2] 1935 pxor @XMM[9], @XMM[3] 1936 pshufb @XMM[8], @XMM[0] 1937 pshufb @XMM[8], @XMM[1] 1938 pxor @XMM[9], @XMM[4] 1939 pxor @XMM[9], @XMM[5] 1940 pshufb @XMM[8], @XMM[2] 1941 pshufb @XMM[8], @XMM[3] 1942 pxor @XMM[9], @XMM[6] 1943 pxor @XMM[9], @XMM[7] 1944 pshufb @XMM[8], @XMM[4] 1945 pshufb @XMM[8], @XMM[5] 1946 pshufb @XMM[8], @XMM[6] 1947 pshufb @XMM[8], @XMM[7] 1948 lea .LBS0(%rip), %r11 # constants table 1949 mov %ebx,%r10d # pass rounds 1950 1951 call _bsaes_encrypt8_bitslice 1952 1953 sub \$8,$len 1954 jc .Lctr_enc_loop_done 1955 1956 movdqu 0x00($inp), @XMM[8] # load input 1957 movdqu 0x10($inp), @XMM[9] 1958 movdqu 0x20($inp), @XMM[10] 1959 movdqu 0x30($inp), @XMM[11] 1960 movdqu 0x40($inp), @XMM[12] 1961 movdqu 0x50($inp), @XMM[13] 1962 movdqu 0x60($inp), @XMM[14] 1963 movdqu 0x70($inp), @XMM[15] 1964 lea 0x80($inp),$inp 1965 pxor @XMM[0], @XMM[8] 1966 movdqa 0x20(%rbp), @XMM[0] # load counter 1967 pxor @XMM[9], @XMM[1] 1968 movdqu @XMM[8], 0x00($out) # write output 1969 pxor @XMM[10], @XMM[4] 1970 movdqu @XMM[1], 0x10($out) 1971 pxor @XMM[11], @XMM[6] 1972 movdqu @XMM[4], 0x20($out) 1973 pxor @XMM[12], @XMM[3] 1974 movdqu @XMM[6], 0x30($out) 1975 pxor @XMM[13], @XMM[7] 1976 movdqu @XMM[3], 0x40($out) 1977 pxor @XMM[14], @XMM[2] 1978 movdqu @XMM[7], 0x50($out) 1979 pxor @XMM[15], @XMM[5] 1980 movdqu @XMM[2], 0x60($out) 1981 lea .LADD1(%rip), %r11 1982 movdqu @XMM[5], 0x70($out) 1983 lea 0x80($out), $out 1984 paddd 0x70(%r11), @XMM[0] # .LADD8 1985 jnz .Lctr_enc_loop 1986 1987 jmp .Lctr_enc_done 1988.align 16 1989.Lctr_enc_loop_done: 1990 add \$8, $len 1991 movdqu 0x00($inp), @XMM[8] # load input 1992 pxor @XMM[8], @XMM[0] 1993 movdqu @XMM[0], 0x00($out) # write output 1994 cmp \$2,$len 1995 jb .Lctr_enc_done 1996 movdqu 0x10($inp), @XMM[9] 1997 pxor @XMM[9], @XMM[1] 1998 movdqu @XMM[1], 0x10($out) 1999 je .Lctr_enc_done 2000 movdqu 0x20($inp), @XMM[10] 2001 pxor @XMM[10], @XMM[4] 2002 movdqu @XMM[4], 0x20($out) 2003 cmp \$4,$len 2004 jb .Lctr_enc_done 2005 movdqu 0x30($inp), @XMM[11] 2006 pxor @XMM[11], @XMM[6] 2007 movdqu @XMM[6], 0x30($out) 2008 je .Lctr_enc_done 2009 movdqu 0x40($inp), @XMM[12] 2010 pxor @XMM[12], @XMM[3] 2011 movdqu @XMM[3], 0x40($out) 2012 cmp \$6,$len 2013 jb .Lctr_enc_done 2014 movdqu 0x50($inp), @XMM[13] 2015 pxor @XMM[13], @XMM[7] 2016 movdqu @XMM[7], 0x50($out) 2017 je .Lctr_enc_done 2018 movdqu 0x60($inp), @XMM[14] 2019 pxor @XMM[14], @XMM[2] 2020 movdqu @XMM[2], 0x60($out) 2021 jmp .Lctr_enc_done 2022 2023.align 16 2024.Lctr_enc_short: 2025 lea 0x20(%rbp), $arg1 2026 lea 0x30(%rbp), $arg2 2027 lea ($key), $arg3 2028 call asm_AES_encrypt 2029 movdqu ($inp), @XMM[1] 2030 lea 16($inp), $inp 2031 mov 0x2c(%rbp), %eax # load 32-bit counter 2032 bswap %eax 2033 pxor 0x30(%rbp), @XMM[1] 2034 inc %eax # increment 2035 movdqu @XMM[1], ($out) 2036 bswap %eax 2037 lea 16($out), $out 2038 mov %eax, 0x2c(%rsp) # save 32-bit counter 2039 dec $len 2040 jnz .Lctr_enc_short 2041 2042.Lctr_enc_done: 2043 lea (%rsp), %rax 2044 pxor %xmm0, %xmm0 2045.Lctr_enc_bzero: # wipe key schedule [if any] 2046 movdqa %xmm0, 0x00(%rax) 2047 movdqa %xmm0, 0x10(%rax) 2048 lea 0x20(%rax), %rax 2049 cmp %rax, %rbp 2050 ja .Lctr_enc_bzero 2051 2052 lea (%rbp),%rsp # restore %rsp 2053___ 2054$code.=<<___ if ($win64); 2055 movaps 0x40(%rbp), %xmm6 2056 movaps 0x50(%rbp), %xmm7 2057 movaps 0x60(%rbp), %xmm8 2058 movaps 0x70(%rbp), %xmm9 2059 movaps 0x80(%rbp), %xmm10 2060 movaps 0x90(%rbp), %xmm11 2061 movaps 0xa0(%rbp), %xmm12 2062 movaps 0xb0(%rbp), %xmm13 2063 movaps 0xc0(%rbp), %xmm14 2064 movaps 0xd0(%rbp), %xmm15 2065 lea 0xa0(%rbp), %rsp 2066___ 2067$code.=<<___; 2068 mov 0x48(%rsp), %r15 2069 mov 0x50(%rsp), %r14 2070 mov 0x58(%rsp), %r13 2071 mov 0x60(%rsp), %r12 2072 mov 0x68(%rsp), %rbx 2073 mov 0x70(%rsp), %rax 2074 lea 0x78(%rsp), %rsp 2075 mov %rax, %rbp 2076.Lctr_enc_epilogue: 2077 ret 2078.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2079___ 2080###################################################################### 2081# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2082# const AES_KEY *key1, const AES_KEY *key2, 2083# const unsigned char iv[16]); 2084# 2085my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2086$arg6=~s/d$//; 2087 2088$code.=<<___; 2089.globl bsaes_xts_encrypt 2090.type bsaes_xts_encrypt,\@abi-omnipotent 2091.align 16 2092bsaes_xts_encrypt: 2093 mov %rsp, %rax 2094.Lxts_enc_prologue: 2095 push %rbp 2096 push %rbx 2097 push %r12 2098 push %r13 2099 push %r14 2100 push %r15 2101 lea -0x48(%rsp), %rsp 2102___ 2103$code.=<<___ if ($win64); 2104 mov 0xa0(%rsp),$arg5 # pull key2 2105 mov 0xa8(%rsp),$arg6 # pull ivp 2106 lea -0xa0(%rsp), %rsp 2107 movaps %xmm6, 0x40(%rsp) 2108 movaps %xmm7, 0x50(%rsp) 2109 movaps %xmm8, 0x60(%rsp) 2110 movaps %xmm9, 0x70(%rsp) 2111 movaps %xmm10, 0x80(%rsp) 2112 movaps %xmm11, 0x90(%rsp) 2113 movaps %xmm12, 0xa0(%rsp) 2114 movaps %xmm13, 0xb0(%rsp) 2115 movaps %xmm14, 0xc0(%rsp) 2116 movaps %xmm15, 0xd0(%rsp) 2117.Lxts_enc_body: 2118___ 2119$code.=<<___; 2120 mov %rsp, %rbp # backup %rsp 2121 mov $arg1, $inp # backup arguments 2122 mov $arg2, $out 2123 mov $arg3, $len 2124 mov $arg4, $key 2125 2126 lea ($arg6), $arg1 2127 lea 0x20(%rbp), $arg2 2128 lea ($arg5), $arg3 2129 call asm_AES_encrypt # generate initial tweak 2130 2131 mov 240($key), %eax # rounds 2132 mov $len, %rbx # backup $len 2133 2134 mov %eax, %edx # rounds 2135 shl \$7, %rax # 128 bytes per inner round key 2136 sub \$`128-32`, %rax # size of bit-sliced key schedule 2137 sub %rax, %rsp 2138 2139 mov %rsp, %rax # pass key schedule 2140 mov $key, %rcx # pass key 2141 mov %edx, %r10d # pass rounds 2142 call _bsaes_key_convert 2143 pxor %xmm6, %xmm7 # fix up last round key 2144 movdqa %xmm7, (%rax) # save last round key 2145 2146 and \$-16, $len 2147 sub \$0x80, %rsp # place for tweak[8] 2148 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2149 2150 pxor $twtmp, $twtmp 2151 movdqa .Lxts_magic(%rip), $twmask 2152 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2153 2154 sub \$0x80, $len 2155 jc .Lxts_enc_short 2156 jmp .Lxts_enc_loop 2157 2158.align 16 2159.Lxts_enc_loop: 2160___ 2161 for ($i=0;$i<7;$i++) { 2162 $code.=<<___; 2163 pshufd \$0x13, $twtmp, $twres 2164 pxor $twtmp, $twtmp 2165 movdqa @XMM[7], @XMM[$i] 2166 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2167 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2168 pand $twmask, $twres # isolate carry and residue 2169 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2170 pxor $twres, @XMM[7] 2171___ 2172 $code.=<<___ if ($i>=1); 2173 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2174___ 2175 $code.=<<___ if ($i>=2); 2176 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2177___ 2178 } 2179$code.=<<___; 2180 movdqu 0x60($inp), @XMM[8+6] 2181 pxor @XMM[8+5], @XMM[5] 2182 movdqu 0x70($inp), @XMM[8+7] 2183 lea 0x80($inp), $inp 2184 movdqa @XMM[7], 0x70(%rsp) 2185 pxor @XMM[8+6], @XMM[6] 2186 lea 0x80(%rsp), %rax # pass key schedule 2187 pxor @XMM[8+7], @XMM[7] 2188 mov %edx, %r10d # pass rounds 2189 2190 call _bsaes_encrypt8 2191 2192 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2193 pxor 0x10(%rsp), @XMM[1] 2194 movdqu @XMM[0], 0x00($out) # write output 2195 pxor 0x20(%rsp), @XMM[4] 2196 movdqu @XMM[1], 0x10($out) 2197 pxor 0x30(%rsp), @XMM[6] 2198 movdqu @XMM[4], 0x20($out) 2199 pxor 0x40(%rsp), @XMM[3] 2200 movdqu @XMM[6], 0x30($out) 2201 pxor 0x50(%rsp), @XMM[7] 2202 movdqu @XMM[3], 0x40($out) 2203 pxor 0x60(%rsp), @XMM[2] 2204 movdqu @XMM[7], 0x50($out) 2205 pxor 0x70(%rsp), @XMM[5] 2206 movdqu @XMM[2], 0x60($out) 2207 movdqu @XMM[5], 0x70($out) 2208 lea 0x80($out), $out 2209 2210 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2211 pxor $twtmp, $twtmp 2212 movdqa .Lxts_magic(%rip), $twmask 2213 pcmpgtd @XMM[7], $twtmp 2214 pshufd \$0x13, $twtmp, $twres 2215 pxor $twtmp, $twtmp 2216 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2217 pand $twmask, $twres # isolate carry and residue 2218 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2219 pxor $twres, @XMM[7] 2220 2221 sub \$0x80,$len 2222 jnc .Lxts_enc_loop 2223 2224.Lxts_enc_short: 2225 add \$0x80, $len 2226 jz .Lxts_enc_done 2227___ 2228 for ($i=0;$i<7;$i++) { 2229 $code.=<<___; 2230 pshufd \$0x13, $twtmp, $twres 2231 pxor $twtmp, $twtmp 2232 movdqa @XMM[7], @XMM[$i] 2233 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2235 pand $twmask, $twres # isolate carry and residue 2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2237 pxor $twres, @XMM[7] 2238___ 2239 $code.=<<___ if ($i>=1); 2240 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2241 cmp \$`0x10*$i`,$len 2242 je .Lxts_enc_$i 2243___ 2244 $code.=<<___ if ($i>=2); 2245 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2246___ 2247 } 2248$code.=<<___; 2249 movdqu 0x60($inp), @XMM[8+6] 2250 pxor @XMM[8+5], @XMM[5] 2251 movdqa @XMM[7], 0x70(%rsp) 2252 lea 0x70($inp), $inp 2253 pxor @XMM[8+6], @XMM[6] 2254 lea 0x80(%rsp), %rax # pass key schedule 2255 mov %edx, %r10d # pass rounds 2256 2257 call _bsaes_encrypt8 2258 2259 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2260 pxor 0x10(%rsp), @XMM[1] 2261 movdqu @XMM[0], 0x00($out) # write output 2262 pxor 0x20(%rsp), @XMM[4] 2263 movdqu @XMM[1], 0x10($out) 2264 pxor 0x30(%rsp), @XMM[6] 2265 movdqu @XMM[4], 0x20($out) 2266 pxor 0x40(%rsp), @XMM[3] 2267 movdqu @XMM[6], 0x30($out) 2268 pxor 0x50(%rsp), @XMM[7] 2269 movdqu @XMM[3], 0x40($out) 2270 pxor 0x60(%rsp), @XMM[2] 2271 movdqu @XMM[7], 0x50($out) 2272 movdqu @XMM[2], 0x60($out) 2273 lea 0x70($out), $out 2274 2275 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2276 jmp .Lxts_enc_done 2277.align 16 2278.Lxts_enc_6: 2279 pxor @XMM[8+4], @XMM[4] 2280 lea 0x60($inp), $inp 2281 pxor @XMM[8+5], @XMM[5] 2282 lea 0x80(%rsp), %rax # pass key schedule 2283 mov %edx, %r10d # pass rounds 2284 2285 call _bsaes_encrypt8 2286 2287 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2288 pxor 0x10(%rsp), @XMM[1] 2289 movdqu @XMM[0], 0x00($out) # write output 2290 pxor 0x20(%rsp), @XMM[4] 2291 movdqu @XMM[1], 0x10($out) 2292 pxor 0x30(%rsp), @XMM[6] 2293 movdqu @XMM[4], 0x20($out) 2294 pxor 0x40(%rsp), @XMM[3] 2295 movdqu @XMM[6], 0x30($out) 2296 pxor 0x50(%rsp), @XMM[7] 2297 movdqu @XMM[3], 0x40($out) 2298 movdqu @XMM[7], 0x50($out) 2299 lea 0x60($out), $out 2300 2301 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2302 jmp .Lxts_enc_done 2303.align 16 2304.Lxts_enc_5: 2305 pxor @XMM[8+3], @XMM[3] 2306 lea 0x50($inp), $inp 2307 pxor @XMM[8+4], @XMM[4] 2308 lea 0x80(%rsp), %rax # pass key schedule 2309 mov %edx, %r10d # pass rounds 2310 2311 call _bsaes_encrypt8 2312 2313 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2314 pxor 0x10(%rsp), @XMM[1] 2315 movdqu @XMM[0], 0x00($out) # write output 2316 pxor 0x20(%rsp), @XMM[4] 2317 movdqu @XMM[1], 0x10($out) 2318 pxor 0x30(%rsp), @XMM[6] 2319 movdqu @XMM[4], 0x20($out) 2320 pxor 0x40(%rsp), @XMM[3] 2321 movdqu @XMM[6], 0x30($out) 2322 movdqu @XMM[3], 0x40($out) 2323 lea 0x50($out), $out 2324 2325 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2326 jmp .Lxts_enc_done 2327.align 16 2328.Lxts_enc_4: 2329 pxor @XMM[8+2], @XMM[2] 2330 lea 0x40($inp), $inp 2331 pxor @XMM[8+3], @XMM[3] 2332 lea 0x80(%rsp), %rax # pass key schedule 2333 mov %edx, %r10d # pass rounds 2334 2335 call _bsaes_encrypt8 2336 2337 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2338 pxor 0x10(%rsp), @XMM[1] 2339 movdqu @XMM[0], 0x00($out) # write output 2340 pxor 0x20(%rsp), @XMM[4] 2341 movdqu @XMM[1], 0x10($out) 2342 pxor 0x30(%rsp), @XMM[6] 2343 movdqu @XMM[4], 0x20($out) 2344 movdqu @XMM[6], 0x30($out) 2345 lea 0x40($out), $out 2346 2347 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2348 jmp .Lxts_enc_done 2349.align 16 2350.Lxts_enc_3: 2351 pxor @XMM[8+1], @XMM[1] 2352 lea 0x30($inp), $inp 2353 pxor @XMM[8+2], @XMM[2] 2354 lea 0x80(%rsp), %rax # pass key schedule 2355 mov %edx, %r10d # pass rounds 2356 2357 call _bsaes_encrypt8 2358 2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2360 pxor 0x10(%rsp), @XMM[1] 2361 movdqu @XMM[0], 0x00($out) # write output 2362 pxor 0x20(%rsp), @XMM[4] 2363 movdqu @XMM[1], 0x10($out) 2364 movdqu @XMM[4], 0x20($out) 2365 lea 0x30($out), $out 2366 2367 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2368 jmp .Lxts_enc_done 2369.align 16 2370.Lxts_enc_2: 2371 pxor @XMM[8+0], @XMM[0] 2372 lea 0x20($inp), $inp 2373 pxor @XMM[8+1], @XMM[1] 2374 lea 0x80(%rsp), %rax # pass key schedule 2375 mov %edx, %r10d # pass rounds 2376 2377 call _bsaes_encrypt8 2378 2379 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2380 pxor 0x10(%rsp), @XMM[1] 2381 movdqu @XMM[0], 0x00($out) # write output 2382 movdqu @XMM[1], 0x10($out) 2383 lea 0x20($out), $out 2384 2385 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2386 jmp .Lxts_enc_done 2387.align 16 2388.Lxts_enc_1: 2389 pxor @XMM[0], @XMM[8] 2390 lea 0x10($inp), $inp 2391 movdqa @XMM[8], 0x20(%rbp) 2392 lea 0x20(%rbp), $arg1 2393 lea 0x20(%rbp), $arg2 2394 lea ($key), $arg3 2395 call asm_AES_encrypt # doesn't touch %xmm 2396 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2397 #pxor @XMM[8], @XMM[0] 2398 #lea 0x80(%rsp), %rax # pass key schedule 2399 #mov %edx, %r10d # pass rounds 2400 #call _bsaes_encrypt8 2401 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2402 movdqu @XMM[0], 0x00($out) # write output 2403 lea 0x10($out), $out 2404 2405 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2406 2407.Lxts_enc_done: 2408 and \$15, %ebx 2409 jz .Lxts_enc_ret 2410 mov $out, %rdx 2411 2412.Lxts_enc_steal: 2413 movzb ($inp), %eax 2414 movzb -16(%rdx), %ecx 2415 lea 1($inp), $inp 2416 mov %al, -16(%rdx) 2417 mov %cl, 0(%rdx) 2418 lea 1(%rdx), %rdx 2419 sub \$1,%ebx 2420 jnz .Lxts_enc_steal 2421 2422 movdqu -16($out), @XMM[0] 2423 lea 0x20(%rbp), $arg1 2424 pxor @XMM[7], @XMM[0] 2425 lea 0x20(%rbp), $arg2 2426 movdqa @XMM[0], 0x20(%rbp) 2427 lea ($key), $arg3 2428 call asm_AES_encrypt # doesn't touch %xmm 2429 pxor 0x20(%rbp), @XMM[7] 2430 movdqu @XMM[7], -16($out) 2431 2432.Lxts_enc_ret: 2433 lea (%rsp), %rax 2434 pxor %xmm0, %xmm0 2435.Lxts_enc_bzero: # wipe key schedule [if any] 2436 movdqa %xmm0, 0x00(%rax) 2437 movdqa %xmm0, 0x10(%rax) 2438 lea 0x20(%rax), %rax 2439 cmp %rax, %rbp 2440 ja .Lxts_enc_bzero 2441 2442 lea (%rbp),%rsp # restore %rsp 2443___ 2444$code.=<<___ if ($win64); 2445 movaps 0x40(%rbp), %xmm6 2446 movaps 0x50(%rbp), %xmm7 2447 movaps 0x60(%rbp), %xmm8 2448 movaps 0x70(%rbp), %xmm9 2449 movaps 0x80(%rbp), %xmm10 2450 movaps 0x90(%rbp), %xmm11 2451 movaps 0xa0(%rbp), %xmm12 2452 movaps 0xb0(%rbp), %xmm13 2453 movaps 0xc0(%rbp), %xmm14 2454 movaps 0xd0(%rbp), %xmm15 2455 lea 0xa0(%rbp), %rsp 2456___ 2457$code.=<<___; 2458 mov 0x48(%rsp), %r15 2459 mov 0x50(%rsp), %r14 2460 mov 0x58(%rsp), %r13 2461 mov 0x60(%rsp), %r12 2462 mov 0x68(%rsp), %rbx 2463 mov 0x70(%rsp), %rax 2464 lea 0x78(%rsp), %rsp 2465 mov %rax, %rbp 2466.Lxts_enc_epilogue: 2467 ret 2468.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2469 2470.globl bsaes_xts_decrypt 2471.type bsaes_xts_decrypt,\@abi-omnipotent 2472.align 16 2473bsaes_xts_decrypt: 2474 mov %rsp, %rax 2475.Lxts_dec_prologue: 2476 push %rbp 2477 push %rbx 2478 push %r12 2479 push %r13 2480 push %r14 2481 push %r15 2482 lea -0x48(%rsp), %rsp 2483___ 2484$code.=<<___ if ($win64); 2485 mov 0xa0(%rsp),$arg5 # pull key2 2486 mov 0xa8(%rsp),$arg6 # pull ivp 2487 lea -0xa0(%rsp), %rsp 2488 movaps %xmm6, 0x40(%rsp) 2489 movaps %xmm7, 0x50(%rsp) 2490 movaps %xmm8, 0x60(%rsp) 2491 movaps %xmm9, 0x70(%rsp) 2492 movaps %xmm10, 0x80(%rsp) 2493 movaps %xmm11, 0x90(%rsp) 2494 movaps %xmm12, 0xa0(%rsp) 2495 movaps %xmm13, 0xb0(%rsp) 2496 movaps %xmm14, 0xc0(%rsp) 2497 movaps %xmm15, 0xd0(%rsp) 2498.Lxts_dec_body: 2499___ 2500$code.=<<___; 2501 mov %rsp, %rbp # backup %rsp 2502 mov $arg1, $inp # backup arguments 2503 mov $arg2, $out 2504 mov $arg3, $len 2505 mov $arg4, $key 2506 2507 lea ($arg6), $arg1 2508 lea 0x20(%rbp), $arg2 2509 lea ($arg5), $arg3 2510 call asm_AES_encrypt # generate initial tweak 2511 2512 mov 240($key), %eax # rounds 2513 mov $len, %rbx # backup $len 2514 2515 mov %eax, %edx # rounds 2516 shl \$7, %rax # 128 bytes per inner round key 2517 sub \$`128-32`, %rax # size of bit-sliced key schedule 2518 sub %rax, %rsp 2519 2520 mov %rsp, %rax # pass key schedule 2521 mov $key, %rcx # pass key 2522 mov %edx, %r10d # pass rounds 2523 call _bsaes_key_convert 2524 pxor (%rsp), %xmm7 # fix up round 0 key 2525 movdqa %xmm6, (%rax) # save last round key 2526 movdqa %xmm7, (%rsp) 2527 2528 xor %eax, %eax # if ($len%16) len-=16; 2529 and \$-16, $len 2530 test \$15, %ebx 2531 setnz %al 2532 shl \$4, %rax 2533 sub %rax, $len 2534 2535 sub \$0x80, %rsp # place for tweak[8] 2536 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2537 2538 pxor $twtmp, $twtmp 2539 movdqa .Lxts_magic(%rip), $twmask 2540 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2541 2542 sub \$0x80, $len 2543 jc .Lxts_dec_short 2544 jmp .Lxts_dec_loop 2545 2546.align 16 2547.Lxts_dec_loop: 2548___ 2549 for ($i=0;$i<7;$i++) { 2550 $code.=<<___; 2551 pshufd \$0x13, $twtmp, $twres 2552 pxor $twtmp, $twtmp 2553 movdqa @XMM[7], @XMM[$i] 2554 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2555 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2556 pand $twmask, $twres # isolate carry and residue 2557 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2558 pxor $twres, @XMM[7] 2559___ 2560 $code.=<<___ if ($i>=1); 2561 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2562___ 2563 $code.=<<___ if ($i>=2); 2564 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2565___ 2566 } 2567$code.=<<___; 2568 movdqu 0x60($inp), @XMM[8+6] 2569 pxor @XMM[8+5], @XMM[5] 2570 movdqu 0x70($inp), @XMM[8+7] 2571 lea 0x80($inp), $inp 2572 movdqa @XMM[7], 0x70(%rsp) 2573 pxor @XMM[8+6], @XMM[6] 2574 lea 0x80(%rsp), %rax # pass key schedule 2575 pxor @XMM[8+7], @XMM[7] 2576 mov %edx, %r10d # pass rounds 2577 2578 call _bsaes_decrypt8 2579 2580 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2581 pxor 0x10(%rsp), @XMM[1] 2582 movdqu @XMM[0], 0x00($out) # write output 2583 pxor 0x20(%rsp), @XMM[6] 2584 movdqu @XMM[1], 0x10($out) 2585 pxor 0x30(%rsp), @XMM[4] 2586 movdqu @XMM[6], 0x20($out) 2587 pxor 0x40(%rsp), @XMM[2] 2588 movdqu @XMM[4], 0x30($out) 2589 pxor 0x50(%rsp), @XMM[7] 2590 movdqu @XMM[2], 0x40($out) 2591 pxor 0x60(%rsp), @XMM[3] 2592 movdqu @XMM[7], 0x50($out) 2593 pxor 0x70(%rsp), @XMM[5] 2594 movdqu @XMM[3], 0x60($out) 2595 movdqu @XMM[5], 0x70($out) 2596 lea 0x80($out), $out 2597 2598 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2599 pxor $twtmp, $twtmp 2600 movdqa .Lxts_magic(%rip), $twmask 2601 pcmpgtd @XMM[7], $twtmp 2602 pshufd \$0x13, $twtmp, $twres 2603 pxor $twtmp, $twtmp 2604 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2605 pand $twmask, $twres # isolate carry and residue 2606 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2607 pxor $twres, @XMM[7] 2608 2609 sub \$0x80,$len 2610 jnc .Lxts_dec_loop 2611 2612.Lxts_dec_short: 2613 add \$0x80, $len 2614 jz .Lxts_dec_done 2615___ 2616 for ($i=0;$i<7;$i++) { 2617 $code.=<<___; 2618 pshufd \$0x13, $twtmp, $twres 2619 pxor $twtmp, $twtmp 2620 movdqa @XMM[7], @XMM[$i] 2621 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2622 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2623 pand $twmask, $twres # isolate carry and residue 2624 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2625 pxor $twres, @XMM[7] 2626___ 2627 $code.=<<___ if ($i>=1); 2628 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2629 cmp \$`0x10*$i`,$len 2630 je .Lxts_dec_$i 2631___ 2632 $code.=<<___ if ($i>=2); 2633 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2634___ 2635 } 2636$code.=<<___; 2637 movdqu 0x60($inp), @XMM[8+6] 2638 pxor @XMM[8+5], @XMM[5] 2639 movdqa @XMM[7], 0x70(%rsp) 2640 lea 0x70($inp), $inp 2641 pxor @XMM[8+6], @XMM[6] 2642 lea 0x80(%rsp), %rax # pass key schedule 2643 mov %edx, %r10d # pass rounds 2644 2645 call _bsaes_decrypt8 2646 2647 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2648 pxor 0x10(%rsp), @XMM[1] 2649 movdqu @XMM[0], 0x00($out) # write output 2650 pxor 0x20(%rsp), @XMM[6] 2651 movdqu @XMM[1], 0x10($out) 2652 pxor 0x30(%rsp), @XMM[4] 2653 movdqu @XMM[6], 0x20($out) 2654 pxor 0x40(%rsp), @XMM[2] 2655 movdqu @XMM[4], 0x30($out) 2656 pxor 0x50(%rsp), @XMM[7] 2657 movdqu @XMM[2], 0x40($out) 2658 pxor 0x60(%rsp), @XMM[3] 2659 movdqu @XMM[7], 0x50($out) 2660 movdqu @XMM[3], 0x60($out) 2661 lea 0x70($out), $out 2662 2663 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2664 jmp .Lxts_dec_done 2665.align 16 2666.Lxts_dec_6: 2667 pxor @XMM[8+4], @XMM[4] 2668 lea 0x60($inp), $inp 2669 pxor @XMM[8+5], @XMM[5] 2670 lea 0x80(%rsp), %rax # pass key schedule 2671 mov %edx, %r10d # pass rounds 2672 2673 call _bsaes_decrypt8 2674 2675 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2676 pxor 0x10(%rsp), @XMM[1] 2677 movdqu @XMM[0], 0x00($out) # write output 2678 pxor 0x20(%rsp), @XMM[6] 2679 movdqu @XMM[1], 0x10($out) 2680 pxor 0x30(%rsp), @XMM[4] 2681 movdqu @XMM[6], 0x20($out) 2682 pxor 0x40(%rsp), @XMM[2] 2683 movdqu @XMM[4], 0x30($out) 2684 pxor 0x50(%rsp), @XMM[7] 2685 movdqu @XMM[2], 0x40($out) 2686 movdqu @XMM[7], 0x50($out) 2687 lea 0x60($out), $out 2688 2689 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2690 jmp .Lxts_dec_done 2691.align 16 2692.Lxts_dec_5: 2693 pxor @XMM[8+3], @XMM[3] 2694 lea 0x50($inp), $inp 2695 pxor @XMM[8+4], @XMM[4] 2696 lea 0x80(%rsp), %rax # pass key schedule 2697 mov %edx, %r10d # pass rounds 2698 2699 call _bsaes_decrypt8 2700 2701 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2702 pxor 0x10(%rsp), @XMM[1] 2703 movdqu @XMM[0], 0x00($out) # write output 2704 pxor 0x20(%rsp), @XMM[6] 2705 movdqu @XMM[1], 0x10($out) 2706 pxor 0x30(%rsp), @XMM[4] 2707 movdqu @XMM[6], 0x20($out) 2708 pxor 0x40(%rsp), @XMM[2] 2709 movdqu @XMM[4], 0x30($out) 2710 movdqu @XMM[2], 0x40($out) 2711 lea 0x50($out), $out 2712 2713 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2714 jmp .Lxts_dec_done 2715.align 16 2716.Lxts_dec_4: 2717 pxor @XMM[8+2], @XMM[2] 2718 lea 0x40($inp), $inp 2719 pxor @XMM[8+3], @XMM[3] 2720 lea 0x80(%rsp), %rax # pass key schedule 2721 mov %edx, %r10d # pass rounds 2722 2723 call _bsaes_decrypt8 2724 2725 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2726 pxor 0x10(%rsp), @XMM[1] 2727 movdqu @XMM[0], 0x00($out) # write output 2728 pxor 0x20(%rsp), @XMM[6] 2729 movdqu @XMM[1], 0x10($out) 2730 pxor 0x30(%rsp), @XMM[4] 2731 movdqu @XMM[6], 0x20($out) 2732 movdqu @XMM[4], 0x30($out) 2733 lea 0x40($out), $out 2734 2735 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2736 jmp .Lxts_dec_done 2737.align 16 2738.Lxts_dec_3: 2739 pxor @XMM[8+1], @XMM[1] 2740 lea 0x30($inp), $inp 2741 pxor @XMM[8+2], @XMM[2] 2742 lea 0x80(%rsp), %rax # pass key schedule 2743 mov %edx, %r10d # pass rounds 2744 2745 call _bsaes_decrypt8 2746 2747 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2748 pxor 0x10(%rsp), @XMM[1] 2749 movdqu @XMM[0], 0x00($out) # write output 2750 pxor 0x20(%rsp), @XMM[6] 2751 movdqu @XMM[1], 0x10($out) 2752 movdqu @XMM[6], 0x20($out) 2753 lea 0x30($out), $out 2754 2755 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2756 jmp .Lxts_dec_done 2757.align 16 2758.Lxts_dec_2: 2759 pxor @XMM[8+0], @XMM[0] 2760 lea 0x20($inp), $inp 2761 pxor @XMM[8+1], @XMM[1] 2762 lea 0x80(%rsp), %rax # pass key schedule 2763 mov %edx, %r10d # pass rounds 2764 2765 call _bsaes_decrypt8 2766 2767 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2768 pxor 0x10(%rsp), @XMM[1] 2769 movdqu @XMM[0], 0x00($out) # write output 2770 movdqu @XMM[1], 0x10($out) 2771 lea 0x20($out), $out 2772 2773 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2774 jmp .Lxts_dec_done 2775.align 16 2776.Lxts_dec_1: 2777 pxor @XMM[0], @XMM[8] 2778 lea 0x10($inp), $inp 2779 movdqa @XMM[8], 0x20(%rbp) 2780 lea 0x20(%rbp), $arg1 2781 lea 0x20(%rbp), $arg2 2782 lea ($key), $arg3 2783 call asm_AES_decrypt # doesn't touch %xmm 2784 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2785 #pxor @XMM[8], @XMM[0] 2786 #lea 0x80(%rsp), %rax # pass key schedule 2787 #mov %edx, %r10d # pass rounds 2788 #call _bsaes_decrypt8 2789 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2790 movdqu @XMM[0], 0x00($out) # write output 2791 lea 0x10($out), $out 2792 2793 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2794 2795.Lxts_dec_done: 2796 and \$15, %ebx 2797 jz .Lxts_dec_ret 2798 2799 pxor $twtmp, $twtmp 2800 movdqa .Lxts_magic(%rip), $twmask 2801 pcmpgtd @XMM[7], $twtmp 2802 pshufd \$0x13, $twtmp, $twres 2803 movdqa @XMM[7], @XMM[6] 2804 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2805 pand $twmask, $twres # isolate carry and residue 2806 movdqu ($inp), @XMM[0] 2807 pxor $twres, @XMM[7] 2808 2809 lea 0x20(%rbp), $arg1 2810 pxor @XMM[7], @XMM[0] 2811 lea 0x20(%rbp), $arg2 2812 movdqa @XMM[0], 0x20(%rbp) 2813 lea ($key), $arg3 2814 call asm_AES_decrypt # doesn't touch %xmm 2815 pxor 0x20(%rbp), @XMM[7] 2816 mov $out, %rdx 2817 movdqu @XMM[7], ($out) 2818 2819.Lxts_dec_steal: 2820 movzb 16($inp), %eax 2821 movzb (%rdx), %ecx 2822 lea 1($inp), $inp 2823 mov %al, (%rdx) 2824 mov %cl, 16(%rdx) 2825 lea 1(%rdx), %rdx 2826 sub \$1,%ebx 2827 jnz .Lxts_dec_steal 2828 2829 movdqu ($out), @XMM[0] 2830 lea 0x20(%rbp), $arg1 2831 pxor @XMM[6], @XMM[0] 2832 lea 0x20(%rbp), $arg2 2833 movdqa @XMM[0], 0x20(%rbp) 2834 lea ($key), $arg3 2835 call asm_AES_decrypt # doesn't touch %xmm 2836 pxor 0x20(%rbp), @XMM[6] 2837 movdqu @XMM[6], ($out) 2838 2839.Lxts_dec_ret: 2840 lea (%rsp), %rax 2841 pxor %xmm0, %xmm0 2842.Lxts_dec_bzero: # wipe key schedule [if any] 2843 movdqa %xmm0, 0x00(%rax) 2844 movdqa %xmm0, 0x10(%rax) 2845 lea 0x20(%rax), %rax 2846 cmp %rax, %rbp 2847 ja .Lxts_dec_bzero 2848 2849 lea (%rbp),%rsp # restore %rsp 2850___ 2851$code.=<<___ if ($win64); 2852 movaps 0x40(%rbp), %xmm6 2853 movaps 0x50(%rbp), %xmm7 2854 movaps 0x60(%rbp), %xmm8 2855 movaps 0x70(%rbp), %xmm9 2856 movaps 0x80(%rbp), %xmm10 2857 movaps 0x90(%rbp), %xmm11 2858 movaps 0xa0(%rbp), %xmm12 2859 movaps 0xb0(%rbp), %xmm13 2860 movaps 0xc0(%rbp), %xmm14 2861 movaps 0xd0(%rbp), %xmm15 2862 lea 0xa0(%rbp), %rsp 2863___ 2864$code.=<<___; 2865 mov 0x48(%rsp), %r15 2866 mov 0x50(%rsp), %r14 2867 mov 0x58(%rsp), %r13 2868 mov 0x60(%rsp), %r12 2869 mov 0x68(%rsp), %rbx 2870 mov 0x70(%rsp), %rax 2871 lea 0x78(%rsp), %rsp 2872 mov %rax, %rbp 2873.Lxts_dec_epilogue: 2874 ret 2875.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2876___ 2877} 2878$code.=<<___; 2879.type _bsaes_const,\@object 2880.align 64 2881_bsaes_const: 2882.LM0ISR: # InvShiftRows constants 2883 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2884.LISRM0: 2885 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2886.LISR: 2887 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2888.LBS0: # bit-slice constants 2889 .quad 0x5555555555555555, 0x5555555555555555 2890.LBS1: 2891 .quad 0x3333333333333333, 0x3333333333333333 2892.LBS2: 2893 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2894.LSR: # shiftrows constants 2895 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2896.LSRM0: 2897 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2898.LM0SR: 2899 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2900.LSWPUP: # byte-swap upper dword 2901 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2902.LSWPUPM0SR: 2903 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2904.LADD1: # counter increment constants 2905 .quad 0x0000000000000000, 0x0000000100000000 2906.LADD2: 2907 .quad 0x0000000000000000, 0x0000000200000000 2908.LADD3: 2909 .quad 0x0000000000000000, 0x0000000300000000 2910.LADD4: 2911 .quad 0x0000000000000000, 0x0000000400000000 2912.LADD5: 2913 .quad 0x0000000000000000, 0x0000000500000000 2914.LADD6: 2915 .quad 0x0000000000000000, 0x0000000600000000 2916.LADD7: 2917 .quad 0x0000000000000000, 0x0000000700000000 2918.LADD8: 2919 .quad 0x0000000000000000, 0x0000000800000000 2920.Lxts_magic: 2921 .long 0x87,0,1,0 2922.Lmasks: 2923 .quad 0x0101010101010101, 0x0101010101010101 2924 .quad 0x0202020202020202, 0x0202020202020202 2925 .quad 0x0404040404040404, 0x0404040404040404 2926 .quad 0x0808080808080808, 0x0808080808080808 2927.LM0: 2928 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2929.L63: 2930 .quad 0x6363636363636363, 0x6363636363636363 2931.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 2932.align 64 2933.size _bsaes_const,.-_bsaes_const 2934___ 2935 2936# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2937# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2938if ($win64) { 2939$rec="%rcx"; 2940$frame="%rdx"; 2941$context="%r8"; 2942$disp="%r9"; 2943 2944$code.=<<___; 2945.extern __imp_RtlVirtualUnwind 2946.type se_handler,\@abi-omnipotent 2947.align 16 2948se_handler: 2949 push %rsi 2950 push %rdi 2951 push %rbx 2952 push %rbp 2953 push %r12 2954 push %r13 2955 push %r14 2956 push %r15 2957 pushfq 2958 sub \$64,%rsp 2959 2960 mov 120($context),%rax # pull context->Rax 2961 mov 248($context),%rbx # pull context->Rip 2962 2963 mov 8($disp),%rsi # disp->ImageBase 2964 mov 56($disp),%r11 # disp->HandlerData 2965 2966 mov 0(%r11),%r10d # HandlerData[0] 2967 lea (%rsi,%r10),%r10 # prologue label 2968 cmp %r10,%rbx # context->Rip<prologue label 2969 jb .Lin_prologue 2970 2971 mov 152($context),%rax # pull context->Rsp 2972 2973 mov 4(%r11),%r10d # HandlerData[1] 2974 lea (%rsi,%r10),%r10 # epilogue label 2975 cmp %r10,%rbx # context->Rip>=epilogue label 2976 jae .Lin_prologue 2977 2978 mov 160($context),%rax # pull context->Rbp 2979 2980 lea 0x40(%rax),%rsi # %xmm save area 2981 lea 512($context),%rdi # &context.Xmm6 2982 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2983 .long 0xa548f3fc # cld; rep movsq 2984 lea 0xa0(%rax),%rax # adjust stack pointer 2985 2986 mov 0x70(%rax),%rbp 2987 mov 0x68(%rax),%rbx 2988 mov 0x60(%rax),%r12 2989 mov 0x58(%rax),%r13 2990 mov 0x50(%rax),%r14 2991 mov 0x48(%rax),%r15 2992 lea 0x78(%rax),%rax # adjust stack pointer 2993 mov %rbx,144($context) # restore context->Rbx 2994 mov %rbp,160($context) # restore context->Rbp 2995 mov %r12,216($context) # restore context->R12 2996 mov %r13,224($context) # restore context->R13 2997 mov %r14,232($context) # restore context->R14 2998 mov %r15,240($context) # restore context->R15 2999 3000.Lin_prologue: 3001 mov %rax,152($context) # restore context->Rsp 3002 3003 mov 40($disp),%rdi # disp->ContextRecord 3004 mov $context,%rsi # context 3005 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3006 .long 0xa548f3fc # cld; rep movsq 3007 3008 mov $disp,%rsi 3009 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3010 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3011 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3012 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3013 mov 40(%rsi),%r10 # disp->ContextRecord 3014 lea 56(%rsi),%r11 # &disp->HandlerData 3015 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3016 mov %r10,32(%rsp) # arg5 3017 mov %r11,40(%rsp) # arg6 3018 mov %r12,48(%rsp) # arg7 3019 mov %rcx,56(%rsp) # arg8, (NULL) 3020 call *__imp_RtlVirtualUnwind(%rip) 3021 3022 mov \$1,%eax # ExceptionContinueSearch 3023 add \$64,%rsp 3024 popfq 3025 pop %r15 3026 pop %r14 3027 pop %r13 3028 pop %r12 3029 pop %rbp 3030 pop %rbx 3031 pop %rdi 3032 pop %rsi 3033 ret 3034.size se_handler,.-se_handler 3035 3036.section .pdata 3037.align 4 3038___ 3039$code.=<<___ if ($ecb); 3040 .rva .Lecb_enc_prologue 3041 .rva .Lecb_enc_epilogue 3042 .rva .Lecb_enc_info 3043 3044 .rva .Lecb_dec_prologue 3045 .rva .Lecb_dec_epilogue 3046 .rva .Lecb_dec_info 3047___ 3048$code.=<<___; 3049 .rva .Lcbc_dec_prologue 3050 .rva .Lcbc_dec_epilogue 3051 .rva .Lcbc_dec_info 3052 3053 .rva .Lctr_enc_prologue 3054 .rva .Lctr_enc_epilogue 3055 .rva .Lctr_enc_info 3056 3057 .rva .Lxts_enc_prologue 3058 .rva .Lxts_enc_epilogue 3059 .rva .Lxts_enc_info 3060 3061 .rva .Lxts_dec_prologue 3062 .rva .Lxts_dec_epilogue 3063 .rva .Lxts_dec_info 3064 3065.section .xdata 3066.align 8 3067___ 3068$code.=<<___ if ($ecb); 3069.Lecb_enc_info: 3070 .byte 9,0,0,0 3071 .rva se_handler 3072 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3073.Lecb_dec_info: 3074 .byte 9,0,0,0 3075 .rva se_handler 3076 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3077___ 3078$code.=<<___; 3079.Lcbc_dec_info: 3080 .byte 9,0,0,0 3081 .rva se_handler 3082 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3083.Lctr_enc_info: 3084 .byte 9,0,0,0 3085 .rva se_handler 3086 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3087.Lxts_enc_info: 3088 .byte 9,0,0,0 3089 .rva se_handler 3090 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3091.Lxts_dec_info: 3092 .byte 9,0,0,0 3093 .rva se_handler 3094 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3095___ 3096} 3097 3098$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3099 3100print $code; 3101 3102close STDOUT; 3103