1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# 39# (*) original 3.64/1.34/1.32 results were for r0p0 revision 40# and are still same even for updated module; 41 42$flavour = shift; 43$output = shift; 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 48die "can't locate arm-xlate.pl"; 49 50open OUT,"| \"$^X\" $xlate $flavour $output"; 51*STDOUT=*OUT; 52 53$prefix="aes_hw"; 54 55$code=<<___; 56#include <openssl/arm_arch.h> 57 58#if __ARM_MAX_ARCH__>=7 59.text 60___ 61$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 62$code.=<<___ if ($flavour !~ /64/); 63.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 64.fpu neon 65.code 32 66#undef __thumb2__ 67___ 68 69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 71# maintain both 32- and 64-bit codes within single module and 72# transliterate common code to either flavour with regex vodoo. 73# 74{{{ 75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 77 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 78 79 80$code.=<<___; 81.align 5 82.Lrcon: 83.long 0x01,0x01,0x01,0x01 84.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 85.long 0x1b,0x1b,0x1b,0x1b 86 87.globl ${prefix}_set_encrypt_key 88.type ${prefix}_set_encrypt_key,%function 89.align 5 90${prefix}_set_encrypt_key: 91.Lenc_key: 92___ 93$code.=<<___ if ($flavour =~ /64/); 94 stp x29,x30,[sp,#-16]! 95 add x29,sp,#0 96___ 97$code.=<<___; 98 mov $ptr,#-1 99 cmp $inp,#0 100 b.eq .Lenc_key_abort 101 cmp $out,#0 102 b.eq .Lenc_key_abort 103 mov $ptr,#-2 104 cmp $bits,#128 105 b.lt .Lenc_key_abort 106 cmp $bits,#256 107 b.gt .Lenc_key_abort 108 tst $bits,#0x3f 109 b.ne .Lenc_key_abort 110 111 adr $ptr,.Lrcon 112 cmp $bits,#192 113 114 veor $zero,$zero,$zero 115 vld1.8 {$in0},[$inp],#16 116 mov $bits,#8 // reuse $bits 117 vld1.32 {$rcon,$mask},[$ptr],#32 118 119 b.lt .Loop128 120 b.eq .L192 121 b .L256 122 123.align 4 124.Loop128: 125 vtbl.8 $key,{$in0},$mask 126 vext.8 $tmp,$zero,$in0,#12 127 vst1.32 {$in0},[$out],#16 128 aese $key,$zero 129 subs $bits,$bits,#1 130 131 veor $in0,$in0,$tmp 132 vext.8 $tmp,$zero,$tmp,#12 133 veor $in0,$in0,$tmp 134 vext.8 $tmp,$zero,$tmp,#12 135 veor $key,$key,$rcon 136 veor $in0,$in0,$tmp 137 vshl.u8 $rcon,$rcon,#1 138 veor $in0,$in0,$key 139 b.ne .Loop128 140 141 vld1.32 {$rcon},[$ptr] 142 143 vtbl.8 $key,{$in0},$mask 144 vext.8 $tmp,$zero,$in0,#12 145 vst1.32 {$in0},[$out],#16 146 aese $key,$zero 147 148 veor $in0,$in0,$tmp 149 vext.8 $tmp,$zero,$tmp,#12 150 veor $in0,$in0,$tmp 151 vext.8 $tmp,$zero,$tmp,#12 152 veor $key,$key,$rcon 153 veor $in0,$in0,$tmp 154 vshl.u8 $rcon,$rcon,#1 155 veor $in0,$in0,$key 156 157 vtbl.8 $key,{$in0},$mask 158 vext.8 $tmp,$zero,$in0,#12 159 vst1.32 {$in0},[$out],#16 160 aese $key,$zero 161 162 veor $in0,$in0,$tmp 163 vext.8 $tmp,$zero,$tmp,#12 164 veor $in0,$in0,$tmp 165 vext.8 $tmp,$zero,$tmp,#12 166 veor $key,$key,$rcon 167 veor $in0,$in0,$tmp 168 veor $in0,$in0,$key 169 vst1.32 {$in0},[$out] 170 add $out,$out,#0x50 171 172 mov $rounds,#10 173 b .Ldone 174 175.align 4 176.L192: 177 vld1.8 {$in1},[$inp],#8 178 vmov.i8 $key,#8 // borrow $key 179 vst1.32 {$in0},[$out],#16 180 vsub.i8 $mask,$mask,$key // adjust the mask 181 182.Loop192: 183 vtbl.8 $key,{$in1},$mask 184 vext.8 $tmp,$zero,$in0,#12 185 vst1.32 {$in1},[$out],#8 186 aese $key,$zero 187 subs $bits,$bits,#1 188 189 veor $in0,$in0,$tmp 190 vext.8 $tmp,$zero,$tmp,#12 191 veor $in0,$in0,$tmp 192 vext.8 $tmp,$zero,$tmp,#12 193 veor $in0,$in0,$tmp 194 195 vdup.32 $tmp,${in0}[3] 196 veor $tmp,$tmp,$in1 197 veor $key,$key,$rcon 198 vext.8 $in1,$zero,$in1,#12 199 vshl.u8 $rcon,$rcon,#1 200 veor $in1,$in1,$tmp 201 veor $in0,$in0,$key 202 veor $in1,$in1,$key 203 vst1.32 {$in0},[$out],#16 204 b.ne .Loop192 205 206 mov $rounds,#12 207 add $out,$out,#0x20 208 b .Ldone 209 210.align 4 211.L256: 212 vld1.8 {$in1},[$inp] 213 mov $bits,#7 214 mov $rounds,#14 215 vst1.32 {$in0},[$out],#16 216 217.Loop256: 218 vtbl.8 $key,{$in1},$mask 219 vext.8 $tmp,$zero,$in0,#12 220 vst1.32 {$in1},[$out],#16 221 aese $key,$zero 222 subs $bits,$bits,#1 223 224 veor $in0,$in0,$tmp 225 vext.8 $tmp,$zero,$tmp,#12 226 veor $in0,$in0,$tmp 227 vext.8 $tmp,$zero,$tmp,#12 228 veor $key,$key,$rcon 229 veor $in0,$in0,$tmp 230 vshl.u8 $rcon,$rcon,#1 231 veor $in0,$in0,$key 232 vst1.32 {$in0},[$out],#16 233 b.eq .Ldone 234 235 vdup.32 $key,${in0}[3] // just splat 236 vext.8 $tmp,$zero,$in1,#12 237 aese $key,$zero 238 239 veor $in1,$in1,$tmp 240 vext.8 $tmp,$zero,$tmp,#12 241 veor $in1,$in1,$tmp 242 vext.8 $tmp,$zero,$tmp,#12 243 veor $in1,$in1,$tmp 244 245 veor $in1,$in1,$key 246 b .Loop256 247 248.Ldone: 249 str $rounds,[$out] 250 mov $ptr,#0 251 252.Lenc_key_abort: 253 mov x0,$ptr // return value 254 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 255 ret 256.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 257 258.globl ${prefix}_set_decrypt_key 259.type ${prefix}_set_decrypt_key,%function 260.align 5 261${prefix}_set_decrypt_key: 262___ 263$code.=<<___ if ($flavour =~ /64/); 264 stp x29,x30,[sp,#-16]! 265 add x29,sp,#0 266___ 267$code.=<<___ if ($flavour !~ /64/); 268 stmdb sp!,{r4,lr} 269___ 270$code.=<<___; 271 bl .Lenc_key 272 273 cmp x0,#0 274 b.ne .Ldec_key_abort 275 276 sub $out,$out,#240 // restore original $out 277 mov x4,#-16 278 add $inp,$out,x12,lsl#4 // end of key schedule 279 280 vld1.32 {v0.16b},[$out] 281 vld1.32 {v1.16b},[$inp] 282 vst1.32 {v0.16b},[$inp],x4 283 vst1.32 {v1.16b},[$out],#16 284 285.Loop_imc: 286 vld1.32 {v0.16b},[$out] 287 vld1.32 {v1.16b},[$inp] 288 aesimc v0.16b,v0.16b 289 aesimc v1.16b,v1.16b 290 vst1.32 {v0.16b},[$inp],x4 291 vst1.32 {v1.16b},[$out],#16 292 cmp $inp,$out 293 b.hi .Loop_imc 294 295 vld1.32 {v0.16b},[$out] 296 aesimc v0.16b,v0.16b 297 vst1.32 {v0.16b},[$inp] 298 299 eor x0,x0,x0 // return value 300.Ldec_key_abort: 301___ 302$code.=<<___ if ($flavour !~ /64/); 303 ldmia sp!,{r4,pc} 304___ 305$code.=<<___ if ($flavour =~ /64/); 306 ldp x29,x30,[sp],#16 307 ret 308___ 309$code.=<<___; 310.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 311___ 312}}} 313{{{ 314sub gen_block () { 315my $dir = shift; 316my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 317my ($inp,$out,$key)=map("x$_",(0..2)); 318my $rounds="w3"; 319my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 320 321$code.=<<___; 322.globl ${prefix}_${dir}crypt 323.type ${prefix}_${dir}crypt,%function 324.align 5 325${prefix}_${dir}crypt: 326 ldr $rounds,[$key,#240] 327 vld1.32 {$rndkey0},[$key],#16 328 vld1.8 {$inout},[$inp] 329 sub $rounds,$rounds,#2 330 vld1.32 {$rndkey1},[$key],#16 331 332.Loop_${dir}c: 333 aes$e $inout,$rndkey0 334 aes$mc $inout,$inout 335 vld1.32 {$rndkey0},[$key],#16 336 subs $rounds,$rounds,#2 337 aes$e $inout,$rndkey1 338 aes$mc $inout,$inout 339 vld1.32 {$rndkey1},[$key],#16 340 b.gt .Loop_${dir}c 341 342 aes$e $inout,$rndkey0 343 aes$mc $inout,$inout 344 vld1.32 {$rndkey0},[$key] 345 aes$e $inout,$rndkey1 346 veor $inout,$inout,$rndkey0 347 348 vst1.8 {$inout},[$out] 349 ret 350.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 351___ 352} 353&gen_block("en"); 354&gen_block("de"); 355}}} 356{{{ 357my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 358my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 359my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 360 361my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 362my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 363 364### q8-q15 preloaded key schedule 365 366$code.=<<___; 367.globl ${prefix}_cbc_encrypt 368.type ${prefix}_cbc_encrypt,%function 369.align 5 370${prefix}_cbc_encrypt: 371___ 372$code.=<<___ if ($flavour =~ /64/); 373 stp x29,x30,[sp,#-16]! 374 add x29,sp,#0 375___ 376$code.=<<___ if ($flavour !~ /64/); 377 mov ip,sp 378 stmdb sp!,{r4-r8,lr} 379 vstmdb sp!,{d8-d15} @ ABI specification says so 380 ldmia ip,{r4-r5} @ load remaining args 381___ 382$code.=<<___; 383 subs $len,$len,#16 384 mov $step,#16 385 b.lo .Lcbc_abort 386 cclr $step,eq 387 388 cmp $enc,#0 // en- or decrypting? 389 ldr $rounds,[$key,#240] 390 and $len,$len,#-16 391 vld1.8 {$ivec},[$ivp] 392 vld1.8 {$dat},[$inp],$step 393 394 vld1.32 {q8-q9},[$key] // load key schedule... 395 sub $rounds,$rounds,#6 396 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 397 sub $rounds,$rounds,#2 398 vld1.32 {q10-q11},[$key_],#32 399 vld1.32 {q12-q13},[$key_],#32 400 vld1.32 {q14-q15},[$key_],#32 401 vld1.32 {$rndlast},[$key_] 402 403 add $key_,$key,#32 404 mov $cnt,$rounds 405 b.eq .Lcbc_dec 406 407 cmp $rounds,#2 408 veor $dat,$dat,$ivec 409 veor $rndzero_n_last,q8,$rndlast 410 b.eq .Lcbc_enc128 411 412 vld1.32 {$in0-$in1},[$key_] 413 add $key_,$key,#16 414 add $key4,$key,#16*4 415 add $key5,$key,#16*5 416 aese $dat,q8 417 aesmc $dat,$dat 418 add $key6,$key,#16*6 419 add $key7,$key,#16*7 420 b .Lenter_cbc_enc 421 422.align 4 423.Loop_cbc_enc: 424 aese $dat,q8 425 aesmc $dat,$dat 426 vst1.8 {$ivec},[$out],#16 427.Lenter_cbc_enc: 428 aese $dat,q9 429 aesmc $dat,$dat 430 aese $dat,$in0 431 aesmc $dat,$dat 432 vld1.32 {q8},[$key4] 433 cmp $rounds,#4 434 aese $dat,$in1 435 aesmc $dat,$dat 436 vld1.32 {q9},[$key5] 437 b.eq .Lcbc_enc192 438 439 aese $dat,q8 440 aesmc $dat,$dat 441 vld1.32 {q8},[$key6] 442 aese $dat,q9 443 aesmc $dat,$dat 444 vld1.32 {q9},[$key7] 445 nop 446 447.Lcbc_enc192: 448 aese $dat,q8 449 aesmc $dat,$dat 450 subs $len,$len,#16 451 aese $dat,q9 452 aesmc $dat,$dat 453 cclr $step,eq 454 aese $dat,q10 455 aesmc $dat,$dat 456 aese $dat,q11 457 aesmc $dat,$dat 458 vld1.8 {q8},[$inp],$step 459 aese $dat,q12 460 aesmc $dat,$dat 461 veor q8,q8,$rndzero_n_last 462 aese $dat,q13 463 aesmc $dat,$dat 464 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 465 aese $dat,q14 466 aesmc $dat,$dat 467 aese $dat,q15 468 veor $ivec,$dat,$rndlast 469 b.hs .Loop_cbc_enc 470 471 vst1.8 {$ivec},[$out],#16 472 b .Lcbc_done 473 474.align 5 475.Lcbc_enc128: 476 vld1.32 {$in0-$in1},[$key_] 477 aese $dat,q8 478 aesmc $dat,$dat 479 b .Lenter_cbc_enc128 480.Loop_cbc_enc128: 481 aese $dat,q8 482 aesmc $dat,$dat 483 vst1.8 {$ivec},[$out],#16 484.Lenter_cbc_enc128: 485 aese $dat,q9 486 aesmc $dat,$dat 487 subs $len,$len,#16 488 aese $dat,$in0 489 aesmc $dat,$dat 490 cclr $step,eq 491 aese $dat,$in1 492 aesmc $dat,$dat 493 aese $dat,q10 494 aesmc $dat,$dat 495 aese $dat,q11 496 aesmc $dat,$dat 497 vld1.8 {q8},[$inp],$step 498 aese $dat,q12 499 aesmc $dat,$dat 500 aese $dat,q13 501 aesmc $dat,$dat 502 aese $dat,q14 503 aesmc $dat,$dat 504 veor q8,q8,$rndzero_n_last 505 aese $dat,q15 506 veor $ivec,$dat,$rndlast 507 b.hs .Loop_cbc_enc128 508 509 vst1.8 {$ivec},[$out],#16 510 b .Lcbc_done 511___ 512{ 513my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 514$code.=<<___; 515.align 5 516.Lcbc_dec: 517 vld1.8 {$dat2},[$inp],#16 518 subs $len,$len,#32 // bias 519 add $cnt,$rounds,#2 520 vorr $in1,$dat,$dat 521 vorr $dat1,$dat,$dat 522 vorr $in2,$dat2,$dat2 523 b.lo .Lcbc_dec_tail 524 525 vorr $dat1,$dat2,$dat2 526 vld1.8 {$dat2},[$inp],#16 527 vorr $in0,$dat,$dat 528 vorr $in1,$dat1,$dat1 529 vorr $in2,$dat2,$dat2 530 531.Loop3x_cbc_dec: 532 aesd $dat0,q8 533 aesimc $dat0,$dat0 534 aesd $dat1,q8 535 aesimc $dat1,$dat1 536 aesd $dat2,q8 537 aesimc $dat2,$dat2 538 vld1.32 {q8},[$key_],#16 539 subs $cnt,$cnt,#2 540 aesd $dat0,q9 541 aesimc $dat0,$dat0 542 aesd $dat1,q9 543 aesimc $dat1,$dat1 544 aesd $dat2,q9 545 aesimc $dat2,$dat2 546 vld1.32 {q9},[$key_],#16 547 b.gt .Loop3x_cbc_dec 548 549 aesd $dat0,q8 550 aesimc $dat0,$dat0 551 aesd $dat1,q8 552 aesimc $dat1,$dat1 553 aesd $dat2,q8 554 aesimc $dat2,$dat2 555 veor $tmp0,$ivec,$rndlast 556 subs $len,$len,#0x30 557 veor $tmp1,$in0,$rndlast 558 mov.lo x6,$len // x6, $cnt, is zero at this point 559 aesd $dat0,q9 560 aesimc $dat0,$dat0 561 aesd $dat1,q9 562 aesimc $dat1,$dat1 563 aesd $dat2,q9 564 aesimc $dat2,$dat2 565 veor $tmp2,$in1,$rndlast 566 add $inp,$inp,x6 // $inp is adjusted in such way that 567 // at exit from the loop $dat1-$dat2 568 // are loaded with last "words" 569 vorr $ivec,$in2,$in2 570 mov $key_,$key 571 aesd $dat0,q12 572 aesimc $dat0,$dat0 573 aesd $dat1,q12 574 aesimc $dat1,$dat1 575 aesd $dat2,q12 576 aesimc $dat2,$dat2 577 vld1.8 {$in0},[$inp],#16 578 aesd $dat0,q13 579 aesimc $dat0,$dat0 580 aesd $dat1,q13 581 aesimc $dat1,$dat1 582 aesd $dat2,q13 583 aesimc $dat2,$dat2 584 vld1.8 {$in1},[$inp],#16 585 aesd $dat0,q14 586 aesimc $dat0,$dat0 587 aesd $dat1,q14 588 aesimc $dat1,$dat1 589 aesd $dat2,q14 590 aesimc $dat2,$dat2 591 vld1.8 {$in2},[$inp],#16 592 aesd $dat0,q15 593 aesd $dat1,q15 594 aesd $dat2,q15 595 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 596 add $cnt,$rounds,#2 597 veor $tmp0,$tmp0,$dat0 598 veor $tmp1,$tmp1,$dat1 599 veor $dat2,$dat2,$tmp2 600 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 601 vst1.8 {$tmp0},[$out],#16 602 vorr $dat0,$in0,$in0 603 vst1.8 {$tmp1},[$out],#16 604 vorr $dat1,$in1,$in1 605 vst1.8 {$dat2},[$out],#16 606 vorr $dat2,$in2,$in2 607 b.hs .Loop3x_cbc_dec 608 609 cmn $len,#0x30 610 b.eq .Lcbc_done 611 nop 612 613.Lcbc_dec_tail: 614 aesd $dat1,q8 615 aesimc $dat1,$dat1 616 aesd $dat2,q8 617 aesimc $dat2,$dat2 618 vld1.32 {q8},[$key_],#16 619 subs $cnt,$cnt,#2 620 aesd $dat1,q9 621 aesimc $dat1,$dat1 622 aesd $dat2,q9 623 aesimc $dat2,$dat2 624 vld1.32 {q9},[$key_],#16 625 b.gt .Lcbc_dec_tail 626 627 aesd $dat1,q8 628 aesimc $dat1,$dat1 629 aesd $dat2,q8 630 aesimc $dat2,$dat2 631 aesd $dat1,q9 632 aesimc $dat1,$dat1 633 aesd $dat2,q9 634 aesimc $dat2,$dat2 635 aesd $dat1,q12 636 aesimc $dat1,$dat1 637 aesd $dat2,q12 638 aesimc $dat2,$dat2 639 cmn $len,#0x20 640 aesd $dat1,q13 641 aesimc $dat1,$dat1 642 aesd $dat2,q13 643 aesimc $dat2,$dat2 644 veor $tmp1,$ivec,$rndlast 645 aesd $dat1,q14 646 aesimc $dat1,$dat1 647 aesd $dat2,q14 648 aesimc $dat2,$dat2 649 veor $tmp2,$in1,$rndlast 650 aesd $dat1,q15 651 aesd $dat2,q15 652 b.eq .Lcbc_dec_one 653 veor $tmp1,$tmp1,$dat1 654 veor $tmp2,$tmp2,$dat2 655 vorr $ivec,$in2,$in2 656 vst1.8 {$tmp1},[$out],#16 657 vst1.8 {$tmp2},[$out],#16 658 b .Lcbc_done 659 660.Lcbc_dec_one: 661 veor $tmp1,$tmp1,$dat2 662 vorr $ivec,$in2,$in2 663 vst1.8 {$tmp1},[$out],#16 664 665.Lcbc_done: 666 vst1.8 {$ivec},[$ivp] 667.Lcbc_abort: 668___ 669} 670$code.=<<___ if ($flavour !~ /64/); 671 vldmia sp!,{d8-d15} 672 ldmia sp!,{r4-r8,pc} 673___ 674$code.=<<___ if ($flavour =~ /64/); 675 ldr x29,[sp],#16 676 ret 677___ 678$code.=<<___; 679.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 680___ 681}}} 682{{{ 683my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 684my ($rounds,$cnt,$key_)=("w5","w6","x7"); 685my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 686my $step="x12"; # aliases with $tctr2 687 688my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 689my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 690 691my ($dat,$tmp)=($dat0,$tmp0); 692 693### q8-q15 preloaded key schedule 694 695$code.=<<___; 696.globl ${prefix}_ctr32_encrypt_blocks 697.type ${prefix}_ctr32_encrypt_blocks,%function 698.align 5 699${prefix}_ctr32_encrypt_blocks: 700___ 701$code.=<<___ if ($flavour =~ /64/); 702 stp x29,x30,[sp,#-16]! 703 add x29,sp,#0 704___ 705$code.=<<___ if ($flavour !~ /64/); 706 mov ip,sp 707 stmdb sp!,{r4-r10,lr} 708 vstmdb sp!,{d8-d15} @ ABI specification says so 709 ldr r4, [ip] @ load remaining arg 710___ 711$code.=<<___; 712 ldr $rounds,[$key,#240] 713 714 ldr $ctr, [$ivp, #12] 715 vld1.32 {$dat0},[$ivp] 716 717 vld1.32 {q8-q9},[$key] // load key schedule... 718 sub $rounds,$rounds,#4 719 mov $step,#16 720 cmp $len,#2 721 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 722 sub $rounds,$rounds,#2 723 vld1.32 {q12-q13},[$key_],#32 724 vld1.32 {q14-q15},[$key_],#32 725 vld1.32 {$rndlast},[$key_] 726 add $key_,$key,#32 727 mov $cnt,$rounds 728 cclr $step,lo 729#ifndef __ARMEB__ 730 rev $ctr, $ctr 731#endif 732 vorr $dat1,$dat0,$dat0 733 add $tctr1, $ctr, #1 734 vorr $dat2,$dat0,$dat0 735 add $ctr, $ctr, #2 736 vorr $ivec,$dat0,$dat0 737 rev $tctr1, $tctr1 738 vmov.32 ${dat1}[3],$tctr1 739 b.ls .Lctr32_tail 740 rev $tctr2, $ctr 741 sub $len,$len,#3 // bias 742 vmov.32 ${dat2}[3],$tctr2 743 b .Loop3x_ctr32 744 745.align 4 746.Loop3x_ctr32: 747 aese $dat0,q8 748 aesmc $dat0,$dat0 749 aese $dat1,q8 750 aesmc $dat1,$dat1 751 aese $dat2,q8 752 aesmc $dat2,$dat2 753 vld1.32 {q8},[$key_],#16 754 subs $cnt,$cnt,#2 755 aese $dat0,q9 756 aesmc $dat0,$dat0 757 aese $dat1,q9 758 aesmc $dat1,$dat1 759 aese $dat2,q9 760 aesmc $dat2,$dat2 761 vld1.32 {q9},[$key_],#16 762 b.gt .Loop3x_ctr32 763 764 aese $dat0,q8 765 aesmc $tmp0,$dat0 766 aese $dat1,q8 767 aesmc $tmp1,$dat1 768 vld1.8 {$in0},[$inp],#16 769 vorr $dat0,$ivec,$ivec 770 aese $dat2,q8 771 aesmc $dat2,$dat2 772 vld1.8 {$in1},[$inp],#16 773 vorr $dat1,$ivec,$ivec 774 aese $tmp0,q9 775 aesmc $tmp0,$tmp0 776 aese $tmp1,q9 777 aesmc $tmp1,$tmp1 778 vld1.8 {$in2},[$inp],#16 779 mov $key_,$key 780 aese $dat2,q9 781 aesmc $tmp2,$dat2 782 vorr $dat2,$ivec,$ivec 783 add $tctr0,$ctr,#1 784 aese $tmp0,q12 785 aesmc $tmp0,$tmp0 786 aese $tmp1,q12 787 aesmc $tmp1,$tmp1 788 veor $in0,$in0,$rndlast 789 add $tctr1,$ctr,#2 790 aese $tmp2,q12 791 aesmc $tmp2,$tmp2 792 veor $in1,$in1,$rndlast 793 add $ctr,$ctr,#3 794 aese $tmp0,q13 795 aesmc $tmp0,$tmp0 796 aese $tmp1,q13 797 aesmc $tmp1,$tmp1 798 veor $in2,$in2,$rndlast 799 rev $tctr0,$tctr0 800 aese $tmp2,q13 801 aesmc $tmp2,$tmp2 802 vmov.32 ${dat0}[3], $tctr0 803 rev $tctr1,$tctr1 804 aese $tmp0,q14 805 aesmc $tmp0,$tmp0 806 aese $tmp1,q14 807 aesmc $tmp1,$tmp1 808 vmov.32 ${dat1}[3], $tctr1 809 rev $tctr2,$ctr 810 aese $tmp2,q14 811 aesmc $tmp2,$tmp2 812 vmov.32 ${dat2}[3], $tctr2 813 subs $len,$len,#3 814 aese $tmp0,q15 815 aese $tmp1,q15 816 aese $tmp2,q15 817 818 veor $in0,$in0,$tmp0 819 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 820 vst1.8 {$in0},[$out],#16 821 veor $in1,$in1,$tmp1 822 mov $cnt,$rounds 823 vst1.8 {$in1},[$out],#16 824 veor $in2,$in2,$tmp2 825 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 826 vst1.8 {$in2},[$out],#16 827 b.hs .Loop3x_ctr32 828 829 adds $len,$len,#3 830 b.eq .Lctr32_done 831 cmp $len,#1 832 mov $step,#16 833 cclr $step,eq 834 835.Lctr32_tail: 836 aese $dat0,q8 837 aesmc $dat0,$dat0 838 aese $dat1,q8 839 aesmc $dat1,$dat1 840 vld1.32 {q8},[$key_],#16 841 subs $cnt,$cnt,#2 842 aese $dat0,q9 843 aesmc $dat0,$dat0 844 aese $dat1,q9 845 aesmc $dat1,$dat1 846 vld1.32 {q9},[$key_],#16 847 b.gt .Lctr32_tail 848 849 aese $dat0,q8 850 aesmc $dat0,$dat0 851 aese $dat1,q8 852 aesmc $dat1,$dat1 853 aese $dat0,q9 854 aesmc $dat0,$dat0 855 aese $dat1,q9 856 aesmc $dat1,$dat1 857 vld1.8 {$in0},[$inp],$step 858 aese $dat0,q12 859 aesmc $dat0,$dat0 860 aese $dat1,q12 861 aesmc $dat1,$dat1 862 vld1.8 {$in1},[$inp] 863 aese $dat0,q13 864 aesmc $dat0,$dat0 865 aese $dat1,q13 866 aesmc $dat1,$dat1 867 veor $in0,$in0,$rndlast 868 aese $dat0,q14 869 aesmc $dat0,$dat0 870 aese $dat1,q14 871 aesmc $dat1,$dat1 872 veor $in1,$in1,$rndlast 873 aese $dat0,q15 874 aese $dat1,q15 875 876 cmp $len,#1 877 veor $in0,$in0,$dat0 878 veor $in1,$in1,$dat1 879 vst1.8 {$in0},[$out],#16 880 b.eq .Lctr32_done 881 vst1.8 {$in1},[$out] 882 883.Lctr32_done: 884___ 885$code.=<<___ if ($flavour !~ /64/); 886 vldmia sp!,{d8-d15} 887 ldmia sp!,{r4-r10,pc} 888___ 889$code.=<<___ if ($flavour =~ /64/); 890 ldr x29,[sp],#16 891 ret 892___ 893$code.=<<___; 894.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 895___ 896}}} 897$code.=<<___; 898#endif 899___ 900######################################## 901if ($flavour =~ /64/) { ######## 64-bit code 902 my %opcode = ( 903 "aesd" => 0x4e285800, "aese" => 0x4e284800, 904 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 905 906 local *unaes = sub { 907 my ($mnemonic,$arg)=@_; 908 909 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 910 sprintf ".inst\t0x%08x\t//%s %s", 911 $opcode{$mnemonic}|$1|($2<<5), 912 $mnemonic,$arg; 913 }; 914 915 foreach(split("\n",$code)) { 916 s/\`([^\`]*)\`/eval($1)/geo; 917 918 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 919 s/@\s/\/\//o; # old->new style commentary 920 921 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 922 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 923 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 924 s/vmov\.i8/movi/o or # fix up legacy mnemonics 925 s/vext\.8/ext/o or 926 s/vrev32\.8/rev32/o or 927 s/vtst\.8/cmtst/o or 928 s/vshr/ushr/o or 929 s/^(\s+)v/$1/o or # strip off v prefix 930 s/\bbx\s+lr\b/ret/o; 931 932 # fix up remainig legacy suffixes 933 s/\.[ui]?8//o; 934 m/\],#8/o and s/\.16b/\.8b/go; 935 s/\.[ui]?32//o and s/\.16b/\.4s/go; 936 s/\.[ui]?64//o and s/\.16b/\.2d/go; 937 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 938 939 print $_,"\n"; 940 } 941} else { ######## 32-bit code 942 my %opcode = ( 943 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 944 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 945 946 local *unaes = sub { 947 my ($mnemonic,$arg)=@_; 948 949 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 950 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 951 |(($2&7)<<1) |(($2&8)<<2); 952 # since ARMv7 instructions are always encoded little-endian. 953 # correct solution is to use .inst directive, but older 954 # assemblers don't implement it:-( 955 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 956 $word&0xff,($word>>8)&0xff, 957 ($word>>16)&0xff,($word>>24)&0xff, 958 $mnemonic,$arg; 959 } 960 }; 961 962 sub unvtbl { 963 my $arg=shift; 964 965 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 966 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 967 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 968 } 969 970 sub unvdup32 { 971 my $arg=shift; 972 973 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 974 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 975 } 976 977 sub unvmov32 { 978 my $arg=shift; 979 980 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 981 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 982 } 983 984 foreach(split("\n",$code)) { 985 s/\`([^\`]*)\`/eval($1)/geo; 986 987 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 988 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 989 s/\/\/\s?/@ /o; # new->old style commentary 990 991 # fix up remainig new-style suffixes 992 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 993 s/\],#[0-9]+/]!/o; 994 995 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 996 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 997 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 998 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 999 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1000 s/^(\s+)b\./$1b/o or 1001 s/^(\s+)mov\./$1mov/o or 1002 s/^(\s+)ret/$1bx\tlr/o; 1003 1004 print $_,"\n"; 1005 } 1006} 1007 1008close STDOUT; 1009