1480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org#!/usr/bin/env perl 2480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 3480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# ==================================================================== 4480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# project. The module is, however, dual licensed under OpenSSL and 6480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# CRYPTOGAMS licenses depending on where you obtain it. For further 7480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# details see http://www.openssl.org/~appro/cryptogams/. 8480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# ==================================================================== 9c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org 10c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# At some point it became apparent that the original SSLeay RC4 11480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# assembler implementation performs suboptimally on latest IA-32 12c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# microarchitectures. After re-tuning performance has changed as 13c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# following: 14c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# 15480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# Pentium -10% 16480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# Pentium III +12% 17480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# AMD +50%(*) 18480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# P4 +250%(**) 19c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# 20c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# (*) This number is actually a trade-off:-) It's possible to 21c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# achieve +72%, but at the cost of -48% off PIII performance. 22c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# In other words code performing further 13% faster on AMD 23c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# would perform almost 2 times slower on Intel PIII... 24c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# For reference! This code delivers ~80% of rc4-amd64.pl 25c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# performance on the same Opteron machine. 26c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# (**) This number requires compressed key schedule set up by 27480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# RC4_set_key [see commentary below for further details]. 28c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# 29c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org# <appro@fy.chalmers.se> 30c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org 312c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# May 2011 322c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# 332c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Optimize for Core2 and Westmere [and incidentally Opteron]. Current 342c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# performance in cycles per processed byte (less is better) and 352c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# improvement relative to previous version of this module is: 362c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# 372c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Pentium 10.2 # original numbers 382c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Pentium III 7.8(*) 392c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Intel P4 7.5 402c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# 412c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Opteron 6.1/+20% # new MMX numbers 422c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Core2 5.3/+67%(**) 432c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Westmere 5.1/+94%(**) 442c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Sandy Bridge 5.0/+8% 452c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# Atom 12.6/+6% 462c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# 472c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# (*) PIII can actually deliver 6.6 cycles per byte with MMX code, 482c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# but this specific code performs poorly on Core2. And vice 492c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs 502c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU 512c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# [anymore], I chose to discard PIII-specific code path and opt 522c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# for original IALU-only code, which is why MMX/SSE code path 532c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# is guarded by SSE2 bit (see below), not MMX/SSE. 542c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# (**) Performance vs. block size on Core2 and Westmere had a maximum 552c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# at ... 64 bytes block size. And it was quite a maximum, 40-60% 562c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# in comparison to largest 8KB block size. Above improvement 572c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org# coefficients are for the largest block size. 582c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 59480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 60480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgpush(@INC,"${dir}","${dir}../../perlasm"); 61c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.orgrequire "x86asm.pl"; 62c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org 63c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org&asm_init($ARGV[0],"rc4-586.pl"); 64c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org 65480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$xx="eax"; 66480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$yy="ebx"; 67c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$tx="ecx"; 68c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org$ty="edx"; 69480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$inp="esi"; 70480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$out="ebp"; 71480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$dat="edi"; 72480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 73480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.orgsub RC4_loop { 74480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org my $i=shift; 75480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org my $func = ($i==0)?*mov:*or; 76480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 77480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($yy),&LB($tx)); 78480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($ty,&DWP(0,$dat,$yy,4)); 79480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$dat,$yy,4),$tx); 80480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$dat,$xx,4),$ty); 81480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add ($ty,$tx); 82480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &inc (&LB($xx)); 83480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &and ($ty,0xff); 84480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &ror ($out,8) if ($i!=0); 85480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org if ($i<3) { 86480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($tx,&DWP(0,$dat,$xx,4)); 87480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org } else { 88480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($tx,&wparam(3)); # reload [re-biased] out 89c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org } 90480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &$func ($out,&DWP(0,$dat,$ty,4)); 91480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org} 92480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 932c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.orgif ($alt=0) { 942c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron, 952c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # but ~40% slower on Core2 and Westmere... Attempt to add movz 962c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet 972c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # on Core2 with movz it's almost 20% slower than below alternative 982c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # code... Yes, it's a total mess... 992c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org my @XX=($xx,$out); 1002c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org $RC4_loop_mmx = sub { # SSE actually... 1012c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org my $i=shift; 1022c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org my $j=$i<=0?0:$i>>1; 1032c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org my $mm=$i<=0?"mm0":"mm".($i&1); 1042c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1052c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &add (&LB($yy),&LB($tx)); 1062c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &lea (@XX[1],&DWP(1,@XX[0])); 1072c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm2","mm0") if ($i==0); 1082c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &psllq ("mm1",8) if ($i==0); 1092c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &and (@XX[1],0xff); 1102c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm0","mm0") if ($i<=0); 1112c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ($ty,&DWP(0,$dat,$yy,4)); 1122c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&DWP(0,$dat,$yy,4),$tx); 1132c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm1","mm2") if ($i==0); 1142c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&DWP(0,$dat,$XX[0],4),$ty); 1152c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &add (&LB($ty),&LB($tx)); 1162c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movd (@XX[0],"mm7") if ($i==0); 1172c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ($tx,&DWP(0,$dat,@XX[1],4)); 1182c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm1","mm1") if ($i==1); 1192c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movq ("mm2",&QWP(0,$inp)) if ($i==1); 1202c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0); 1212c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j); 1222c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1232c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org push (@XX,shift(@XX)) if ($i>=0); 1242c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org } 1252c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org} else { 1262c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # Using pinsrw here improves performane on Intel CPUs by 2-3%, but 1272c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # brings down AMD by 7%... 1282c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org $RC4_loop_mmx = sub { 1292c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org my $i=shift; 1302c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1312c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &add (&LB($yy),&LB($tx)); 1322c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); 1332c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ($ty,&DWP(0,$dat,$yy,4)); 1342c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&DWP(0,$dat,$yy,4),$tx); 1352c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&DWP(0,$dat,$xx,4),$ty); 1362c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &inc ($xx); 1372c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &add ($ty,$tx); 1382c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movz ($xx,&LB($xx)); # (*) 1392c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movz ($ty,&LB($ty)); # (*) 1402c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); 1412c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movq ("mm0",&QWP(0,$inp)) if ($i<=0); 1422c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); 1432c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ($tx,&DWP(0,$dat,$xx,4)); 1442c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); 1452c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1462c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # (*) This is the key to Core2 and Westmere performance. 1472c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # Whithout movz out-of-order execution logic confuses 1482c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # itself and fails to reorder loads and stores. Problem 1492c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org # appears to be fixed in Sandy Bridge... 1502c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org } 1512c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org} 1522c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1532c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&external_label("OPENSSL_ia32cap_P"); 1542c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 155480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); 156480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&function_begin("RC4"); 157480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($dat,&wparam(0)); # load key schedule pointer 158480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($ty, &wparam(1)); # load len 159480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($inp,&wparam(2)); # load inp 160480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($out,&wparam(3)); # load out 161480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 162480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($xx,$xx); # avoid partial register stalls 163480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($yy,$yy); 164480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 165480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &cmp ($ty,0); # safety net 166480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &je (&label("abort")); 167480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 168480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&LB($xx),&BP(0,$dat)); # load key->x 169480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&LB($yy),&BP(4,$dat)); # load key->y 170480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add ($dat,8); 171480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 172480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($tx,&DWP(0,$inp,$ty)); 173480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &sub ($out,$inp); # re-bias out 174480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&wparam(1),$tx); # save input+len 175480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 176480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &inc (&LB($xx)); 177480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 178480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org # detect compressed key schedule... 179480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &cmp (&DWP(256,$dat),-1); 180480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &je (&label("RC4_CHAR")); 181480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 182480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($tx,&DWP(0,$dat,$xx,4)); 183480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 184480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &and ($ty,-4); # how many 4-byte chunks? 185480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jz (&label("loop1")); 186480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 1872c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &test ($ty,-8); 1882c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&wparam(3),$out); # $out as accumulator in these loops 1892c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jz (&label("go4loop4")); 1902c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1912c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &picmeup($out,"OPENSSL_ia32cap_P"); 1922c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] 1932c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jnc (&label("go4loop4")); 1942c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 1952c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ($out,&wparam(3)) if (!$alt); 1962c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movd ("mm7",&wparam(3)) if ($alt); 1972c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &and ($ty,-8); 1982c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &lea ($ty,&DWP(-8,$inp,$ty)); 1992c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 2002c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 2012c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &$RC4_loop_mmx(-1); 2022c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jmp(&label("loop_mmx_enter")); 2032c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 2042c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &set_label("loop_mmx",16); 2052c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &$RC4_loop_mmx(0); 2062c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &set_label("loop_mmx_enter"); 2072c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } 2082c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ($ty,$yy); 2092c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &xor ($yy,$yy); # this is second key to Core2 2102c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&LB($yy),&LB($ty)); # and Westmere performance... 2112c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &cmp ($inp,&DWP(-4,$dat)); 2122c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &lea ($inp,&DWP(8,$inp)); 2132c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jb (&label("loop_mmx")); 2142c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 2152c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org if ($alt) { 2162c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movd ($out,"mm7"); 2172c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm2","mm0"); 2182c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &psllq ("mm1",8); 2192c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm1","mm2"); 2202c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movq (&QWP(-8,$out,$inp),"mm1"); 2212c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org } else { 2222c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &psllq ("mm1",56); 2232c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &pxor ("mm2","mm1"); 2242c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &movq (&QWP(-8,$out,$inp),"mm2"); 2252c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org } 2262c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &emms (); 2272c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 2282c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &cmp ($inp,&wparam(1)); # compare to input+len 2292c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &je (&label("done")); 2302c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jmp (&label("loop1")); 2312c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org 2322c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&set_label("go4loop4",16); 233480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($ty,&DWP(-4,$inp,$ty)); 234480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&wparam(2),$ty); # save input+(len/4)*4-4 235480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 2362c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &set_label("loop4"); 237480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org for ($i=0;$i<4;$i++) { RC4_loop($i); } 238480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &ror ($out,8); 239480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($out,&DWP(0,$inp)); 240480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4 241480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here 242480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($inp,&DWP(4,$inp)); 243480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($tx,&DWP(0,$dat,$xx,4)); 244480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jb (&label("loop4")); 245480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 246480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &cmp ($inp,&wparam(1)); # compare to input+len 247480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &je (&label("done")); 248480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($out,&wparam(3)); # restore $out 249480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 250480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &set_label("loop1",16); 251480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($yy),&LB($tx)); 252480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($ty,&DWP(0,$dat,$yy,4)); 253480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$dat,$yy,4),$tx); 254480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$dat,$xx,4),$ty); 255480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add ($ty,$tx); 256480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &inc (&LB($xx)); 257480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &and ($ty,0xff); 258480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($ty,&DWP(0,$dat,$ty,4)); 259480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor (&LB($ty),&BP(0,$inp)); 260480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($inp,&DWP(1,$inp)); 261480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($tx,&DWP(0,$dat,$xx,4)); 262480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &cmp ($inp,&wparam(1)); # compare to input+len 263480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(-1,$out,$inp),&LB($ty)); 264480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jb (&label("loop1")); 265480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 266480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jmp (&label("done")); 267480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 268480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# this is essentially Intel P4 specific codepath... 269480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("RC4_CHAR",16); 270480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &movz ($tx,&BP(0,$dat,$xx)); 271c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org # strangely enough unrolled loop performs over 20% slower... 272480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &set_label("cloop1"); 273480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($yy),&LB($tx)); 274480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &movz ($ty,&BP(0,$dat,$yy)); 275480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(0,$dat,$yy),&LB($tx)); 276480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(0,$dat,$xx),&LB($ty)); 277c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org &add (&LB($ty),&LB($tx)); 278480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &movz ($ty,&BP(0,$dat,$ty)); 279480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($xx),1); 280480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor (&LB($ty),&BP(0,$inp)); 281480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($inp,&DWP(1,$inp)); 282480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &movz ($tx,&BP(0,$dat,$xx)); 283480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &cmp ($inp,&wparam(1)); 284480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(-1,$out,$inp),&LB($ty)); 285480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jb (&label("cloop1")); 286480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 287480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("done"); 288480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &dec (&LB($xx)); 2892c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov (&DWP(-4,$dat),$yy); # save key->y 290480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(-8,$dat),&LB($xx)); # save key->x 291480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("abort"); 292480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&function_end("RC4"); 293480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 294480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org######################################################################## 295480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 296480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$inp="esi"; 297480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$out="edi"; 298480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$idi="ebp"; 299480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$ido="ecx"; 300480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org$idx="edx"; 301480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 302480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); 3032c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&function_begin("private_RC4_set_key"); 304480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($out,&wparam(0)); # load key 305480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($idi,&wparam(1)); # load len 306480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($inp,&wparam(2)); # load data 307480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &picmeup($idx,"OPENSSL_ia32cap_P"); 308480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 309480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($out,&DWP(2*4,$out)); # &key->data 310480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end 311480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &neg ($idi); 312480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ("eax","eax"); 313480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(-4,$out),$idi); # borrow key->y 314480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 315480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &bt (&DWP(0,$idx),20); # check for bit#20 316480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jc (&label("c1stloop")); 317480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 318480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("w1stloop",16); 319480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i; 320480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB("eax"),1); # i++; 321480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jnc (&label("w1stloop")); 322480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 323480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($ido,$ido); 324480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($idx,$idx); 325480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 326480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("w2ndloop",16); 327480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ("eax",&DWP(0,$out,$ido,4)); 328480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($idx),&BP(0,$inp,$idi)); 329480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($idx),&LB("eax")); 330480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add ($idi,1); 331480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ("ebx",&DWP(0,$out,$idx,4)); 332480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jnz (&label("wnowrap")); 333480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($idi,&DWP(-4,$out)); 334480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &set_label("wnowrap"); 335480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$out,$idx,4),"eax"); 336480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(0,$out,$ido,4),"ebx"); 337480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($ido),1); 338480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jnc (&label("w2ndloop")); 339480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&jmp (&label("exit")); 340480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 341480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# Unlike all other x86 [and x86_64] implementations, Intel P4 core 342480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# [including EM64T] was found to perform poorly with above "32-bit" key 343480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded 344480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# assembler turned out to be 3.5x if re-coded for compressed 8-bit one, 345480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit 346480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# schedule for x86[_64], because non-P4 implementations suffer from 347480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# significant performance losses then, e.g. PIII exhibits >2x 348480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# deterioration, and so does Opteron. In order to assure optimal 349480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# all-round performance, we detect P4 at run-time and set up compressed 350480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# key schedule, which is recognized by RC4 procedure. 351480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 352480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("c1stloop",16); 353480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i; 354480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB("eax"),1); # i++; 355480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jnc (&label("c1stloop")); 356480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 357480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($ido,$ido); 358480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ($idx,$idx); 359480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ("ebx","ebx"); 360480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 361480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("c2ndloop",16); 362480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&LB("eax"),&BP(0,$out,$ido)); 363480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($idx),&BP(0,$inp,$idi)); 364480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($idx),&LB("eax")); 365480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add ($idi,1); 366480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&LB("ebx"),&BP(0,$out,$idx)); 367480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jnz (&label("cnowrap")); 368480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov ($idi,&DWP(-4,$out)); 369480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &set_label("cnowrap"); 370480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(0,$out,$idx),&LB("eax")); 371480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&BP(0,$out,$ido),&LB("ebx")); 372480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &add (&LB($ido),1); 373480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &jnc (&label("c2ndloop")); 374480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 375480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(256,$out),-1); # mark schedule as compressed 376480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 377480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("exit"); 378480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &xor ("eax","eax"); 379480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(-8,$out),"eax"); # key->x=0; 380480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &mov (&DWP(-4,$out),"eax"); # key->y=0; 3812c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&function_end("private_RC4_set_key"); 382480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 383480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org# const char *RC4_options(void); 384480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&function_begin_B("RC4_options"); 385480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &call (&label("pic_point")); 386480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("pic_point"); 387480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &blindpop("eax"); 388480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); 389480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &picmeup("edx","OPENSSL_ia32cap_P"); 3902c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &mov ("edx",&DWP(0,"edx")); 3912c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &bt ("edx",20); 3922c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jc (&label("1xchar")); 3932c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &bt ("edx",26); 3942c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &jnc (&label("ret")); 3952c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &add ("eax",25); 3962c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &ret (); 3972c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&set_label("1xchar"); 3982c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org &add ("eax",12); 3992c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&set_label("ret"); 400480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org &ret (); 401480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&set_label("opts",64); 402480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&asciz ("rc4(4x,int)"); 403480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&asciz ("rc4(1x,char)"); 4042c4508dfe2bc5b6296c01114ed11ddc64b7718c6digit@chromium.org&asciz ("rc4(8x,mmx)"); 405480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 406480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&align (64); 407480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&function_end_B("RC4_options"); 408480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org 409480da75abf485e7e2a6be5acc0f71842368792c0jnd@chromium.org&asm_finish(); 410c9490d33b98b7affb729b5f1db13cb0a348471aagl@chromium.org 411