1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# February 2009 11# 12# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to 13# "cluster" Address Generation Interlocks, so that one pipeline stall 14# resolves several dependencies. 15 16# November 2010. 17# 18# Adapt for -m31 build. If kernel supports what's called "highgprs" 19# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 20# instructions and achieve "64-bit" performance even in 31-bit legacy 21# application context. The feature is not specific to any particular 22# processor, as long as it's "z-CPU". Latter implies that the code 23# remains z/Architecture specific. On z990 it was measured to perform 24# 50% better than code generated by gcc 4.3. 25 26$flavour = shift; 27 28if ($flavour =~ /3[12]/) { 29 $SIZE_T=4; 30 $g=""; 31} else { 32 $SIZE_T=8; 33 $g="g"; 34} 35 36while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 37open STDOUT,">$output"; 38 39$rp="%r14"; 40$sp="%r15"; 41$code=<<___; 42.text 43 44___ 45 46# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) 47{ 48$acc="%r0"; 49$cnt="%r1"; 50$key="%r2"; 51$len="%r3"; 52$inp="%r4"; 53$out="%r5"; 54 55@XX=("%r6","%r7"); 56@TX=("%r8","%r9"); 57$YY="%r10"; 58$TY="%r11"; 59 60$code.=<<___; 61.globl RC4 62.type RC4,\@function 63.align 64 64RC4: 65 stm${g} %r6,%r11,6*$SIZE_T($sp) 66___ 67$code.=<<___ if ($flavour =~ /3[12]/); 68 llgfr $len,$len 69___ 70$code.=<<___; 71 llgc $XX[0],0($key) 72 llgc $YY,1($key) 73 la $XX[0],1($XX[0]) 74 nill $XX[0],0xff 75 srlg $cnt,$len,3 76 ltgr $cnt,$cnt 77 llgc $TX[0],2($XX[0],$key) 78 jz .Lshort 79 j .Loop8 80 81.align 64 82.Loop8: 83___ 84for ($i=0;$i<8;$i++) { 85$code.=<<___; 86 la $YY,0($YY,$TX[0]) # $i 87 nill $YY,255 88 la $XX[1],1($XX[0]) 89 nill $XX[1],255 90___ 91$code.=<<___ if ($i==1); 92 llgc $acc,2($TY,$key) 93___ 94$code.=<<___ if ($i>1); 95 sllg $acc,$acc,8 96 ic $acc,2($TY,$key) 97___ 98$code.=<<___; 99 llgc $TY,2($YY,$key) 100 stc $TX[0],2($YY,$key) 101 llgc $TX[1],2($XX[1],$key) 102 stc $TY,2($XX[0],$key) 103 cr $XX[1],$YY 104 jne .Lcmov$i 105 la $TX[1],0($TX[0]) 106.Lcmov$i: 107 la $TY,0($TY,$TX[0]) 108 nill $TY,255 109___ 110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 111} 112 113$code.=<<___; 114 lg $TX[1],0($inp) 115 sllg $acc,$acc,8 116 la $inp,8($inp) 117 ic $acc,2($TY,$key) 118 xgr $acc,$TX[1] 119 stg $acc,0($out) 120 la $out,8($out) 121 brctg $cnt,.Loop8 122 123.Lshort: 124 lghi $acc,7 125 ngr $len,$acc 126 jz .Lexit 127 j .Loop1 128 129.align 16 130.Loop1: 131 la $YY,0($YY,$TX[0]) 132 nill $YY,255 133 llgc $TY,2($YY,$key) 134 stc $TX[0],2($YY,$key) 135 stc $TY,2($XX[0],$key) 136 ar $TY,$TX[0] 137 ahi $XX[0],1 138 nill $TY,255 139 nill $XX[0],255 140 llgc $acc,0($inp) 141 la $inp,1($inp) 142 llgc $TY,2($TY,$key) 143 llgc $TX[0],2($XX[0],$key) 144 xr $acc,$TY 145 stc $acc,0($out) 146 la $out,1($out) 147 brct $len,.Loop1 148 149.Lexit: 150 ahi $XX[0],-1 151 stc $XX[0],0($key) 152 stc $YY,1($key) 153 lm${g} %r6,%r11,6*$SIZE_T($sp) 154 br $rp 155.size RC4,.-RC4 156.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" 157 158___ 159} 160 161# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) 162{ 163$cnt="%r0"; 164$idx="%r1"; 165$key="%r2"; 166$len="%r3"; 167$inp="%r4"; 168$acc="%r5"; 169$dat="%r6"; 170$ikey="%r7"; 171$iinp="%r8"; 172 173$code.=<<___; 174.globl private_RC4_set_key 175.type private_RC4_set_key,\@function 176.align 64 177private_RC4_set_key: 178 stm${g} %r6,%r8,6*$SIZE_T($sp) 179 lhi $cnt,256 180 la $idx,0(%r0) 181 sth $idx,0($key) 182.align 4 183.L1stloop: 184 stc $idx,2($idx,$key) 185 la $idx,1($idx) 186 brct $cnt,.L1stloop 187 188 lghi $ikey,-256 189 lr $cnt,$len 190 la $iinp,0(%r0) 191 la $idx,0(%r0) 192.align 16 193.L2ndloop: 194 llgc $acc,2+256($ikey,$key) 195 llgc $dat,0($iinp,$inp) 196 la $idx,0($idx,$acc) 197 la $ikey,1($ikey) 198 la $idx,0($idx,$dat) 199 nill $idx,255 200 la $iinp,1($iinp) 201 tml $ikey,255 202 llgc $dat,2($idx,$key) 203 stc $dat,2+256-1($ikey,$key) 204 stc $acc,2($idx,$key) 205 jz .Ldone 206 brct $cnt,.L2ndloop 207 lr $cnt,$len 208 la $iinp,0(%r0) 209 j .L2ndloop 210.Ldone: 211 lm${g} %r6,%r8,6*$SIZE_T($sp) 212 br $rp 213.size private_RC4_set_key,.-private_RC4_set_key 214 215___ 216} 217 218# const char *RC4_options() 219$code.=<<___; 220.globl RC4_options 221.type RC4_options,\@function 222.align 16 223RC4_options: 224 larl %r2,.Loptions 225 br %r14 226.size RC4_options,.-RC4_options 227.section .rodata 228.Loptions: 229.align 8 230.string "rc4(8x,char)" 231___ 232 233print $code; 234close STDOUT; # force flush 235