1221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#!/usr/bin/env perl 2221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 3221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ==================================================================== 4221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Written by David Mosberger <David.Mosberger@acm.org> based on the 5221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Itanium optimized Crypto code which was released by HP Labs at 6221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# http://www.hpl.hp.com/research/linux/crypto/. 7221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 8221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Copyright (c) 2005 Hewlett-Packard Development Company, L.P. 9221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 10221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Permission is hereby granted, free of charge, to any person obtaining 11221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# a copy of this software and associated documentation files (the 12221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "Software"), to deal in the Software without restriction, including 13221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# without limitation the rights to use, copy, modify, merge, publish, 14221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# distribute, sublicense, and/or sell copies of the Software, and to 15221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# permit persons to whom the Software is furnished to do so, subject to 16221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# the following conditions: 17221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 18221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# The above copyright notice and this permission notice shall be 19221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# included in all copies or substantial portions of the Software. 20221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 21221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 24221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 25221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 26221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 27221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 28221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 29221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 30221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 31221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# This is a little helper program which generates a software-pipelined 32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for RC4 encryption. The basic algorithm looks like this: 33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for (counter = 0; counter < len; ++counter) 35221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# { 36221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# in = inp[counter]; 37221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SI = S[I]; 38221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# J = (SI + J) & 0xff; 39221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SJ = S[J]; 40221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = (SI + SJ) & 0xff; 41221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# S[I] = SJ, S[J] = SI; 42221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ST = S[T]; 43221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# outp[counter] = in ^ ST; 44221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# I = (I + 1) & 0xff; 45221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# } 46221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 47221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Pipelining this loop isn't easy, because the stores to the S[] array 48221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# need to be observed in the right order. The loop generated by the 49221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# code below has the following pipeline diagram: 50221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 51221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# cycle 52221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 | 53221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iter 54221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx 55221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx 56221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx 57221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 58221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# where: 59221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# LDI = load of S[I] 60221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# LDJ = load of S[J] 61221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SWP = swap of S[I] and S[J] 62221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# LDT = load of S[T] 63221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 64221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Note that in the above diagram, the major trouble-spot is that LDI 65221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# of the 2nd iteration is performed BEFORE the SWP of the first 66221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iteration. Fortunately, this is easy to detect (I of the 1st 67221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iteration will be equal to J of the 2nd iteration) and when this 68221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# happens, we simply forward the proper value from the 1st iteration 69221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to the 2nd one. The proper value in this case is simply the value 70221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# of S[I] from the first iteration (thanks to the fact that SWP 71221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# simply swaps the contents of S[I] and S[J]). 72221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 73221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Another potential trouble-spot is in cycle 7, where SWP of the 1st 74221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iteration issues at the same time as the LDI of the 3rd iteration. 75221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# However, thanks to IA-64 execution semantics, this can be taken 76221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# care of simply by placing LDI later in the instruction-group than 77221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SWP. IA-64 CPUs will automatically forward the value if they 78221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# detect that the SWP and LDI are accessing the same memory-location. 79221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 80221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# The core-loop that can be pipelined then looks like this (annotated 81221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# with McKinley/Madison issue port & latency numbers, assuming L1 82221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# cache hits for the most part): 83221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 84221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# operation: instruction: issue-ports: latency 85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ------------------ ----------------------------- ------------- ------- 86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0 88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc 89221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc 90221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 91221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP! 92221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 93221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# cmp.eq.unc pBypass = I, J * after J is valid! 94221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2 95221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# (pBypass) br.cond.spnt Bypass 96221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 97221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# --------------------------------------------------------------------------------------- 98221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3 99221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 100221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4 101221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 102221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5 103221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 104221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# --------------------------------------------------------------------------------------- 105221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6 106221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 107221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = T & 0xff zxt1 T = T I0, I1 1 cyc 108221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7 109221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# S[J] = SI st8 [Jptr] = SI M2-M3 110221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 111221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8 112221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 113221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# --------------------------------------------------------------------------------------- 114221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9 115221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 116221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10 117221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 118221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11 119221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ;; 120221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# --------------------------------------------------------------------------------------- 121221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 122221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# There are several points worth making here: 123221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 124221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - Note that due to the bypass/forwarding-path, the first two 125221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# phases of the loop are strangly mingled together. In 126221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# particular, note that the first stage of the pipeline is 127221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# using the value of "J", as calculated by the second stage. 128221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - Each bundle-pair will have exactly 6 instructions. 129221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - Pipelined, the loop can execute in 3 cycles/iteration and 130221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 4 stages. However, McKinley/Madison can issue "st1" to 131221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# the same bank at a rate of at most one per 4 cycles. Thus, 132221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# instead of storing each byte, we accumulate them in a word 133221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# and then write them back at once with a single "st8" (this 134221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# implies that the setup code needs to ensure that the output 135221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# buffer is properly aligned, if need be, by encoding the 136221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# first few bytes separately). 137221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - There is no space for a "br.ctop" instruction. For this 138221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# reason we can't use module-loop support in IA-64 and have 139221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to do a traditional, purely software-pipelined loop. 140221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - We can't replace any of the remaining "add/zxt1" pairs with 141221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "padd1" because the latency for that instruction is too high 142221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# and would push the loop to the point where more bypasses 143221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# would be needed, which we don't have space for. 144221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - The above loop runs at around 3.26 cycles/byte, or roughly 145221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 440 MByte/sec on a 1.5GHz Madison. This is well below the 146221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# system bus bandwidth and hence with judicious use of 147221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "lfetch" this loop can run at (almost) peak speed even when 148221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# the input and output data reside in memory. The 149221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# max. latency that can be tolerated is (PREFETCH_DISTANCE * 150221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at 151221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# least) 1-ahead prefetching of 128 byte cache-lines. Note 152221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# that we do NOT prefetch into L1, since that would only 153221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# interfere with the S[] table values stored there. This is 154221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# acceptable because there is a 10 cycle latency between 155221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# load and first use of the input data. 156221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# - We use a branch to out-of-line bypass-code of cycle-pressure: 157221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# we calculate the next J, check for the need to activate the 158221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# bypass path, and activate the bypass path ALL IN THE SAME 159221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# CYCLE. If we didn't have these constraints, we could do 160221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# the bypass with a simple conditional move instruction. 161221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Fortunately, the bypass paths get activated relatively 162221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# infrequently, so the extra branches don't cost all that much 163221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# (about 0.04 cycles/byte, measured on a 16396 byte file with 164221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# random input data). 165221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 166221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 167221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$phases = 4; # number of stages/phases in the pipelined-loop 168221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$unroll_count = 6; # number of times we unrolled it 169221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pComI = (1 << 0); 170221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pComJ = (1 << 1); 171221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pComT = (1 << 2); 172221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pOut = (1 << 3); 173221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 174221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NData = 4; 175221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NIP = 3; 176221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NJP = 2; 177221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NI = 2; 178221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NSI = 3; 179221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NSJ = 2; 180221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NT = 2; 181221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NOutWord = 2; 182221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 183221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 184221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# $threshold is the minimum length before we attempt to use the 185221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# big software-pipelined loop. It MUST be greater-or-equal 186221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to: 187221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# PHASES * (UNROLL_COUNT + 1) + 7 188221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 189221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# The "+ 7" comes from the fact we may have to encode up to 190221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 7 bytes separately before the output pointer is aligned. 191221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 192221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$threshold = (3 * ($phases * ($unroll_count + 1)) + 7); 193221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 194221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub I { 195221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local *code = shift; 196221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $format = shift; 197221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $code .= sprintf ("\t\t".$format."\n", @_); 198221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 199221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 200221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub P { 201221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local *code = shift; 202221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $format = shift; 203221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $code .= sprintf ($format."\n", @_); 204221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 205221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 206221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub STOP { 207221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local *code = shift; 208221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $code .=<<___; 209221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; 210221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 211221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 212221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 213221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub emit_body { 214221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local *c = shift; 215221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local *bypass = shift; 216221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local ($iteration, $p) = @_; 217221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 218221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $i0 = $iteration; 219221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $i1 = $iteration - 1; 220221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $i2 = $iteration - 2; 221221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $i3 = $iteration - 3; 222221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $iw0 = ($iteration - 3) / 8; 223221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1; 224221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $byte_num = ($iteration - 3) % 8; 225221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $label = $iteration + 1; 226221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $pAny = ($p & 0xf) == 0xf; 227221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom local $pByp = (($p & $pComI) && ($iteration > 0)); 228221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 229221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $c.=<<___; 230221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom////////////////////////////////////////////////// 231221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 232221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 233221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom if (($p & 0xf) == 0) { 234221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $c.="#ifdef HOST_IS_BIG_ENDIAN\n"; 235221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;", 236221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $iw1 % $NOutWord, $iw1 % $NOutWord); 237221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $c.="#endif\n"; 238221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord); 239221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom return; 240221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 241221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 242221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom # Cycle 0 243221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "{ .mmi") if ($pAny); 244221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI); 245221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI); 246221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "zxt1 J = J") if ($p & $pComJ); 247221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "}") if ($pAny); 248221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "{ .mmi") if ($pAny); 249221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut); 250221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "add T[%u] = SI[%u], SJ[%u]", 251221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT); 252221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI); 253221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "}") if ($pAny); 254221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &STOP(\$c); 255221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 256221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom # Cycle 1 257221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "{ .mmi") if ($pAny); 258221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT); 259221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT); 260221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT); 261221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "}") if ($pAny); 262221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "{ .mmi") if ($pAny); 263221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI); 264221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ); 265221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "xor Data[%u] = Data[%u], T[%u]", 266221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut); 267221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "}") if ($pAny); 268221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &STOP(\$c); 269221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 270221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom # Cycle 2 271221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "{ .mmi") if ($pAny); 272221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ); 273221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp); 274221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8", 275221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut); 276221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "}") if ($pAny); 277221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "{ .mmb") if ($pAny); 278221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI); 279221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT); 280221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp); 281221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "}") if ($pAny); 282221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &STOP(\$c); 283221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 284221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &P(\$c, ".rc4Resume%u:", $label) if ($pByp); 285221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom if ($byte_num == 0 && $iteration >= $phases) { 286221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "st8 [OutPtr] = OutWord[%u], 8", 287221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $iw1 % $NOutWord) if ($p & $pOut); 288221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom if ($iteration == (1 + $unroll_count) * $phases - 1) { 289221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom if ($unroll_count == 6) { 290221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "mov OutWord[%u] = OutWord[%u]", 291221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $iw1 % $NOutWord, $iw0 % $NOutWord); 292221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 293221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "lfetch.nt1 [InPrefetch], %u", 294221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $unroll_count * $phases); 295221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u", 296221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $unroll_count * $phases); 297221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$c, "br.cloop.sptk.few .rc4Loop"); 298221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 299221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 300221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 301221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom if ($pByp) { 302221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &P(\$bypass, ".rc4Bypass%u:", $label); 303221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI); 304221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, "nop 0"); 305221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, "nop 0"); 306221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, ";;"); 307221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI); 308221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI); 309221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label); 310221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &I(\$bypass, ";;"); 311221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 312221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 313221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 314221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code=<<___; 315221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.ident \"rc4-ia64.s, version 3.0\" 316221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\" 317221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 318221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define LCSave r8 319221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define PRSave r9 320221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 321221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Inputs become invalid once rotation begins! */ 322221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 323221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define StateTable in0 324221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define DataLen in1 325221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define InputBuffer in2 326221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define OutputBuffer in3 327221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 328221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define KTable r14 329221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define J r15 330221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define InPtr r16 331221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define OutPtr r17 332221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define InPrefetch r18 333221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define OutPrefetch r19 334221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define One r20 335221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define LoopCount r21 336221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define Remainder r22 337221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define IFinal r23 338221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define EndPtr r24 339221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 340221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define tmp0 r25 341221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define tmp1 r26 342221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 343221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pBypass p6 344221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pDone p7 345221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pSmall p8 346221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pAligned p9 347221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pUnaligned p10 348221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 349221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pComputeI pPhase[0] 350221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pComputeJ pPhase[1] 351221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pComputeT pPhase[2] 352221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pOutput pPhase[3] 353221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 354221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define RetVal r8 355221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define L_OK p7 356221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define L_NOK p8 357221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 358221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define _NINPUTS 4 359221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define _NOUTPUT 0 360221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 361221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define _NROTATE 24 362221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT) 363221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 364221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#ifndef SZ 365221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SZ 4 // this must be set to sizeof(RC4_INT) 366221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 367221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 368221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ == 1 369221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY ld1 370221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY st1 371221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i) add dst = i, KTable 372221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 2 373221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY ld2 374221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY st2 375221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i) shladd dst = i, 1, KTable 376221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 4 377221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY ld4 378221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY st4 379221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i) shladd dst = i, 2, KTable 380221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#else 381221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY ld8 382221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY st8 383221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i) shladd dst = i, 3, KTable 384221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 385221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 386221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if defined(_HPUX_SOURCE) && !defined(_LP64) 387221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define ADDP addp4 388221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#else 389221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define ADDP add 390221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 391221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 392221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Define a macro for the bit number of the n-th byte: */ 393221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 394221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if defined(_HPUX_SOURCE) || defined(B_ENDIAN) 395221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define HOST_IS_BIG_ENDIAN 396221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define BYTE_POS(n) (56 - (8 * (n))) 397221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#else 398221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define BYTE_POS(n) (8 * (n)) 399221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 400221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 401221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* 402221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom We must perform the first phase of the pipeline explicitly since 403221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom we will always load from the stable the first time. The br.cexit 404221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom will never be taken since regardless of the number of bytes because 405221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom the epilogue count is 4. 406221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom*/ 407221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX 408221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom assembler failed on original macro with syntax error. <appro> */ 409221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define MODSCHED_RC4_PROLOGUE \\ 410221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { \\ 411221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ld1 Data[0] = [InPtr], 1; \\ 412221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add IFinal = 1, I[1]; \\ 413221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom KEYADDR(IPr[0], I[1]); \\ 414221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; \\ 415221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { \\ 416221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom LKEY SI[0] = [IPr[0]]; \\ 417221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov pr.rot = 0x10000; \\ 418221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov ar.ec = 4; \\ 419221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; \\ 420221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { \\ 421221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add J = J, SI[0]; \\ 422221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom zxt1 I[0] = IFinal; \\ 423221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom br.cexit.spnt.few .+16; /* never taken */ \\ 424221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 425221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define MODSCHED_RC4_LOOP(label) \\ 426221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromlabel: \\ 427221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { .mmi; \\ 428221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) ld1 Data[0] = [InPtr], 1; \\ 429221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) add IFinal = 1, I[1]; \\ 430221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeJ) zxt1 J = J; \\ 431221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom }{ .mmi; \\ 432221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pOutput) LKEY T[1] = [T[1]]; \\ 433221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeT) add T[0] = SI[2], SJ[1]; \\ 434221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) KEYADDR(IPr[0], I[1]); \\ 435221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; \\ 436221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { .mmi; \\ 437221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeT) SKEY [IPr[2]] = SJ[1]; \\ 438221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeT) SKEY [JP[1]] = SI[2]; \\ 439221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeT) zxt1 T[0] = T[0]; \\ 440221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom }{ .mmi; \\ 441221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) LKEY SI[0] = [IPr[0]]; \\ 442221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeJ) KEYADDR(JP[0], J); \\ 443221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\ 444221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; \\ 445221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { .mmi; \\ 446221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeJ) LKEY SJ[0] = [JP[0]]; \\ 447221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pOutput) xor Data[3] = Data[3], T[1]; \\ 448221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0; \\ 449221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom }{ .mmi; \\ 450221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeT) KEYADDR(T[0], T[0]); \\ 451221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pBypass) mov SI[0] = SI[1]; \\ 452221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) zxt1 I[0] = IFinal; \\ 453221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; \\ 454221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { .mmb; \\ 455221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pOutput) st1 [OutPtr] = Data[3], 1; \\ 456221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (pComputeI) add J = J, SI[0]; \\ 457221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom br.ctop.sptk.few label; \\ 458221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 459221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 460221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .text 461221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 462221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .align 32 463221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 464221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .type RC4, \@function 465221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .global RC4 466221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 467221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .proc RC4 468221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .prologue 469221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 470221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromRC4: 471221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 472221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 473221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE 474221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 475221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\ 476221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom OutWord[2] 477221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .rotp pPhase[4] 478221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 479221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP InPrefetch = 0, InputBuffer 480221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP KTable = 0, StateTable 481221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 482221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 483221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 484221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP InPtr = 0, InputBuffer 485221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP OutPtr = 0, OutputBuffer 486221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov RetVal = r0 487221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 488221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; 489221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 490221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 491221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.nt1 [InPrefetch], 0x80 492221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP OutPrefetch = 0, OutputBuffer 493221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 494221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { // Return 0 if the input length is nonsensical 495221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mib 496221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP StateTable = 0, StateTable 497221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.ge.unc L_NOK, L_OK = r0, DataLen 498221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (L_NOK) br.ret.sptk.few rp 499221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 500221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; 501221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 502221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mib 503221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.eq.or L_NOK, L_OK = r0, InPtr 504221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.eq.or L_NOK, L_OK = r0, OutPtr 505221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 506221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 507221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 508221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mib 509221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.eq.or L_NOK, L_OK = r0, StateTable 510221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 511221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom (L_NOK) br.ret.sptk.few rp 512221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 513221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; 514221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom LKEY I[1] = [KTable], SZ 515221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Prefetch the state-table. It contains 256 elements of size SZ */ 516221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 517221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ == 1 518221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp0 = 1*128, StateTable 519221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 2 520221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp0 = 3*128, StateTable 521221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp1 = 2*128, StateTable 522221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 4 523221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp0 = 7*128, StateTable 524221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp1 = 6*128, StateTable 525221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 8 526221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp0 = 15*128, StateTable 527221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP tmp1 = 14*128, StateTable 528221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 529221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; 530221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ >= 8 531221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 15 532221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 533221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 13 534221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 535221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 11 536221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 537221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 9 538221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 539221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 540221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ >= 4 541221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 7 542221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 543221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 5 544221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 545221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 546221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ >= 2 547221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0], -256 // 3 548221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp1], -256;; 549221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif 550221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 551221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mii 552221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.fault.nt1 [tmp0] // 1 553221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add I[1]=1,I[1];; 554221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom zxt1 I[1]=I[1] 555221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 556221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 557221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 558221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.nt1 [InPrefetch], 0x80 559221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.excl.nt1 [OutPrefetch], 0x80 560221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .save pr, PRSave 561221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov PRSave = pr 562221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 563221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 564221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 565221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.excl.nt1 [OutPrefetch], 0x80 566221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom LKEY J = [KTable], SZ 567221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP EndPtr = DataLen, InPtr 568221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 569221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 570221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 571221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ADDP EndPtr = -1, EndPtr // Make it point to 572221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom // last data byte. 573221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov One = 1 574221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .save ar.lc, LCSave 575221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov LCSave = ar.lc 576221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .body 577221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 578221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 579221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmb 580221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom sub Remainder = 0, OutPtr 581221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.gtu pSmall, p0 = $threshold, DataLen 582221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pSmall) br.cond.dpnt .rc4Remainder // Data too small for 583221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom // big loop. 584221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 585221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 586221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 587221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom and Remainder = 0x7, Remainder 588221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; 589221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.eq pAligned, pUnaligned = Remainder, r0 590221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 591221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 592221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 593221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmb 594221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.pred.rel "mutex",pUnaligned,pAligned 595221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pUnaligned) add Remainder = -1, Remainder 596221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pAligned) sub Remainder = EndPtr, InPtr 597221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pAligned) br.cond.dptk.many .rc4Aligned 598221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 599221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 600221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 601221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 602221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 603221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov.i ar.lc = Remainder 604221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 605221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 606221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Do the initial few bytes via the compact, modulo-scheduled loop 607221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom until the output pointer is 8-byte-aligned. */ 608221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 609221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom MODSCHED_RC4_PROLOGUE 610221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom MODSCHED_RC4_LOOP(.RC4AlignLoop) 611221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 612221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 613221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mib 614221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom sub Remainder = EndPtr, InPtr 615221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom zxt1 IFinal = IFinal 616221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom clrrrb // Clear CFM.rrb.pr so 617221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom ;; // next "mov pr.rot = N" 618221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom // does the right thing. 619221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 620221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 621221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 622221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov I[1] = IFinal 623221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 624221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 625221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 626221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 627221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 628221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Aligned: 629221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 630221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* 631221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases) 632221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom */ 633221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 634221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 635221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mlx 636221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder 637221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom movl Remainder = 0xaaaaaaaaaaaaaaab 638221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 639221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 640221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 641221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom setf.sig f6 = LoopCount // M2, M3 6 cyc 642221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom setf.sig f7 = Remainder // M2, M3 6 cyc 643221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 644221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 645221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 646221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mfb 647221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 648221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom xmpy.hu f6 = f6, f7 649221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 650221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 651221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 652221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 653221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom getf.sig LoopCount = f6;; // M2 5 cyc 654221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 655221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom shr.u LoopCount = LoopCount, 4 656221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 657221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 658221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 659221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 660221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 661221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov.i ar.lc = LoopCount 662221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 663221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 664221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Now comes the unrolled loop: */ 665221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 666221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Prologue: 667221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 668221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 669221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$iteration = 0; 670221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 671221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Generate the prologue: 672221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$predicates = 1; 673221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromfor ($i = 0; $i < $phases; ++$i) { 674221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &emit_body (\$code, \$bypass, $iteration++, $predicates); 675221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $predicates = ($predicates << 1) | 1; 676221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 677221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 678221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___; 679221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Loop: 680221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 681221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 682221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Generate the body: 683221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromfor ($i = 0; $i < $unroll_count*$phases; ++$i) { 684221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &emit_body (\$code, \$bypass, $iteration++, $predicates); 685221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 686221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 687221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___; 688221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Epilogue: 689221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 690221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 691221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Generate the epilogue: 692221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromfor ($i = 0; $i < $phases; ++$i) { 693221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom $predicates <<= 1; 694221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom &emit_body (\$code, \$bypass, $iteration++, $predicates); 695221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom} 696221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 697221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___; 698221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 699221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 700221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom lfetch.nt1 [EndPtr] // fetch line with last byte 701221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov IFinal = I[1] 702221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 703221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 704221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 705221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Remainder: 706221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 707221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 708221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom sub Remainder = EndPtr, InPtr // Calculate 709221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom // # of bytes 710221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom // left - 1 711221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 712221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom nop 0x0 713221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 714221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 715221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mib 716221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom cmp.eq pDone, p0 = -1, Remainder // done already? 717221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov.i ar.lc = Remainder 718221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pDone) br.cond.dptk.few .rc4Complete 719221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } 720221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 721221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Do the remaining bytes via the compact, modulo-scheduled loop */ 722221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 723221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom MODSCHED_RC4_PROLOGUE 724221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom MODSCHED_RC4_LOOP(.RC4RestLoop) 725221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 726221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Complete: 727221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 728221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mmi 729221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add KTable = -SZ, KTable 730221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add IFinal = -1, IFinal 731221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov ar.lc = LCSave 732221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 733221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 734221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mii 735221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom SKEY [KTable] = J,-SZ 736221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom zxt1 IFinal = IFinal 737221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom mov pr = PRSave, 0x1FFFF 738221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 739221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom { 740221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .mib 741221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom SKEY [KTable] = IFinal 742221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom add RetVal = 1, r0 743221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom br.ret.sptk.few rp 744221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom } ;; 745221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 746221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 747221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Last but not least, emit the code for the bypass-code of the unrolled loop: 748221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 749221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=$bypass; 750221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 751221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___; 752221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom .endp RC4 753221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___ 754221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom 755221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromprint $code; 756