1221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#!/usr/bin/env perl
2221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
3221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ====================================================================
4221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Written by David Mosberger <David.Mosberger@acm.org> based on the
5221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Itanium optimized Crypto code which was released by HP Labs at
6221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# http://www.hpl.hp.com/research/linux/crypto/.
7221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
8221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
9221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
10221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Permission is hereby granted, free of charge, to any person obtaining
11221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# a copy of this software and associated documentation files (the
12221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# "Software"), to deal in the Software without restriction, including
13221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# without limitation the rights to use, copy, modify, merge, publish,
14221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# distribute, sublicense, and/or sell copies of the Software, and to
15221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# permit persons to whom the Software is furnished to do so, subject to
16221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# the following conditions:
17221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
18221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# The above copyright notice and this permission notice shall be
19221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# included in all copies or substantial portions of the Software.
20221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
21221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
28221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
29221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
30221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
31221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# This is a little helper program which generates a software-pipelined
32221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# for RC4 encryption.  The basic algorithm looks like this:
33221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
34221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   for (counter = 0; counter < len; ++counter)
35221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     {
36221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       in = inp[counter];
37221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       SI = S[I];
38221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       J = (SI + J) & 0xff;
39221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       SJ = S[J];
40221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       T = (SI + SJ) & 0xff;
41221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       S[I] = SJ, S[J] = SI;
42221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       ST = S[T];
43221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       outp[counter] = in ^ ST;
44221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#       I = (I + 1) & 0xff;
45221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     }
46221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
47221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Pipelining this loop isn't easy, because the stores to the S[] array
48221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# need to be observed in the right order.  The loop generated by the
49221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# code below has the following pipeline diagram:
50221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
51221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#      cycle
52221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
53221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iter
54221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
55221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
56221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
57221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
58221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   where:
59221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 	LDI = load of S[I]
60221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 	LDJ = load of S[J]
61221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 	SWP = swap of S[I] and S[J]
62221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# 	LDT = load of S[T]
63221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
64221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Note that in the above diagram, the major trouble-spot is that LDI
65221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# of the 2nd iteration is performed BEFORE the SWP of the first
66221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iteration.  Fortunately, this is easy to detect (I of the 1st
67221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iteration will be equal to J of the 2nd iteration) and when this
68221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# happens, we simply forward the proper value from the 1st iteration
69221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to the 2nd one.  The proper value in this case is simply the value
70221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# of S[I] from the first iteration (thanks to the fact that SWP
71221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# simply swaps the contents of S[I] and S[J]).
72221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
73221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Another potential trouble-spot is in cycle 7, where SWP of the 1st
74221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# iteration issues at the same time as the LDI of the 3rd iteration.
75221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# However, thanks to IA-64 execution semantics, this can be taken
76221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# care of simply by placing LDI later in the instruction-group than
77221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SWP.  IA-64 CPUs will automatically forward the value if they
78221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# detect that the SWP and LDI are accessing the same memory-location.
79221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
80221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# The core-loop that can be pipelined then looks like this (annotated
81221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# with McKinley/Madison issue port & latency numbers, assuming L1
82221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# cache hits for the most part):
83221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
84221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# operation:	    instruction:		    issue-ports:  latency
85221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ------------------  -----------------------------   ------------- -------
86221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
87221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
88221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
89221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
90221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
91221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
92221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
93221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
94221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
95221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     (pBypass) br.cond.spnt Bypass
96221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
97221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ---------------------------------------------------------------------------------------
98221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
99221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
100221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
101221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
102221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
103221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
104221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ---------------------------------------------------------------------------------------
105221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
106221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
107221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
108221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
109221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# S[J] = SI           st8 [Jptr] = SI                 M2-M3
110221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
111221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
112221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
113221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ---------------------------------------------------------------------------------------
114221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
115221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
116221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
117221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
118221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
119221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#                     ;;
120221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# ---------------------------------------------------------------------------------------
121221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
122221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# There are several points worth making here:
123221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
124221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - Note that due to the bypass/forwarding-path, the first two
125221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     phases of the loop are strangly mingled together.  In
126221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     particular, note that the first stage of the pipeline is
127221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     using the value of "J", as calculated by the second stage.
128221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - Each bundle-pair will have exactly 6 instructions.
129221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - Pipelined, the loop can execute in 3 cycles/iteration and
130221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     4 stages.  However, McKinley/Madison can issue "st1" to
131221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     the same bank at a rate of at most one per 4 cycles.  Thus,
132221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     instead of storing each byte, we accumulate them in a word
133221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     and then write them back at once with a single "st8" (this
134221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     implies that the setup code needs to ensure that the output
135221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     buffer is properly aligned, if need be, by encoding the
136221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     first few bytes separately).
137221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - There is no space for a "br.ctop" instruction.  For this
138221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     reason we can't use module-loop support in IA-64 and have
139221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     to do a traditional, purely software-pipelined loop.
140221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - We can't replace any of the remaining "add/zxt1" pairs with
141221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     "padd1" because the latency for that instruction is too high
142221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     and would push the loop to the point where more bypasses
143221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     would be needed, which we don't have space for.
144221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - The above loop runs at around 3.26 cycles/byte, or roughly
145221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     440 MByte/sec on a 1.5GHz Madison.  This is well below the
146221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     system bus bandwidth and hence with judicious use of
147221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     "lfetch" this loop can run at (almost) peak speed even when
148221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     the input and output data reside in memory.  The
149221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     max. latency that can be tolerated is (PREFETCH_DISTANCE *
150221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
151221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     least) 1-ahead prefetching of 128 byte cache-lines.  Note
152221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     that we do NOT prefetch into L1, since that would only
153221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     interfere with the S[] table values stored there.  This is
154221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     acceptable because there is a 10 cycle latency between
155221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     load and first use of the input data.
156221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   - We use a branch to out-of-line bypass-code of cycle-pressure:
157221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     we calculate the next J, check for the need to activate the
158221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     bypass path, and activate the bypass path ALL IN THE SAME
159221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     CYCLE.  If we didn't have these constraints, we could do
160221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     the bypass with a simple conditional move instruction.
161221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     Fortunately, the bypass paths get activated relatively
162221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     infrequently, so the extra branches don't cost all that much
163221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     (about 0.04 cycles/byte, measured on a 16396 byte file with
164221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#     random input data).
165221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
166221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
167221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$phases = 4;		# number of stages/phases in the pipelined-loop
168221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$unroll_count = 6;	# number of times we unrolled it
169221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pComI = (1 << 0);
170221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pComJ = (1 << 1);
171221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pComT = (1 << 2);
172221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$pOut  = (1 << 3);
173221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
174221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NData = 4;
175221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NIP = 3;
176221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NJP = 2;
177221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NI = 2;
178221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NSI = 3;
179221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NSJ = 2;
180221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NT = 2;
181221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$NOutWord = 2;
182221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
183221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
184221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# $threshold is the minimum length before we attempt to use the
185221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# big software-pipelined loop.  It MUST be greater-or-equal
186221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# to:
187221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#  		PHASES * (UNROLL_COUNT + 1) + 7
188221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
189221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# The "+ 7" comes from the fact we may have to encode up to
190221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#   7 bytes separately before the output pointer is aligned.
191221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#
192221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
193221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
194221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub I {
195221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local *code = shift;
196221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $format = shift;
197221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    $code .= sprintf ("\t\t".$format."\n", @_);
198221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
199221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
200221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub P {
201221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local *code = shift;
202221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $format = shift;
203221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    $code .= sprintf ($format."\n", @_);
204221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
205221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
206221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub STOP {
207221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local *code = shift;
208221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    $code .=<<___;
209221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		;;
210221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
211221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
212221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
213221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromsub emit_body {
214221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local *c = shift;
215221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local *bypass = shift;
216221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local ($iteration, $p) = @_;
217221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
218221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $i0 = $iteration;
219221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $i1 = $iteration - 1;
220221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $i2 = $iteration - 2;
221221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $i3 = $iteration - 3;
222221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $iw0 = ($iteration - 3) / 8;
223221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
224221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $byte_num = ($iteration - 3) % 8;
225221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $label = $iteration + 1;
226221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $pAny = ($p & 0xf) == 0xf;
227221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    local $pByp = (($p & $pComI) && ($iteration > 0));
228221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
229221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    $c.=<<___;
230221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom//////////////////////////////////////////////////
231221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
232221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
233221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    if (($p & 0xf) == 0) {
234221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$c.="#ifdef HOST_IS_BIG_ENDIAN\n";
235221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$c,"shr.u	OutWord[%u] = OutWord[%u], 32;;",
236221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				$iw1 % $NOutWord, $iw1 % $NOutWord);
237221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	$c.="#endif\n";
238221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
239221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	return;
240221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    }
241221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
242221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    # Cycle 0
243221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "{ .mmi")					      if ($pAny);
244221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
245221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
246221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "zxt1   J = J")				      if ($p & $pComJ);
247221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "}")					      if ($pAny);
248221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "{ .mmi")					      if ($pAny);
249221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
250221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
251221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom       $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)		      if ($p & $pComT);
252221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
253221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "}")					      if ($pAny);
254221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &STOP(\$c);
255221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
256221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    # Cycle 1
257221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "{ .mmi")					      if ($pAny);
258221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
259221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
260221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
261221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "}")					      if ($pAny);
262221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "{ .mmi")					      if ($pAny);
263221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
264221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)		      if ($p & $pComJ);
265221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
266221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom       $i3 % $NData, $i3 % $NData, $i1 % $NT)		      if ($p & $pOut);
267221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "}")					      if ($pAny);
268221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &STOP(\$c);
269221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
270221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    # Cycle 2
271221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "{ .mmi")					      if ($pAny);
272221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
273221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)	      if ($pByp);
274221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
275221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom       $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
276221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "}")					      if ($pAny);
277221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "{ .mmb")					      if ($pAny);
278221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)		      if ($p & $pComI);
279221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
280221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
281221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &I(\$c, "}") if ($pAny);
282221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &STOP(\$c);
283221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
284221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &P(\$c, ".rc4Resume%u:", $label)			      if ($pByp);
285221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    if ($byte_num == 0 && $iteration >= $phases) {
286221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
287221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	   $iw1 % $NOutWord)				      if ($p & $pOut);
288221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	if ($iteration == (1 + $unroll_count) * $phases - 1) {
289221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	    if ($unroll_count == 6) {
290221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		&I(\$c, "mov OutWord[%u] = OutWord[%u]",
291221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		   $iw1 % $NOutWord, $iw0 % $NOutWord);
292221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	    }
293221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	    &I(\$c, "lfetch.nt1 [InPrefetch], %u",
294221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	       $unroll_count * $phases);
295221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	    &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
296221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	       $unroll_count * $phases);
297221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	    &I(\$c, "br.cloop.sptk.few .rc4Loop");
298221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
299221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    }
300221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
301221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    if ($pByp) {
302221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&P(\$bypass, ".rc4Bypass%u:", $label);
303221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
304221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, "nop 0");
305221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, "nop 0");
306221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, ";;");
307221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
308221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
309221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
310221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	&I(\$bypass, ";;");
311221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    }
312221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
313221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
314221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code=<<___;
315221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.ident \"rc4-ia64.s, version 3.0\"
316221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
317221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
318221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define LCSave		r8
319221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define PRSave		r9
320221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
321221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Inputs become invalid once rotation begins!  */
322221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
323221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define StateTable	in0
324221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define DataLen		in1
325221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define InputBuffer	in2
326221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define OutputBuffer	in3
327221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
328221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define KTable		r14
329221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define J		r15
330221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define InPtr		r16
331221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define OutPtr		r17
332221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define InPrefetch	r18
333221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define OutPrefetch	r19
334221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define One		r20
335221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define LoopCount	r21
336221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define Remainder	r22
337221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define IFinal		r23
338221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define EndPtr		r24
339221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
340221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define tmp0		r25
341221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define tmp1		r26
342221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
343221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pBypass		p6
344221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pDone		p7
345221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pSmall		p8
346221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pAligned	p9
347221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pUnaligned	p10
348221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
349221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pComputeI	pPhase[0]
350221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pComputeJ	pPhase[1]
351221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pComputeT	pPhase[2]
352221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define pOutput		pPhase[3]
353221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
354221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define RetVal		r8
355221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define L_OK		p7
356221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define L_NOK		p8
357221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
358221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define	_NINPUTS	4
359221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define	_NOUTPUT	0
360221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
361221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define	_NROTATE	24
362221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define	_NLOCALS	(_NROTATE - _NINPUTS - _NOUTPUT)
363221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
364221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#ifndef SZ
365221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SZ	4	// this must be set to sizeof(RC4_INT)
366221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
367221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
368221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ == 1
369221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY			ld1
370221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY			st1
371221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i)	add dst = i, KTable
372221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 2
373221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY			ld2
374221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY			st2
375221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i)	shladd dst = i, 1, KTable
376221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 4
377221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY			ld4
378221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY			st4
379221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i)	shladd dst = i, 2, KTable
380221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#else
381221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define LKEY			ld8
382221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define SKEY			st8
383221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define KEYADDR(dst, i)	shladd dst = i, 3, KTable
384221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
385221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
386221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if defined(_HPUX_SOURCE) && !defined(_LP64)
387221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define ADDP	addp4
388221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#else
389221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define ADDP	add
390221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
391221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
392221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Define a macro for the bit number of the n-th byte: */
393221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
394221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
395221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define HOST_IS_BIG_ENDIAN
396221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define BYTE_POS(n)	(56 - (8 * (n)))
397221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#else
398221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# define BYTE_POS(n)	(8 * (n))
399221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
400221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
401221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/*
402221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   We must perform the first phase of the pipeline explicitly since
403221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   we will always load from the stable the first time. The br.cexit
404221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   will never be taken since regardless of the number of bytes because
405221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   the epilogue count is 4.
406221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom*/
407221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
408221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   assembler failed on original macro with syntax error. <appro> */
409221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define MODSCHED_RC4_PROLOGUE						   \\
410221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{								   \\
411221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				ld1		Data[0] = [InPtr], 1;	   \\
412221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				add		IFinal = 1, I[1];	   \\
413221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				KEYADDR(IPr[0], I[1]);			   \\
414221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;								   \\
415221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{								   \\
416221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				LKEY		SI[0] = [IPr[0]];	   \\
417221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				mov		pr.rot = 0x10000;	   \\
418221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				mov		ar.ec = 4;		   \\
419221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;								   \\
420221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{								   \\
421221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				add		J = J, SI[0];		   \\
422221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				zxt1		I[0] = IFinal;		   \\
423221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				br.cexit.spnt.few .+16; /* never taken */  \\
424221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
425221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#define MODSCHED_RC4_LOOP(label)					   \\
426221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromlabel:									   \\
427221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{	.mmi;							   \\
428221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	ld1		Data[0] = [InPtr], 1;	   \\
429221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	add		IFinal = 1, I[1];	   \\
430221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeJ)	zxt1		J = J;			   \\
431221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}{	.mmi;							   \\
432221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pOutput)	LKEY		T[1] = [T[1]];		   \\
433221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeT)	add		T[0] = SI[2], SJ[1];	   \\
434221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	KEYADDR(IPr[0], I[1]);			   \\
435221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;								   \\
436221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{	.mmi;							   \\
437221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeT)	SKEY		[IPr[2]] = SJ[1];	   \\
438221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeT)	SKEY		[JP[1]] = SI[2];	   \\
439221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeT)	zxt1		T[0] = T[0];		   \\
440221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}{	.mmi;							   \\
441221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	LKEY		SI[0] = [IPr[0]];	   \\
442221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeJ)	KEYADDR(JP[0], J);			   \\
443221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	cmp.eq.unc	pBypass, p0 = I[1], J;	   \\
444221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;								   \\
445221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{	.mmi;							   \\
446221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeJ)	LKEY		SJ[0] = [JP[0]];	   \\
447221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pOutput)	xor		Data[3] = Data[3], T[1];   \\
448221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				nop		0x0;			   \\
449221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}{	.mmi;							   \\
450221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeT)	KEYADDR(T[0], T[0]);			   \\
451221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pBypass)	mov		SI[0] = SI[1];		   \\
452221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	zxt1		I[0] = IFinal;		   \\
453221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;								   \\
454221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{	.mmb;							   \\
455221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pOutput)	st1		[OutPtr] = Data[3], 1;	   \\
456221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		(pComputeI)	add		J = J, SI[0];		   \\
457221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom				br.ctop.sptk.few label;			   \\
458221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
459221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
460221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.text
461221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
462221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.align	32
463221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
464221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.type	RC4, \@function
465221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.global	RC4
466221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
467221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.proc	RC4
468221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.prologue
469221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
470221304ee937bc0910948a8be1320cb8cc4eb6d36Brian CarlstromRC4:
471221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
472221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	  	.mmi
473221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		alloc	r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
474221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
475221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
476221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		      OutWord[2]
477221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.rotp pPhase[4]
478221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
479221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		InPrefetch = 0, InputBuffer
480221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		KTable = 0, StateTable
481221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
482221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
483221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
484221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		InPtr = 0, InputBuffer
485221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		OutPtr = 0, OutputBuffer
486221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		RetVal = r0
487221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
488221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	;;
489221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
490221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
491221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.nt1	[InPrefetch], 0x80
492221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		OutPrefetch = 0, OutputBuffer
493221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
494221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{               // Return 0 if the input length is nonsensical
495221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom        	.mib
496221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		StateTable = 0, StateTable
497221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom        	cmp.ge.unc  	L_NOK, L_OK = r0, DataLen
498221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	(L_NOK) br.ret.sptk.few rp
499221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
500221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	;;
501221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
502221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom        	.mib
503221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom        	cmp.eq.or  	L_NOK, L_OK = r0, InPtr
504221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom        	cmp.eq.or  	L_NOK, L_OK = r0, OutPtr
505221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
506221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
507221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
508221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mib
509221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom        	cmp.eq.or  	L_NOK, L_OK = r0, StateTable
510221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
511221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	(L_NOK) br.ret.sptk.few rp
512221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
513221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	;;
514221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		LKEY		I[1] = [KTable], SZ
515221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Prefetch the state-table. It contains 256 elements of size SZ */
516221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
517221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ == 1
518221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp0 = 1*128, StateTable
519221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 2
520221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp0 = 3*128, StateTable
521221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp1 = 2*128, StateTable
522221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 4
523221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp0 = 7*128, StateTable
524221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp1 = 6*128, StateTable
525221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#elif SZ == 8
526221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp0 = 15*128, StateTable
527221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		tmp1 = 14*128, StateTable
528221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
529221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		;;
530221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ >= 8
531221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	// 15
532221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
533221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	// 13
534221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
535221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	// 11
536221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
537221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	//  9
538221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
539221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
540221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ >= 4
541221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	//  7
542221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
543221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	//  5
544221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
545221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
546221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#if SZ >= 2
547221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0], -256	//  3
548221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp1], -256;;
549221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom#endif
550221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
551221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mii
552221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.fault.nt1		[tmp0]		//  1
553221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		add		I[1]=1,I[1];;
554221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		zxt1		I[1]=I[1]
555221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
556221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
557221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
558221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.nt1	[InPrefetch], 0x80
559221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.excl.nt1	[OutPrefetch], 0x80
560221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.save		pr, PRSave
561221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		PRSave = pr
562221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
563221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
564221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
565221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.excl.nt1	[OutPrefetch], 0x80
566221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		LKEY		J = [KTable], SZ
567221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		EndPtr = DataLen, InPtr
568221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}  ;;
569221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
570221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
571221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		ADDP		EndPtr = -1, EndPtr	// Make it point to
572221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom							// last data byte.
573221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		One = 1
574221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.save		ar.lc, LCSave
575221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		LCSave = ar.lc
576221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.body
577221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
578221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
579221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmb
580221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		sub		Remainder = 0, OutPtr
581221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		cmp.gtu		pSmall, p0 = $threshold, DataLen
582221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pSmall)	br.cond.dpnt	.rc4Remainder		// Data too small for
583221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom							// big loop.
584221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
585221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
586221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
587221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		and		Remainder = 0x7, Remainder
588221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		;;
589221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		cmp.eq		pAligned, pUnaligned = Remainder, r0
590221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
591221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
592221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
593221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmb
594221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.pred.rel	"mutex",pUnaligned,pAligned
595221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pUnaligned)	add		Remainder = -1, Remainder
596221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pAligned)	sub		Remainder = EndPtr, InPtr
597221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pAligned)	br.cond.dptk.many .rc4Aligned
598221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
599221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
600221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
601221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
602221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
603221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov.i		ar.lc = Remainder
604221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
605221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
606221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Do the initial few bytes via the compact, modulo-scheduled loop
607221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   until the output pointer is 8-byte-aligned.  */
608221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
609221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		MODSCHED_RC4_PROLOGUE
610221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		MODSCHED_RC4_LOOP(.RC4AlignLoop)
611221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
612221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
613221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mib
614221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		sub		Remainder = EndPtr, InPtr
615221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		zxt1		IFinal = IFinal
616221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		clrrrb				// Clear CFM.rrb.pr so
617221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		;;				// next "mov pr.rot = N"
618221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom						// does the right thing.
619221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
620221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
621221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
622221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		I[1] = IFinal
623221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
624221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
625221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
626221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
627221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
628221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Aligned:
629221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
630221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/*
631221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom   Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
632221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom */
633221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
634221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
635221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mlx
636221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		add	LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
637221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		movl		Remainder = 0xaaaaaaaaaaaaaaab
638221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
639221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
640221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
641221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		setf.sig	f6 = LoopCount		// M2, M3	6 cyc
642221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		setf.sig	f7 = Remainder		// M2, M3	6 cyc
643221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
644221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
645221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
646221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mfb
647221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
648221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		xmpy.hu		f6 = f6, f7
649221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
650221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
651221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
652221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
653221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		getf.sig	LoopCount = f6;;	// M2		5 cyc
654221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
655221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		shr.u		LoopCount = LoopCount, 4
656221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
657221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
658221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
659221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
660221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
661221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov.i		ar.lc = LoopCount
662221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
663221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
664221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Now comes the unrolled loop: */
665221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
666221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Prologue:
667221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
668221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
669221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$iteration = 0;
670221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
671221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Generate the prologue:
672221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$predicates = 1;
673221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromfor ($i = 0; $i < $phases; ++$i) {
674221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &emit_body (\$code, \$bypass, $iteration++, $predicates);
675221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    $predicates = ($predicates << 1) | 1;
676221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
677221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
678221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___;
679221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Loop:
680221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
681221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
682221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Generate the body:
683221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromfor ($i = 0; $i < $unroll_count*$phases; ++$i) {
684221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &emit_body (\$code, \$bypass, $iteration++, $predicates);
685221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
686221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
687221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___;
688221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Epilogue:
689221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
690221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
691221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Generate the epilogue:
692221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromfor ($i = 0; $i < $phases; ++$i) {
693221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    $predicates <<= 1;
694221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom    &emit_body (\$code, \$bypass, $iteration++, $predicates);
695221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom}
696221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
697221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___;
698221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
699221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
700221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		lfetch.nt1	[EndPtr]	// fetch line with last byte
701221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		IFinal = I[1]
702221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
703221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
704221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
705221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Remainder:
706221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
707221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
708221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		sub		Remainder = EndPtr, InPtr	// Calculate
709221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom								// # of bytes
710221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom								// left - 1
711221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
712221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		nop		0x0
713221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
714221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
715221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mib
716221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		cmp.eq		pDone, p0 = -1, Remainder // done already?
717221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov.i		ar.lc = Remainder
718221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom(pDone)		br.cond.dptk.few .rc4Complete
719221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	}
720221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
721221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom/* Do the remaining bytes via the compact, modulo-scheduled loop */
722221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
723221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		MODSCHED_RC4_PROLOGUE
724221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		MODSCHED_RC4_LOOP(.RC4RestLoop)
725221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
726221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom.rc4Complete:
727221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
728221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mmi
729221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		add		KTable = -SZ, KTable
730221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		add		IFinal = -1, IFinal
731221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		ar.lc = LCSave
732221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
733221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
734221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mii
735221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		SKEY		[KTable] = J,-SZ
736221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		zxt1		IFinal = IFinal
737221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		mov		pr = PRSave, 0x1FFFF
738221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
739221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	{
740221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		.mib
741221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		SKEY		[KTable] = IFinal
742221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		add		RetVal = 1, r0
743221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom		br.ret.sptk.few	rp
744221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	} ;;
745221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
746221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
747221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom# Last but not least, emit the code for the bypass-code of the unrolled loop:
748221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
749221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=$bypass;
750221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
751221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom$code.=<<___;
752221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom	.endp RC4
753221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom___
754221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstrom
755221304ee937bc0910948a8be1320cb8cc4eb6d36Brian Carlstromprint $code;
756