1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0";	# $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3";	# $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7";	# $8
33#################
34$Xi="a0";	# $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4";	# $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10";	# $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT";	# $28
46
47{ my $N;
48  sub loop() {
49
50	$N++;
51$code.=<<___;
52.align	4
53	extbl	$Xlo,7,$nlo
54	and	$nlo,0xf0,$nhi
55	sll	$nlo,4,$nlo
56	and	$nlo,0xf0,$nlo
57
58	addq	$nlo,$Htbl,$nlo
59	ldq	$Zlo,8($nlo)
60	addq	$nhi,$Htbl,$nhi
61	ldq	$Zhi,0($nlo)
62
63	and	$Zlo,0x0f,$remp
64	sll	$Zhi,60,$t0
65	lda	$cnt,6(zero)
66	extbl	$Xlo,6,$nlo
67
68	ldq	$Tlo1,8($nhi)
69	s8addq	$remp,$rem_4bit,$remp
70	ldq	$Thi1,0($nhi)
71	srl	$Zlo,4,$Zlo
72
73	ldq	$rem,0($remp)
74	srl	$Zhi,4,$Zhi
75	xor	$t0,$Zlo,$Zlo
76	and	$nlo,0xf0,$nhi
77
78	xor	$Tlo1,$Zlo,$Zlo
79	sll	$nlo,4,$nlo
80	xor	$Thi1,$Zhi,$Zhi
81	and	$nlo,0xf0,$nlo
82
83	addq	$nlo,$Htbl,$nlo
84	ldq	$Tlo0,8($nlo)
85	addq	$nhi,$Htbl,$nhi
86	ldq	$Thi0,0($nlo)
87
88.Looplo$N:
89	and	$Zlo,0x0f,$remp
90	sll	$Zhi,60,$t0
91	subq	$cnt,1,$cnt
92	srl	$Zlo,4,$Zlo
93
94	ldq	$Tlo1,8($nhi)
95	xor	$rem,$Zhi,$Zhi
96	ldq	$Thi1,0($nhi)
97	s8addq	$remp,$rem_4bit,$remp
98
99	ldq	$rem,0($remp)
100	srl	$Zhi,4,$Zhi
101	xor	$t0,$Zlo,$Zlo
102	extbl	$Xlo,$cnt,$nlo
103
104	and	$nlo,0xf0,$nhi
105	xor	$Thi0,$Zhi,$Zhi
106	xor	$Tlo0,$Zlo,$Zlo
107	sll	$nlo,4,$nlo
108
109
110	and	$Zlo,0x0f,$remp
111	sll	$Zhi,60,$t0
112	and	$nlo,0xf0,$nlo
113	srl	$Zlo,4,$Zlo
114
115	s8addq	$remp,$rem_4bit,$remp
116	xor	$rem,$Zhi,$Zhi
117	addq	$nlo,$Htbl,$nlo
118	addq	$nhi,$Htbl,$nhi
119
120	ldq	$rem,0($remp)
121	srl	$Zhi,4,$Zhi
122	ldq	$Tlo0,8($nlo)
123	xor	$t0,$Zlo,$Zlo
124
125	xor	$Tlo1,$Zlo,$Zlo
126	xor	$Thi1,$Zhi,$Zhi
127	ldq	$Thi0,0($nlo)
128	bne	$cnt,.Looplo$N
129
130
131	and	$Zlo,0x0f,$remp
132	sll	$Zhi,60,$t0
133	lda	$cnt,7(zero)
134	srl	$Zlo,4,$Zlo
135
136	ldq	$Tlo1,8($nhi)
137	xor	$rem,$Zhi,$Zhi
138	ldq	$Thi1,0($nhi)
139	s8addq	$remp,$rem_4bit,$remp
140
141	ldq	$rem,0($remp)
142	srl	$Zhi,4,$Zhi
143	xor	$t0,$Zlo,$Zlo
144	extbl	$Xhi,$cnt,$nlo
145
146	and	$nlo,0xf0,$nhi
147	xor	$Thi0,$Zhi,$Zhi
148	xor	$Tlo0,$Zlo,$Zlo
149	sll	$nlo,4,$nlo
150
151	and	$Zlo,0x0f,$remp
152	sll	$Zhi,60,$t0
153	and	$nlo,0xf0,$nlo
154	srl	$Zlo,4,$Zlo
155
156	s8addq	$remp,$rem_4bit,$remp
157	xor	$rem,$Zhi,$Zhi
158	addq	$nlo,$Htbl,$nlo
159	addq	$nhi,$Htbl,$nhi
160
161	ldq	$rem,0($remp)
162	srl	$Zhi,4,$Zhi
163	ldq	$Tlo0,8($nlo)
164	xor	$t0,$Zlo,$Zlo
165
166	xor	$Tlo1,$Zlo,$Zlo
167	xor	$Thi1,$Zhi,$Zhi
168	ldq	$Thi0,0($nlo)
169	unop
170
171
172.Loophi$N:
173	and	$Zlo,0x0f,$remp
174	sll	$Zhi,60,$t0
175	subq	$cnt,1,$cnt
176	srl	$Zlo,4,$Zlo
177
178	ldq	$Tlo1,8($nhi)
179	xor	$rem,$Zhi,$Zhi
180	ldq	$Thi1,0($nhi)
181	s8addq	$remp,$rem_4bit,$remp
182
183	ldq	$rem,0($remp)
184	srl	$Zhi,4,$Zhi
185	xor	$t0,$Zlo,$Zlo
186	extbl	$Xhi,$cnt,$nlo
187
188	and	$nlo,0xf0,$nhi
189	xor	$Thi0,$Zhi,$Zhi
190	xor	$Tlo0,$Zlo,$Zlo
191	sll	$nlo,4,$nlo
192
193
194	and	$Zlo,0x0f,$remp
195	sll	$Zhi,60,$t0
196	and	$nlo,0xf0,$nlo
197	srl	$Zlo,4,$Zlo
198
199	s8addq	$remp,$rem_4bit,$remp
200	xor	$rem,$Zhi,$Zhi
201	addq	$nlo,$Htbl,$nlo
202	addq	$nhi,$Htbl,$nhi
203
204	ldq	$rem,0($remp)
205	srl	$Zhi,4,$Zhi
206	ldq	$Tlo0,8($nlo)
207	xor	$t0,$Zlo,$Zlo
208
209	xor	$Tlo1,$Zlo,$Zlo
210	xor	$Thi1,$Zhi,$Zhi
211	ldq	$Thi0,0($nlo)
212	bne	$cnt,.Loophi$N
213
214
215	and	$Zlo,0x0f,$remp
216	sll	$Zhi,60,$t0
217	srl	$Zlo,4,$Zlo
218
219	ldq	$Tlo1,8($nhi)
220	xor	$rem,$Zhi,$Zhi
221	ldq	$Thi1,0($nhi)
222	s8addq	$remp,$rem_4bit,$remp
223
224	ldq	$rem,0($remp)
225	srl	$Zhi,4,$Zhi
226	xor	$t0,$Zlo,$Zlo
227
228	xor	$Tlo0,$Zlo,$Zlo
229	xor	$Thi0,$Zhi,$Zhi
230
231	and	$Zlo,0x0f,$remp
232	sll	$Zhi,60,$t0
233	srl	$Zlo,4,$Zlo
234
235	s8addq	$remp,$rem_4bit,$remp
236	xor	$rem,$Zhi,$Zhi
237
238	ldq	$rem,0($remp)
239	srl	$Zhi,4,$Zhi
240	xor	$Tlo1,$Zlo,$Zlo
241	xor	$Thi1,$Zhi,$Zhi
242	xor	$t0,$Zlo,$Zlo
243	xor	$rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#ifdef __linux__
249#include <asm/regdef.h>
250#else
251#include <asm.h>
252#include <regdef.h>
253#endif
254
255.text
256
257.set	noat
258.set	noreorder
259.globl	gcm_gmult_4bit
260.align	4
261.ent	gcm_gmult_4bit
262gcm_gmult_4bit:
263	.frame	sp,0,ra
264	.prologue 0
265
266	ldq	$Xlo,8($Xi)
267	ldq	$Xhi,0($Xi)
268
269	br	$rem_4bit,.Lpic1
270.Lpic1:	lda	$rem_4bit,rem_4bit-.Lpic1($rem_4bit)
271___
272
273	&loop();
274
275$code.=<<___;
276	srl	$Zlo,24,$t0	# byte swap
277	srl	$Zlo,8,$t1
278
279	sll	$Zlo,8,$t2
280	sll	$Zlo,24,$Zlo
281	zapnot	$t0,0x11,$t0
282	zapnot	$t1,0x22,$t1
283
284	zapnot	$Zlo,0x88,$Zlo
285	or	$t0,$t1,$t0
286	zapnot	$t2,0x44,$t2
287
288	or	$Zlo,$t0,$Zlo
289	srl	$Zhi,24,$t0
290	srl	$Zhi,8,$t1
291
292	or	$Zlo,$t2,$Zlo
293	sll	$Zhi,8,$t2
294	sll	$Zhi,24,$Zhi
295
296	srl	$Zlo,32,$Xlo
297	sll	$Zlo,32,$Zlo
298
299	zapnot	$t0,0x11,$t0
300	zapnot	$t1,0x22,$t1
301	or	$Zlo,$Xlo,$Xlo
302
303	zapnot	$Zhi,0x88,$Zhi
304	or	$t0,$t1,$t0
305	zapnot	$t2,0x44,$t2
306
307	or	$Zhi,$t0,$Zhi
308	or	$Zhi,$t2,$Zhi
309
310	srl	$Zhi,32,$Xhi
311	sll	$Zhi,32,$Zhi
312
313	or	$Zhi,$Xhi,$Xhi
314	stq	$Xlo,8($Xi)
315	stq	$Xhi,0($Xi)
316
317	ret	(ra)
318.end	gcm_gmult_4bit
319___
320
321$inhi="s0";
322$inlo="s1";
323
324$code.=<<___;
325.globl	gcm_ghash_4bit
326.align	4
327.ent	gcm_ghash_4bit
328gcm_ghash_4bit:
329	lda	sp,-32(sp)
330	stq	ra,0(sp)
331	stq	s0,8(sp)
332	stq	s1,16(sp)
333	.mask	0x04000600,-32
334	.frame	sp,32,ra
335	.prologue 0
336
337	ldq_u	$inhi,0($inp)
338	ldq_u	$Thi0,7($inp)
339	ldq_u	$inlo,8($inp)
340	ldq_u	$Tlo0,15($inp)
341	ldq	$Xhi,0($Xi)
342	ldq	$Xlo,8($Xi)
343
344	br	$rem_4bit,.Lpic2
345.Lpic2:	lda	$rem_4bit,rem_4bit-.Lpic2($rem_4bit)
346
347.Louter:
348	extql	$inhi,$inp,$inhi
349	extqh	$Thi0,$inp,$Thi0
350	or	$inhi,$Thi0,$inhi
351	lda	$inp,16($inp)
352
353	extql	$inlo,$inp,$inlo
354	extqh	$Tlo0,$inp,$Tlo0
355	or	$inlo,$Tlo0,$inlo
356	subq	$len,16,$len
357
358	xor	$Xlo,$inlo,$Xlo
359	xor	$Xhi,$inhi,$Xhi
360___
361
362	&loop();
363
364$code.=<<___;
365	srl	$Zlo,24,$t0	# byte swap
366	srl	$Zlo,8,$t1
367
368	sll	$Zlo,8,$t2
369	sll	$Zlo,24,$Zlo
370	zapnot	$t0,0x11,$t0
371	zapnot	$t1,0x22,$t1
372
373	zapnot	$Zlo,0x88,$Zlo
374	or	$t0,$t1,$t0
375	zapnot	$t2,0x44,$t2
376
377	or	$Zlo,$t0,$Zlo
378	srl	$Zhi,24,$t0
379	srl	$Zhi,8,$t1
380
381	or	$Zlo,$t2,$Zlo
382	sll	$Zhi,8,$t2
383	sll	$Zhi,24,$Zhi
384
385	srl	$Zlo,32,$Xlo
386	sll	$Zlo,32,$Zlo
387	beq	$len,.Ldone
388
389	zapnot	$t0,0x11,$t0
390	zapnot	$t1,0x22,$t1
391	or	$Zlo,$Xlo,$Xlo
392	ldq_u	$inhi,0($inp)
393
394	zapnot	$Zhi,0x88,$Zhi
395	or	$t0,$t1,$t0
396	zapnot	$t2,0x44,$t2
397	ldq_u	$Thi0,7($inp)
398
399	or	$Zhi,$t0,$Zhi
400	or	$Zhi,$t2,$Zhi
401	ldq_u	$inlo,8($inp)
402	ldq_u	$Tlo0,15($inp)
403
404	srl	$Zhi,32,$Xhi
405	sll	$Zhi,32,$Zhi
406
407	or	$Zhi,$Xhi,$Xhi
408	br	zero,.Louter
409
410.Ldone:
411	zapnot	$t0,0x11,$t0
412	zapnot	$t1,0x22,$t1
413	or	$Zlo,$Xlo,$Xlo
414
415	zapnot	$Zhi,0x88,$Zhi
416	or	$t0,$t1,$t0
417	zapnot	$t2,0x44,$t2
418
419	or	$Zhi,$t0,$Zhi
420	or	$Zhi,$t2,$Zhi
421
422	srl	$Zhi,32,$Xhi
423	sll	$Zhi,32,$Zhi
424
425	or	$Zhi,$Xhi,$Xhi
426
427	stq	$Xlo,8($Xi)
428	stq	$Xhi,0($Xi)
429
430	.set	noreorder
431	/*ldq	ra,0(sp)*/
432	ldq	s0,8(sp)
433	ldq	s1,16(sp)
434	lda	sp,32(sp)
435	ret	(ra)
436.end	gcm_ghash_4bit
437
438.align	4
439rem_4bit:
440	.quad	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
441	.quad	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
442	.quad	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
443	.quad	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
444.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
445.align	4
446
447___
448$output=shift and open STDOUT,">$output";
449print $code;
450close STDOUT;
451
452