1#!/usr/bin/env perl
2
3# Specific modes implementations for SPARC Architecture 2011. There
4# is T4 dependency though, an ASI value that is not specified in the
5# Architecture Manual. But as SPARC universe is rather monocultural,
6# we imply that processor capable of executing crypto instructions
7# can handle the ASI in question as well. This means that we ought to
8# keep eyes open when new processors emerge...
9#
10# As for above mentioned ASI. It's so called "block initializing
11# store" which cancels "read" in "read-update-write" on cache lines.
12# This is "cooperative" optimization, as it reduces overall pressure
13# on memory interface. Benefits can't be observed/quantified with
14# usual benchmarks, on the contrary you can notice that single-thread
15# performance for parallelizable modes is ~1.5% worse for largest
16# block sizes [though few percent better for not so long ones]. All
17# this based on suggestions from David Miller.
18
19sub asm_init {		# to be called with @ARGV as argument
20    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
21    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
22    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
23}
24
25# unified interface
26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
27# local variables
28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
29
30sub alg_cbc_encrypt_implement {
31my ($alg,$bits) = @_;
32
33$::code.=<<___;
34.globl	${alg}${bits}_t4_cbc_encrypt
35.align	32
36${alg}${bits}_t4_cbc_encrypt:
37	save		%sp, -$::frame, %sp
38	sub		$inp, $out, $blk_init	! $inp!=$out
39___
40$::code.=<<___ if (!$::evp);
41	andcc		$ivec, 7, $ivoff
42	alignaddr	$ivec, %g0, $ivec
43
44	ldd		[$ivec + 0], %f0	! load ivec
45	bz,pt		%icc, 1f
46	ldd		[$ivec + 8], %f2
47	ldd		[$ivec + 16], %f4
48	faligndata	%f0, %f2, %f0
49	faligndata	%f2, %f4, %f2
501:
51___
52$::code.=<<___ if ($::evp);
53	ld		[$ivec + 0], %f0
54	ld		[$ivec + 4], %f1
55	ld		[$ivec + 8], %f2
56	ld		[$ivec + 12], %f3
57___
58$::code.=<<___;
59	prefetch	[$inp], 20
60	prefetch	[$inp + 63], 20
61	call		_${alg}${bits}_load_enckey
62	and		$inp, 7, $ileft
63	andn		$inp, 7, $inp
64	sll		$ileft, 3, $ileft
65	mov		64, $iright
66	mov		0xff, $omask
67	sub		$iright, $ileft, $iright
68	and		$out, 7, $ooff
69	cmp		$len, 127
70	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
71	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
72	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
73	srl		$omask, $ooff, $omask
74
75	alignaddrl	$out, %g0, $out
76	srlx		$len, 4, $len
77	prefetch	[$out], 22
78
79.L${bits}_cbc_enc_loop:
80	ldx		[$inp + 0], %o0
81	brz,pt		$ileft, 4f
82	ldx		[$inp + 8], %o1
83
84	ldx		[$inp + 16], %o2
85	sllx		%o0, $ileft, %o0
86	srlx		%o1, $iright, %g1
87	sllx		%o1, $ileft, %o1
88	or		%g1, %o0, %o0
89	srlx		%o2, $iright, %o2
90	or		%o2, %o1, %o1
914:
92	xor		%g4, %o0, %o0		! ^= rk[0]
93	xor		%g5, %o1, %o1
94	movxtod		%o0, %f12
95	movxtod		%o1, %f14
96
97	fxor		%f12, %f0, %f0		! ^= ivec
98	fxor		%f14, %f2, %f2
99	prefetch	[$out + 63], 22
100	prefetch	[$inp + 16+63], 20
101	call		_${alg}${bits}_encrypt_1x
102	add		$inp, 16, $inp
103
104	brnz,pn		$ooff, 2f
105	sub		$len, 1, $len
106
107	std		%f0, [$out + 0]
108	std		%f2, [$out + 8]
109	brnz,pt		$len, .L${bits}_cbc_enc_loop
110	add		$out, 16, $out
111___
112$::code.=<<___ if ($::evp);
113	st		%f0, [$ivec + 0]
114	st		%f1, [$ivec + 4]
115	st		%f2, [$ivec + 8]
116	st		%f3, [$ivec + 12]
117___
118$::code.=<<___ if (!$::evp);
119	brnz,pn		$ivoff, 3f
120	nop
121
122	std		%f0, [$ivec + 0]	! write out ivec
123	std		%f2, [$ivec + 8]
124___
125$::code.=<<___;
126	ret
127	restore
128
129.align	16
1302:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
131						! and ~3x deterioration
132						! in inp==out case
133	faligndata	%f0, %f0, %f4		! handle unaligned output
134	faligndata	%f0, %f2, %f6
135	faligndata	%f2, %f2, %f8
136
137	stda		%f4, [$out + $omask]0xc0	! partial store
138	std		%f6, [$out + 8]
139	add		$out, 16, $out
140	orn		%g0, $omask, $omask
141	stda		%f8, [$out + $omask]0xc0	! partial store
142
143	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
144	orn		%g0, $omask, $omask
145___
146$::code.=<<___ if ($::evp);
147	st		%f0, [$ivec + 0]
148	st		%f1, [$ivec + 4]
149	st		%f2, [$ivec + 8]
150	st		%f3, [$ivec + 12]
151___
152$::code.=<<___ if (!$::evp);
153	brnz,pn		$ivoff, 3f
154	nop
155
156	std		%f0, [$ivec + 0]	! write out ivec
157	std		%f2, [$ivec + 8]
158	ret
159	restore
160
161.align	16
1623:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
163	mov		0xff, $omask
164	srl		$omask, $ivoff, $omask
165	faligndata	%f0, %f0, %f4
166	faligndata	%f0, %f2, %f6
167	faligndata	%f2, %f2, %f8
168	stda		%f4, [$ivec + $omask]0xc0
169	std		%f6, [$ivec + 8]
170	add		$ivec, 16, $ivec
171	orn		%g0, $omask, $omask
172	stda		%f8, [$ivec + $omask]0xc0
173___
174$::code.=<<___;
175	ret
176	restore
177
178!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
179.align	32
180.L${bits}cbc_enc_blk:
181	add	$out, $len, $blk_init
182	and	$blk_init, 63, $blk_init	! tail
183	sub	$len, $blk_init, $len
184	add	$blk_init, 15, $blk_init	! round up to 16n
185	srlx	$len, 4, $len
186	srl	$blk_init, 4, $blk_init
187
188.L${bits}_cbc_enc_blk_loop:
189	ldx		[$inp + 0], %o0
190	brz,pt		$ileft, 5f
191	ldx		[$inp + 8], %o1
192
193	ldx		[$inp + 16], %o2
194	sllx		%o0, $ileft, %o0
195	srlx		%o1, $iright, %g1
196	sllx		%o1, $ileft, %o1
197	or		%g1, %o0, %o0
198	srlx		%o2, $iright, %o2
199	or		%o2, %o1, %o1
2005:
201	xor		%g4, %o0, %o0		! ^= rk[0]
202	xor		%g5, %o1, %o1
203	movxtod		%o0, %f12
204	movxtod		%o1, %f14
205
206	fxor		%f12, %f0, %f0		! ^= ivec
207	fxor		%f14, %f2, %f2
208	prefetch	[$inp + 16+63], 20
209	call		_${alg}${bits}_encrypt_1x
210	add		$inp, 16, $inp
211	sub		$len, 1, $len
212
213	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
214	add		$out, 8, $out
215	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
216	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
217	add		$out, 8, $out
218
219	membar		#StoreLoad|#StoreStore
220	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
221	mov		$blk_init, $len
222___
223$::code.=<<___ if ($::evp);
224	st		%f0, [$ivec + 0]
225	st		%f1, [$ivec + 4]
226	st		%f2, [$ivec + 8]
227	st		%f3, [$ivec + 12]
228___
229$::code.=<<___ if (!$::evp);
230	brnz,pn		$ivoff, 3b
231	nop
232
233	std		%f0, [$ivec + 0]	! write out ivec
234	std		%f2, [$ivec + 8]
235___
236$::code.=<<___;
237	ret
238	restore
239.type	${alg}${bits}_t4_cbc_encrypt,#function
240.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
241___
242}
243
244sub alg_cbc_decrypt_implement {
245my ($alg,$bits) = @_;
246
247$::code.=<<___;
248.globl	${alg}${bits}_t4_cbc_decrypt
249.align	32
250${alg}${bits}_t4_cbc_decrypt:
251	save		%sp, -$::frame, %sp
252	sub		$inp, $out, $blk_init	! $inp!=$out
253___
254$::code.=<<___ if (!$::evp);
255	andcc		$ivec, 7, $ivoff
256	alignaddr	$ivec, %g0, $ivec
257
258	ldd		[$ivec + 0], %f12	! load ivec
259	bz,pt		%icc, 1f
260	ldd		[$ivec + 8], %f14
261	ldd		[$ivec + 16], %f0
262	faligndata	%f12, %f14, %f12
263	faligndata	%f14, %f0, %f14
2641:
265___
266$::code.=<<___ if ($::evp);
267	ld		[$ivec + 0], %f12	! load ivec
268	ld		[$ivec + 4], %f13
269	ld		[$ivec + 8], %f14
270	ld		[$ivec + 12], %f15
271___
272$::code.=<<___;
273	prefetch	[$inp], 20
274	prefetch	[$inp + 63], 20
275	call		_${alg}${bits}_load_deckey
276	and		$inp, 7, $ileft
277	andn		$inp, 7, $inp
278	sll		$ileft, 3, $ileft
279	mov		64, $iright
280	mov		0xff, $omask
281	sub		$iright, $ileft, $iright
282	and		$out, 7, $ooff
283	cmp		$len, 255
284	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
285	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
286	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
287	srl		$omask, $ooff, $omask
288
289	andcc		$len, 16, %g0		! is number of blocks even?
290	srlx		$len, 4, $len
291	alignaddrl	$out, %g0, $out
292	bz		%icc, .L${bits}_cbc_dec_loop2x
293	prefetch	[$out], 22
294.L${bits}_cbc_dec_loop:
295	ldx		[$inp + 0], %o0
296	brz,pt		$ileft, 4f
297	ldx		[$inp + 8], %o1
298
299	ldx		[$inp + 16], %o2
300	sllx		%o0, $ileft, %o0
301	srlx		%o1, $iright, %g1
302	sllx		%o1, $ileft, %o1
303	or		%g1, %o0, %o0
304	srlx		%o2, $iright, %o2
305	or		%o2, %o1, %o1
3064:
307	xor		%g4, %o0, %o2		! ^= rk[0]
308	xor		%g5, %o1, %o3
309	movxtod		%o2, %f0
310	movxtod		%o3, %f2
311
312	prefetch	[$out + 63], 22
313	prefetch	[$inp + 16+63], 20
314	call		_${alg}${bits}_decrypt_1x
315	add		$inp, 16, $inp
316
317	fxor		%f12, %f0, %f0		! ^= ivec
318	fxor		%f14, %f2, %f2
319	movxtod		%o0, %f12
320	movxtod		%o1, %f14
321
322	brnz,pn		$ooff, 2f
323	sub		$len, 1, $len
324
325	std		%f0, [$out + 0]
326	std		%f2, [$out + 8]
327	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
328	add		$out, 16, $out
329___
330$::code.=<<___ if ($::evp);
331	st		%f12, [$ivec + 0]
332	st		%f13, [$ivec + 4]
333	st		%f14, [$ivec + 8]
334	st		%f15, [$ivec + 12]
335___
336$::code.=<<___ if (!$::evp);
337	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
338	nop
339
340	std		%f12, [$ivec + 0]	! write out ivec
341	std		%f14, [$ivec + 8]
342___
343$::code.=<<___;
344	ret
345	restore
346
347.align	16
3482:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
349						! and ~3x deterioration
350						! in inp==out case
351	faligndata	%f0, %f0, %f4		! handle unaligned output
352	faligndata	%f0, %f2, %f6
353	faligndata	%f2, %f2, %f8
354
355	stda		%f4, [$out + $omask]0xc0	! partial store
356	std		%f6, [$out + 8]
357	add		$out, 16, $out
358	orn		%g0, $omask, $omask
359	stda		%f8, [$out + $omask]0xc0	! partial store
360
361	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
362	orn		%g0, $omask, $omask
363___
364$::code.=<<___ if ($::evp);
365	st		%f12, [$ivec + 0]
366	st		%f13, [$ivec + 4]
367	st		%f14, [$ivec + 8]
368	st		%f15, [$ivec + 12]
369___
370$::code.=<<___ if (!$::evp);
371	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
372	nop
373
374	std		%f12, [$ivec + 0]	! write out ivec
375	std		%f14, [$ivec + 8]
376___
377$::code.=<<___;
378	ret
379	restore
380
381!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
382.align	32
383.L${bits}_cbc_dec_loop2x:
384	ldx		[$inp + 0], %o0
385	ldx		[$inp + 8], %o1
386	ldx		[$inp + 16], %o2
387	brz,pt		$ileft, 4f
388	ldx		[$inp + 24], %o3
389
390	ldx		[$inp + 32], %o4
391	sllx		%o0, $ileft, %o0
392	srlx		%o1, $iright, %g1
393	or		%g1, %o0, %o0
394	sllx		%o1, $ileft, %o1
395	srlx		%o2, $iright, %g1
396	or		%g1, %o1, %o1
397	sllx		%o2, $ileft, %o2
398	srlx		%o3, $iright, %g1
399	or		%g1, %o2, %o2
400	sllx		%o3, $ileft, %o3
401	srlx		%o4, $iright, %o4
402	or		%o4, %o3, %o3
4034:
404	xor		%g4, %o0, %o4		! ^= rk[0]
405	xor		%g5, %o1, %o5
406	movxtod		%o4, %f0
407	movxtod		%o5, %f2
408	xor		%g4, %o2, %o4
409	xor		%g5, %o3, %o5
410	movxtod		%o4, %f4
411	movxtod		%o5, %f6
412
413	prefetch	[$out + 63], 22
414	prefetch	[$inp + 32+63], 20
415	call		_${alg}${bits}_decrypt_2x
416	add		$inp, 32, $inp
417
418	movxtod		%o0, %f8
419	movxtod		%o1, %f10
420	fxor		%f12, %f0, %f0		! ^= ivec
421	fxor		%f14, %f2, %f2
422	movxtod		%o2, %f12
423	movxtod		%o3, %f14
424	fxor		%f8, %f4, %f4
425	fxor		%f10, %f6, %f6
426
427	brnz,pn		$ooff, 2f
428	sub		$len, 2, $len
429
430	std		%f0, [$out + 0]
431	std		%f2, [$out + 8]
432	std		%f4, [$out + 16]
433	std		%f6, [$out + 24]
434	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
435	add		$out, 32, $out
436___
437$::code.=<<___ if ($::evp);
438	st		%f12, [$ivec + 0]
439	st		%f13, [$ivec + 4]
440	st		%f14, [$ivec + 8]
441	st		%f15, [$ivec + 12]
442___
443$::code.=<<___ if (!$::evp);
444	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
445	nop
446
447	std		%f12, [$ivec + 0]	! write out ivec
448	std		%f14, [$ivec + 8]
449___
450$::code.=<<___;
451	ret
452	restore
453
454.align	16
4552:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
456						! and ~3x deterioration
457						! in inp==out case
458	faligndata	%f0, %f0, %f8		! handle unaligned output
459	faligndata	%f0, %f2, %f0
460	faligndata	%f2, %f4, %f2
461	faligndata	%f4, %f6, %f4
462	faligndata	%f6, %f6, %f6
463	stda		%f8, [$out + $omask]0xc0	! partial store
464	std		%f0, [$out + 8]
465	std		%f2, [$out + 16]
466	std		%f4, [$out + 24]
467	add		$out, 32, $out
468	orn		%g0, $omask, $omask
469	stda		%f6, [$out + $omask]0xc0	! partial store
470
471	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
472	orn		%g0, $omask, $omask
473___
474$::code.=<<___ if ($::evp);
475	st		%f12, [$ivec + 0]
476	st		%f13, [$ivec + 4]
477	st		%f14, [$ivec + 8]
478	st		%f15, [$ivec + 12]
479___
480$::code.=<<___ if (!$::evp);
481	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
482	nop
483
484	std		%f12, [$ivec + 0]	! write out ivec
485	std		%f14, [$ivec + 8]
486	ret
487	restore
488
489.align	16
490.L${bits}_cbc_dec_unaligned_ivec:
491	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
492	mov		0xff, $omask
493	srl		$omask, $ivoff, $omask
494	faligndata	%f12, %f12, %f0
495	faligndata	%f12, %f14, %f2
496	faligndata	%f14, %f14, %f4
497	stda		%f0, [$ivec + $omask]0xc0
498	std		%f2, [$ivec + 8]
499	add		$ivec, 16, $ivec
500	orn		%g0, $omask, $omask
501	stda		%f4, [$ivec + $omask]0xc0
502___
503$::code.=<<___;
504	ret
505	restore
506
507!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
508.align	32
509.L${bits}cbc_dec_blk:
510	add	$out, $len, $blk_init
511	and	$blk_init, 63, $blk_init	! tail
512	sub	$len, $blk_init, $len
513	add	$blk_init, 15, $blk_init	! round up to 16n
514	srlx	$len, 4, $len
515	srl	$blk_init, 4, $blk_init
516	sub	$len, 1, $len
517	add	$blk_init, 1, $blk_init
518
519.L${bits}_cbc_dec_blk_loop2x:
520	ldx		[$inp + 0], %o0
521	ldx		[$inp + 8], %o1
522	ldx		[$inp + 16], %o2
523	brz,pt		$ileft, 5f
524	ldx		[$inp + 24], %o3
525
526	ldx		[$inp + 32], %o4
527	sllx		%o0, $ileft, %o0
528	srlx		%o1, $iright, %g1
529	or		%g1, %o0, %o0
530	sllx		%o1, $ileft, %o1
531	srlx		%o2, $iright, %g1
532	or		%g1, %o1, %o1
533	sllx		%o2, $ileft, %o2
534	srlx		%o3, $iright, %g1
535	or		%g1, %o2, %o2
536	sllx		%o3, $ileft, %o3
537	srlx		%o4, $iright, %o4
538	or		%o4, %o3, %o3
5395:
540	xor		%g4, %o0, %o4		! ^= rk[0]
541	xor		%g5, %o1, %o5
542	movxtod		%o4, %f0
543	movxtod		%o5, %f2
544	xor		%g4, %o2, %o4
545	xor		%g5, %o3, %o5
546	movxtod		%o4, %f4
547	movxtod		%o5, %f6
548
549	prefetch	[$inp + 32+63], 20
550	call		_${alg}${bits}_decrypt_2x
551	add		$inp, 32, $inp
552	subcc		$len, 2, $len
553
554	movxtod		%o0, %f8
555	movxtod		%o1, %f10
556	fxor		%f12, %f0, %f0		! ^= ivec
557	fxor		%f14, %f2, %f2
558	movxtod		%o2, %f12
559	movxtod		%o3, %f14
560	fxor		%f8, %f4, %f4
561	fxor		%f10, %f6, %f6
562
563	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
564	add		$out, 8, $out
565	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
566	add		$out, 8, $out
567	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
568	add		$out, 8, $out
569	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
570	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
571	add		$out, 8, $out
572
573	add		$blk_init, $len, $len
574	andcc		$len, 1, %g0		! is number of blocks even?
575	membar		#StoreLoad|#StoreStore
576	bnz,pt		%icc, .L${bits}_cbc_dec_loop
577	srl		$len, 0, $len
578	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
579	nop
580___
581$::code.=<<___ if ($::evp);
582	st		%f12, [$ivec + 0]	! write out ivec
583	st		%f13, [$ivec + 4]
584	st		%f14, [$ivec + 8]
585	st		%f15, [$ivec + 12]
586___
587$::code.=<<___ if (!$::evp);
588	brnz,pn		$ivoff, 3b
589	nop
590
591	std		%f12, [$ivec + 0]	! write out ivec
592	std		%f14, [$ivec + 8]
593___
594$::code.=<<___;
595	ret
596	restore
597.type	${alg}${bits}_t4_cbc_decrypt,#function
598.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
599___
600}
601
602sub alg_ctr32_implement {
603my ($alg,$bits) = @_;
604
605$::code.=<<___;
606.globl	${alg}${bits}_t4_ctr32_encrypt
607.align	32
608${alg}${bits}_t4_ctr32_encrypt:
609	save		%sp, -$::frame, %sp
610
611	prefetch	[$inp], 20
612	prefetch	[$inp + 63], 20
613	call		_${alg}${bits}_load_enckey
614	sllx		$len, 4, $len
615
616	ld		[$ivec + 0], %l4	! counter
617	ld		[$ivec + 4], %l5
618	ld		[$ivec + 8], %l6
619	ld		[$ivec + 12], %l7
620
621	sllx		%l4, 32, %o5
622	or		%l5, %o5, %o5
623	sllx		%l6, 32, %g1
624	xor		%o5, %g4, %g4		! ^= rk[0]
625	xor		%g1, %g5, %g5
626	movxtod		%g4, %f14		! most significant 64 bits
627
628	sub		$inp, $out, $blk_init	! $inp!=$out
629	and		$inp, 7, $ileft
630	andn		$inp, 7, $inp
631	sll		$ileft, 3, $ileft
632	mov		64, $iright
633	mov		0xff, $omask
634	sub		$iright, $ileft, $iright
635	and		$out, 7, $ooff
636	cmp		$len, 255
637	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
638	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
639	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
640	srl		$omask, $ooff, $omask
641
642	andcc		$len, 16, %g0		! is number of blocks even?
643	alignaddrl	$out, %g0, $out
644	bz		%icc, .L${bits}_ctr32_loop2x
645	srlx		$len, 4, $len
646.L${bits}_ctr32_loop:
647	ldx		[$inp + 0], %o0
648	brz,pt		$ileft, 4f
649	ldx		[$inp + 8], %o1
650
651	ldx		[$inp + 16], %o2
652	sllx		%o0, $ileft, %o0
653	srlx		%o1, $iright, %g1
654	sllx		%o1, $ileft, %o1
655	or		%g1, %o0, %o0
656	srlx		%o2, $iright, %o2
657	or		%o2, %o1, %o1
6584:
659	xor		%g5, %l7, %g1		! ^= rk[0]
660	add		%l7, 1, %l7
661	movxtod		%g1, %f2
662	srl		%l7, 0, %l7		! clruw
663	prefetch	[$out + 63], 22
664	prefetch	[$inp + 16+63], 20
665___
666$::code.=<<___ if ($alg eq "aes");
667	aes_eround01	%f16, %f14, %f2, %f4
668	aes_eround23	%f18, %f14, %f2, %f2
669___
670$::code.=<<___ if ($alg eq "cmll");
671	camellia_f	%f16, %f2, %f14, %f2
672	camellia_f	%f18, %f14, %f2, %f0
673___
674$::code.=<<___;
675	call		_${alg}${bits}_encrypt_1x+8
676	add		$inp, 16, $inp
677
678	movxtod		%o0, %f10
679	movxtod		%o1, %f12
680	fxor		%f10, %f0, %f0		! ^= inp
681	fxor		%f12, %f2, %f2
682
683	brnz,pn		$ooff, 2f
684	sub		$len, 1, $len
685
686	std		%f0, [$out + 0]
687	std		%f2, [$out + 8]
688	brnz,pt		$len, .L${bits}_ctr32_loop2x
689	add		$out, 16, $out
690
691	ret
692	restore
693
694.align	16
6952:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
696						! and ~3x deterioration
697						! in inp==out case
698	faligndata	%f0, %f0, %f4		! handle unaligned output
699	faligndata	%f0, %f2, %f6
700	faligndata	%f2, %f2, %f8
701	stda		%f4, [$out + $omask]0xc0	! partial store
702	std		%f6, [$out + 8]
703	add		$out, 16, $out
704	orn		%g0, $omask, $omask
705	stda		%f8, [$out + $omask]0xc0	! partial store
706
707	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
708	orn		%g0, $omask, $omask
709
710	ret
711	restore
712
713!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
714.align	32
715.L${bits}_ctr32_loop2x:
716	ldx		[$inp + 0], %o0
717	ldx		[$inp + 8], %o1
718	ldx		[$inp + 16], %o2
719	brz,pt		$ileft, 4f
720	ldx		[$inp + 24], %o3
721
722	ldx		[$inp + 32], %o4
723	sllx		%o0, $ileft, %o0
724	srlx		%o1, $iright, %g1
725	or		%g1, %o0, %o0
726	sllx		%o1, $ileft, %o1
727	srlx		%o2, $iright, %g1
728	or		%g1, %o1, %o1
729	sllx		%o2, $ileft, %o2
730	srlx		%o3, $iright, %g1
731	or		%g1, %o2, %o2
732	sllx		%o3, $ileft, %o3
733	srlx		%o4, $iright, %o4
734	or		%o4, %o3, %o3
7354:
736	xor		%g5, %l7, %g1		! ^= rk[0]
737	add		%l7, 1, %l7
738	movxtod		%g1, %f2
739	srl		%l7, 0, %l7		! clruw
740	xor		%g5, %l7, %g1
741	add		%l7, 1, %l7
742	movxtod		%g1, %f6
743	srl		%l7, 0, %l7		! clruw
744	prefetch	[$out + 63], 22
745	prefetch	[$inp + 32+63], 20
746___
747$::code.=<<___ if ($alg eq "aes");
748	aes_eround01	%f16, %f14, %f2, %f8
749	aes_eround23	%f18, %f14, %f2, %f2
750	aes_eround01	%f16, %f14, %f6, %f10
751	aes_eround23	%f18, %f14, %f6, %f6
752___
753$::code.=<<___ if ($alg eq "cmll");
754	camellia_f	%f16, %f2, %f14, %f2
755	camellia_f	%f16, %f6, %f14, %f6
756	camellia_f	%f18, %f14, %f2, %f0
757	camellia_f	%f18, %f14, %f6, %f4
758___
759$::code.=<<___;
760	call		_${alg}${bits}_encrypt_2x+16
761	add		$inp, 32, $inp
762
763	movxtod		%o0, %f8
764	movxtod		%o1, %f10
765	movxtod		%o2, %f12
766	fxor		%f8, %f0, %f0		! ^= inp
767	movxtod		%o3, %f8
768	fxor		%f10, %f2, %f2
769	fxor		%f12, %f4, %f4
770	fxor		%f8, %f6, %f6
771
772	brnz,pn		$ooff, 2f
773	sub		$len, 2, $len
774
775	std		%f0, [$out + 0]
776	std		%f2, [$out + 8]
777	std		%f4, [$out + 16]
778	std		%f6, [$out + 24]
779	brnz,pt		$len, .L${bits}_ctr32_loop2x
780	add		$out, 32, $out
781
782	ret
783	restore
784
785.align	16
7862:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
787						! and ~3x deterioration
788						! in inp==out case
789	faligndata	%f0, %f0, %f8		! handle unaligned output
790	faligndata	%f0, %f2, %f0
791	faligndata	%f2, %f4, %f2
792	faligndata	%f4, %f6, %f4
793	faligndata	%f6, %f6, %f6
794
795	stda		%f8, [$out + $omask]0xc0	! partial store
796	std		%f0, [$out + 8]
797	std		%f2, [$out + 16]
798	std		%f4, [$out + 24]
799	add		$out, 32, $out
800	orn		%g0, $omask, $omask
801	stda		%f6, [$out + $omask]0xc0	! partial store
802
803	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
804	orn		%g0, $omask, $omask
805
806	ret
807	restore
808
809!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
810.align	32
811.L${bits}_ctr32_blk:
812	add	$out, $len, $blk_init
813	and	$blk_init, 63, $blk_init	! tail
814	sub	$len, $blk_init, $len
815	add	$blk_init, 15, $blk_init	! round up to 16n
816	srlx	$len, 4, $len
817	srl	$blk_init, 4, $blk_init
818	sub	$len, 1, $len
819	add	$blk_init, 1, $blk_init
820
821.L${bits}_ctr32_blk_loop2x:
822	ldx		[$inp + 0], %o0
823	ldx		[$inp + 8], %o1
824	ldx		[$inp + 16], %o2
825	brz,pt		$ileft, 5f
826	ldx		[$inp + 24], %o3
827
828	ldx		[$inp + 32], %o4
829	sllx		%o0, $ileft, %o0
830	srlx		%o1, $iright, %g1
831	or		%g1, %o0, %o0
832	sllx		%o1, $ileft, %o1
833	srlx		%o2, $iright, %g1
834	or		%g1, %o1, %o1
835	sllx		%o2, $ileft, %o2
836	srlx		%o3, $iright, %g1
837	or		%g1, %o2, %o2
838	sllx		%o3, $ileft, %o3
839	srlx		%o4, $iright, %o4
840	or		%o4, %o3, %o3
8415:
842	xor		%g5, %l7, %g1		! ^= rk[0]
843	add		%l7, 1, %l7
844	movxtod		%g1, %f2
845	srl		%l7, 0, %l7		! clruw
846	xor		%g5, %l7, %g1
847	add		%l7, 1, %l7
848	movxtod		%g1, %f6
849	srl		%l7, 0, %l7		! clruw
850	prefetch	[$inp + 32+63], 20
851___
852$::code.=<<___ if ($alg eq "aes");
853	aes_eround01	%f16, %f14, %f2, %f8
854	aes_eround23	%f18, %f14, %f2, %f2
855	aes_eround01	%f16, %f14, %f6, %f10
856	aes_eround23	%f18, %f14, %f6, %f6
857___
858$::code.=<<___ if ($alg eq "cmll");
859	camellia_f	%f16, %f2, %f14, %f2
860	camellia_f	%f16, %f6, %f14, %f6
861	camellia_f	%f18, %f14, %f2, %f0
862	camellia_f	%f18, %f14, %f6, %f4
863___
864$::code.=<<___;
865	call		_${alg}${bits}_encrypt_2x+16
866	add		$inp, 32, $inp
867	subcc		$len, 2, $len
868
869	movxtod		%o0, %f8
870	movxtod		%o1, %f10
871	movxtod		%o2, %f12
872	fxor		%f8, %f0, %f0		! ^= inp
873	movxtod		%o3, %f8
874	fxor		%f10, %f2, %f2
875	fxor		%f12, %f4, %f4
876	fxor		%f8, %f6, %f6
877
878	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
879	add		$out, 8, $out
880	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
881	add		$out, 8, $out
882	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
883	add		$out, 8, $out
884	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
885	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
886	add		$out, 8, $out
887
888	add		$blk_init, $len, $len
889	andcc		$len, 1, %g0		! is number of blocks even?
890	membar		#StoreLoad|#StoreStore
891	bnz,pt		%icc, .L${bits}_ctr32_loop
892	srl		$len, 0, $len
893	brnz,pn		$len, .L${bits}_ctr32_loop2x
894	nop
895
896	ret
897	restore
898.type	${alg}${bits}_t4_ctr32_encrypt,#function
899.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
900___
901}
902
903sub alg_xts_implement {
904my ($alg,$bits,$dir) = @_;
905my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
906my $rem=$ivec;
907
908$::code.=<<___;
909.globl	${alg}${bits}_t4_xts_${dir}crypt
910.align	32
911${alg}${bits}_t4_xts_${dir}crypt:
912	save		%sp, -$::frame-16, %sp
913
914	mov		$ivec, %o0
915	add		%fp, $::bias-16, %o1
916	call		${alg}_t4_encrypt
917	mov		$key2, %o2
918
919	add		%fp, $::bias-16, %l7
920	ldxa		[%l7]0x88, %g2
921	add		%fp, $::bias-8, %l7
922	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
923
924	sethi		%hi(0x76543210), %l7
925	or		%l7, %lo(0x76543210), %l7
926	bmask		%l7, %g0, %g0		! byte swap mask
927
928	prefetch	[$inp], 20
929	prefetch	[$inp + 63], 20
930	call		_${alg}${bits}_load_${dir}ckey
931	and		$len, 15,  $rem
932	and		$len, -16, $len
933___
934$code.=<<___ if ($dir eq "de");
935	mov		0, %l7
936	movrnz		$rem, 16,  %l7
937	sub		$len, %l7, $len
938___
939$code.=<<___;
940
941	sub		$inp, $out, $blk_init	! $inp!=$out
942	and		$inp, 7, $ileft
943	andn		$inp, 7, $inp
944	sll		$ileft, 3, $ileft
945	mov		64, $iright
946	mov		0xff, $omask
947	sub		$iright, $ileft, $iright
948	and		$out, 7, $ooff
949	cmp		$len, 255
950	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
951	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
952	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
953	srl		$omask, $ooff, $omask
954
955	andcc		$len, 16, %g0		! is number of blocks even?
956___
957$code.=<<___ if ($dir eq "de");
958	brz,pn		$len, .L${bits}_xts_${dir}steal
959___
960$code.=<<___;
961	alignaddrl	$out, %g0, $out
962	bz		%icc, .L${bits}_xts_${dir}loop2x
963	srlx		$len, 4, $len
964.L${bits}_xts_${dir}loop:
965	ldx		[$inp + 0], %o0
966	brz,pt		$ileft, 4f
967	ldx		[$inp + 8], %o1
968
969	ldx		[$inp + 16], %o2
970	sllx		%o0, $ileft, %o0
971	srlx		%o1, $iright, %g1
972	sllx		%o1, $ileft, %o1
973	or		%g1, %o0, %o0
974	srlx		%o2, $iright, %o2
975	or		%o2, %o1, %o1
9764:
977	movxtod		%g2, %f12
978	movxtod		%g3, %f14
979	bshuffle	%f12, %f12, %f12
980	bshuffle	%f14, %f14, %f14
981
982	xor		%g4, %o0, %o0		! ^= rk[0]
983	xor		%g5, %o1, %o1
984	movxtod		%o0, %f0
985	movxtod		%o1, %f2
986
987	fxor		%f12, %f0, %f0		! ^= tweak[0]
988	fxor		%f14, %f2, %f2
989
990	prefetch	[$out + 63], 22
991	prefetch	[$inp + 16+63], 20
992	call		_${alg}${bits}_${dir}crypt_1x
993	add		$inp, 16, $inp
994
995	fxor		%f12, %f0, %f0		! ^= tweak[0]
996	fxor		%f14, %f2, %f2
997
998	srax		%g3, 63, %l7		! next tweak value
999	addcc		%g2, %g2, %g2
1000	and		%l7, 0x87, %l7
1001	addxc		%g3, %g3, %g3
1002	xor		%l7, %g2, %g2
1003
1004	brnz,pn		$ooff, 2f
1005	sub		$len, 1, $len
1006
1007	std		%f0, [$out + 0]
1008	std		%f2, [$out + 8]
1009	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1010	add		$out, 16, $out
1011
1012	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1013	nop
1014
1015	ret
1016	restore
1017
1018.align	16
10192:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1020						! and ~3x deterioration
1021						! in inp==out case
1022	faligndata	%f0, %f0, %f4		! handle unaligned output
1023	faligndata	%f0, %f2, %f6
1024	faligndata	%f2, %f2, %f8
1025	stda		%f4, [$out + $omask]0xc0	! partial store
1026	std		%f6, [$out + 8]
1027	add		$out, 16, $out
1028	orn		%g0, $omask, $omask
1029	stda		%f8, [$out + $omask]0xc0	! partial store
1030
1031	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1032	orn		%g0, $omask, $omask
1033
1034	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1035	nop
1036
1037	ret
1038	restore
1039
1040!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1041.align	32
1042.L${bits}_xts_${dir}loop2x:
1043	ldx		[$inp + 0], %o0
1044	ldx		[$inp + 8], %o1
1045	ldx		[$inp + 16], %o2
1046	brz,pt		$ileft, 4f
1047	ldx		[$inp + 24], %o3
1048
1049	ldx		[$inp + 32], %o4
1050	sllx		%o0, $ileft, %o0
1051	srlx		%o1, $iright, %g1
1052	or		%g1, %o0, %o0
1053	sllx		%o1, $ileft, %o1
1054	srlx		%o2, $iright, %g1
1055	or		%g1, %o1, %o1
1056	sllx		%o2, $ileft, %o2
1057	srlx		%o3, $iright, %g1
1058	or		%g1, %o2, %o2
1059	sllx		%o3, $ileft, %o3
1060	srlx		%o4, $iright, %o4
1061	or		%o4, %o3, %o3
10624:
1063	movxtod		%g2, %f12
1064	movxtod		%g3, %f14
1065	bshuffle	%f12, %f12, %f12
1066	bshuffle	%f14, %f14, %f14
1067
1068	srax		%g3, 63, %l7		! next tweak value
1069	addcc		%g2, %g2, %g2
1070	and		%l7, 0x87, %l7
1071	addxc		%g3, %g3, %g3
1072	xor		%l7, %g2, %g2
1073
1074	movxtod		%g2, %f8
1075	movxtod		%g3, %f10
1076	bshuffle	%f8,  %f8,  %f8
1077	bshuffle	%f10, %f10, %f10
1078
1079	xor		%g4, %o0, %o0		! ^= rk[0]
1080	xor		%g5, %o1, %o1
1081	xor		%g4, %o2, %o2		! ^= rk[0]
1082	xor		%g5, %o3, %o3
1083	movxtod		%o0, %f0
1084	movxtod		%o1, %f2
1085	movxtod		%o2, %f4
1086	movxtod		%o3, %f6
1087
1088	fxor		%f12, %f0, %f0		! ^= tweak[0]
1089	fxor		%f14, %f2, %f2
1090	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1091	fxor		%f10, %f6, %f6
1092
1093	prefetch	[$out + 63], 22
1094	prefetch	[$inp + 32+63], 20
1095	call		_${alg}${bits}_${dir}crypt_2x
1096	add		$inp, 32, $inp
1097
1098	movxtod		%g2, %f8
1099	movxtod		%g3, %f10
1100
1101	srax		%g3, 63, %l7		! next tweak value
1102	addcc		%g2, %g2, %g2
1103	and		%l7, 0x87, %l7
1104	addxc		%g3, %g3, %g3
1105	xor		%l7, %g2, %g2
1106
1107	bshuffle	%f8,  %f8,  %f8
1108	bshuffle	%f10, %f10, %f10
1109
1110	fxor		%f12, %f0, %f0		! ^= tweak[0]
1111	fxor		%f14, %f2, %f2
1112	fxor		%f8,  %f4, %f4
1113	fxor		%f10, %f6, %f6
1114
1115	brnz,pn		$ooff, 2f
1116	sub		$len, 2, $len
1117
1118	std		%f0, [$out + 0]
1119	std		%f2, [$out + 8]
1120	std		%f4, [$out + 16]
1121	std		%f6, [$out + 24]
1122	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1123	add		$out, 32, $out
1124
1125	fsrc2		%f4, %f0
1126	fsrc2		%f6, %f2
1127	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1128	nop
1129
1130	ret
1131	restore
1132
1133.align	16
11342:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1135						! and ~3x deterioration
1136						! in inp==out case
1137	faligndata	%f0, %f0, %f8		! handle unaligned output
1138	faligndata	%f0, %f2, %f10
1139	faligndata	%f2, %f4, %f12
1140	faligndata	%f4, %f6, %f14
1141	faligndata	%f6, %f6, %f0
1142
1143	stda		%f8, [$out + $omask]0xc0	! partial store
1144	std		%f10, [$out + 8]
1145	std		%f12, [$out + 16]
1146	std		%f14, [$out + 24]
1147	add		$out, 32, $out
1148	orn		%g0, $omask, $omask
1149	stda		%f0, [$out + $omask]0xc0	! partial store
1150
1151	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1152	orn		%g0, $omask, $omask
1153
1154	fsrc2		%f4, %f0
1155	fsrc2		%f6, %f2
1156	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1157	nop
1158
1159	ret
1160	restore
1161
1162!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1163.align	32
1164.L${bits}_xts_${dir}blk:
1165	add	$out, $len, $blk_init
1166	and	$blk_init, 63, $blk_init	! tail
1167	sub	$len, $blk_init, $len
1168	add	$blk_init, 15, $blk_init	! round up to 16n
1169	srlx	$len, 4, $len
1170	srl	$blk_init, 4, $blk_init
1171	sub	$len, 1, $len
1172	add	$blk_init, 1, $blk_init
1173
1174.L${bits}_xts_${dir}blk2x:
1175	ldx		[$inp + 0], %o0
1176	ldx		[$inp + 8], %o1
1177	ldx		[$inp + 16], %o2
1178	brz,pt		$ileft, 5f
1179	ldx		[$inp + 24], %o3
1180
1181	ldx		[$inp + 32], %o4
1182	sllx		%o0, $ileft, %o0
1183	srlx		%o1, $iright, %g1
1184	or		%g1, %o0, %o0
1185	sllx		%o1, $ileft, %o1
1186	srlx		%o2, $iright, %g1
1187	or		%g1, %o1, %o1
1188	sllx		%o2, $ileft, %o2
1189	srlx		%o3, $iright, %g1
1190	or		%g1, %o2, %o2
1191	sllx		%o3, $ileft, %o3
1192	srlx		%o4, $iright, %o4
1193	or		%o4, %o3, %o3
11945:
1195	movxtod		%g2, %f12
1196	movxtod		%g3, %f14
1197	bshuffle	%f12, %f12, %f12
1198	bshuffle	%f14, %f14, %f14
1199
1200	srax		%g3, 63, %l7		! next tweak value
1201	addcc		%g2, %g2, %g2
1202	and		%l7, 0x87, %l7
1203	addxc		%g3, %g3, %g3
1204	xor		%l7, %g2, %g2
1205
1206	movxtod		%g2, %f8
1207	movxtod		%g3, %f10
1208	bshuffle	%f8,  %f8,  %f8
1209	bshuffle	%f10, %f10, %f10
1210
1211	xor		%g4, %o0, %o0		! ^= rk[0]
1212	xor		%g5, %o1, %o1
1213	xor		%g4, %o2, %o2		! ^= rk[0]
1214	xor		%g5, %o3, %o3
1215	movxtod		%o0, %f0
1216	movxtod		%o1, %f2
1217	movxtod		%o2, %f4
1218	movxtod		%o3, %f6
1219
1220	fxor		%f12, %f0, %f0		! ^= tweak[0]
1221	fxor		%f14, %f2, %f2
1222	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1223	fxor		%f10, %f6, %f6
1224
1225	prefetch	[$inp + 32+63], 20
1226	call		_${alg}${bits}_${dir}crypt_2x
1227	add		$inp, 32, $inp
1228
1229	movxtod		%g2, %f8
1230	movxtod		%g3, %f10
1231
1232	srax		%g3, 63, %l7		! next tweak value
1233	addcc		%g2, %g2, %g2
1234	and		%l7, 0x87, %l7
1235	addxc		%g3, %g3, %g3
1236	xor		%l7, %g2, %g2
1237
1238	bshuffle	%f8,  %f8,  %f8
1239	bshuffle	%f10, %f10, %f10
1240
1241	fxor		%f12, %f0, %f0		! ^= tweak[0]
1242	fxor		%f14, %f2, %f2
1243	fxor		%f8,  %f4, %f4
1244	fxor		%f10, %f6, %f6
1245
1246	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1247	add		$out, 8, $out
1248	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1249	add		$out, 8, $out
1250	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1251	add		$out, 8, $out
1252	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1253	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
1254	add		$out, 8, $out
1255
1256	add		$blk_init, $len, $len
1257	andcc		$len, 1, %g0		! is number of blocks even?
1258	membar		#StoreLoad|#StoreStore
1259	bnz,pt		%icc, .L${bits}_xts_${dir}loop
1260	srl		$len, 0, $len
1261	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
1262	nop
1263
1264	fsrc2		%f4, %f0
1265	fsrc2		%f6, %f2
1266	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1267	nop
1268
1269	ret
1270	restore
1271!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1272___
1273$code.=<<___ if ($dir eq "en");
1274.align	32
1275.L${bits}_xts_${dir}steal:
1276	std		%f0, [%fp + $::bias-16]	! copy of output
1277	std		%f2, [%fp + $::bias-8]
1278
1279	srl		$ileft, 3, $ileft
1280	add		%fp, $::bias-16, %l7
1281	add		$inp, $ileft, $inp	! original $inp+$len&-15
1282	add		$out, $ooff, $out	! original $out+$len&-15
1283	mov		0, $ileft
1284	nop					! align
1285
1286.L${bits}_xts_${dir}stealing:
1287	ldub		[$inp + $ileft], %o0
1288	ldub		[%l7  + $ileft], %o1
1289	dec		$rem
1290	stb		%o0, [%l7  + $ileft]
1291	stb		%o1, [$out + $ileft]
1292	brnz		$rem, .L${bits}_xts_${dir}stealing
1293	inc		$ileft
1294
1295	mov		%l7, $inp
1296	sub		$out, 16, $out
1297	mov		0, $ileft
1298	sub		$out, $ooff, $out
1299	ba		.L${bits}_xts_${dir}loop	! one more time
1300	mov		1, $len				! $rem is 0
1301___
1302$code.=<<___ if ($dir eq "de");
1303.align	32
1304.L${bits}_xts_${dir}steal:
1305	ldx		[$inp + 0], %o0
1306	brz,pt		$ileft, 8f
1307	ldx		[$inp + 8], %o1
1308
1309	ldx		[$inp + 16], %o2
1310	sllx		%o0, $ileft, %o0
1311	srlx		%o1, $iright, %g1
1312	sllx		%o1, $ileft, %o1
1313	or		%g1, %o0, %o0
1314	srlx		%o2, $iright, %o2
1315	or		%o2, %o1, %o1
13168:
1317	srax		%g3, 63, %l7		! next tweak value
1318	addcc		%g2, %g2, %o2
1319	and		%l7, 0x87, %l7
1320	addxc		%g3, %g3, %o3
1321	xor		%l7, %o2, %o2
1322
1323	movxtod		%o2, %f12
1324	movxtod		%o3, %f14
1325	bshuffle	%f12, %f12, %f12
1326	bshuffle	%f14, %f14, %f14
1327
1328	xor		%g4, %o0, %o0		! ^= rk[0]
1329	xor		%g5, %o1, %o1
1330	movxtod		%o0, %f0
1331	movxtod		%o1, %f2
1332
1333	fxor		%f12, %f0, %f0		! ^= tweak[0]
1334	fxor		%f14, %f2, %f2
1335
1336	call		_${alg}${bits}_${dir}crypt_1x
1337	add		$inp, 16, $inp
1338
1339	fxor		%f12, %f0, %f0		! ^= tweak[0]
1340	fxor		%f14, %f2, %f2
1341
1342	std		%f0, [%fp + $::bias-16]
1343	std		%f2, [%fp + $::bias-8]
1344
1345	srl		$ileft, 3, $ileft
1346	add		%fp, $::bias-16, %l7
1347	add		$inp, $ileft, $inp	! original $inp+$len&-15
1348	add		$out, $ooff, $out	! original $out+$len&-15
1349	mov		0, $ileft
1350	add		$out, 16, $out
1351	nop					! align
1352
1353.L${bits}_xts_${dir}stealing:
1354	ldub		[$inp + $ileft], %o0
1355	ldub		[%l7  + $ileft], %o1
1356	dec		$rem
1357	stb		%o0, [%l7  + $ileft]
1358	stb		%o1, [$out + $ileft]
1359	brnz		$rem, .L${bits}_xts_${dir}stealing
1360	inc		$ileft
1361
1362	mov		%l7, $inp
1363	sub		$out, 16, $out
1364	mov		0, $ileft
1365	sub		$out, $ooff, $out
1366	ba		.L${bits}_xts_${dir}loop	! one more time
1367	mov		1, $len				! $rem is 0
1368___
1369$code.=<<___;
1370	ret
1371	restore
1372.type	${alg}${bits}_t4_xts_${dir}crypt,#function
1373.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1374___
1375}
1376
1377# Purpose of these subroutines is to explicitly encode VIS instructions,
1378# so that one can compile the module without having to specify VIS
1379# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1380# Idea is to reserve for option to produce "universal" binary and let
1381# programmer detect if current CPU is VIS capable at run-time.
1382sub unvis {
1383my ($mnemonic,$rs1,$rs2,$rd)=@_;
1384my ($ref,$opf);
1385my %visopf = (	"faligndata"	=> 0x048,
1386		"bshuffle"	=> 0x04c,
1387		"fnot2"		=> 0x066,
1388		"fxor"		=> 0x06c,
1389		"fsrc2"		=> 0x078	);
1390
1391    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1392
1393    if ($opf=$visopf{$mnemonic}) {
1394	foreach ($rs1,$rs2,$rd) {
1395	    return $ref if (!/%f([0-9]{1,2})/);
1396	    $_=$1;
1397	    if ($1>=32) {
1398		return $ref if ($1&1);
1399		# re-encode for upper double register addressing
1400		$_=($1|$1>>5)&31;
1401	    }
1402	}
1403
1404	return	sprintf ".word\t0x%08x !%s",
1405			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1406			$ref;
1407    } else {
1408	return $ref;
1409    }
1410}
1411
1412sub unvis3 {
1413my ($mnemonic,$rs1,$rs2,$rd)=@_;
1414my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1415my ($ref,$opf);
1416my %visopf = (	"addxc"		=> 0x011,
1417		"addxccc"	=> 0x013,
1418		"umulxhi"	=> 0x016,
1419		"alignaddr"	=> 0x018,
1420		"bmask"		=> 0x019,
1421		"alignaddrl"	=> 0x01a	);
1422
1423    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1424
1425    if ($opf=$visopf{$mnemonic}) {
1426	foreach ($rs1,$rs2,$rd) {
1427	    return $ref if (!/%([goli])([0-9])/);
1428	    $_=$bias{$1}+$2;
1429	}
1430
1431	return	sprintf ".word\t0x%08x !%s",
1432			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1433			$ref;
1434    } else {
1435	return $ref;
1436    }
1437}
1438
1439sub unaes_round {	# 4-argument instructions
1440my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1441my ($ref,$opf);
1442my %aesopf = (	"aes_eround01"	=> 0,
1443		"aes_eround23"	=> 1,
1444		"aes_dround01"	=> 2,
1445		"aes_dround23"	=> 3,
1446		"aes_eround01_l"=> 4,
1447		"aes_eround23_l"=> 5,
1448		"aes_dround01_l"=> 6,
1449		"aes_dround23_l"=> 7,
1450		"aes_kexpand1"	=> 8	);
1451
1452    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1453
1454    if (defined($opf=$aesopf{$mnemonic})) {
1455	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1456	foreach ($rs1,$rs2,$rd) {
1457	    return $ref if (!/%f([0-9]{1,2})/);
1458	    $_=$1;
1459	    if ($1>=32) {
1460		return $ref if ($1&1);
1461		# re-encode for upper double register addressing
1462		$_=($1|$1>>5)&31;
1463	    }
1464	}
1465
1466	return	sprintf ".word\t0x%08x !%s",
1467			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1468			$ref;
1469    } else {
1470	return $ref;
1471    }
1472}
1473
1474sub unaes_kexpand {	# 3-argument instructions
1475my ($mnemonic,$rs1,$rs2,$rd)=@_;
1476my ($ref,$opf);
1477my %aesopf = (	"aes_kexpand0"	=> 0x130,
1478		"aes_kexpand2"	=> 0x131	);
1479
1480    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1481
1482    if (defined($opf=$aesopf{$mnemonic})) {
1483	foreach ($rs1,$rs2,$rd) {
1484	    return $ref if (!/%f([0-9]{1,2})/);
1485	    $_=$1;
1486	    if ($1>=32) {
1487		return $ref if ($1&1);
1488		# re-encode for upper double register addressing
1489		$_=($1|$1>>5)&31;
1490	    }
1491	}
1492
1493	return	sprintf ".word\t0x%08x !%s",
1494			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1495			$ref;
1496    } else {
1497	return $ref;
1498    }
1499}
1500
1501sub uncamellia_f {	# 4-argument instructions
1502my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1503my ($ref,$opf);
1504
1505    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1506
1507    if (1) {
1508	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1509	foreach ($rs1,$rs2,$rd) {
1510	    return $ref if (!/%f([0-9]{1,2})/);
1511	    $_=$1;
1512	    if ($1>=32) {
1513		return $ref if ($1&1);
1514		# re-encode for upper double register addressing
1515		$_=($1|$1>>5)&31;
1516	    }
1517	}
1518
1519	return	sprintf ".word\t0x%08x !%s",
1520			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1521			$ref;
1522    } else {
1523	return $ref;
1524    }
1525}
1526
1527sub uncamellia3 {	# 3-argument instructions
1528my ($mnemonic,$rs1,$rs2,$rd)=@_;
1529my ($ref,$opf);
1530my %cmllopf = (	"camellia_fl"	=> 0x13c,
1531		"camellia_fli"	=> 0x13d	);
1532
1533    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1534
1535    if (defined($opf=$cmllopf{$mnemonic})) {
1536	foreach ($rs1,$rs2,$rd) {
1537	    return $ref if (!/%f([0-9]{1,2})/);
1538	    $_=$1;
1539	    if ($1>=32) {
1540		return $ref if ($1&1);
1541		# re-encode for upper double register addressing
1542		$_=($1|$1>>5)&31;
1543	    }
1544	}
1545
1546	return	sprintf ".word\t0x%08x !%s",
1547			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1548			$ref;
1549    } else {
1550	return $ref;
1551    }
1552}
1553
1554sub unmovxtox {		# 2-argument instructions
1555my ($mnemonic,$rs,$rd)=@_;
1556my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1557my ($ref,$opf);
1558my %movxopf = (	"movdtox"	=> 0x110,
1559		"movstouw"	=> 0x111,
1560		"movstosw"	=> 0x113,
1561		"movxtod"	=> 0x118,
1562		"movwtos"	=> 0x119	);
1563
1564    $ref = "$mnemonic\t$rs,$rd";
1565
1566    if (defined($opf=$movxopf{$mnemonic})) {
1567	foreach ($rs,$rd) {
1568	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
1569	    $_=$bias{$1}+$2;
1570	    if ($2>=32) {
1571		return $ref if ($2&1);
1572		# re-encode for upper double register addressing
1573		$_=($2|$2>>5)&31;
1574	    }
1575	}
1576
1577	return	sprintf ".word\t0x%08x !%s",
1578			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1579			$ref;
1580    } else {
1581	return $ref;
1582    }
1583}
1584
1585sub undes {
1586my ($mnemonic)=shift;
1587my @args=@_;
1588my ($ref,$opf);
1589my %desopf = (	"des_round"	=> 0b1001,
1590		"des_ip"	=> 0b100110100,
1591		"des_iip"	=> 0b100110101,
1592		"des_kexpand"	=> 0b100110110	);
1593
1594    $ref = "$mnemonic\t".join(",",@_);
1595
1596    if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
1597	if ($mnemonic eq "des_round") {
1598	    foreach (@args[0..3]) {
1599		return $ref if (!/%f([0-9]{1,2})/);
1600		$_=$1;
1601		if ($1>=32) {
1602		    return $ref if ($1&1);
1603		    # re-encode for upper double register addressing
1604		    $_=($1|$1>>5)&31;
1605		}
1606	    }
1607	    return  sprintf ".word\t0x%08x !%s",
1608			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1609			    $ref;
1610	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
1611	    foreach (@args[0..2]) {
1612		return $ref if (!/(%f)?([0-9]{1,2})/);
1613		$_=$2;
1614		if ($2>=32) {
1615		    return $ref if ($2&1);
1616		    # re-encode for upper double register addressing
1617		    $_=($2|$2>>5)&31;
1618		}
1619	    }
1620	    return  sprintf ".word\t0x%08x !%s",
1621			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1622			    $ref;
1623	} else {				# 2-arg
1624	    foreach (@args[0..1]) {
1625		return $ref if (!/%f([0-9]{1,2})/);
1626		$_=$1;
1627		if ($1>=32) {
1628		    return $ref if ($2&1);
1629		    # re-encode for upper double register addressing
1630		    $_=($1|$1>>5)&31;
1631		}
1632	    }
1633	    return  sprintf ".word\t0x%08x !%s",
1634			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1635			    $ref;
1636	}
1637    } else {
1638	return $ref;
1639    }
1640}
1641
1642sub emit_assembler {
1643    foreach (split("\n",$::code)) {
1644	s/\`([^\`]*)\`/eval $1/ge;
1645
1646	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1647
1648	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1649		&unaes_round($1,$2,$3,$4,$5)
1650	 /geo or
1651	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1652		&unaes_kexpand($1,$2,$3,$4)
1653	 /geo or
1654	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1655		&uncamellia_f($1,$2,$3,$4,$5)
1656	 /geo or
1657	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1658		&uncamellia3($1,$2,$3,$4)
1659	 /geo or
1660	s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1661		&undes($1,$2,$3,$4,$5)
1662	 /geo or
1663	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1664		&unmovxtox($1,$2,$3)
1665	 /geo or
1666	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1667		&unmovxtox($1,$2,$3)
1668	 /geo or
1669	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1670		&unvis($1,$2,$3,$4)
1671	 /geo or
1672	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1673		&unvis3($1,$2,$3,$4)
1674	 /geo;
1675
1676	print $_,"\n";
1677    }
1678}
1679
16801;
1681