1/* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
2
3Permission is hereby granted, free of charge, to any person obtaining
4a copy of this software and associated documentation files (the
5"Software"), to deal in the Software without restriction, including
6without limitation the rights to use, copy, modify, merge, publish,
7distribute, sublicense, and/or sell copies of the Software, and to
8permit persons to whom the Software is furnished to do so, subject to
9the following conditions:
10
11The above copyright notice and this permission notice shall be
12included in all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
21
22//	Common registers are assigned as follows:
23//
24//	COMMON
25//
26//	t0		Const Tbl Ptr	TPtr
27//	t1		Round Constant	TRound
28//	t4		Block residual	LenResid
29//	t5		Residual Data	DTmp
30//
31//	{in,out}0	Block 0 Cycle	RotateM0
32//	{in,out}1	Block Value 12	M12
33//	{in,out}2	Block Value 8	M8
34//	{in,out}3	Block Value 4	M4
35//	{in,out}4	Block Value 0	M0
36//	{in,out}5	Block 1 Cycle	RotateM1
37//	{in,out}6	Block Value 13	M13
38//	{in,out}7	Block Value 9	M9
39//	{in,out}8	Block Value 5	M5
40//	{in,out}9	Block Value 1	M1
41//	{in,out}10	Block 2 Cycle	RotateM2
42//	{in,out}11	Block Value 14	M14
43//	{in,out}12	Block Value 10	M10
44//	{in,out}13	Block Value 6	M6
45//	{in,out}14	Block Value 2	M2
46//	{in,out}15	Block 3 Cycle	RotateM3
47//	{in,out}16	Block Value 15	M15
48//	{in,out}17	Block Value 11	M11
49//	{in,out}18	Block Value 7	M7
50//	{in,out}19	Block Value 3	M3
51//	{in,out}20	Scratch			Z
52//	{in,out}21	Scratch			Y
53//	{in,out}22	Scratch			X
54//	{in,out}23	Scratch			W
55//	{in,out}24	Digest A		A
56//	{in,out}25	Digest B		B
57//	{in,out}26	Digest C		C
58//	{in,out}27	Digest D		D
59//	{in,out}28	Active Data Ptr	DPtr
60//	in28		Dummy Value		-
61//	out28		Dummy Value		-
62//	bt0			Coroutine Link	QUICK_RTN
63//
64///	These predicates are used for computing the padding block(s) and
65///	are shared between the driver and digest co-routines
66//
67//	pt0			Extra Pad Block	pExtra
68//	pt1			Load next word	pLoad
69//	pt2			Skip next word	pSkip
70//	pt3			Search for Pad	pNoPad
71//	pt4			Pad Word 0		pPad0
72//	pt5			Pad Word 1		pPad1
73//	pt6			Pad Word 2		pPad2
74//	pt7			Pad Word 3		pPad3
75
76#define	DTmp		r19
77#define	LenResid	r18
78#define	QUICK_RTN	b6
79#define	TPtr		r14
80#define	TRound		r15
81#define	pExtra		p6
82#define	pLoad		p7
83#define	pNoPad		p9
84#define	pPad0		p10
85#define	pPad1		p11
86#define	pPad2		p12
87#define	pPad3		p13
88#define	pSkip		p8
89
90#define	A_		out24
91#define	B_		out25
92#define	C_		out26
93#define	D_		out27
94#define	DPtr_		out28
95#define	M0_		out4
96#define	M1_		out9
97#define	M10_		out12
98#define	M11_		out17
99#define	M12_		out1
100#define	M13_		out6
101#define	M14_		out11
102#define	M15_		out16
103#define	M2_		out14
104#define	M3_		out19
105#define	M4_		out3
106#define	M5_		out8
107#define	M6_		out13
108#define	M7_		out18
109#define	M8_		out2
110#define	M9_		out7
111#define	RotateM0_	out0
112#define	RotateM1_	out5
113#define	RotateM2_	out10
114#define	RotateM3_	out15
115#define	W_		out23
116#define	X_		out22
117#define	Y_		out21
118#define	Z_		out20
119
120#define	A		in24
121#define	B		in25
122#define	C		in26
123#define	D		in27
124#define	DPtr		in28
125#define	M0		in4
126#define	M1		in9
127#define	M10		in12
128#define	M11		in17
129#define	M12		in1
130#define	M13		in6
131#define	M14		in11
132#define	M15		in16
133#define	M2		in14
134#define	M3		in19
135#define	M4		in3
136#define	M5		in8
137#define	M6		in13
138#define	M7		in18
139#define	M8		in2
140#define	M9		in7
141#define	RotateM0	in0
142#define	RotateM1	in5
143#define	RotateM2	in10
144#define	RotateM3	in15
145#define	W		in23
146#define	X		in22
147#define	Y		in21
148#define	Z		in20
149
150/* register stack configuration for md5_block_asm_data_order(): */
151#define	MD5_NINP	3
152#define	MD5_NLOC	0
153#define MD5_NOUT	29
154#define MD5_NROT	0
155
156/* register stack configuration for helpers: */
157#define	_NINPUTS	MD5_NOUT
158#define	_NLOCALS	0
159#define _NOUTPUT	0
160#define	_NROTATE	24	/* this must be <= _NINPUTS */
161
162#if defined(_HPUX_SOURCE) && !defined(_LP64)
163#define	ADDP	addp4
164#else
165#define	ADDP	add
166#endif
167
168#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
169#define HOST_IS_BIG_ENDIAN
170#endif
171
172//	Macros for getting the left and right portions of little-endian words
173
174#define	GETLW(dst, src, align)	dep.z dst = src, 32 - 8 * align, 8 * align
175#define	GETRW(dst, src, align)	extr.u dst = src, 8 * align, 32 - 8 * align
176
177//	MD5 driver
178//
179//		Reads an input block, then calls the digest block
180//		subroutine and adds the results to the accumulated
181//		digest.  It allocates 32 outs which the subroutine
182//		uses as it's inputs and rotating
183//		registers. Initializes the round constant pointer and
184//		takes care of saving/restoring ar.lc
185//
186///	INPUT
187//
188//	in0		Context Ptr		CtxPtr0
189//	in1		Input Data Ptr		DPtrIn
190//	in2		Integral Blocks		BlockCount
191//	rp		Return Address		-
192//
193///	CODE
194//
195//	v2		Input Align		InAlign
196//	t0		Shared w/digest		-
197//	t1		Shared w/digest		-
198//	t2		Shared w/digest		-
199//	t3		Shared w/digest		-
200//	t4		Shared w/digest		-
201//	t5		Shared w/digest		-
202//	t6		PFS Save		PFSSave
203//	t7		ar.lc Save		LCSave
204//	t8		Saved PR		PRSave
205//	t9		2nd CtxPtr		CtxPtr1
206//	t10		Table Base		CTable
207//	t11		Table[0]		CTable0
208//	t13		Accumulator A		AccumA
209//	t14		Accumulator B		AccumB
210//	t15		Accumulator C		AccumC
211//	t16		Accumulator D		AccumD
212//	pt0		Shared w/digest		-
213//	pt1		Shared w/digest		-
214//	pt2		Shared w/digest		-
215//	pt3		Shared w/digest		-
216//	pt4		Shared w/digest		-
217//	pt5		Shared w/digest		-
218//	pt6		Shared w/digest		-
219//	pt7		Shared w/digest		-
220//	pt8		Not Aligned		pOff
221//	pt8		Blocks Left		pAgain
222
223#define	AccumA		r27
224#define	AccumB		r28
225#define	AccumC		r29
226#define	AccumD		r30
227#define	CTable		r24
228#define	CTable0		r25
229#define	CtxPtr0		in0
230#define	CtxPtr1		r23
231#define	DPtrIn		in1
232#define	BlockCount	in2
233#define	InAlign		r10
234#define	LCSave		r21
235#define	PFSSave		r20
236#define	PRSave		r22
237#define	pAgain		p63
238#define	pOff		p63
239
240	.text
241
242/* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
243
244     where:
245      c: a pointer to a structure of this type:
246
247	   typedef struct MD5state_st
248	     {
249	       MD5_LONG A,B,C,D;
250	       MD5_LONG Nl,Nh;
251	       MD5_LONG data[MD5_LBLOCK];
252	       unsigned int num;
253	     }
254	   MD5_CTX;
255
256      data: a pointer to the input data (may be misaligned)
257      num:  the number of 16-byte blocks to hash (i.e., the length
258            of DATA is 16*NUM.
259
260   */
261
262	.type	md5_block_asm_data_order, @function
263	.global	md5_block_asm_data_order
264	.align	32
265	.proc	md5_block_asm_data_order
266md5_block_asm_data_order:
267.md5_block:
268	.prologue
269{	.mmi
270	.save	ar.pfs, PFSSave
271	alloc	PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
272	ADDP	CtxPtr1 = 8, CtxPtr0
273	mov	CTable = ip
274}
275{	.mmi
276	ADDP	DPtrIn = 0, DPtrIn
277	ADDP	CtxPtr0 = 0, CtxPtr0
278	.save	ar.lc, LCSave
279	mov	LCSave = ar.lc
280}
281;;
282{	.mmi
283	add	CTable = .md5_tbl_data_order#-.md5_block#, CTable
284	and	InAlign = 0x3, DPtrIn
285}
286
287{	.mmi
288	ld4	AccumA = [CtxPtr0], 4
289	ld4	AccumC = [CtxPtr1], 4
290	.save pr, PRSave
291	mov	PRSave = pr
292	.body
293}
294;;
295{	.mmi
296	ld4	AccumB = [CtxPtr0]
297	ld4	AccumD = [CtxPtr1]
298	dep	DPtr_ = 0, DPtrIn, 0, 2
299} ;;
300#ifdef HOST_IS_BIG_ENDIAN
301	rum	psr.be;;	// switch to little-endian
302#endif
303{	.mmb
304	ld4	CTable0 = [CTable], 4
305	cmp.ne	pOff, p0 = 0, InAlign
306(pOff)	br.cond.spnt.many .md5_unaligned
307} ;;
308
309//	The FF load/compute loop rotates values three times, so that
310//	loading into M12 here produces the M0 value, M13 -> M1, etc.
311
312.md5_block_loop0:
313{	.mmi
314	ld4	M12_ = [DPtr_], 4
315	mov	TPtr = CTable
316	mov	TRound = CTable0
317} ;;
318{	.mmi
319	ld4	M13_ = [DPtr_], 4
320	mov	A_ = AccumA
321	mov	B_ = AccumB
322} ;;
323{	.mmi
324	ld4	M14_ = [DPtr_], 4
325	mov	C_ = AccumC
326	mov	D_ = AccumD
327} ;;
328{	.mmb
329	ld4	M15_ = [DPtr_], 4
330	add	BlockCount = -1, BlockCount
331	br.call.sptk.many QUICK_RTN = md5_digest_block0
332} ;;
333
334//	Now, we add the new digest values and do some clean-up
335//	before checking if there's another full block to process
336
337{	.mmi
338	add	AccumA = AccumA, A_
339	add	AccumB = AccumB, B_
340	cmp.ne	pAgain, p0 = 0, BlockCount
341}
342{	.mib
343	add	AccumC = AccumC, C_
344	add	AccumD = AccumD, D_
345(pAgain) br.cond.dptk.many .md5_block_loop0
346} ;;
347
348.md5_exit:
349#ifdef HOST_IS_BIG_ENDIAN
350	sum	psr.be;;	// switch back to big-endian mode
351#endif
352{	.mmi
353	st4	[CtxPtr0] = AccumB, -4
354	st4	[CtxPtr1] = AccumD, -4
355	mov	pr = PRSave, 0x1ffff ;;
356}
357{	.mmi
358	st4	[CtxPtr0] = AccumA
359	st4	[CtxPtr1] = AccumC
360	mov	ar.lc = LCSave
361} ;;
362{	.mib
363	mov	ar.pfs = PFSSave
364	br.ret.sptk.few	rp
365} ;;
366
367#define	MD5UNALIGNED(offset)						\
368.md5_process##offset:							\
369{	.mib ;								\
370	nop	0x0	;						\
371	GETRW(DTmp, DTmp, offset) ;					\
372} ;;									\
373.md5_block_loop##offset:						\
374{	.mmi ;								\
375	ld4	Y_ = [DPtr_], 4 ;					\
376	mov	TPtr = CTable ;						\
377	mov	TRound = CTable0 ;					\
378} ;;									\
379{	.mmi ;								\
380	ld4	M13_ = [DPtr_], 4 ;					\
381	mov	A_ = AccumA ;						\
382	mov	B_ = AccumB ;						\
383} ;;									\
384{	.mii ;								\
385	ld4	M14_ = [DPtr_], 4 ;					\
386	GETLW(W_, Y_, offset) ;						\
387	mov	C_ = AccumC ;						\
388}									\
389{	.mmi ;								\
390	mov	D_ = AccumD ;;						\
391	or	M12_ = W_, DTmp ;					\
392	GETRW(DTmp, Y_, offset) ;					\
393}									\
394{	.mib ;								\
395	ld4	M15_ = [DPtr_], 4 ;					\
396	add	BlockCount = -1, BlockCount ;				\
397	br.call.sptk.many QUICK_RTN = md5_digest_block##offset;		\
398} ;;									\
399{	.mmi ;								\
400	add	AccumA = AccumA, A_ ;					\
401	add	AccumB = AccumB, B_ ;					\
402	cmp.ne	pAgain, p0 = 0, BlockCount ;				\
403}									\
404{	.mib ;								\
405	add	AccumC = AccumC, C_ ;					\
406	add	AccumD = AccumD, D_ ;					\
407(pAgain) br.cond.dptk.many .md5_block_loop##offset ;			\
408} ;;									\
409{	.mib ;								\
410	nop	0x0 ;							\
411	nop	0x0 ;							\
412	br.cond.sptk.many .md5_exit ;					\
413} ;;
414
415	.align	32
416.md5_unaligned:
417//
418//	Because variable shifts are expensive, we special case each of
419//	the four alignements. In practice, this won't hurt too much
420//	since only one working set of code will be loaded.
421//
422{	.mib
423	ld4	DTmp = [DPtr_], 4
424	cmp.eq	pOff, p0 = 1, InAlign
425(pOff)	br.cond.dpnt.many .md5_process1
426} ;;
427{	.mib
428	cmp.eq	pOff, p0 = 2, InAlign
429	nop	0x0
430(pOff)	br.cond.dpnt.many .md5_process2
431} ;;
432	MD5UNALIGNED(3)
433	MD5UNALIGNED(1)
434	MD5UNALIGNED(2)
435
436	.endp md5_block_asm_data_order
437
438
439// MD5 Perform the F function and load
440//
441// Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
442// computes the FF() round of functions, then branches to the common
443// digest code to finish up with GG(), HH, and II().
444//
445// INPUT
446//
447// rp Return Address -
448//
449// CODE
450//
451// v0 PFS bit bucket PFS
452// v1 Loop Trip Count LTrip
453// pt0 Load next word pMore
454
455/* For F round: */
456#define LTrip	r9
457#define PFS	r8
458#define pMore	p6
459
460/* For GHI rounds: */
461#define T	r9
462#define U	r10
463#define V	r11
464
465#define COMPUTE(a, b, s, M, R)			\
466{						\
467	.mii ;					\
468	ld4 TRound = [TPtr], 4 ;		\
469	dep.z Y = Z, 32, 32 ;;			\
470	shrp Z = Z, Y, 64 - s ;			\
471} ;;						\
472{						\
473	.mmi ;					\
474	add a = Z, b ;				\
475	mov R = M ;				\
476	nop 0x0 ;				\
477} ;;
478
479#define LOOP(a, b, s, M, R, label)		\
480{	.mii ;					\
481	ld4 TRound = [TPtr], 4 ;		\
482	dep.z Y = Z, 32, 32 ;;			\
483	shrp Z = Z, Y, 64 - s ;			\
484} ;;						\
485{	.mib ;					\
486	add a = Z, b ;				\
487	mov R = M ;				\
488	br.ctop.sptk.many label ;		\
489} ;;
490
491// G(B, C, D) = (B & D) | (C & ~D)
492
493#define G(a, b, c, d, M)			\
494{	.mmi ;					\
495	add Z = M, TRound ;			\
496	and Y = b, d ;				\
497	andcm X = c, d ;			\
498} ;;						\
499{	.mii ;					\
500	add Z = Z, a ;				\
501	or Y = Y, X ;;				\
502	add Z = Z, Y ;				\
503} ;;
504
505// H(B, C, D) = B ^ C ^ D
506
507#define H(a, b, c, d, M)			\
508{	.mmi ;					\
509	add Z = M, TRound ;			\
510	xor Y = b, c ;				\
511	nop 0x0 ;				\
512} ;;						\
513{	.mii ;					\
514	add Z = Z, a ;				\
515	xor Y = Y, d ;;				\
516	add Z = Z, Y ;				\
517} ;;
518
519// I(B, C, D) = C ^ (B | ~D)
520//
521// However, since we have an andcm operator, we use the fact that
522//
523// Y ^ Z == ~Y ^ ~Z
524//
525// to rewrite the expression as
526//
527// I(B, C, D) = ~C ^ (~B & D)
528
529#define I(a, b, c, d, M)			\
530{	.mmi ;					\
531	add Z = M, TRound ;			\
532	andcm Y = d, b ;			\
533	andcm X = -1, c ;			\
534} ;;						\
535{	.mii ;					\
536	add Z = Z, a ;				\
537	xor Y = Y, X ;;				\
538	add Z = Z, Y ;				\
539} ;;
540
541#define GG4(label)				\
542	G(A, B, C, D, M0)			\
543	COMPUTE(A, B, 5, M0, RotateM0)		\
544	G(D, A, B, C, M1)			\
545	COMPUTE(D, A, 9, M1, RotateM1)		\
546	G(C, D, A, B, M2)			\
547	COMPUTE(C, D, 14, M2, RotateM2)		\
548	G(B, C, D, A, M3)			\
549	LOOP(B, C, 20, M3, RotateM3, label)
550
551#define HH4(label)				\
552	H(A, B, C, D, M0)			\
553	COMPUTE(A, B, 4, M0, RotateM0)		\
554	H(D, A, B, C, M1)			\
555	COMPUTE(D, A, 11, M1, RotateM1)		\
556	H(C, D, A, B, M2)			\
557	COMPUTE(C, D, 16, M2, RotateM2)		\
558	H(B, C, D, A, M3)			\
559	LOOP(B, C, 23, M3, RotateM3, label)
560
561#define II4(label)				\
562	I(A, B, C, D, M0)			\
563	COMPUTE(A, B, 6, M0, RotateM0)		\
564	I(D, A, B, C, M1)			\
565	COMPUTE(D, A, 10, M1, RotateM1)		\
566	I(C, D, A, B, M2)			\
567	COMPUTE(C, D, 15, M2, RotateM2)		\
568	I(B, C, D, A, M3)			\
569	LOOP(B, C, 21, M3, RotateM3, label)
570
571#define FFLOAD(a, b, c, d, M, N, s)		\
572{	.mii ;					\
573(pMore) ld4 N = [DPtr], 4 ;			\
574	add Z = M, TRound ;			\
575	and Y = c, b ;				\
576}						\
577{	.mmi ;					\
578	andcm X = d, b ;;			\
579	add Z = Z, a ;				\
580	or Y = Y, X ;				\
581} ;;						\
582{	.mii ;					\
583	ld4 TRound = [TPtr], 4 ;		\
584	add Z = Z, Y ;;				\
585	dep.z Y = Z, 32, 32 ;			\
586} ;;						\
587{	.mii ;					\
588	nop 0x0 ;				\
589	shrp Z = Z, Y, 64 - s ;;		\
590	add a = Z, b ;				\
591} ;;
592
593#define FFLOOP(a, b, c, d, M, N, s, dest)	\
594{	.mii ;					\
595(pMore)	ld4 N = [DPtr], 4 ;			\
596	add Z = M, TRound ;			\
597	and Y = c, b ;				\
598}						\
599{	.mmi ;					\
600	andcm X = d, b ;;			\
601	add Z = Z, a ;				\
602	or Y = Y, X ;				\
603} ;;						\
604{	.mii ;					\
605	ld4 TRound = [TPtr], 4 ;		\
606	add Z = Z, Y ;;				\
607	dep.z Y = Z, 32, 32 ;			\
608} ;;						\
609{	.mii ;					\
610	nop 0x0 ;				\
611	shrp Z = Z, Y, 64 - s ;;		\
612	add a = Z, b ;				\
613}						\
614{	.mib ;					\
615	cmp.ne pMore, p0 = 0, LTrip ;		\
616	add LTrip = -1, LTrip ;			\
617	br.ctop.dptk.many dest ;		\
618} ;;
619
620	.type md5_digest_block0, @function
621	.align 32
622
623	.proc md5_digest_block0
624	.prologue
625md5_digest_block0:
626	.altrp QUICK_RTN
627	.body
628{	.mmi
629	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
630	mov LTrip = 2
631	mov ar.lc = 3
632} ;;
633{	.mii
634	cmp.eq pMore, p0 = r0, r0
635	mov ar.ec = 0
636	nop 0x0
637} ;;
638
639.md5_FF_round0:
640	FFLOAD(A, B, C, D, M12, RotateM0, 7)
641	FFLOAD(D, A, B, C, M13, RotateM1, 12)
642	FFLOAD(C, D, A, B, M14, RotateM2, 17)
643	FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
644	//
645	// !!! Fall through to md5_digest_GHI
646	//
647	.endp md5_digest_block0
648
649	.type md5_digest_GHI, @function
650	.align 32
651
652	.proc md5_digest_GHI
653	.prologue
654	.regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
655md5_digest_GHI:
656	.altrp QUICK_RTN
657	.body
658//
659// The following sequence shuffles the block counstants round for the
660// next round:
661//
662// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
663// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
664//
665{	.mmi
666	mov Z = M0
667	mov Y = M15
668	mov ar.lc = 3
669}
670{	.mmi
671	mov X = M2
672	mov W = M9
673	mov V = M4
674} ;;
675
676{	.mmi
677	mov M0 = M1
678	mov M15 = M12
679	mov ar.ec = 1
680}
681{	.mmi
682	mov M2 = M11
683	mov M9 = M14
684	mov M4 = M5
685} ;;
686
687{	.mmi
688	mov M1 = M6
689	mov M12 = M13
690	mov U = M3
691}
692{	.mmi
693	mov M11 = M8
694	mov M14 = M7
695	mov M5 = M10
696} ;;
697
698{	.mmi
699	mov M6 = Y
700	mov M13 = X
701	mov M3 = Z
702}
703{	.mmi
704	mov M8 = W
705	mov M7 = V
706	mov M10 = U
707} ;;
708
709.md5_GG_round:
710	GG4(.md5_GG_round)
711
712// The following sequence shuffles the block constants round for the
713// next round:
714//
715// 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
716// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
717
718{	.mmi
719	mov Z = M0
720	mov Y = M1
721	mov ar.lc = 3
722}
723{	.mmi
724	mov X = M3
725	mov W = M5
726	mov V = M6
727} ;;
728
729{	.mmi
730	mov M0 = M4
731	mov M1 = M11
732	mov ar.ec = 1
733}
734{	.mmi
735	mov M3 = M9
736	mov U = M8
737	mov T = M13
738} ;;
739
740{	.mmi
741	mov M4 = Z
742	mov M11 = Y
743	mov M5 = M7
744}
745{	.mmi
746	mov M6 = M14
747	mov M8 = M12
748	mov M13 = M15
749} ;;
750
751{	.mmi
752	mov M7 = W
753	mov M14 = V
754	nop 0x0
755}
756{	.mmi
757	mov M9 = X
758	mov M12 = U
759	mov M15 = T
760} ;;
761
762.md5_HH_round:
763	HH4(.md5_HH_round)
764
765// The following sequence shuffles the block constants round for the
766// next round:
767//
768// 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
769// 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
770
771{	.mmi
772	mov Z = M0
773	mov Y = M15
774	mov ar.lc = 3
775}
776{	.mmi
777	mov X = M10
778	mov W = M1
779	mov V = M4
780} ;;
781
782{	.mmi
783	mov M0 = M9
784	mov M15 = M12
785	mov ar.ec = 1
786}
787{	.mmi
788	mov M10 = M11
789	mov M1 = M6
790	mov M4 = M13
791} ;;
792
793{	.mmi
794	mov M9 = M14
795	mov M12 = M5
796	mov U = M3
797}
798{	.mmi
799	mov M11 = M8
800	mov M6 = M7
801	mov M13 = M2
802} ;;
803
804{	.mmi
805	mov M14 = Y
806	mov M5 = X
807	mov M3 = Z
808}
809{	.mmi
810	mov M8 = W
811	mov M7 = V
812	mov M2 = U
813} ;;
814
815.md5_II_round:
816	II4(.md5_II_round)
817
818{	.mib
819	nop 0x0
820	nop 0x0
821	br.ret.sptk.many QUICK_RTN
822} ;;
823
824	.endp md5_digest_GHI
825
826#define FFLOADU(a, b, c, d, M, P, N, s, offset)	\
827{	.mii ;					\
828(pMore) ld4 N = [DPtr], 4 ;			\
829	add Z = M, TRound ;			\
830	and Y = c, b ;				\
831}						\
832{	.mmi ;					\
833	andcm X = d, b ;;			\
834	add Z = Z, a ;				\
835	or Y = Y, X ;				\
836} ;;						\
837{	.mii ;					\
838	ld4 TRound = [TPtr], 4 ;		\
839	GETLW(W, P, offset) ;			\
840	add Z = Z, Y ;				\
841} ;;						\
842{	.mii ;					\
843	or W = W, DTmp ;			\
844	dep.z Y = Z, 32, 32 ;;			\
845	shrp Z = Z, Y, 64 - s ;			\
846} ;;						\
847{	.mii ;					\
848	add a = Z, b ;				\
849	GETRW(DTmp, P, offset) ;		\
850	mov P = W ;				\
851} ;;
852
853#define FFLOOPU(a, b, c, d, M, P, N, s, offset)		\
854{	.mii ;						\
855(pMore) ld4 N = [DPtr], 4 ;				\
856	add Z = M, TRound ;				\
857	and Y = c, b ;					\
858}							\
859{	.mmi ;						\
860	andcm X = d, b ;;				\
861	add Z = Z, a ;					\
862	or Y = Y, X ;					\
863} ;;							\
864{	.mii ;						\
865	ld4 TRound = [TPtr], 4 ;			\
866(pMore) GETLW(W, P, offset) 	;			\
867	add Z = Z, Y ;					\
868} ;;							\
869{	.mii ;						\
870(pMore) or W = W, DTmp ;				\
871	dep.z Y = Z, 32, 32 ;;				\
872	shrp Z = Z, Y, 64 - s ;				\
873} ;;							\
874{	.mii ;						\
875	add a = Z, b ;					\
876(pMore) GETRW(DTmp, P, offset) 	;			\
877(pMore) mov P = W ;					\
878}							\
879{	.mib ;						\
880	cmp.ne pMore, p0 = 0, LTrip ;			\
881	add LTrip = -1, LTrip ;				\
882	br.ctop.sptk.many .md5_FF_round##offset ;	\
883} ;;
884
885#define MD5FBLOCK(offset)						\
886	.type md5_digest_block##offset, @function ;			\
887									\
888	.align 32 ;							\
889	.proc md5_digest_block##offset ;				\
890	.prologue ;							\
891	.altrp QUICK_RTN ;						\
892	.body ;								\
893md5_digest_block##offset:						\
894{	.mmi ;								\
895	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ;	\
896	mov LTrip = 2 ;							\
897	mov ar.lc = 3 ;							\
898} ;;									\
899{	.mii ;								\
900	cmp.eq pMore, p0 = r0, r0 ;					\
901	mov ar.ec = 0 ;							\
902	nop 0x0 ;							\
903} ;;									\
904									\
905	.pred.rel "mutex", pLoad, pSkip ;				\
906.md5_FF_round##offset:							\
907	FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset)		\
908	FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset)		\
909	FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset)		\
910	FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset)	\
911									\
912{	.mib ;								\
913	nop 0x0 ;							\
914	nop 0x0 ;							\
915	br.cond.sptk.many md5_digest_GHI ;				\
916} ;;									\
917	.endp md5_digest_block##offset
918
919MD5FBLOCK(1)
920MD5FBLOCK(2)
921MD5FBLOCK(3)
922
923	.align 64
924	.type md5_constants, @object
925md5_constants:
926.md5_tbl_data_order:			// To ensure little-endian data
927					// order, code as bytes.
928	data1 0x78, 0xa4, 0x6a, 0xd7	//     0
929	data1 0x56, 0xb7, 0xc7, 0xe8	//     1
930	data1 0xdb, 0x70, 0x20, 0x24	//     2
931	data1 0xee, 0xce, 0xbd, 0xc1	//     3
932	data1 0xaf, 0x0f, 0x7c, 0xf5	//     4
933	data1 0x2a, 0xc6, 0x87, 0x47	//     5
934	data1 0x13, 0x46, 0x30, 0xa8	//     6
935	data1 0x01, 0x95, 0x46, 0xfd	//     7
936	data1 0xd8, 0x98, 0x80, 0x69	//     8
937	data1 0xaf, 0xf7, 0x44, 0x8b	//     9
938	data1 0xb1, 0x5b, 0xff, 0xff	//    10
939	data1 0xbe, 0xd7, 0x5c, 0x89	//    11
940	data1 0x22, 0x11, 0x90, 0x6b	//    12
941	data1 0x93, 0x71, 0x98, 0xfd	//    13
942	data1 0x8e, 0x43, 0x79, 0xa6	//    14
943	data1 0x21, 0x08, 0xb4, 0x49	//    15
944	data1 0x62, 0x25, 0x1e, 0xf6	//    16
945	data1 0x40, 0xb3, 0x40, 0xc0	//    17
946	data1 0x51, 0x5a, 0x5e, 0x26	//    18
947	data1 0xaa, 0xc7, 0xb6, 0xe9	//    19
948	data1 0x5d, 0x10, 0x2f, 0xd6	//    20
949	data1 0x53, 0x14, 0x44, 0x02	//    21
950	data1 0x81, 0xe6, 0xa1, 0xd8	//    22
951	data1 0xc8, 0xfb, 0xd3, 0xe7	//    23
952	data1 0xe6, 0xcd, 0xe1, 0x21	//    24
953	data1 0xd6, 0x07, 0x37, 0xc3	//    25
954	data1 0x87, 0x0d, 0xd5, 0xf4	//    26
955	data1 0xed, 0x14, 0x5a, 0x45	//    27
956	data1 0x05, 0xe9, 0xe3, 0xa9	//    28
957	data1 0xf8, 0xa3, 0xef, 0xfc	//    29
958	data1 0xd9, 0x02, 0x6f, 0x67	//    30
959	data1 0x8a, 0x4c, 0x2a, 0x8d	//    31
960	data1 0x42, 0x39, 0xfa, 0xff	//    32
961	data1 0x81, 0xf6, 0x71, 0x87	//    33
962	data1 0x22, 0x61, 0x9d, 0x6d	//    34
963	data1 0x0c, 0x38, 0xe5, 0xfd	//    35
964	data1 0x44, 0xea, 0xbe, 0xa4	//    36
965	data1 0xa9, 0xcf, 0xde, 0x4b	//    37
966	data1 0x60, 0x4b, 0xbb, 0xf6	//    38
967	data1 0x70, 0xbc, 0xbf, 0xbe	//    39
968	data1 0xc6, 0x7e, 0x9b, 0x28	//    40
969	data1 0xfa, 0x27, 0xa1, 0xea	//    41
970	data1 0x85, 0x30, 0xef, 0xd4	//    42
971	data1 0x05, 0x1d, 0x88, 0x04	//    43
972	data1 0x39, 0xd0, 0xd4, 0xd9	//    44
973	data1 0xe5, 0x99, 0xdb, 0xe6	//    45
974	data1 0xf8, 0x7c, 0xa2, 0x1f	//    46
975	data1 0x65, 0x56, 0xac, 0xc4	//    47
976	data1 0x44, 0x22, 0x29, 0xf4	//    48
977	data1 0x97, 0xff, 0x2a, 0x43	//    49
978	data1 0xa7, 0x23, 0x94, 0xab	//    50
979	data1 0x39, 0xa0, 0x93, 0xfc	//    51
980	data1 0xc3, 0x59, 0x5b, 0x65	//    52
981	data1 0x92, 0xcc, 0x0c, 0x8f	//    53
982	data1 0x7d, 0xf4, 0xef, 0xff	//    54
983	data1 0xd1, 0x5d, 0x84, 0x85	//    55
984	data1 0x4f, 0x7e, 0xa8, 0x6f	//    56
985	data1 0xe0, 0xe6, 0x2c, 0xfe	//    57
986	data1 0x14, 0x43, 0x01, 0xa3	//    58
987	data1 0xa1, 0x11, 0x08, 0x4e	//    59
988	data1 0x82, 0x7e, 0x53, 0xf7	//    60
989	data1 0x35, 0xf2, 0x3a, 0xbd	//    61
990	data1 0xbb, 0xd2, 0xd7, 0x2a	//    62
991	data1 0x91, 0xd3, 0x86, 0xeb	//    63
992.size	md5_constants#,64*4
993