1d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen########################################################################
2d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
4d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Copyright (C) 2013 Intel Corporation.
5d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
6d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Authors:
7d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#     James Guilford <james.guilford@intel.com>
8d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#     Kirk Yap <kirk.s.yap@intel.com>
9d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#     Tim Chen <tim.c.chen@linux.intel.com>
10d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
11d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# This software is available to you under a choice of one of two
12d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# licenses.  You may choose to be licensed under the terms of the GNU
13d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# General Public License (GPL) Version 2, available from the file
14d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# COPYING in the main directory of this source tree, or the
15d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# OpenIB.org BSD license below:
16d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
17d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#     Redistribution and use in source and binary forms, with or
18d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#     without modification, are permitted provided that the following
19d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#     conditions are met:
20d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
21d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#      - Redistributions of source code must retain the above
22d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#        copyright notice, this list of conditions and the following
23d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#        disclaimer.
24d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
25d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#      - Redistributions in binary form must reproduce the above
26d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#        copyright notice, this list of conditions and the following
27d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#        disclaimer in the documentation and/or other materials
28d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#        provided with the distribution.
29d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
30d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# SOFTWARE.
38d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
39d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen########################################################################
40d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
41d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# This code is described in an Intel White-Paper:
42d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# "Fast SHA-256 Implementations on Intel Architecture Processors"
43d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
44d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded
45d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# and search for that title.
46d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#
47d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen########################################################################
48d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# This code schedules 2 blocks at a time, with 4 lanes per block
49d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen########################################################################
50d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
51d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#ifdef CONFIG_AS_AVX2
52d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#include <linux/linkage.h>
53d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
54d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## assume buffers not aligned
55d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#define	VMOVDQ vmovdqu
56d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
57d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################ Define Macros
58d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
59d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# addm [mem], reg
60d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Add reg to mem using reg-mem add and store
61d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro addm p1 p2
62d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	\p1, \p2
63d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	\p2, \p1
64d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm
65d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
66d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################
67d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
68d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX0 = %ymm4
69d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX1 = %ymm5
70d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX2 = %ymm6
71d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX3 = %ymm7
72d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
73d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# XMM versions of above
74d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD0 = %xmm4
75d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD1 = %xmm5
76d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD2 = %xmm6
77d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD3 = %xmm7
78d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
79d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP0 = %ymm0
80d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP1 = %ymm1
81d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP2 = %ymm2
82d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP3 = %ymm3
83d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP4 = %ymm8
84d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXFER  = %ymm9
85d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP5 = %ymm11
86d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
87d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
88d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
89d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenBYTE_FLIP_MASK = %ymm13
90d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
91d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
93d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenNUM_BLKS = %rdx	# 3rd arg
94d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenCTX	= %rsi  # 2nd arg
95d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenINP	= %rdi	# 1st arg
96d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenc	= %ecx
97d34a460092d857f1616e39eed7eac6f40cea2225Tim Chend	= %r8d
98d34a460092d857f1616e39eed7eac6f40cea2225Tim Chene       = %edx	# clobbers NUM_BLKS
99d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny3	= %edi	# clobbers INP
100d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
101d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
102d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenTBL	= %rbp
103d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSRND	= CTX	# SRND is same register as CTX
104d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
105d34a460092d857f1616e39eed7eac6f40cea2225Tim Chena = %eax
106d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenb = %ebx
107d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenf = %r9d
108d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheng = %r10d
109d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenh = %r11d
110d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenold_h = %r11d
111d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
112d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenT1 = %r12d
113d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny0 = %r13d
114d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny1 = %r14d
115d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny2 = %r15d
116d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
117d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
118d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
119d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XMM_SAVE_SIZE	= 0
120d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP_END_SIZE	= 8
121d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP_SIZE	= 8
122d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_CTX_SIZE	= 8
123d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_RSP_SIZE	= 8
124d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
125d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XFER		= 0
126d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XMM_SAVE	= _XFER     + _XFER_SIZE
127d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
128d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP		= _INP_END  + _INP_END_SIZE
129d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_CTX		= _INP      + _INP_SIZE
130d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_RSP		= _CTX      + _CTX_SIZE
131d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSTACK_SIZE	= _RSP      + _RSP_SIZE
132d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
133d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# rotate_Xs
134d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Rotate values of symbols X0...X3
135d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro rotate_Xs
136d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	X_ = X0
137d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	X0 = X1
138d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	X1 = X2
139d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	X2 = X3
140d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	X3 = X_
141d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm
142d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
143d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# ROTATE_ARGS
144d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Rotate values of symbols a...h
145d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro ROTATE_ARGS
146d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	old_h = h
147d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	TMP_ = h
148d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	h = g
149d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	g = f
150d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	f = e
151d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	e = d
152d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	d = c
153d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	c = b
154d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	b = a
155d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	a = TMP_
156d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm
157d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
158d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro FOUR_ROUNDS_AND_SCHED disp
159d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 0 ############################
160d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
161d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
162d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
163d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
164d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
165d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
166d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
167d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
168d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
169d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
170d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
171d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
172d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
173d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
174d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
175d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
176d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
177d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
178d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
179d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
180d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
181d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
182d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
183d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
184d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
185d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
186d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
187d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrld	$7, XTMP1, XTMP2
188d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
189d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
190d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
191d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
192d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
193d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpslld	$(32-7), XTMP1, XTMP3
194d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
195d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
196d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
197d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
198d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
199d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
200d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrld	$18, XTMP1, XTMP2
201d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
202d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
203d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
204d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
205d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
206d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
207d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 1 ############################
208d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
209d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
210d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
211d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
212d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	offset = \disp + 1*4
213d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
214d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
215d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
216d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
217d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
218d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
219d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
220d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
221d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
222d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
223d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
224d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
225d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
226d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
227d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
228d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
229d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
230d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpslld	$(32-18), XTMP1, XTMP1
231d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
232d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
233d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
234d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP1, XTMP3, XTMP3
235d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
236d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
237d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
238d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
239d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
240d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
241d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
242d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
243d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
244d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
245d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
246d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
247d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
248d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
249d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
250d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
251d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
252d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
253d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
254d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
255d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
256d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
257d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
258d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
259d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 2 ############################
260d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
261d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
262d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
263d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	offset = \disp + 2*4
264d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
265d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
266d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
267d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
268d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
269d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
270d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
271d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
272d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
273d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
274d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
275d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
276d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
277d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
278d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP3, XTMP2, XTMP2
279d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
280d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
281d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
282d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
283d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
284d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
285d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
286d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
287d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
288d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
289d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
290d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
291d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
292d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
293d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
294d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
295d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
296d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
297d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
298d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
299d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1,h		# h = k + w + h + S0                    # --
300d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
301d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
302d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
303d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3,h		# h = t1 + S0 + MAJ                     # --
304d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
305d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
306d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
307d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
308d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 3 ############################
309d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
310d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
311d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
312d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
313d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	offset = \disp + 3*4
314d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
315d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
316d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
317d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
318d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
319d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
320d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
321d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
322d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
323d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
324d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
325d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
326d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
327d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
328d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
329d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
330d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
331d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
332d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
333d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
334d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
335d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP3, XTMP2, XTMP2
336d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
337d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
338d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
339d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
340d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
341d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
342d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
343d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
344d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
345d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
346d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
347d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
348d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
349d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
350d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
351d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
352d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
353d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
354d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
355d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
356d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
357d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rotate_Xs
358d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm
359d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
360d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro DO_4ROUNDS disp
361d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 0 ###########################
362d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
363d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
364d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
365d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
366d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
367d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
368d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
369d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
370d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
371d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
372d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
373d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
374d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
375d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
376d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
377d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
378d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
379d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
380d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
381d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
382d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
383d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
384d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
385d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
386d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
387d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
388d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
389d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
390d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
391d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
392d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
393d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
394d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
395d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
396d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
397d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 1 ###########################
398d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
399d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
400d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
401d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
402d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
403d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
404d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
405d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
406d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
407d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
408d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
409d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
410d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
411d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
412d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
413d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
414d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
415d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
416d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
417d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
418d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	offset = 4*1 + \disp
419d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	offset(%rsp, SRND), h		# h = k + w + h # --
420d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
421d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
422d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
423d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
424d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
425d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
426d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
427d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
428d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
429d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
430d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
431d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
432d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
433d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
434d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
435d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
436d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
437d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 2 ##############################
438d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
439d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
440d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
441d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
442d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
443d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
444d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
445d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
446d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
447d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
448d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
449d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
450d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
451d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
452d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
453d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
454d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
455d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
456d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
457d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
458d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	offset = 4*2 + \disp
459d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	offset(%rsp, SRND), h		# h = k + w + h # --
460d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
461d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
462d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
463d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
464d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
465d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
466d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
467d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
468d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
469d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
470d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
471d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
472d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
473d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
474d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
475d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
476d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
477d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 3 ###########################
478d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
479d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
480d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	f, y2		# y2 = f                                # CH
481d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$25, e, y0	# y0 = e >> 25				# S1A
482d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$11, e, y1	# y1 = e >> 11				# S1B
483d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = f^g                              # CH
484d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
485d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
486d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
487d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
488d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
489d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
490d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
491d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$13, a, T1	# T1 = a >> 13				# S0B
492d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
493d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$22, a, y1	# y1 = a >> 22				# S0A
494d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, y3		# y3 = a                                # MAJA
495d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
496d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
497d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
498d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	offset = 4*3 + \disp
499d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addl	offset(%rsp, SRND), h		# h = k + w + h # --
500d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	c, y3		# y3 = a|c                              # MAJA
501d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
502d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
503d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	a, T1		# T1 = a                                # MAJB
504d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
505d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	c, T1		# T1 = a&c                              # MAJB
506d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y0, y2		# y2 = S1 + CH                          # --
507d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
508d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
509d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	h, d		# d = k + w + h + d                     # --
510d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
511d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y1, h		# h = k + w + h + S0                    # --
512d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
513d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
514d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
515d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
516d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
517d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
518d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
519d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
520d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ROTATE_ARGS
521d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
522d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm
523d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
524d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen########################################################################
525d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
526d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## arg 1 : pointer to input data
527d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## arg 2 : pointer to digest
528d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## arg 3 : Num blocks
529d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen########################################################################
530d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.text
531d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenENTRY(sha256_transform_rorx)
532d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 32
533d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	pushq	%rbx
534d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	pushq	%rbp
535d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	pushq	%r12
536d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	pushq	%r13
537d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	pushq	%r14
538d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	pushq	%r15
539d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
540d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	%rsp, %rax
541d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	subq	$STACK_SIZE, %rsp
542d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	and	$-32, %rsp	# align rsp to 32 byte boundary
543d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	%rax, _RSP(%rsp)
544d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
545d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
546d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	shl	$6, NUM_BLKS	# convert to bytes
547d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jz	done_hash
548d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
549d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	NUM_BLKS, _INP_END(%rsp)
550d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
551d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	cmp	NUM_BLKS, INP
552d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	je	only_one_block
553d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
554d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## load initial digest
555d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(CTX), a
556d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*1(CTX), b
557d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*2(CTX), c
558d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*3(CTX), d
559d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*4(CTX), e
560d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*5(CTX), f
561d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*6(CTX), g
562d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	4*7(CTX), h
563d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
564d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
565d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
566d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
567d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
568d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	CTX, _CTX(%rsp)
569d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
570d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop0:
571d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	lea     K256(%rip), TBL
572d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
573d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## Load first 16 dwords from two blocks
574d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	0*32(INP),XTMP0
575d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	1*32(INP),XTMP1
576d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	2*32(INP),XTMP2
577d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	3*32(INP),XTMP3
578d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
579d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## byte swap data
580d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
581d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
582d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
583d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
584d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
585d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## transpose data into high/low halves
586d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vperm2i128	$0x20, XTMP2, XTMP0, X0
587d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vperm2i128	$0x31, XTMP2, XTMP0, X1
588d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vperm2i128	$0x20, XTMP3, XTMP1, X2
589d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vperm2i128	$0x31, XTMP3, XTMP1, X3
590d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
591d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenlast_block_enter:
592d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	$64, INP
593d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	INP, _INP(%rsp)
594d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
595d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## schedule 48 input dwords, by doing 3 rounds of 12 each
596d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	SRND, SRND
597d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
598d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 16
599d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop1:
600d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	0*32(TBL, SRND), X0, XFER
601d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
602d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
603d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
604d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	1*32(TBL, SRND), X0, XFER
605d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
606d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
607d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
608d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	2*32(TBL, SRND), X0, XFER
609d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
610d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
611d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
612d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	3*32(TBL, SRND), X0, XFER
613d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
614d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
615d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
616d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	$4*32, SRND
617d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	cmp	$3*4*32, SRND
618d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jb	loop1
619d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
620d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop2:
621d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## Do last 16 rounds with no scheduling
622d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	0*32(TBL, SRND), X0, XFER
623d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
624d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	DO_4ROUNDS	_XFER + 0*32
625d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpaddd	1*32(TBL, SRND), X1, XFER
626d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
627d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	DO_4ROUNDS	_XFER + 1*32
628d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	$2*32, SRND
629d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
630d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa	X2, X0
631d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa	X3, X1
632d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
633d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	cmp	$4*4*32, SRND
634d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jb	loop2
635d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
636d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	_CTX(%rsp), CTX
637d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	_INP(%rsp), INP
638d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
639d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*0)(CTX),a
640d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*1)(CTX),b
641d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*2)(CTX),c
642d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*3)(CTX),d
643d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*4)(CTX),e
644d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*5)(CTX),f
645d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*6)(CTX),g
646d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*7)(CTX),h
647d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
648d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	cmp	_INP_END(%rsp), INP
649d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ja	done_hash
650d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
651d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	#### Do second block using previously scheduled results
652d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	xor	SRND, SRND
653d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 16
654d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop3:
655d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	DO_4ROUNDS	 _XFER + 0*32 + 16
656d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	DO_4ROUNDS	 _XFER + 1*32 + 16
657d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	$2*32, SRND
658d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	cmp	$4*4*32, SRND
659d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jb	loop3
660d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
661d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	_CTX(%rsp), CTX
662d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	_INP(%rsp), INP
663d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	add	$64, INP
664d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
665d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*0)(CTX),a
666d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*1)(CTX),b
667d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*2)(CTX),c
668d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*3)(CTX),d
669d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*4)(CTX),e
670d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*5)(CTX),f
671d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*6)(CTX),g
672d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	addm    (4*7)(CTX),h
673d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
674d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	cmp	_INP_END(%rsp), INP
675d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jb	loop0
676d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ja	done_hash
677d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
678d34a460092d857f1616e39eed7eac6f40cea2225Tim Chendo_last_block:
679d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	#### do last block
680d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	lea	K256(%rip), TBL
681d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
682d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	0*16(INP),XWORD0
683d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	1*16(INP),XWORD1
684d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	2*16(INP),XWORD2
685d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	VMOVDQ	3*16(INP),XWORD3
686d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
687d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
688d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
689d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
690d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
691d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
692d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jmp	last_block_enter
693d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
694d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenonly_one_block:
695d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
696d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	## load initial digest
697d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*0)(CTX),a
698d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*1)(CTX),b
699d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*2)(CTX),c
700d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*3)(CTX),d
701d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*4)(CTX),e
702d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*5)(CTX),f
703d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*6)(CTX),g
704d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	(4*7)(CTX),h
705d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
706d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
707d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
708d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
709d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
710d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	CTX, _CTX(%rsp)
711d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	jmp	do_last_block
712d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
713d34a460092d857f1616e39eed7eac6f40cea2225Tim Chendone_hash:
714d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
715d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	mov	_RSP(%rsp), %rsp
716d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
717d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	popq	%r15
718d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	popq	%r14
719d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	popq	%r13
720d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	popq	%r12
721d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	popq	%rbp
722d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	popq	%rbx
723d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	ret
724d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenENDPROC(sha256_transform_rorx)
725d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
726d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.data
727d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 64
728d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenK256:
729d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
730d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
731d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
732d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
733d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
734d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
735d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
736d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
737d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
738d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
739d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
740d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
741d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
742d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
743d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
744d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
745d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
746d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
747d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
748d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
749d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
750d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
751d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
752d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
753d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
754d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
755d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
756d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
757d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
758d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
759d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
760d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
761d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
762d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenPSHUFFLE_BYTE_FLIP_MASK:
763d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
764d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
765d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# shuffle xBxA -> 00BA
766d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_SHUF_00BA:
767d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
768d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen
769d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# shuffle xDxC -> DC00
770d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_SHUF_DC00:
771d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
772d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#endif
773