1ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen########################################################################
2ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Implement fast SHA-256 with AVX1 instructions. (x86_64)
3ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
4ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Copyright (C) 2013 Intel Corporation.
5ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
6ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Authors:
7ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#     James Guilford <james.guilford@intel.com>
8ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#     Kirk Yap <kirk.s.yap@intel.com>
9ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#     Tim Chen <tim.c.chen@linux.intel.com>
10ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
11ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# This software is available to you under a choice of one of two
12ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# licenses.  You may choose to be licensed under the terms of the GNU
13ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# General Public License (GPL) Version 2, available from the file
14ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# COPYING in the main directory of this source tree, or the
15ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# OpenIB.org BSD license below:
16ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
17ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#     Redistribution and use in source and binary forms, with or
18ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#     without modification, are permitted provided that the following
19ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#     conditions are met:
20ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
21ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#      - Redistributions of source code must retain the above
22ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#        copyright notice, this list of conditions and the following
23ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#        disclaimer.
24ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
25ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#      - Redistributions in binary form must reproduce the above
26ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#        copyright notice, this list of conditions and the following
27ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#        disclaimer in the documentation and/or other materials
28ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#        provided with the distribution.
29ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
30ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# SOFTWARE.
38ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen########################################################################
39ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
40ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# This code is described in an Intel White-Paper:
41ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# "Fast SHA-256 Implementations on Intel Architecture Processors"
42ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
43ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded
44ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# and search for that title.
45ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#
46ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen########################################################################
47ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# This code schedules 1 block at a time, with 4 lanes per block
48ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen########################################################################
49ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
50ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#ifdef CONFIG_AS_AVX
51ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#include <linux/linkage.h>
52ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
53ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen## assume buffers not aligned
54ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#define    VMOVDQ vmovdqu
55ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
56ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen################################ Define Macros
57ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
58ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# addm [mem], reg
59ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Add reg to mem using reg-mem add and store
60ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro addm p1 p2
61ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     \p1, \p2
62ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     \p2, \p1
63ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
64ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
65ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
66ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro MY_ROR p1 p2
67ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	shld    $(32-(\p1)), \p2, \p2
68ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
69ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
70ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen################################
71ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
72ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
73ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Load xmm with mem and byte swap each dword
74ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro COPY_XMM_AND_BSWAP p1 p2 p3
75ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	VMOVDQ \p2, \p1
76ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpshufb \p3, \p1, \p1
77ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
78ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
79ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen################################
80ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
81ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX0 = %xmm4
82ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX1 = %xmm5
83ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX2 = %xmm6
84ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX3 = %xmm7
85ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
86ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXTMP0 = %xmm0
87ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXTMP1 = %xmm1
88ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXTMP2 = %xmm2
89ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXTMP3 = %xmm3
90ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXTMP4 = %xmm8
91ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXFER = %xmm9
92ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenXTMP5 = %xmm11
93ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
94ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenSHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
95ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenSHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
96ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenBYTE_FLIP_MASK = %xmm13
97ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
98ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenNUM_BLKS = %rdx   # 3rd arg
99ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenCTX = %rsi        # 2nd arg
100ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenINP = %rdi        # 1st arg
101ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
102ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenSRND = %rdi       # clobbers INP
103ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenc = %ecx
104ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chend = %r8d
105ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chene = %edx
106ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenTBL = %rbp
107ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chena = %eax
108ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenb = %ebx
109ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
110ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenf = %r9d
111ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Cheng = %r10d
112ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenh = %r11d
113ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
114ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Cheny0 = %r13d
115ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Cheny1 = %r14d
116ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Cheny2 = %r15d
117ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
118ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
119ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_INP_END_SIZE = 8
120ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_INP_SIZE = 8
121de614e561b9c633073caae8f86399aa8923ef85dJussi Kivilinna_XFER_SIZE = 16
122ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_XMM_SAVE_SIZE = 0
123ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
124ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_INP_END = 0
125ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_INP            = _INP_END  + _INP_END_SIZE
126ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_XFER           = _INP      + _INP_SIZE
127ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_XMM_SAVE       = _XFER     + _XFER_SIZE
128ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenSTACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
129ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
130ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# rotate_Xs
131ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Rotate values of symbols X0...X3
132ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro rotate_Xs
133ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX_ = X0
134ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX0 = X1
135ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX1 = X2
136ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX2 = X3
137ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenX3 = X_
138ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
139ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
140ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# ROTATE_ARGS
141ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# Rotate values of symbols a...h
142ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro ROTATE_ARGS
143ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenTMP_ = h
144ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenh = g
145ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Cheng = f
146ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenf = e
147ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chene = d
148ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chend = c
149ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenc = b
150ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenb = a
151ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chena = TMP_
152ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
153ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
154ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro FOUR_ROUNDS_AND_SCHED
155ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## compute s0 four at a time and s1 two at a time
156ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## compute W[-16] + W[-7] 4 at a time
157ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
158ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     e, y0			# y0 = e
159ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
160ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y1                   # y1 = a
161ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
162ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
163ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
164ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     f, y2                   # y2 = f
165ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
166ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
167ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = f^g
168ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
169ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
170ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     e, y2                   # y2 = (f^g)&e
171ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
172ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## compute s0
173ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
174ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
176ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
177ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
178ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, y2                  # y2 = S1 + CH
179ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
180ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y0                   # y0 = a
181ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y2, h                   # h = h + S1 + CH + k + w
182ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y2                   # y2 = a
183ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrld  $7, XTMP1, XTMP2
184ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      c, y0                   # y0 = a|c
185ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
186ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     c, y2                   # y2 = a&c
187ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpslld  $(32-7), XTMP1, XTMP3
188ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     b, y0                   # y0 = (a|c)&b
189ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
190ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
191ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
192ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
193ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	ROTATE_ARGS
194ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     e, y0                   # y0 = e
195ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y1                   # y1 = a
196ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
197ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
198ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     f, y2                   # y2 = f
199ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
200ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrld  $18, XTMP1, XTMP2       #
201ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
202ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
203ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = f^g
204ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
205ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
206ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     e, y2                   # y2 = (f^g)&e
208ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpslld  $(32-18), XTMP1, XTMP1
210ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
212ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP1, XTMP3, XTMP3     #
213ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, y2                  # y2 = S1 + CH
214ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
217ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y0                   # y0 = a
218ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y2, h                   # h = h + S1 + CH + k + w
219ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y2                   # y2 = a
220ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
221ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      c, y0                   # y0 = a|c
222ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
223ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     c, y2                   # y2 = a&c
224ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## compute low s1
225ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
226ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     b, y0                   # y0 = (a|c)&b
227ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
228ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
229ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
230ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
231ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	ROTATE_ARGS
232ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     e, y0                   # y0 = e
233ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y1                   # y1 = a
234ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
235ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
236ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
237ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     f, y2                   # y2 = f
238ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
239ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
240ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
241ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = f^g
242ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
243ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     e, y2                   # y2 = (f^g)&e
245ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
246ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
247ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
248ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
249ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
250ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP3, XTMP2, XTMP2     #
251ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, y2                  # y2 = S1 + CH
252ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
253ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
254ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
255ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y0                   # y0 = a
256ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y2, h                   # h = h + S1 + CH + k + w
257ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y2                   # y2 = a
258ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
259ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      c, y0                   # y0 = a|c
260ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
261ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     c, y2                   # y2 = a&c
262ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
263ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     b, y0                   # y0 = (a|c)&b
264ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
265ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## compute high s1
266ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
267ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
268ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
269ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	ROTATE_ARGS
270ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     e, y0                   # y0 = e
271ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
272ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y1                   # y1 = a
273ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
274ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (25-11))
275ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     f, y2                   # y2 = f
276ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
277ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
278ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (22-13)
279ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = f^g
280ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
281ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
282ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     e, y2                   # y2 = (f^g)&e
283ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
284ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
285ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
287ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
288ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP3, XTMP2, XTMP2
289ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
290ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, y2                  # y2 = S1 + CH
291ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
292ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
293ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y0                   # y0 = a
294ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y2, h                   # h = h + S1 + CH + k + w
295ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     a, y2                   # y2 = a
296ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
297ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      c, y0                   # y0 = a|c
298ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     h, d                    # d = d + h + S1 + CH + k + w
299ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     c, y2                   # y2 = a&c
300ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
301ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and     b, y0                   # y0 = (a|c)&b
302ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y1, h                   # h = h + S1 + CH + k + w + S0
303ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
304ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
305ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	ROTATE_ARGS
306ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	rotate_Xs
307ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
308ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
309ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen## input is [rsp + _XFER + %1 * 4]
310ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.macro DO_ROUND round
311ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov	e, y0			# y0 = e
312ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
313ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        mov     a, y1                   # y1 = a
314ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        xor     e, y0                   # y0 = e ^ (e >> (25-11))
315ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
316ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        mov     f, y2                   # y2 = f
317ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        xor     a, y1                   # y1 = a ^ (a >> (22-13)
318ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
319ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        xor     g, y2                   # y2 = f^g
320ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
321ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
322ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        and     e, y2                   # y2 = (f^g)&e
323ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
325ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
326ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        add     y0, y2                  # y2 = S1 + CH
327ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
328ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        offset = \round * 4 + _XFER     #
329ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
330ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        mov     a, y0			# y0 = a
331ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        add     y2, h                   # h = h + S1 + CH + k + w
332ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        mov     a, y2                   # y2 = a
333ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        or      c, y0                   # y0 = a|c
334ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        add     h, d                    # d = d + h + S1 + CH + k + w
335ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        and     c, y2                   # y2 = a&c
336ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        and     b, y0                   # y0 = (a|c)&b
337ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        add     y1, h                   # h = h + S1 + CH + k + w + S0
338ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
339ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
340ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen        ROTATE_ARGS
341ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.endm
342ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
343ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen########################################################################
344ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
345ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen## arg 1 : pointer to input data
346ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen## arg 2 : pointer to digest
347ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen## arg 3 : Num blocks
348ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen########################################################################
349ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.text
350ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenENTRY(sha256_transform_avx)
351ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.align 32
352ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	pushq   %rbx
353ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	pushq   %rbp
354ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	pushq   %r13
355ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	pushq   %r14
356ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	pushq   %r15
357ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	pushq   %r12
358ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
359ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov	%rsp, %r12
360ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	subq    $STACK_SIZE, %rsp	# allocate stack space
361ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	and	$~15, %rsp		# align stack pointer
362ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
363ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	shl     $6, NUM_BLKS		# convert to bytes
364ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	jz      done_hash
365ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     INP, NUM_BLKS		# pointer to end of data
366ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     NUM_BLKS, _INP_END(%rsp)
367ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
368ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## load initial digest
369ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*0(CTX), a
370ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*1(CTX), b
371ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*2(CTX), c
372ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*3(CTX), d
373ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*4(CTX), e
374ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*5(CTX), f
375ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*6(CTX), g
376ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     4*7(CTX), h
377ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
378ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
379ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
380ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
381ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenloop0:
382ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	lea     K256(%rip), TBL
383ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
384ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## byte swap first 16 dwords
385ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
386ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
387ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
388ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
389ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
390ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     INP, _INP(%rsp)
391ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
392ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	## schedule 48 input dwords, by doing 3 rounds of 16 each
393ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     $3, SRND
394ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.align 16
395ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenloop1:
396ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  (TBL), X0, XFER
397ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa XFER, _XFER(%rsp)
398ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	FOUR_ROUNDS_AND_SCHED
399ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
400ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  1*16(TBL), X0, XFER
401ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa XFER, _XFER(%rsp)
402ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	FOUR_ROUNDS_AND_SCHED
403ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
404ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  2*16(TBL), X0, XFER
405ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa XFER, _XFER(%rsp)
406ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	FOUR_ROUNDS_AND_SCHED
407ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
408ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  3*16(TBL), X0, XFER
409ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa XFER, _XFER(%rsp)
410ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add	$4*16, TBL
411ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	FOUR_ROUNDS_AND_SCHED
412ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
413ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	sub     $1, SRND
414ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	jne     loop1
415ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
416ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     $2, SRND
417ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chenloop2:
418ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  (TBL), X0, XFER
419ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa XFER, _XFER(%rsp)
420ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        0
421ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        1
422ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        2
423ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        3
424ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
425ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vpaddd  1*16(TBL), X1, XFER
426ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa XFER, _XFER(%rsp)
427ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     $2*16, TBL
428ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        0
429ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        1
430ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        2
431ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	DO_ROUND        3
432ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
433ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa X2, X0
434ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	vmovdqa X3, X1
435ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
436ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	sub     $1, SRND
437ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	jne     loop2
438ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
439ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*0)(CTX),a
440ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*1)(CTX),b
441ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*2)(CTX),c
442ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*3)(CTX),d
443ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*4)(CTX),e
444ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*5)(CTX),f
445ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*6)(CTX),g
446ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	addm    (4*7)(CTX),h
447ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
448ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov     _INP(%rsp), INP
449ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	add     $64, INP
450ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	cmp     _INP_END(%rsp), INP
451ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	jne     loop0
452ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
453ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chendone_hash:
454ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
455ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	mov	%r12, %rsp
456ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
457ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	popq	%r12
458ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	popq    %r15
459ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	popq    %r14
460ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	popq    %r13
461ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	popq    %rbp
462ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	popq    %rbx
463ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	ret
464ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenENDPROC(sha256_transform_avx)
465ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
466ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.data
467ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen.align 64
468ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenK256:
469ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
470ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
471ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
472ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
473ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
474ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
475ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
476ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
477ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
478ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
479ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
480ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
481ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
482ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
483ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
484ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
485ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
486ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim ChenPSHUFFLE_BYTE_FLIP_MASK:
487ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.octa 0x0c0d0e0f08090a0b0405060700010203
488ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
489ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# shuffle xBxA -> 00BA
490ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_SHUF_00BA:
491ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
492ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen
493ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen# shuffle xDxC -> DC00
494ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen_SHUF_DC00:
495ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
496ec2b4c851f4da48a51b79a69843beb135e3db8c2Tim Chen#endif
497