15663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
25663535b69eef3940dcdb3110f95651304fe41afTim Chen# Implement fast SHA-512 with AVX2 instructions. (x86_64)
35663535b69eef3940dcdb3110f95651304fe41afTim Chen#
45663535b69eef3940dcdb3110f95651304fe41afTim Chen# Copyright (C) 2013 Intel Corporation.
55663535b69eef3940dcdb3110f95651304fe41afTim Chen#
65663535b69eef3940dcdb3110f95651304fe41afTim Chen# Authors:
75663535b69eef3940dcdb3110f95651304fe41afTim Chen#     James Guilford <james.guilford@intel.com>
85663535b69eef3940dcdb3110f95651304fe41afTim Chen#     Kirk Yap <kirk.s.yap@intel.com>
95663535b69eef3940dcdb3110f95651304fe41afTim Chen#     David Cote <david.m.cote@intel.com>
105663535b69eef3940dcdb3110f95651304fe41afTim Chen#     Tim Chen <tim.c.chen@linux.intel.com>
115663535b69eef3940dcdb3110f95651304fe41afTim Chen#
125663535b69eef3940dcdb3110f95651304fe41afTim Chen# This software is available to you under a choice of one of two
135663535b69eef3940dcdb3110f95651304fe41afTim Chen# licenses.  You may choose to be licensed under the terms of the GNU
145663535b69eef3940dcdb3110f95651304fe41afTim Chen# General Public License (GPL) Version 2, available from the file
155663535b69eef3940dcdb3110f95651304fe41afTim Chen# COPYING in the main directory of this source tree, or the
165663535b69eef3940dcdb3110f95651304fe41afTim Chen# OpenIB.org BSD license below:
175663535b69eef3940dcdb3110f95651304fe41afTim Chen#
185663535b69eef3940dcdb3110f95651304fe41afTim Chen#     Redistribution and use in source and binary forms, with or
195663535b69eef3940dcdb3110f95651304fe41afTim Chen#     without modification, are permitted provided that the following
205663535b69eef3940dcdb3110f95651304fe41afTim Chen#     conditions are met:
215663535b69eef3940dcdb3110f95651304fe41afTim Chen#
225663535b69eef3940dcdb3110f95651304fe41afTim Chen#      - Redistributions of source code must retain the above
235663535b69eef3940dcdb3110f95651304fe41afTim Chen#        copyright notice, this list of conditions and the following
245663535b69eef3940dcdb3110f95651304fe41afTim Chen#        disclaimer.
255663535b69eef3940dcdb3110f95651304fe41afTim Chen#
265663535b69eef3940dcdb3110f95651304fe41afTim Chen#      - Redistributions in binary form must reproduce the above
275663535b69eef3940dcdb3110f95651304fe41afTim Chen#        copyright notice, this list of conditions and the following
285663535b69eef3940dcdb3110f95651304fe41afTim Chen#        disclaimer in the documentation and/or other materials
295663535b69eef3940dcdb3110f95651304fe41afTim Chen#        provided with the distribution.
305663535b69eef3940dcdb3110f95651304fe41afTim Chen#
315663535b69eef3940dcdb3110f95651304fe41afTim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
325663535b69eef3940dcdb3110f95651304fe41afTim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
335663535b69eef3940dcdb3110f95651304fe41afTim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
345663535b69eef3940dcdb3110f95651304fe41afTim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
355663535b69eef3940dcdb3110f95651304fe41afTim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
365663535b69eef3940dcdb3110f95651304fe41afTim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
375663535b69eef3940dcdb3110f95651304fe41afTim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
385663535b69eef3940dcdb3110f95651304fe41afTim Chen# SOFTWARE.
395663535b69eef3940dcdb3110f95651304fe41afTim Chen#
405663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
415663535b69eef3940dcdb3110f95651304fe41afTim Chen#
425663535b69eef3940dcdb3110f95651304fe41afTim Chen# This code is described in an Intel White-Paper:
435663535b69eef3940dcdb3110f95651304fe41afTim Chen# "Fast SHA-512 Implementations on Intel Architecture Processors"
445663535b69eef3940dcdb3110f95651304fe41afTim Chen#
455663535b69eef3940dcdb3110f95651304fe41afTim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded
465663535b69eef3940dcdb3110f95651304fe41afTim Chen# and search for that title.
475663535b69eef3940dcdb3110f95651304fe41afTim Chen#
485663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
495663535b69eef3940dcdb3110f95651304fe41afTim Chen# This code schedules 1 blocks at a time, with 4 lanes per block
505663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
515663535b69eef3940dcdb3110f95651304fe41afTim Chen
525663535b69eef3940dcdb3110f95651304fe41afTim Chen#ifdef CONFIG_AS_AVX2
535663535b69eef3940dcdb3110f95651304fe41afTim Chen#include <linux/linkage.h>
545663535b69eef3940dcdb3110f95651304fe41afTim Chen
555663535b69eef3940dcdb3110f95651304fe41afTim Chen.text
565663535b69eef3940dcdb3110f95651304fe41afTim Chen
575663535b69eef3940dcdb3110f95651304fe41afTim Chen# Virtual Registers
585663535b69eef3940dcdb3110f95651304fe41afTim ChenY_0 = %ymm4
595663535b69eef3940dcdb3110f95651304fe41afTim ChenY_1 = %ymm5
605663535b69eef3940dcdb3110f95651304fe41afTim ChenY_2 = %ymm6
615663535b69eef3940dcdb3110f95651304fe41afTim ChenY_3 = %ymm7
625663535b69eef3940dcdb3110f95651304fe41afTim Chen
635663535b69eef3940dcdb3110f95651304fe41afTim ChenYTMP0 = %ymm0
645663535b69eef3940dcdb3110f95651304fe41afTim ChenYTMP1 = %ymm1
655663535b69eef3940dcdb3110f95651304fe41afTim ChenYTMP2 = %ymm2
665663535b69eef3940dcdb3110f95651304fe41afTim ChenYTMP3 = %ymm3
675663535b69eef3940dcdb3110f95651304fe41afTim ChenYTMP4 = %ymm8
685663535b69eef3940dcdb3110f95651304fe41afTim ChenXFER  = YTMP0
695663535b69eef3940dcdb3110f95651304fe41afTim Chen
705663535b69eef3940dcdb3110f95651304fe41afTim ChenBYTE_FLIP_MASK  = %ymm9
715663535b69eef3940dcdb3110f95651304fe41afTim Chen
725663535b69eef3940dcdb3110f95651304fe41afTim Chen# 1st arg
735663535b69eef3940dcdb3110f95651304fe41afTim ChenINP         = %rdi
745663535b69eef3940dcdb3110f95651304fe41afTim Chen# 2nd arg
755663535b69eef3940dcdb3110f95651304fe41afTim ChenCTX         = %rsi
765663535b69eef3940dcdb3110f95651304fe41afTim Chen# 3rd arg
775663535b69eef3940dcdb3110f95651304fe41afTim ChenNUM_BLKS    = %rdx
785663535b69eef3940dcdb3110f95651304fe41afTim Chen
795663535b69eef3940dcdb3110f95651304fe41afTim Chenc           = %rcx
805663535b69eef3940dcdb3110f95651304fe41afTim Chend           = %r8
815663535b69eef3940dcdb3110f95651304fe41afTim Chene           = %rdx
825663535b69eef3940dcdb3110f95651304fe41afTim Cheny3          = %rdi
835663535b69eef3940dcdb3110f95651304fe41afTim Chen
845663535b69eef3940dcdb3110f95651304fe41afTim ChenTBL   = %rbp
855663535b69eef3940dcdb3110f95651304fe41afTim Chen
865663535b69eef3940dcdb3110f95651304fe41afTim Chena     = %rax
875663535b69eef3940dcdb3110f95651304fe41afTim Chenb     = %rbx
885663535b69eef3940dcdb3110f95651304fe41afTim Chen
895663535b69eef3940dcdb3110f95651304fe41afTim Chenf     = %r9
905663535b69eef3940dcdb3110f95651304fe41afTim Cheng     = %r10
915663535b69eef3940dcdb3110f95651304fe41afTim Chenh     = %r11
925663535b69eef3940dcdb3110f95651304fe41afTim Chenold_h = %r11
935663535b69eef3940dcdb3110f95651304fe41afTim Chen
945663535b69eef3940dcdb3110f95651304fe41afTim ChenT1    = %r12
955663535b69eef3940dcdb3110f95651304fe41afTim Cheny0    = %r13
965663535b69eef3940dcdb3110f95651304fe41afTim Cheny1    = %r14
975663535b69eef3940dcdb3110f95651304fe41afTim Cheny2    = %r15
985663535b69eef3940dcdb3110f95651304fe41afTim Chen
995663535b69eef3940dcdb3110f95651304fe41afTim Cheny4    = %r12
1005663535b69eef3940dcdb3110f95651304fe41afTim Chen
1015663535b69eef3940dcdb3110f95651304fe41afTim Chen# Local variables (stack frame)
1025663535b69eef3940dcdb3110f95651304fe41afTim ChenXFER_SIZE = 4*8
1035663535b69eef3940dcdb3110f95651304fe41afTim ChenSRND_SIZE = 1*8
1045663535b69eef3940dcdb3110f95651304fe41afTim ChenINP_SIZE = 1*8
1055663535b69eef3940dcdb3110f95651304fe41afTim ChenINPEND_SIZE = 1*8
1065663535b69eef3940dcdb3110f95651304fe41afTim ChenRSPSAVE_SIZE = 1*8
1075663535b69eef3940dcdb3110f95651304fe41afTim ChenGPRSAVE_SIZE = 6*8
1085663535b69eef3940dcdb3110f95651304fe41afTim Chen
1095663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_XFER = 0
1105663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_SRND = frame_XFER + XFER_SIZE
1115663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_INP = frame_SRND + SRND_SIZE
1125663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_INPEND = frame_INP + INP_SIZE
1135663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_RSPSAVE = frame_INPEND + INPEND_SIZE
1145663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
1155663535b69eef3940dcdb3110f95651304fe41afTim Chenframe_size = frame_GPRSAVE + GPRSAVE_SIZE
1165663535b69eef3940dcdb3110f95651304fe41afTim Chen
1175663535b69eef3940dcdb3110f95651304fe41afTim Chen## assume buffers not aligned
1185663535b69eef3940dcdb3110f95651304fe41afTim Chen#define	VMOVDQ vmovdqu
1195663535b69eef3940dcdb3110f95651304fe41afTim Chen
1205663535b69eef3940dcdb3110f95651304fe41afTim Chen# addm [mem], reg
1215663535b69eef3940dcdb3110f95651304fe41afTim Chen# Add reg to mem using reg-mem add and store
1225663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro addm p1 p2
1235663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	\p1, \p2
1245663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	\p2, \p1
1255663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
1265663535b69eef3940dcdb3110f95651304fe41afTim Chen
1275663535b69eef3940dcdb3110f95651304fe41afTim Chen
1285663535b69eef3940dcdb3110f95651304fe41afTim Chen# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
1295663535b69eef3940dcdb3110f95651304fe41afTim Chen# Load ymm with mem and byte swap each dword
1305663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro COPY_YMM_AND_BSWAP p1 p2 p3
1315663535b69eef3940dcdb3110f95651304fe41afTim Chen	VMOVDQ \p2, \p1
1325663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpshufb \p3, \p1, \p1
1335663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
1345663535b69eef3940dcdb3110f95651304fe41afTim Chen# rotate_Ys
1355663535b69eef3940dcdb3110f95651304fe41afTim Chen# Rotate values of symbols Y0...Y3
1365663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro rotate_Ys
1375663535b69eef3940dcdb3110f95651304fe41afTim Chen	Y_ = Y_0
1385663535b69eef3940dcdb3110f95651304fe41afTim Chen	Y_0 = Y_1
1395663535b69eef3940dcdb3110f95651304fe41afTim Chen	Y_1 = Y_2
1405663535b69eef3940dcdb3110f95651304fe41afTim Chen	Y_2 = Y_3
1415663535b69eef3940dcdb3110f95651304fe41afTim Chen	Y_3 = Y_
1425663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
1435663535b69eef3940dcdb3110f95651304fe41afTim Chen
1445663535b69eef3940dcdb3110f95651304fe41afTim Chen# RotateState
1455663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro RotateState
1465663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Rotate symbols a..h right
1475663535b69eef3940dcdb3110f95651304fe41afTim Chen	old_h  = h
1485663535b69eef3940dcdb3110f95651304fe41afTim Chen	TMP_   = h
1495663535b69eef3940dcdb3110f95651304fe41afTim Chen	h      = g
1505663535b69eef3940dcdb3110f95651304fe41afTim Chen	g      = f
1515663535b69eef3940dcdb3110f95651304fe41afTim Chen	f      = e
1525663535b69eef3940dcdb3110f95651304fe41afTim Chen	e      = d
1535663535b69eef3940dcdb3110f95651304fe41afTim Chen	d      = c
1545663535b69eef3940dcdb3110f95651304fe41afTim Chen	c      = b
1555663535b69eef3940dcdb3110f95651304fe41afTim Chen	b      = a
1565663535b69eef3940dcdb3110f95651304fe41afTim Chen	a      = TMP_
1575663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
1585663535b69eef3940dcdb3110f95651304fe41afTim Chen
1595663535b69eef3940dcdb3110f95651304fe41afTim Chen# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
1605663535b69eef3940dcdb3110f95651304fe41afTim Chen# YDST = {YSRC1, YSRC2} >> RVAL*8
1615663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
1625663535b69eef3940dcdb3110f95651304fe41afTim Chen	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
1635663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
1645663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
1655663535b69eef3940dcdb3110f95651304fe41afTim Chen
1665663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro FOUR_ROUNDS_AND_SCHED
1675663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 0 #########################################
1685663535b69eef3940dcdb3110f95651304fe41afTim Chen
1695663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Extract w[t-7]
1705663535b69eef3940dcdb3110f95651304fe41afTim Chen	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
1715663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate w[t-16] + w[t-7]
1725663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
1735663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Extract w[t-15]
1745663535b69eef3940dcdb3110f95651304fe41afTim Chen	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
1755663535b69eef3940dcdb3110f95651304fe41afTim Chen
1765663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate sigma0
1775663535b69eef3940dcdb3110f95651304fe41afTim Chen
1785663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate w[t-15] ror 1
1795663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$1, YTMP1, YTMP2
1805663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsllq		$(64-1), YTMP1, YTMP3
1815663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
1825663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate w[t-15] shr 7
1835663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
1845663535b69eef3940dcdb3110f95651304fe41afTim Chen
1855663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
1865663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
1875663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
1885663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	frame_XFER(%rsp),h		# h = k + w + h         # --
1895663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
1905663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
1915663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
1925663535b69eef3940dcdb3110f95651304fe41afTim Chen
1935663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
1945663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
1955663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
1965663535b69eef3940dcdb3110f95651304fe41afTim Chen
1975663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
1985663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
1995663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
2005663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
2015663535b69eef3940dcdb3110f95651304fe41afTim Chen
2025663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
2035663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
2045663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
2055663535b69eef3940dcdb3110f95651304fe41afTim Chen
2065663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
2075663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
2085663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
2095663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
2105663535b69eef3940dcdb3110f95651304fe41afTim Chen
2115663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
2125663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
2135663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
2145663535b69eef3940dcdb3110f95651304fe41afTim Chen
2155663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
2165663535b69eef3940dcdb3110f95651304fe41afTim Chen
2175663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
2185663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
2195663535b69eef3940dcdb3110f95651304fe41afTim Chen
2205663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
2215663535b69eef3940dcdb3110f95651304fe41afTim Chen
2225663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 1 #########################################
2235663535b69eef3940dcdb3110f95651304fe41afTim Chen
2245663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate w[t-15] ror 8
2255663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$8, YTMP1, YTMP2
2265663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsllq		$(64-8), YTMP1, YTMP1
2275663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
2285663535b69eef3940dcdb3110f95651304fe41afTim Chen	# XOR the three components
2295663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
2305663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
2315663535b69eef3940dcdb3110f95651304fe41afTim Chen
2325663535b69eef3940dcdb3110f95651304fe41afTim Chen
2335663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Add three components, w[t-16], w[t-7] and sigma0
2345663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
2355663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Move to appropriate lanes for calculating w[16] and w[17]
2365663535b69eef3940dcdb3110f95651304fe41afTim Chen	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
2375663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Move to appropriate lanes for calculating w[18] and w[19]
2385663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
2395663535b69eef3940dcdb3110f95651304fe41afTim Chen
2405663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate w[16] and w[17] in both 128 bit lanes
2415663535b69eef3940dcdb3110f95651304fe41afTim Chen
2425663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
2435663535b69eef3940dcdb3110f95651304fe41afTim Chen	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
2445663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
2455663535b69eef3940dcdb3110f95651304fe41afTim Chen
2465663535b69eef3940dcdb3110f95651304fe41afTim Chen
2475663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
2485663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
2495663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
2505663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
2515663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
2525663535b69eef3940dcdb3110f95651304fe41afTim Chen
2535663535b69eef3940dcdb3110f95651304fe41afTim Chen
2545663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
2555663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
2565663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
2575663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
2585663535b69eef3940dcdb3110f95651304fe41afTim Chen
2595663535b69eef3940dcdb3110f95651304fe41afTim Chen
2605663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
2615663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
2625663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
2635663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
2645663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
2655663535b69eef3940dcdb3110f95651304fe41afTim Chen
2665663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
2675663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
2685663535b69eef3940dcdb3110f95651304fe41afTim Chen
2695663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
2705663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
2715663535b69eef3940dcdb3110f95651304fe41afTim Chen
2725663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
2735663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
2745663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
2755663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
2765663535b69eef3940dcdb3110f95651304fe41afTim Chen
2775663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
2785663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
2795663535b69eef3940dcdb3110f95651304fe41afTim Chen
2805663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
2815663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
2825663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
2835663535b69eef3940dcdb3110f95651304fe41afTim Chen
2845663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
2855663535b69eef3940dcdb3110f95651304fe41afTim Chen
2865663535b69eef3940dcdb3110f95651304fe41afTim Chen
2875663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 2 #########################################
2885663535b69eef3940dcdb3110f95651304fe41afTim Chen
2895663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
2905663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
2915663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
2925663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
2935663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
2945663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
2955663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
2965663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
2975663535b69eef3940dcdb3110f95651304fe41afTim Chen							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
2985663535b69eef3940dcdb3110f95651304fe41afTim Chen
2995663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Add sigma1 to the other compunents to get w[16] and w[17]
3005663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
3015663535b69eef3940dcdb3110f95651304fe41afTim Chen
3025663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
3035663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
3045663535b69eef3940dcdb3110f95651304fe41afTim Chen
3055663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
3065663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
3075663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
3085663535b69eef3940dcdb3110f95651304fe41afTim Chen
3095663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
3105663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
3115663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
3125663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
3135663535b69eef3940dcdb3110f95651304fe41afTim Chen
3145663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
3155663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
3165663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
3175663535b69eef3940dcdb3110f95651304fe41afTim Chen
3185663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
3195663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
3205663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
3215663535b69eef3940dcdb3110f95651304fe41afTim Chen
3225663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
3235663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
3245663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
3255663535b69eef3940dcdb3110f95651304fe41afTim Chen
3265663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
3275663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
3285663535b69eef3940dcdb3110f95651304fe41afTim Chen
3295663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
3305663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
3315663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
3325663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
3335663535b69eef3940dcdb3110f95651304fe41afTim Chen
3345663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
3355663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
3365663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
3375663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
3385663535b69eef3940dcdb3110f95651304fe41afTim Chen
3395663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
3405663535b69eef3940dcdb3110f95651304fe41afTim Chen
3415663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
3425663535b69eef3940dcdb3110f95651304fe41afTim Chen
3435663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 3 #########################################
3445663535b69eef3940dcdb3110f95651304fe41afTim Chen
3455663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
3465663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
3475663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
3485663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
3495663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
3505663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
3515663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
3525663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
3535663535b69eef3940dcdb3110f95651304fe41afTim Chen							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
3545663535b69eef3940dcdb3110f95651304fe41afTim Chen
3555663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
3565663535b69eef3940dcdb3110f95651304fe41afTim Chen	# to newly calculated sigma1 to get w[18] and w[19]
3575663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
3585663535b69eef3940dcdb3110f95651304fe41afTim Chen
3595663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Form w[19, w[18], w17], w[16]
3605663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
3615663535b69eef3940dcdb3110f95651304fe41afTim Chen
3625663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
3635663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
3645663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
3655663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
3665663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
3675663535b69eef3940dcdb3110f95651304fe41afTim Chen
3685663535b69eef3940dcdb3110f95651304fe41afTim Chen
3695663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
3705663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
3715663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
3725663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
3735663535b69eef3940dcdb3110f95651304fe41afTim Chen
3745663535b69eef3940dcdb3110f95651304fe41afTim Chen
3755663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
3765663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
3775663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
3785663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
3795663535b69eef3940dcdb3110f95651304fe41afTim Chen
3805663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
3815663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
3825663535b69eef3940dcdb3110f95651304fe41afTim Chen
3835663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
3845663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
3855663535b69eef3940dcdb3110f95651304fe41afTim Chen
3865663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
3875663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
3885663535b69eef3940dcdb3110f95651304fe41afTim Chen
3895663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
3905663535b69eef3940dcdb3110f95651304fe41afTim Chen
3915663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
3925663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
3935663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
3945663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
3955663535b69eef3940dcdb3110f95651304fe41afTim Chen
3965663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
3975663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
3985663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
3995663535b69eef3940dcdb3110f95651304fe41afTim Chen
4005663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
4015663535b69eef3940dcdb3110f95651304fe41afTim Chen
4025663535b69eef3940dcdb3110f95651304fe41afTim Chen	rotate_Ys
4035663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
4045663535b69eef3940dcdb3110f95651304fe41afTim Chen
4055663535b69eef3940dcdb3110f95651304fe41afTim Chen.macro DO_4ROUNDS
4065663535b69eef3940dcdb3110f95651304fe41afTim Chen
4075663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 0 #########################################
4085663535b69eef3940dcdb3110f95651304fe41afTim Chen
4095663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
4105663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
4115663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
4125663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
4135663535b69eef3940dcdb3110f95651304fe41afTim Chen
4145663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
4155663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
4165663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
4175663535b69eef3940dcdb3110f95651304fe41afTim Chen
4185663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
4195663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
4205663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
4215663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
4225663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
4235663535b69eef3940dcdb3110f95651304fe41afTim Chen
4245663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
4255663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
4265663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	frame_XFER(%rsp), h		# h = k + w + h         # --
4275663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
4285663535b69eef3940dcdb3110f95651304fe41afTim Chen
4295663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
4305663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
4315663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
4325663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
4335663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
4345663535b69eef3940dcdb3110f95651304fe41afTim Chen
4355663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
4365663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
4375663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
4385663535b69eef3940dcdb3110f95651304fe41afTim Chen
4395663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
4405663535b69eef3940dcdb3110f95651304fe41afTim Chen
4415663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
4425663535b69eef3940dcdb3110f95651304fe41afTim Chen
4435663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 1 #########################################
4445663535b69eef3940dcdb3110f95651304fe41afTim Chen
4455663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
4465663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
4475663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
4485663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
4495663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
4505663535b69eef3940dcdb3110f95651304fe41afTim Chen
4515663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
4525663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
4535663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
4545663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
4555663535b69eef3940dcdb3110f95651304fe41afTim Chen
4565663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
4575663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
4585663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
4595663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
4605663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
4615663535b69eef3940dcdb3110f95651304fe41afTim Chen
4625663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
4635663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
4645663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
4655663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
4665663535b69eef3940dcdb3110f95651304fe41afTim Chen
4675663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
4685663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
4695663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
4705663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
4715663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
4725663535b69eef3940dcdb3110f95651304fe41afTim Chen
4735663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
4745663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
4755663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
4765663535b69eef3940dcdb3110f95651304fe41afTim Chen
4775663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
4785663535b69eef3940dcdb3110f95651304fe41afTim Chen
4795663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
4805663535b69eef3940dcdb3110f95651304fe41afTim Chen
4815663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 2 #########################################
4825663535b69eef3940dcdb3110f95651304fe41afTim Chen
4835663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
4845663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
4855663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
4865663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
4875663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
4885663535b69eef3940dcdb3110f95651304fe41afTim Chen
4895663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
4905663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
4915663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
4925663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
4935663535b69eef3940dcdb3110f95651304fe41afTim Chen
4945663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
4955663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
4965663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
4975663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
4985663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
4995663535b69eef3940dcdb3110f95651304fe41afTim Chen
5005663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
5015663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
5025663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
5035663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
5045663535b69eef3940dcdb3110f95651304fe41afTim Chen
5055663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
5065663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
5075663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
5085663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
5095663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
5105663535b69eef3940dcdb3110f95651304fe41afTim Chen
5115663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
5125663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
5135663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
5145663535b69eef3940dcdb3110f95651304fe41afTim Chen
5155663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
5165663535b69eef3940dcdb3110f95651304fe41afTim Chen
5175663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
5185663535b69eef3940dcdb3110f95651304fe41afTim Chen
5195663535b69eef3940dcdb3110f95651304fe41afTim Chen################################### RND N + 3 #########################################
5205663535b69eef3940dcdb3110f95651304fe41afTim Chen
5215663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
5225663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	f, y2		# y2 = f                                # CH
5235663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$41, e, y0	# y0 = e >> 41				# S1A
5245663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$18, e, y1	# y1 = e >> 18				# S1B
5255663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = f^g                              # CH
5265663535b69eef3940dcdb3110f95651304fe41afTim Chen
5275663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
5285663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
5295663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	e, y2		# y2 = (f^g)&e                          # CH
5305663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
5315663535b69eef3940dcdb3110f95651304fe41afTim Chen
5325663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
5335663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$34, a, T1	# T1 = a >> 34				# S0B
5345663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
5355663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$39, a, y1	# y1 = a >> 39				# S0A
5365663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, y3		# y3 = a                                # MAJA
5375663535b69eef3940dcdb3110f95651304fe41afTim Chen
5385663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
5395663535b69eef3940dcdb3110f95651304fe41afTim Chen	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
5405663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
5415663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	c, y3		# y3 = a|c                              # MAJA
5425663535b69eef3940dcdb3110f95651304fe41afTim Chen
5435663535b69eef3940dcdb3110f95651304fe41afTim Chen	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
5445663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	a, T1		# T1 = a                                # MAJB
5455663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	b, y3		# y3 = (a|c)&b                          # MAJA
5465663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	c, T1		# T1 = a&c                              # MAJB
5475663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y0, y2		# y2 = S1 + CH                          # --
5485663535b69eef3940dcdb3110f95651304fe41afTim Chen
5495663535b69eef3940dcdb3110f95651304fe41afTim Chen
5505663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	h, d		# d = k + w + h + d                     # --
5515663535b69eef3940dcdb3110f95651304fe41afTim Chen	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
5525663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y1, h		# h = k + w + h + S0                    # --
5535663535b69eef3940dcdb3110f95651304fe41afTim Chen
5545663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
5555663535b69eef3940dcdb3110f95651304fe41afTim Chen
5565663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
5575663535b69eef3940dcdb3110f95651304fe41afTim Chen
5585663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	y3, h		# h = t1 + S0 + MAJ                     # --
5595663535b69eef3940dcdb3110f95651304fe41afTim Chen
5605663535b69eef3940dcdb3110f95651304fe41afTim Chen	RotateState
5615663535b69eef3940dcdb3110f95651304fe41afTim Chen
5625663535b69eef3940dcdb3110f95651304fe41afTim Chen.endm
5635663535b69eef3940dcdb3110f95651304fe41afTim Chen
5645663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
5655663535b69eef3940dcdb3110f95651304fe41afTim Chen# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
5665663535b69eef3940dcdb3110f95651304fe41afTim Chen# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
5675663535b69eef3940dcdb3110f95651304fe41afTim Chen# The size of the message pointed to by M must be an integer multiple of SHA512
5685663535b69eef3940dcdb3110f95651304fe41afTim Chen#   message blocks.
5695663535b69eef3940dcdb3110f95651304fe41afTim Chen# L is the message length in SHA512 blocks
5705663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
5715663535b69eef3940dcdb3110f95651304fe41afTim ChenENTRY(sha512_transform_rorx)
5725663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Allocate Stack Space
5735663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%rsp, %rax
5745663535b69eef3940dcdb3110f95651304fe41afTim Chen	sub	$frame_size, %rsp
5755663535b69eef3940dcdb3110f95651304fe41afTim Chen	and	$~(0x20 - 1), %rsp
5765663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%rax, frame_RSPSAVE(%rsp)
5775663535b69eef3940dcdb3110f95651304fe41afTim Chen
5785663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Save GPRs
5795663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%rbp, frame_GPRSAVE(%rsp)
5805663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%rbx, 8*1+frame_GPRSAVE(%rsp)
5815663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%r12, 8*2+frame_GPRSAVE(%rsp)
5825663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%r13, 8*3+frame_GPRSAVE(%rsp)
5835663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%r14, 8*4+frame_GPRSAVE(%rsp)
5845663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	%r15, 8*5+frame_GPRSAVE(%rsp)
5855663535b69eef3940dcdb3110f95651304fe41afTim Chen
5865663535b69eef3940dcdb3110f95651304fe41afTim Chen	shl	$7, NUM_BLKS	# convert to bytes
5875663535b69eef3940dcdb3110f95651304fe41afTim Chen	jz	done_hash
5885663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	INP, NUM_BLKS	# pointer to end of data
5895663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	NUM_BLKS, frame_INPEND(%rsp)
5905663535b69eef3940dcdb3110f95651304fe41afTim Chen
5915663535b69eef3940dcdb3110f95651304fe41afTim Chen	## load initial digest
5925663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*0(CTX),a
5935663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*1(CTX),b
5945663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*2(CTX),c
5955663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*3(CTX),d
5965663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*4(CTX),e
5975663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*5(CTX),f
5985663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*6(CTX),g
5995663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*7(CTX),h
6005663535b69eef3940dcdb3110f95651304fe41afTim Chen
6015663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
6025663535b69eef3940dcdb3110f95651304fe41afTim Chen
6035663535b69eef3940dcdb3110f95651304fe41afTim Chenloop0:
6045663535b69eef3940dcdb3110f95651304fe41afTim Chen	lea	K512(%rip), TBL
6055663535b69eef3940dcdb3110f95651304fe41afTim Chen
6065663535b69eef3940dcdb3110f95651304fe41afTim Chen	## byte swap first 16 dwords
6075663535b69eef3940dcdb3110f95651304fe41afTim Chen	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
6085663535b69eef3940dcdb3110f95651304fe41afTim Chen	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
6095663535b69eef3940dcdb3110f95651304fe41afTim Chen	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
6105663535b69eef3940dcdb3110f95651304fe41afTim Chen	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
6115663535b69eef3940dcdb3110f95651304fe41afTim Chen
6125663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	INP, frame_INP(%rsp)
6135663535b69eef3940dcdb3110f95651304fe41afTim Chen
6145663535b69eef3940dcdb3110f95651304fe41afTim Chen	## schedule 64 input dwords, by doing 12 rounds of 4 each
6155663535b69eef3940dcdb3110f95651304fe41afTim Chen	movq	$4, frame_SRND(%rsp)
6165663535b69eef3940dcdb3110f95651304fe41afTim Chen
6175663535b69eef3940dcdb3110f95651304fe41afTim Chen.align 16
6185663535b69eef3940dcdb3110f95651304fe41afTim Chenloop1:
6195663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq	(TBL), Y_0, XFER
6205663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6215663535b69eef3940dcdb3110f95651304fe41afTim Chen	FOUR_ROUNDS_AND_SCHED
6225663535b69eef3940dcdb3110f95651304fe41afTim Chen
6235663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq	1*32(TBL), Y_0, XFER
6245663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6255663535b69eef3940dcdb3110f95651304fe41afTim Chen	FOUR_ROUNDS_AND_SCHED
6265663535b69eef3940dcdb3110f95651304fe41afTim Chen
6275663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq	2*32(TBL), Y_0, XFER
6285663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6295663535b69eef3940dcdb3110f95651304fe41afTim Chen	FOUR_ROUNDS_AND_SCHED
6305663535b69eef3940dcdb3110f95651304fe41afTim Chen
6315663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq	3*32(TBL), Y_0, XFER
6325663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6335663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	$(4*32), TBL
6345663535b69eef3940dcdb3110f95651304fe41afTim Chen	FOUR_ROUNDS_AND_SCHED
6355663535b69eef3940dcdb3110f95651304fe41afTim Chen
6365663535b69eef3940dcdb3110f95651304fe41afTim Chen	subq	$1, frame_SRND(%rsp)
6375663535b69eef3940dcdb3110f95651304fe41afTim Chen	jne	loop1
6385663535b69eef3940dcdb3110f95651304fe41afTim Chen
6395663535b69eef3940dcdb3110f95651304fe41afTim Chen	movq	$2, frame_SRND(%rsp)
6405663535b69eef3940dcdb3110f95651304fe41afTim Chenloop2:
6415663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq	(TBL), Y_0, XFER
6425663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6435663535b69eef3940dcdb3110f95651304fe41afTim Chen	DO_4ROUNDS
6445663535b69eef3940dcdb3110f95651304fe41afTim Chen	vpaddq	1*32(TBL), Y_1, XFER
6455663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa XFER, frame_XFER(%rsp)
6465663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	$(2*32), TBL
6475663535b69eef3940dcdb3110f95651304fe41afTim Chen	DO_4ROUNDS
6485663535b69eef3940dcdb3110f95651304fe41afTim Chen
6495663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa	Y_2, Y_0
6505663535b69eef3940dcdb3110f95651304fe41afTim Chen	vmovdqa	Y_3, Y_1
6515663535b69eef3940dcdb3110f95651304fe41afTim Chen
6525663535b69eef3940dcdb3110f95651304fe41afTim Chen	subq	$1, frame_SRND(%rsp)
6535663535b69eef3940dcdb3110f95651304fe41afTim Chen	jne	loop2
6545663535b69eef3940dcdb3110f95651304fe41afTim Chen
6555663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*0(CTX),a
6565663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*1(CTX),b
6575663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*2(CTX),c
6585663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*3(CTX),d
6595663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*4(CTX),e
6605663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*5(CTX),f
6615663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*6(CTX),g
6625663535b69eef3940dcdb3110f95651304fe41afTim Chen	addm	8*7(CTX),h
6635663535b69eef3940dcdb3110f95651304fe41afTim Chen
6645663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	frame_INP(%rsp), INP
6655663535b69eef3940dcdb3110f95651304fe41afTim Chen	add	$128, INP
6665663535b69eef3940dcdb3110f95651304fe41afTim Chen	cmp	frame_INPEND(%rsp), INP
6675663535b69eef3940dcdb3110f95651304fe41afTim Chen	jne	loop0
6685663535b69eef3940dcdb3110f95651304fe41afTim Chen
6695663535b69eef3940dcdb3110f95651304fe41afTim Chendone_hash:
6705663535b69eef3940dcdb3110f95651304fe41afTim Chen
6715663535b69eef3940dcdb3110f95651304fe41afTim Chen# Restore GPRs
6725663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	frame_GPRSAVE(%rsp)     ,%rbp
6735663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*1+frame_GPRSAVE(%rsp) ,%rbx
6745663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*2+frame_GPRSAVE(%rsp) ,%r12
6755663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*3+frame_GPRSAVE(%rsp) ,%r13
6765663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*4+frame_GPRSAVE(%rsp) ,%r14
6775663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	8*5+frame_GPRSAVE(%rsp) ,%r15
6785663535b69eef3940dcdb3110f95651304fe41afTim Chen
6795663535b69eef3940dcdb3110f95651304fe41afTim Chen	# Restore Stack Pointer
6805663535b69eef3940dcdb3110f95651304fe41afTim Chen	mov	frame_RSPSAVE(%rsp), %rsp
6815663535b69eef3940dcdb3110f95651304fe41afTim Chen	ret
6825663535b69eef3940dcdb3110f95651304fe41afTim ChenENDPROC(sha512_transform_rorx)
6835663535b69eef3940dcdb3110f95651304fe41afTim Chen
6845663535b69eef3940dcdb3110f95651304fe41afTim Chen########################################################################
6855663535b69eef3940dcdb3110f95651304fe41afTim Chen### Binary Data
6865663535b69eef3940dcdb3110f95651304fe41afTim Chen
6875663535b69eef3940dcdb3110f95651304fe41afTim Chen.data
6885663535b69eef3940dcdb3110f95651304fe41afTim Chen
6895663535b69eef3940dcdb3110f95651304fe41afTim Chen.align 64
6905663535b69eef3940dcdb3110f95651304fe41afTim Chen# K[t] used in SHA512 hashing
6915663535b69eef3940dcdb3110f95651304fe41afTim ChenK512:
6925663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
6935663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
6945663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x3956c25bf348b538,0x59f111f1b605d019
6955663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
6965663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xd807aa98a3030242,0x12835b0145706fbe
6975663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
6985663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
6995663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
7005663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
7015663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
7025663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
7035663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
7045663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
7055663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
7065663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
7075663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x06ca6351e003826f,0x142929670a0e6e70
7085663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
7095663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
7105663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
7115663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x81c2c92e47edaee6,0x92722c851482353b
7125663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
7135663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
7145663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xd192e819d6ef5218,0xd69906245565a910
7155663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
7165663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
7175663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
7185663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
7195663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
7205663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
7215663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
7225663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x90befffa23631e28,0xa4506cebde82bde9
7235663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
7245663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xca273eceea26619c,0xd186b8c721c0c207
7255663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
7265663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
7275663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x113f9804bef90dae,0x1b710b35131c471b
7285663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x28db77f523047d84,0x32caab7b40c72493
7295663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
7305663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
7315663535b69eef3940dcdb3110f95651304fe41afTim Chen	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
7325663535b69eef3940dcdb3110f95651304fe41afTim Chen
7335663535b69eef3940dcdb3110f95651304fe41afTim Chen.align 32
7345663535b69eef3940dcdb3110f95651304fe41afTim Chen
7355663535b69eef3940dcdb3110f95651304fe41afTim Chen# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
7365663535b69eef3940dcdb3110f95651304fe41afTim ChenPSHUFFLE_BYTE_FLIP_MASK:
7375663535b69eef3940dcdb3110f95651304fe41afTim Chen	.octa 0x08090a0b0c0d0e0f0001020304050607
7385663535b69eef3940dcdb3110f95651304fe41afTim Chen	.octa 0x18191a1b1c1d1e1f1011121314151617
7395663535b69eef3940dcdb3110f95651304fe41afTim Chen
7405663535b69eef3940dcdb3110f95651304fe41afTim ChenMASK_YMM_LO:
7415663535b69eef3940dcdb3110f95651304fe41afTim Chen	.octa 0x00000000000000000000000000000000
7425663535b69eef3940dcdb3110f95651304fe41afTim Chen	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
7435663535b69eef3940dcdb3110f95651304fe41afTim Chen#endif
744