1d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen######################################################################## 2d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 4d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Copyright (C) 2013 Intel Corporation. 5d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 6d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Authors: 7d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# James Guilford <james.guilford@intel.com> 8d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Kirk Yap <kirk.s.yap@intel.com> 9d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Tim Chen <tim.c.chen@linux.intel.com> 10d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 11d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# This software is available to you under a choice of one of two 12d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# licenses. You may choose to be licensed under the terms of the GNU 13d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# General Public License (GPL) Version 2, available from the file 14d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# COPYING in the main directory of this source tree, or the 15d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# OpenIB.org BSD license below: 16d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 17d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Redistribution and use in source and binary forms, with or 18d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# without modification, are permitted provided that the following 19d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# conditions are met: 20d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 21d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# - Redistributions of source code must retain the above 22d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# copyright notice, this list of conditions and the following 23d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# disclaimer. 24d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 25d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# - Redistributions in binary form must reproduce the above 26d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# copyright notice, this list of conditions and the following 27d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# disclaimer in the documentation and/or other materials 28d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# provided with the distribution. 29d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 30d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# SOFTWARE. 38d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 39d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen######################################################################## 40d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 41d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# This code is described in an Intel White-Paper: 42d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# "Fast SHA-256 Implementations on Intel Architecture Processors" 43d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 44d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# To find it, surf to http://www.intel.com/p/en_US/embedded 45d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# and search for that title. 46d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# 47d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen######################################################################## 48d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# This code schedules 2 blocks at a time, with 4 lanes per block 49d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen######################################################################## 50d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 51d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#ifdef CONFIG_AS_AVX2 52d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#include <linux/linkage.h> 53d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 54d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## assume buffers not aligned 55d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#define VMOVDQ vmovdqu 56d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 57d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################ Define Macros 58d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 59d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# addm [mem], reg 60d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Add reg to mem using reg-mem add and store 61d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro addm p1 p2 62d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add \p1, \p2 63d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov \p2, \p1 64d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm 65d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 66d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################ 67d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 68d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX0 = %ymm4 69d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX1 = %ymm5 70d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX2 = %ymm6 71d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX3 = %ymm7 72d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 73d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# XMM versions of above 74d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD0 = %xmm4 75d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD1 = %xmm5 76d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD2 = %xmm6 77d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXWORD3 = %xmm7 78d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 79d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP0 = %ymm0 80d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP1 = %ymm1 81d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP2 = %ymm2 82d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP3 = %ymm3 83d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP4 = %ymm8 84d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXFER = %ymm9 85d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenXTMP5 = %ymm11 86d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 87d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 88d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 89d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenBYTE_FLIP_MASK = %ymm13 90d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 91d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenX_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 92d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 93d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenNUM_BLKS = %rdx # 3rd arg 94d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenCTX = %rsi # 2nd arg 95d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenINP = %rdi # 1st arg 96d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenc = %ecx 97d34a460092d857f1616e39eed7eac6f40cea2225Tim Chend = %r8d 98d34a460092d857f1616e39eed7eac6f40cea2225Tim Chene = %edx # clobbers NUM_BLKS 99d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny3 = %edi # clobbers INP 100d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 101d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 102d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenTBL = %rbp 103d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSRND = CTX # SRND is same register as CTX 104d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 105d34a460092d857f1616e39eed7eac6f40cea2225Tim Chena = %eax 106d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenb = %ebx 107d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenf = %r9d 108d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheng = %r10d 109d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenh = %r11d 110d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenold_h = %r11d 111d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 112d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenT1 = %r12d 113d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny0 = %r13d 114d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny1 = %r14d 115d34a460092d857f1616e39eed7eac6f40cea2225Tim Cheny2 = %r15d 116d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 117d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 118d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 119d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XMM_SAVE_SIZE = 0 120d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP_END_SIZE = 8 121d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP_SIZE = 8 122d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_CTX_SIZE = 8 123d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_RSP_SIZE = 8 124d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 125d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XFER = 0 126d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_XMM_SAVE = _XFER + _XFER_SIZE 127d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 128d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_INP = _INP_END + _INP_END_SIZE 129d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_CTX = _INP + _INP_SIZE 130d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_RSP = _CTX + _CTX_SIZE 131d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenSTACK_SIZE = _RSP + _RSP_SIZE 132d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 133d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# rotate_Xs 134d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Rotate values of symbols X0...X3 135d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro rotate_Xs 136d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen X_ = X0 137d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen X0 = X1 138d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen X1 = X2 139d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen X2 = X3 140d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen X3 = X_ 141d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm 142d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 143d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# ROTATE_ARGS 144d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# Rotate values of symbols a...h 145d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro ROTATE_ARGS 146d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen old_h = h 147d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen TMP_ = h 148d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen h = g 149d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen g = f 150d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen f = e 151d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen e = d 152d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen d = c 153d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen c = b 154d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen b = a 155d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen a = TMP_ 156d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm 157d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 158d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro FOUR_ROUNDS_AND_SCHED disp 159d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 0 ############################ 160d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 161d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 162d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 163d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 164d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 165d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl \disp(%rsp, SRND), h # h = k + w + h # -- 166d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 167d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 168d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 169d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 170d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 171d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 172d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 173d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 174d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 175d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 176d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 177d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 178d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 179d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 180d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 181d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 182d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 183d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 184d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 185d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 186d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 187d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrld $7, XTMP1, XTMP2 188d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 189d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 190d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 191d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 192d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 193d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpslld $(32-7), XTMP1, XTMP3 194d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 195d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 196d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 197d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 198d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 199d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 200d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrld $18, XTMP1, XTMP2 201d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 202d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, h # h = t1 + S0 + MAJ # -- 203d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 204d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 205d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 206d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 207d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 1 ############################ 208d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 209d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 210d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 211d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 212d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen offset = \disp + 1*4 213d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl offset(%rsp, SRND), h # h = k + w + h # -- 214d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 215d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 216d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 217d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 218d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 219d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 220d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 221d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 222d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 223d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 224d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 225d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 226d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 227d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 228d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 229d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 230d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpslld $(32-18), XTMP1, XTMP1 231d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 232d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 233d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 234d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP1, XTMP3, XTMP3 235d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 236d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 237d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 238d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 239d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 240d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 241d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 242d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 243d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 244d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 245d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 246d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 247d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 248d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 249d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 250d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 251d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 252d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, h # h = t1 + S0 + MAJ # -- 253d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 254d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 255d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 256d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 257d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 258d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 259d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 2 ############################ 260d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 261d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 262d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 263d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen offset = \disp + 2*4 264d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl offset(%rsp, SRND), h # h = k + w + h # -- 265d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 266d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 267d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 268d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 269d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 270d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 271d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 272d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 273d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 274d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 275d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 276d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 277d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 278d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP3, XTMP2, XTMP2 279d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 280d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 281d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 282d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 283d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 284d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 285d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 286d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 287d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 288d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 289d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a ,T1 # T1 = (a >> 2) # S0 290d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 291d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 292d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 293d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 294d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 295d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 296d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 297d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 298d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 299d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1,h # h = k + w + h + S0 # -- 300d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 301d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 302d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 303d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3,h # h = t1 + S0 + MAJ # -- 304d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 305d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 306d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 307d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 308d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 3 ############################ 309d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 310d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 311d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 312d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 313d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen offset = \disp + 3*4 314d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl offset(%rsp, SRND), h # h = k + w + h # -- 315d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 316d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 317d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 318d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 319d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 320d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 321d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 322d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 323d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 324d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 325d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 326d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 327d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 328d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 329d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 330d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 331d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 332d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 333d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 334d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 335d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP3, XTMP2, XTMP2 336d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 337d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 338d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 339d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 340d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 341d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 342d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 343d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 344d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 345d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 346d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 347d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 348d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 349d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 350d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 351d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 352d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 353d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 354d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, h # h = t1 + S0 + MAJ # -- 355d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 356d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 357d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rotate_Xs 358d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm 359d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 360d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.macro DO_4ROUNDS disp 361d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 0 ########################### 362d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 363d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 364d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 365d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 366d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 367d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 368d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 369d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 370d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 371d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 372d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 373d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 374d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 375d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 376d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 377d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 378d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 379d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 380d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl \disp(%rsp, SRND), h # h = k + w + h # -- 381d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 382d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 383d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 384d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 385d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 386d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 387d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 388d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 389d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 390d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 391d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 392d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 393d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 394d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 395d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 396d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 397d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 1 ########################### 398d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 399d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 400d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 401d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 402d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 403d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 404d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 405d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 406d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 407d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 408d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, old_h # h = t1 + S0 + MAJ # -- 409d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 410d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 411d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 412d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 413d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 414d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 415d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 416d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 417d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 418d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen offset = 4*1 + \disp 419d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl offset(%rsp, SRND), h # h = k + w + h # -- 420d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 421d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 422d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 423d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 424d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 425d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 426d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 427d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 428d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 429d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 430d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 431d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 432d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 433d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 434d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 435d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 436d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 437d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 2 ############################## 438d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 439d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 440d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 441d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 442d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 443d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 444d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 445d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 446d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 447d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 448d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, old_h # h = t1 + S0 + MAJ # -- 449d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 450d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 451d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 452d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 453d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 454d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 455d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 456d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 457d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 458d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen offset = 4*2 + \disp 459d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl offset(%rsp, SRND), h # h = k + w + h # -- 460d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 461d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 462d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 463d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 464d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 465d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 466d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 467d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 468d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 469d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 470d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 471d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 472d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 473d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 474d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 475d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 476d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 477d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen################################### RND N + 3 ########################### 478d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 479d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 480d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov f, y2 # y2 = f # CH 481d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $25, e, y0 # y0 = e >> 25 # S1A 482d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $11, e, y1 # y1 = e >> 11 # S1B 483d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = f^g # CH 484d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 485d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 486d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $6, e, y1 # y1 = (e >> 6) # S1 487d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and e, y2 # y2 = (f^g)&e # CH 488d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, old_h # h = t1 + S0 + MAJ # -- 489d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 490d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 491d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $13, a, T1 # T1 = a >> 13 # S0B 492d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 493d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $22, a, y1 # y1 = a >> 22 # S0A 494d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, y3 # y3 = a # MAJA 495d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 496d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 497d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen rorx $2, a, T1 # T1 = (a >> 2) # S0 498d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen offset = 4*3 + \disp 499d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addl offset(%rsp, SRND), h # h = k + w + h # -- 500d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or c, y3 # y3 = a|c # MAJA 501d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 502d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 503d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov a, T1 # T1 = a # MAJB 504d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and b, y3 # y3 = (a|c)&b # MAJA 505d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and c, T1 # T1 = a&c # MAJB 506d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y0, y2 # y2 = S1 + CH # -- 507d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 508d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 509d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add h, d # d = k + w + h + d # -- 510d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 511d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y1, h # h = k + w + h + S0 # -- 512d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 513d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 514d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 515d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 516d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 517d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 518d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add y3, h # h = t1 + S0 + MAJ # -- 519d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 520d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ROTATE_ARGS 521d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 522d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.endm 523d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 524d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen######################################################################## 525d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 526d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## arg 1 : pointer to input data 527d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## arg 2 : pointer to digest 528d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen## arg 3 : Num blocks 529d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen######################################################################## 530d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.text 531d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenENTRY(sha256_transform_rorx) 532d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 32 533d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen pushq %rbx 534d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen pushq %rbp 535d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen pushq %r12 536d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen pushq %r13 537d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen pushq %r14 538d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen pushq %r15 539d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 540d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov %rsp, %rax 541d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen subq $STACK_SIZE, %rsp 542d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen and $-32, %rsp # align rsp to 32 byte boundary 543d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov %rax, _RSP(%rsp) 544d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 545d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 546d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen shl $6, NUM_BLKS # convert to bytes 547d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jz done_hash 548d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 549d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov NUM_BLKS, _INP_END(%rsp) 550d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 551d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen cmp NUM_BLKS, INP 552d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen je only_one_block 553d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 554d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## load initial digest 555d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (CTX), a 556d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*1(CTX), b 557d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*2(CTX), c 558d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*3(CTX), d 559d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*4(CTX), e 560d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*5(CTX), f 561d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*6(CTX), g 562d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov 4*7(CTX), h 563d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 564d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 565d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa _SHUF_00BA(%rip), SHUF_00BA 566d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa _SHUF_DC00(%rip), SHUF_DC00 567d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 568d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov CTX, _CTX(%rsp) 569d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 570d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop0: 571d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen lea K256(%rip), TBL 572d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 573d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## Load first 16 dwords from two blocks 574d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 0*32(INP),XTMP0 575d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 1*32(INP),XTMP1 576d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 2*32(INP),XTMP2 577d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 3*32(INP),XTMP3 578d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 579d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## byte swap data 580d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 581d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 582d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 583d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 584d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 585d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## transpose data into high/low halves 586d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vperm2i128 $0x20, XTMP2, XTMP0, X0 587d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vperm2i128 $0x31, XTMP2, XTMP0, X1 588d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vperm2i128 $0x20, XTMP3, XTMP1, X2 589d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vperm2i128 $0x31, XTMP3, XTMP1, X3 590d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 591d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenlast_block_enter: 592d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add $64, INP 593d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov INP, _INP(%rsp) 594d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 595d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## schedule 48 input dwords, by doing 3 rounds of 12 each 596d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor SRND, SRND 597d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 598d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 16 599d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop1: 600d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd 0*32(TBL, SRND), X0, XFER 601d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 602d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen FOUR_ROUNDS_AND_SCHED _XFER + 0*32 603d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 604d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd 1*32(TBL, SRND), X0, XFER 605d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 606d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen FOUR_ROUNDS_AND_SCHED _XFER + 1*32 607d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 608d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd 2*32(TBL, SRND), X0, XFER 609d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 610d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen FOUR_ROUNDS_AND_SCHED _XFER + 2*32 611d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 612d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd 3*32(TBL, SRND), X0, XFER 613d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 614d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen FOUR_ROUNDS_AND_SCHED _XFER + 3*32 615d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 616d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add $4*32, SRND 617d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen cmp $3*4*32, SRND 618d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jb loop1 619d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 620d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop2: 621d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## Do last 16 rounds with no scheduling 622d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd 0*32(TBL, SRND), X0, XFER 623d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 624d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen DO_4ROUNDS _XFER + 0*32 625d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpaddd 1*32(TBL, SRND), X1, XFER 626d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 627d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen DO_4ROUNDS _XFER + 1*32 628d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add $2*32, SRND 629d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 630d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa X2, X0 631d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa X3, X1 632d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 633d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen cmp $4*4*32, SRND 634d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jb loop2 635d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 636d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov _CTX(%rsp), CTX 637d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov _INP(%rsp), INP 638d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 639d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*0)(CTX),a 640d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*1)(CTX),b 641d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*2)(CTX),c 642d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*3)(CTX),d 643d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*4)(CTX),e 644d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*5)(CTX),f 645d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*6)(CTX),g 646d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*7)(CTX),h 647d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 648d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen cmp _INP_END(%rsp), INP 649d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ja done_hash 650d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 651d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen #### Do second block using previously scheduled results 652d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen xor SRND, SRND 653d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 16 654d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenloop3: 655d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen DO_4ROUNDS _XFER + 0*32 + 16 656d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen DO_4ROUNDS _XFER + 1*32 + 16 657d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add $2*32, SRND 658d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen cmp $4*4*32, SRND 659d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jb loop3 660d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 661d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov _CTX(%rsp), CTX 662d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov _INP(%rsp), INP 663d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen add $64, INP 664d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 665d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*0)(CTX),a 666d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*1)(CTX),b 667d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*2)(CTX),c 668d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*3)(CTX),d 669d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*4)(CTX),e 670d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*5)(CTX),f 671d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*6)(CTX),g 672d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen addm (4*7)(CTX),h 673d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 674d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen cmp _INP_END(%rsp), INP 675d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jb loop0 676d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ja done_hash 677d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 678d34a460092d857f1616e39eed7eac6f40cea2225Tim Chendo_last_block: 679d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen #### do last block 680d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen lea K256(%rip), TBL 681d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 682d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 0*16(INP),XWORD0 683d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 1*16(INP),XWORD1 684d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 2*16(INP),XWORD2 685d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen VMOVDQ 3*16(INP),XWORD3 686d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 687d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 688d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 689d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 690d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 691d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 692d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jmp last_block_enter 693d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 694d34a460092d857f1616e39eed7eac6f40cea2225Tim Chenonly_one_block: 695d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 696d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ## load initial digest 697d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*0)(CTX),a 698d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*1)(CTX),b 699d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*2)(CTX),c 700d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*3)(CTX),d 701d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*4)(CTX),e 702d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*5)(CTX),f 703d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*6)(CTX),g 704d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov (4*7)(CTX),h 705d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 706d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 707d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa _SHUF_00BA(%rip), SHUF_00BA 708d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen vmovdqa _SHUF_DC00(%rip), SHUF_DC00 709d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 710d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov CTX, _CTX(%rsp) 711d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen jmp do_last_block 712d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 713d34a460092d857f1616e39eed7eac6f40cea2225Tim Chendone_hash: 714d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 715d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen mov _RSP(%rsp), %rsp 716d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 717d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen popq %r15 718d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen popq %r14 719d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen popq %r13 720d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen popq %r12 721d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen popq %rbp 722d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen popq %rbx 723d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen ret 724d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenENDPROC(sha256_transform_rorx) 725d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 726d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.data 727d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen.align 64 728d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenK256: 729d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 730d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 731d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 732d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 733d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 734d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 735d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 736d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 737d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 738d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 739d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 740d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 741d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 742d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 743d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 744d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 745d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 746d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 747d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 748d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 749d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 750d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 751d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 752d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 753d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 754d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 755d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 756d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 757d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 758d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 759d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 760d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 761d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 762d34a460092d857f1616e39eed7eac6f40cea2225Tim ChenPSHUFFLE_BYTE_FLIP_MASK: 763d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 764d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 765d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# shuffle xBxA -> 00BA 766d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_SHUF_00BA: 767d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 768d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen 769d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen# shuffle xDxC -> DC00 770d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen_SHUF_DC00: 771d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 772d34a460092d857f1616e39eed7eac6f40cea2225Tim Chen#endif 773