1e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#!/usr/bin/env perl
2e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
3e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# ====================================================================
4e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# project. The module is, however, dual licensed under OpenSSL and
6e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# CRYPTOGAMS licenses depending on where you obtain it. For further
7e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# details see http://www.openssl.org/~appro/cryptogams/.
8e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
9e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
10e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# granted.
12e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# ====================================================================
13e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
14e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Bit-sliced AES for ARM NEON
15e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
16e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# February 2012.
17e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
18e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# This implementation is direct adaptation of bsaes-x86_64 module for
19e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# ARM NEON. Except that this module is endian-neutral [in sense that
20e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# it can be compiled for either endianness] by courtesy of vld1.8's
21e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# neutrality. Initial version doesn't implement interface to OpenSSL,
22e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# only low-level primitives and unsupported entry points, just enough
23e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# to collect performance results, which for Cortex-A8 core are:
24e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
25e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# encrypt	19.5 cycles per byte processed with 128-bit key
26e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# decrypt	22.1 cycles per byte processed with 128-bit key
27e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# key conv.	440  cycles per 128-bit key/0.18 of 8x block
28e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
29e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
30e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# which is [much] worse than anticipated (for further details see
31e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# http://www.openssl.org/~appro/Snapdragon-S4.html).
32e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
33e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
34e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# manages in 20.0 cycles].
35e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
36e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# When comparing to x86_64 results keep in mind that NEON unit is
37e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# [mostly] single-issue and thus can't [fully] benefit from
38e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# instruction-level parallelism. And when comparing to aes-armv4
39e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# results keep in mind key schedule conversion overhead (see
40e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# bsaes-x86_64.pl for further details)...
41e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
42e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#						<appro@openssl.org>
43e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
44e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# April-August 2013
45e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
46e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Add CBC, CTR and XTS subroutines, adapt for kernel use.
47e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
48e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#					<ard.biesheuvel@linaro.org>
49e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
50e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelopen STDOUT,">$output";
52e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
53e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($inp,$out,$len,$key)=("r0","r1","r2","r3");
54e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @XMM=map("q$_",(0..15));
55e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
56e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel{
57e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($key,$rounds,$const)=("r4","r5","r6");
58e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
59e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
60e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
61e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
62e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Sbox {
63e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
64e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
65e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @b=@_[0..7];
66e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..11];
67e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @s=@_[12..15];
68e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&InBasisChange	(@b);
69e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
70e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
71e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
72e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
73e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub InBasisChange {
74e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
75e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
76e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @b=@_[0..7];
77e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
78e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[1]
79e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[5], @b[5], @b[6]
80e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[0]
81e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[6], @b[6], @b[2]
82e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[5], @b[5], @b[0]
83e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
84e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[6], @b[6], @b[3]
85e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[7]
86e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[7], @b[7], @b[5]
87e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[4]
88e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[4], @b[4], @b[5]
89e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
90e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[7]
91e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[1]
92e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[1], @b[1], @b[5]
93e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
94e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
95e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
96e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub OutBasisChange {
97e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
98e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
99e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @b=@_[0..7];
100e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
101e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[0], @b[0], @b[6]
102e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[1], @b[1], @b[4]
103e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[4], @b[4], @b[6]
104e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[0]
105e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[6], @b[6], @b[1]
106e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
107e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[1], @b[1], @b[5]
108e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[5], @b[5], @b[3]
109e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[7]
110e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[7], @b[7], @b[5]
111e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[5]
112e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
113e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[4], @b[4], @b[7]
114e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
115e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
116e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
117e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub InvSbox {
118e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
120e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @b=@_[0..7];
121e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..11];
122e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @s=@_[12..15];
123e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&InvInBasisChange	(@b);
124e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
125e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
126e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
127e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
128e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub InvInBasisChange {		# OutBasisChange in reverse (with twist)
129e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @b=@_[5,1,2,6,3,7,0,4];
130e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___
131e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@b[1], @b[1], @b[7]
132e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[4], @b[4], @b[7]
133e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
134e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[7], @b[7], @b[5]
135e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@b[1], @b[1], @b[3]
136e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[5]
137e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[7]
138e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
139e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[6], @b[6], @b[1]
140e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[0]
141e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@b[5], @b[5], @b[3]
142e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[4], @b[4], @b[6]
143e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[0], @b[0], @b[6]
144e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[1], @b[1], @b[4]
145e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
146e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
147e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
148e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub InvOutBasisChange {		# InBasisChange in reverse
149e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @b=@_[2,5,7,3,6,1,0,4];
150e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
151e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[1], @b[1], @b[5]
152e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[2], @b[2], @b[7]
153e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
154e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[1]
155e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[4], @b[4], @b[5]
156e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[7], @b[7], @b[5]
157e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[4]
158e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor 	@b[5], @b[5], @b[0]
159e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[7]
160e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@b[6], @b[6], @b[2]
161e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@b[2], @b[2], @b[1]
162e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[6], @b[6], @b[3]
163e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
164e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[3], @b[3], @b[0]
165e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@b[5], @b[5], @b[6]
166e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
167e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
168e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
169e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Mul_GF4 {
170e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#;*************************************************************
171e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
172e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#;*************************************************************
173e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
174e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
175e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor 	$t0, $y0, $y1
176e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$t0, $t0, $x0
177e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x0, $x0, $x1
178e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$t1, $x1, $y0
179e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$x0, $x0, $y1
180e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x1, $t1, $t0
181e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x0, $x0, $t1
182e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
183e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
184e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
185e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Mul_GF4_N {				# not used, see next subroutine
186e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# multiply and scale by N
187e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($x0,$x1,$y0,$y1,$t0)=@_;
188e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
189e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$t0, $y0, $y1
190e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$t0, $t0, $x0
191e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x0, $x0, $x1
192e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$x1, $x1, $y0
193e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$x0, $x0, $y1
194e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x1, $x1, $x0
195e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x0, $x0, $t0
196e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
197e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
198e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
199e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Mul_GF4_N_GF4 {
200e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# interleaved Mul_GF4_N and Mul_GF4
201e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($x0,$x1,$y0,$y1,$t0,
202e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel    $x2,$x3,$y2,$y3,$t1)=@_;
203e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
204e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$t0, $y0, $y1
205e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor 	$t1, $y2, $y3
206e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$t0, $t0, $x0
207e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vand	$t1, $t1, $x2
208e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x0, $x0, $x1
209e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	$x2, $x2, $x3
210e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$x1, $x1, $y0
211e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vand	$x3, $x3, $y2
212e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	$x0, $x0, $y1
213e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vand	$x2, $x2, $y3
214e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x1, $x1, $x0
215e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	$x2, $x2, $x3
216e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	$x0, $x0, $t0
217e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	$x3, $x3, $t1
218e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
219e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
220e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Mul_GF16_2 {
221e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=@_[0..7];
222e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @y=@_[8..11];
223e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[12..15];
224e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
225e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @x[0], @x[2]
226e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @x[1], @x[3]
227e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
228e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
229e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
230e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @y[2]
231e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @y[3]
232e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
233e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
234e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel			 @x[2], @x[3], @y[2], @y[3], @t[2]);
235e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
236e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[0], @x[0], @t[0]
237e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[2], @x[2], @t[0]
238e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[1], @x[1], @t[1]
239e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @x[3], @t[1]
240e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
241e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @x[4], @x[6]
242e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @x[5], @x[7]
243e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
244e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
245e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel			 @x[6], @x[7], @y[2], @y[3], @t[2]);
246e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
247e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @y[2]
248e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @y[3]
249e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
250e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
251e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
252e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[4], @x[4], @t[0]
253e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[6], @x[6], @t[0]
254e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[5], @x[5], @t[1]
255e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[7], @x[7], @t[1]
256e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
257e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
258e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub Inv_GF256 {
259e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#;********************************************************************
260e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
261e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#;********************************************************************
262e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=@_[0..7];
263e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..11];
264e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @s=@_[12..15];
265e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# direct optimizations from hardware
266e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
267e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @x[4], @x[6]
268e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @x[5], @x[7]
269e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @x[1], @x[3]
270e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[1], @x[7], @x[6]
271e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vmov	@t[0], @t[2]
272e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[0], @x[0], @x[2]
273e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
274e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vorr	@t[2], @t[2], @t[1]
275e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[3], @t[3], @t[0]
276e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[2], @t[3], @s[0]
277e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vorr	@t[3], @t[3], @s[0]
278e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[0], @s[0], @t[1]
279e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@t[0], @t[0], @t[1]
280e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @x[3], @x[2]
281e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[3], @s[3], @s[0]
282e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[1], @s[1], @t[1]
283e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @x[4], @x[5]
284e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[0], @x[1], @x[0]
285e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @s[1]
286e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @s[1]
287e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[1], @t[1], @s[0]
288e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vorr	@t[1], @t[1], @s[0]
289e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @s[3]
290e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @s[1]
291e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @s[2]
292e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @s[3]
293e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @s[2]
294e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[0], @x[7], @x[3]
295e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @s[2]
296e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[1], @x[6], @x[2]
297e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[2], @x[5], @x[1]
298e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vorr	@s[3], @x[4], @x[0]
299e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @s[0]
300e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @s[2]
301e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @s[3]
302e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @s[1]
303e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
304e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
305e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
306e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ new smaller inversion
307e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
308e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[2], @t[3], @t[1]
309e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@s[0], @t[0]
310e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
311e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[1], @t[2], @s[2]
312e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[3], @t[0], @s[2]
313e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
314e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
315e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vbsl	@s[1], @t[1], @t[0]
316e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vbsl	@s[3], @t[3], @t[2]
317e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @t[2]
318e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
319e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vbsl	@s[0], @s[1], @s[2]
320e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vbsl	@t[0], @s[2], @s[1]
321e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
322e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand	@s[2], @s[0], @s[3]
323e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @t[0]
324e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
325e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@s[2], @s[2], @t[3]
326e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
327e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# output in s3, s2, s1, t1
328e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
329e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
330e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
331e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
332e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
333e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
334e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
335e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
336e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
337e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# AES linear components
338e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
339e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub ShiftRows {
340e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=@_[0..7];
341e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..11];
342e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy $mask=pop;
343e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
344e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@t[0]-@t[3]}
345e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @x[0]
346e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @x[1]
347e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
348e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
349e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@t[0]}
350e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @x[2]
351e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
352e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
353e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@t[1]}
354e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @x[3]
355e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
356e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
357e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@t[2]}
358e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
359e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
360e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@t[3]}
361e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @x[4]
362e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @x[5]
363e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
364e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
365e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @x[6]
366e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
367e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
368e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @x[7]
369e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
370e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
371e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
372e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
373e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
374e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
375e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
376e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub MixColumns {
377e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# modified to emit output in order suitable for feeding back to aesenc[last]
378e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=@_[0..7];
379e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..15];
380e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy $inv=@_[16];	# optional
381e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
382e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
383e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[1], @x[1], @x[1], #12
384e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
385e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[2], @x[2], @x[2], #12
386e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[1], @x[1], @t[1]
387e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[3], @x[3], @x[3], #12
388e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[2], @x[2], @t[2]
389e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[4], @x[4], @x[4], #12
390e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[3], @x[3], @t[3]
391e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[5], @x[5], @x[5], #12
392e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[4], @x[4], @t[4]
393e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[6], @x[6], @x[6], #12
394e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[5], @x[5], @t[5]
395e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[7], @x[7], @x[7], #12
396e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[6], @x[6], @t[6]
397e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
398e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @x[0]
399e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[7], @x[7], @t[7]
400e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
401e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @x[1]
402e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @x[7]
403e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @x[7]
404e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@x[1], @x[1], @x[1], #8
405e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[5], @t[5], @x[4]
406e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[0], @x[0], @t[0]
407e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[6], @t[6], @x[5]
408e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[1], @x[1], @t[1]
409e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@t[0], @x[4], @x[4], #8
410e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[4], @t[4], @x[3]
411e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@t[1], @x[5], @x[5], #8
412e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[7], @t[7], @x[6]
413e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@x[4], @x[3], @x[3], #8
414e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @x[2]
415e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@x[5], @x[7], @x[7], #8
416e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[4], @t[4], @x[7]
417e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@x[3], @x[6], @x[6], #8
418e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @x[7]
419e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vext.8	@x[6], @x[2], @x[2], #8
420e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[7], @t[1], @t[5]
421e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
422e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if (!$inv);
423e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[2], @t[0], @t[4]
424e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[4], @x[4], @t[3]
425e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[5], @x[5], @t[7]
426e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @x[3], @t[6]
427e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 @ vmov	@x[2], @t[0]
428e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[6], @x[6], @t[2]
429e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 @ vmov	@x[7], @t[1]
430e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
431e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($inv);
432e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @x[4]
433e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[5], @x[5], @t[7]
434e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[2], @x[3], @t[6]
435e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @t[0], @t[4]
436e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[4], @x[6], @t[2]
437e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[6], @t[3]
438e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 @ vmov	@x[7], @t[1]
439e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
440e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
441e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
442e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub InvMixColumns_orig {
443e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=@_[0..7];
444e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..15];
445e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
446e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
447e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ multiplication by 0x0e
448e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[7], @x[7], @x[7], #12
449e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@t[2], @x[2]
450e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[2], @x[2], @x[5]		@ 2 5
451e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[7], @x[7], @x[5]		@ 7 5
452e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[0], @x[0], @x[0], #12
453e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@t[5], @x[5]
454e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
455e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[0], @x[0], @x[1]		@ 0 1
456e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[1], @x[1], @x[1], #12
457e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[1], @x[1], @x[2]		@ 1 25
458e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
459e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[3], @x[3], @x[3], #12
460e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
461e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
462e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @x[3], @x[7]		@ 3 75
463e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
464e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[6], @x[6], @x[6], #12
465e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@t[4], @x[4]
466e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[6], @x[6], @x[4]		@ 6 4
467e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
468e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @x[3], @x[7]		@ 375 756=36
469e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
470e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @x[3], @t[2]		@ 36 2
471e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[5], @t[5], @t[5], #12
472e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
473e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
474e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel					my @y = @x[7,5,0,2,1,3,4,6];
475e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
476e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ multiplication by 0x0b
477e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @y[0]
478e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @t[0]
479e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[2], @t[2], @t[2], #12
480e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @t[1]
481e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @t[5]
482e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[4], @t[4], @t[4], #12
483e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @t[6]
484e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @t[7]
485e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
486e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
487e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[0]
488e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@y[1], @y[1], @y[0]
489e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[0], @t[0], @t[0], #12
490e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[2], @y[2], @t[1]
491e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[1]
492e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[1], @t[1], @t[1], #12
493e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[2], @y[2], @t[2]
494e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[2]
495e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[2]
496e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[2], @y[2], @t[7]
497e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[2], @t[2], @t[2], #12
498e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[3]
499e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[6], @y[6], @t[3]
500e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[3]
501e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[7], @y[7], @t[4]
502e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[3], @t[3], @t[3], #12
503e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[4]
504e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[7], @y[7], @t[7]
505e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
506e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[5]
507e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[4]
508e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
509e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[7]
510e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[4], @t[4], @t[4], #12
511e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[6], @y[6], @t[7]
512e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[7]
513e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
514e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[7], @t[7], @t[5]
515e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[5], @t[5], @t[5], #12
516e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
517e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ multiplication by 0x0d
518e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @y[7]
519e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
520e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[7], @y[7], @t[4]
521e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[6], @t[6], @t[6], #12
522e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[2], @y[2], @t[0]
523e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[7], @y[7], @t[5]
524e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[7], @t[7], @t[7], #12
525e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[2], @y[2], @t[2]
526e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
527e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @y[1]
528e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @t[1]
529e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @t[0]
530e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[0]
531e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @t[5]
532e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @t[5]
533e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[0], @t[0], @t[0], #12
534e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[1], @y[1], @t[7]
535e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[0], @y[0], @t[6]
536e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @y[1]
537e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[1]
538e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[1], @t[1], @t[1], #12
539e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
540e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[7], @y[7], @t[7]
541e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[2]
542e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[2]
543e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[2], @y[2], @t[6]
544e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
545e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[2], @t[2], @t[2], #12
546e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @y[7]
547e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[6]
548e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
549e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[6], @y[6], @t[6]
550e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[5]
551e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[5], @t[5], @t[5], #12
552e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[6], @y[6], @t[4]
553e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[4], @t[4], @t[4], #12
554e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[6]
555e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[6], @y[6], @t[7]
556e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[7], @t[7], @t[7], #12
557e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[6], @t[6], @t[3]		@ restore t[6]
558e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[3], @t[3], @t[3], #12
559e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
560e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ multiplication by 0x09
561e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @y[1]
562e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
563e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
564e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[6], @t[6], @t[6], #12
565e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @t[5]
566e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[3], @y[3], @t[0]
567e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
568e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @t[6]
569e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
570e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[4], @y[4], @t[1]
571e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[7], @y[7], @t[4]
572e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[6], @y[6], @t[3]
573e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@y[5], @y[5], @t[2]
574e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
575e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
576e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
577e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
578e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @t[7]
579e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[5], @t[5], @t[6]
580e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
581e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @t[2], @t[6]
582e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
583e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
584e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[0], @t[0]
585e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[1], @t[1]
586e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ vmov	@XMM[2], @t[2]
587e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[3], @t[3]
588e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[4], @t[4]
589e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ vmov	@XMM[5], @t[5]
590e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ vmov	@XMM[6], @t[6]
591e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ vmov	@XMM[7], @t[7]
592e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
593e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
594e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
595e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub InvMixColumns {
596e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=@_[0..7];
597e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @t=@_[8..15];
598e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
599e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# Thanks to Jussi Kivilinna for providing pointer to
600e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
601e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
602e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
603e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
604e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
605e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
606e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
607e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ multiplication by 0x05-0x00-0x04-0x00
608e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[0], @x[0], @x[0], #8
609e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[6], @x[6], @x[6], #8
610e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[7], @x[7], @x[7], #8
611e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[0], @t[0], @x[0]
612e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[1], @x[1], @x[1], #8
613e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[6], @t[6], @x[6]
614e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[2], @x[2], @x[2], #8
615e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[7], @t[7], @x[7]
616e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[3], @x[3], @x[3], #8
617e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[1], @t[1], @x[1]
618e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[4], @x[4], @x[4], #8
619e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[2], @t[2], @x[2]
620e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8	@t[5], @x[5], @x[5], #8
621e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[3], @t[3], @x[3]
622e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[4], @t[4], @x[4]
623e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@t[5], @t[5], @x[5]
624e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
625e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[0], @x[0], @t[6]
626e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[1], @x[1], @t[6]
627e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[2], @x[2], @t[0]
628e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[4], @x[4], @t[2]
629e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[3], @x[3], @t[1]
630e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[1], @x[1], @t[7]
631e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[2], @x[2], @t[7]
632e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[4], @x[4], @t[6]
633e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[5], @x[5], @t[3]
634e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[3], @x[3], @t[6]
635e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[6], @x[6], @t[4]
636e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[4], @x[4], @t[7]
637e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[5], @x[5], @t[7]
638e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor	@x[7], @x[7], @t[5]
639e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
640e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
641e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
642e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
643e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub swapmove {
644e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($a,$b,$n,$mask,$t)=@_;
645e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
646e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.u64	$t, $b, #$n
647e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		$t, $t, $a
648e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		$t, $t, $mask
649e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		$a, $a, $t
650e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshl.u64	$t, $t, #$n
651e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		$b, $b, $t
652e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
653e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
654e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub swapmove2x {
655e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
656e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
657e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.u64	$t0, $b0, #$n
658e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vshr.u64	$t1, $b1, #$n
659e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		$t0, $t0, $a0
660e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor		$t1, $t1, $a1
661e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		$t0, $t0, $mask
662e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vand		$t1, $t1, $mask
663e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		$a0, $a0, $t0
664e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshl.u64	$t0, $t0, #$n
665e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor		$a1, $a1, $t1
666e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vshl.u64	$t1, $t1, #$n
667e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		$b0, $b0, $t0
668e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 veor		$b1, $b1, $t1
669e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
670e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
671e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
672e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub bitslice {
673e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=reverse(@_[0..7]);
674e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($t0,$t1,$t2,$t3)=@_[8..11];
675e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
676e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	$t0,#0x55			@ compose .LBS0
677e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	$t1,#0x33			@ compose .LBS1
678e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
679e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
680e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
681e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
682e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	$t0,#0x0f			@ compose .LBS2
683e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
684e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
685e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
686e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
687e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
688e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
689e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
690e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
691e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
692e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef __KERNEL__
693e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# include "arm_arch.h"
694e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
695e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
696e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
697e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define VFP_ABI_FRAME	0x40
698e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
699e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define VFP_ABI_PUSH
700e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define VFP_ABI_POP
701e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define VFP_ABI_FRAME	0
702e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define BSAES_ASM_EXTENDED_KEY
703e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define XTS_CHAIN_TWEAK
704d2eca20d77d9d42f3163a0a3d6ead75ee3635f99Russell King# define __ARM_ARCH__	7
705e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
706e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
707e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef __thumb__
708e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# define adrl adr
709e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
710e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
711e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#if __ARM_ARCH__>=7
712e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.text
713e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
714e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef __thumb2__
715e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.thumb
716e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
717e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.code   32
718e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
719e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
720e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.fpu	neon
721e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
722e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	_bsaes_decrypt8,%function
723e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
724e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel_bsaes_decrypt8:
725e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adr	$const,_bsaes_decrypt8
726e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@XMM[9]}		@ round 0 key
727e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
728e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
729e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
730e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
731e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[11], @XMM[1], @XMM[9]
732e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
733e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
734e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[12], @XMM[2], @XMM[9]
735e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
736e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
737e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[13], @XMM[3], @XMM[9]
738e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
739e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
740e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[14], @XMM[4], @XMM[9]
741e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
742e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
743e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[15], @XMM[5], @XMM[9]
744e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
745e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
746e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[10], @XMM[6], @XMM[9]
747e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
748e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
749e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[11], @XMM[7], @XMM[9]
750e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
751e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
752e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
753e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
754e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
755e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&bitslice	(@XMM[0..7, 8..11]);
756e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
757e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$rounds,$rounds,#1
758e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Ldec_sbox
759e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
760e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Ldec_loop:
761e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
762e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&ShiftRows	(@XMM[0..7, 8..12]);
763e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=".Ldec_sbox:\n";
764e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&InvSbox	(@XMM[0..7, 8..15]);
765e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
766e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$rounds,$rounds,#1
767e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bcc	.Ldec_done
768e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
769e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
770e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
771e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const, {@XMM[12]}		@ .LISR
772e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ite	eq				@ Thumb2 thing, sanity check in ARM
773e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	addeq	$const,$const,#0x10
774e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne	.Ldec_loop
775e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const, {@XMM[12]}		@ .LISRM0
776e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Ldec_loop
777e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
778e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Ldec_done:
779e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
780e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
781e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
782e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key, {@XMM[8]}			@ last round key
783e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[8]
784e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[8]
785e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @XMM[2], @XMM[8]
786e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[8]
787e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[3], @XMM[3], @XMM[8]
788e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[5], @XMM[5], @XMM[8]
789e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[8]
790e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
791e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bx	lr
792e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	_bsaes_decrypt8,.-_bsaes_decrypt8
793e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
794e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	_bsaes_const,%object
795e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	6
796e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel_bsaes_const:
797e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LM0ISR:	@ InvShiftRows constants
798e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
799e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LISR:
800e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
801e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LISRM0:
802e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
803e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LM0SR:		@ ShiftRows constants
804e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
805e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LSR:
806e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
807e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LSRM0:
808e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
809e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LM0:
810e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
811e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.LREVM0SR:
812e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	0x090d01050c000408, 0x03070b0f060a0e02
813e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
814e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	6
815e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	_bsaes_const,.-_bsaes_const
816e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
817e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	_bsaes_encrypt8,%function
818e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
819e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel_bsaes_encrypt8:
820e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adr	$const,_bsaes_encrypt8
821e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key!, {@XMM[9]}		@ round 0 key
822e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
823e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
824e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const!, {@XMM[8]}		@ .LM0SR
825e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel_bsaes_encrypt8_alt:
826e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
827e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[11], @XMM[1], @XMM[9]
828e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
829e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
830e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[12], @XMM[2], @XMM[9]
831e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
832e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
833e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[13], @XMM[3], @XMM[9]
834e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
835e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
836e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[14], @XMM[4], @XMM[9]
837e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
838e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
839e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[15], @XMM[5], @XMM[9]
840e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
841e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
842e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[10], @XMM[6], @XMM[9]
843e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
844e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
845e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[11], @XMM[7], @XMM[9]
846e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
847e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
848e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
849e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
850e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel_bsaes_encrypt8_bitslice:
851e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
852e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&bitslice	(@XMM[0..7, 8..11]);
853e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
854e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$rounds,$rounds,#1
855e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lenc_sbox
856e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
857e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lenc_loop:
858e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
859e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&ShiftRows	(@XMM[0..7, 8..12]);
860e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=".Lenc_sbox:\n";
861e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&Sbox		(@XMM[0..7, 8..15]);
862e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
863e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$rounds,$rounds,#1
864e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bcc	.Lenc_done
865e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
866e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
867e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
868e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const, {@XMM[12]}		@ .LSR
869e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ite	eq				@ Thumb2 thing, samity check in ARM
870e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	addeq	$const,$const,#0x10
871e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne	.Lenc_loop
872e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const, {@XMM[12]}		@ .LSRM0
873e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lenc_loop
874e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
875e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lenc_done:
876e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
877e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
878e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
879e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
880e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$key, {@XMM[8]}			@ last round key
881e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[8]
882e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[8]
883e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[3], @XMM[3], @XMM[8]
884e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[8]
885e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @XMM[2], @XMM[8]
886e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[5], @XMM[5], @XMM[8]
887e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[8]
888e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
889e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bx	lr
890e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	_bsaes_encrypt8,.-_bsaes_encrypt8
891e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
892e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
893e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel{
894e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
895e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
896e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelsub bitslice_key {
897e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @x=reverse(@_[0..7]);
898e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
899e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
900e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
901e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
902e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
903e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[2], @x[0]
904e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[3], @x[1]
905e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
906e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
907e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
908e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
909e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
910e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
911e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[4], @x[0]
912e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[6], @x[2]
913e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[5], @x[1]
914e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@x[7], @x[3]
915e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
916e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
917e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
918e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
919e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
920e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
921e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	_bsaes_key_convert,%function
922e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
923e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel_bsaes_key_convert:
924e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adr	$const,_bsaes_key_convert
925e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
926e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$const,$const,#_bsaes_key_convert-.LM0
927e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
928e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
929e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[8],  #0x01			@ bit masks
930e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[9],  #0x02
931e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[10], #0x04
932e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[11], #0x08
933e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[12], #0x10
934e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[13], #0x20
935e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$const, {@XMM[14]}		@ .LM0
936e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
937e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef __ARMEL__
938e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vrev32.8	@XMM[7],  @XMM[7]
939e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vrev32.8	@XMM[15], @XMM[15]
940e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
941e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$rounds,$rounds,#1
942e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$out!, {@XMM[7]}		@ save round 0 key
943e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lkey_loop
944e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
945e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
946e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lkey_loop:
947e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
948e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
949e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[6],  #0x40
950e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[15], #0x80
951e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
952e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[0], @XMM[7], @XMM[8]
953e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[1], @XMM[7], @XMM[9]
954e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[2], @XMM[7], @XMM[10]
955e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[3], @XMM[7], @XMM[11]
956e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[4], @XMM[7], @XMM[12]
957e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[5], @XMM[7], @XMM[13]
958e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[6], @XMM[7], @XMM[6]
959e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vtst.8	@XMM[7], @XMM[7], @XMM[15]
960e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
961e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmvn	@XMM[0], @XMM[0]		@ "pnot"
962e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmvn	@XMM[1], @XMM[1]
963e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmvn	@XMM[5], @XMM[5]
964e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmvn	@XMM[6], @XMM[6]
965e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef __ARMEL__
966e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vrev32.8	@XMM[15], @XMM[15]
967e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
968e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$rounds,$rounds,#1
969e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
970e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne	.Lkey_loop
971e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
972e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i8	@XMM[7],#0x63			@ compose .L63
973e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ don't save last round key
974e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bx	lr
975e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	_bsaes_key_convert,.-_bsaes_key_convert
976e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
977e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
978e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
979e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelif (0) {		# following four functions are unsupported interface
980e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel			# used for benchmarking...
981e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
982e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.globl	bsaes_enc_key_convert
983e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_enc_key_convert,%function
984e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
985e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_enc_key_convert:
986e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!,{r4-r6,lr}
987e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmdb	sp!,{d8-d15}		@ ABI specification says so
988e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
989e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r5,[$inp,#240]			@ pass rounds
990e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4,$inp				@ pass key
991e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r12,$out			@ pass key schedule
992e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
993e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
994e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12, {@XMM[7]}			@ save last round key
995e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
996e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	sp!,{d8-d15}
997e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!,{r4-r6,pc}
998e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
999e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1000e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.globl	bsaes_encrypt_128
1001e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_encrypt_128,%function
1002e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1003e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_encrypt_128:
1004e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!,{r4-r6,lr}
1005e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmdb	sp!,{d8-d15}		@ ABI specification says so
1006e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lenc128_loop:
1007e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
1008e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
1009e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4,$key				@ pass the key
1010e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
1011e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5,#10				@ pass rounds
1012e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
1013e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1014e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_encrypt8
1015e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1016e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1017e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1018e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1019e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[3]}, [$out]!
1020e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[7]}, [$out]!
1021e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[2]}, [$out]!
1022e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$len,$len,#0x80
1023e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[5]}, [$out]!
1024e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bhi	.Lenc128_loop
1025e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1026e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	sp!,{d8-d15}
1027e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!,{r4-r6,pc}
1028e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1029e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1030e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.globl	bsaes_dec_key_convert
1031e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_dec_key_convert,%function
1032e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1033e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_dec_key_convert:
1034e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!,{r4-r6,lr}
1035e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmdb	sp!,{d8-d15}		@ ABI specification says so
1036e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1037e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r5,[$inp,#240]			@ pass rounds
1038e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4,$inp				@ pass key
1039e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r12,$out			@ pass key schedule
1040e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1041e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$out, {@XMM[6]}
1042e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12,  {@XMM[15]}		@ save last round key
1043e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
1044e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$out, {@XMM[7]}
1045e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1046e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	sp!,{d8-d15}
1047e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!,{r4-r6,pc}
1048e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1049e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1050e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.globl	bsaes_decrypt_128
1051e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_decrypt_128,%function
1052e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1053e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_decrypt_128:
1054e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!,{r4-r6,lr}
1055e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmdb	sp!,{d8-d15}		@ ABI specification says so
1056e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Ldec128_loop:
1057e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
1058e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
1059e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4,$key				@ pass the key
1060e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
1061e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5,#10				@ pass rounds
1062e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
1063e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1064e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1065e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1066e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1067e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1068e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1069e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[2]}, [$out]!
1070e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[7]}, [$out]!
1071e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[3]}, [$out]!
1072e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$len,$len,#0x80
1073e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[5]}, [$out]!
1074e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bhi	.Ldec128_loop
1075e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1076e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	sp!,{d8-d15}
1077e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!,{r4-r6,pc}
1078e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1079e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1080e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
1081e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel{
1082e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
1083e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($keysched)=("sp");
1084e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1085e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1086e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.extern AES_cbc_encrypt
1087e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.extern AES_decrypt
1088e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1089e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.global	bsaes_cbc_encrypt
1090e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_cbc_encrypt,%function
1091e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	5
1092e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_cbc_encrypt:
1093e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	__KERNEL__
1094e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp	$len, #128
1095e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	__thumb__
1096e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	AES_cbc_encrypt
1097e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1098e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bhs	1f
1099e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	AES_cbc_encrypt
1100e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel1:
1101e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1102e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1103e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1104e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ it is up to the caller to make sure we are called with enc == 0
1105e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1106e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	ip, sp
1107e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!, {r4-r10, lr}
1108e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_PUSH
1109e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	$ivp, [ip]			@ IV is 1st arg on the stack
1110e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$len, $len, lsr#4		@ len in 16 byte blocks
1111e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	sp, #0x10			@ scratch space to carry over the IV
1112e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$fp, sp				@ save sp
1113e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1114e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	$rounds, [$key, #240]		@ get # of rounds
1115e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1116e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ allocate the key schedule on the stack
1117e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
1118e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, #`128-32`			@ sifze of bit-slices key schedule
1119e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1120e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ populate the key schedule
1121e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
1122e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
1123e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, r12				@ sp is $keysched
1124e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1125e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$keysched, {@XMM[6]}
1126e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12,  {@XMM[15]}		@ save last round key
1127e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
1128e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$keysched, {@XMM[7]}
1129e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1130e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r12, [$key, #244]
1131e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	eors	r12, #1
1132e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	0f
1133e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1134e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ populate the key schedule
1135e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	str	r12, [$key, #244]
1136e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
1137e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
1138e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, $key, #248			@ pass key schedule
1139e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1140e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r4, $key, #248
1141e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	r4, {@XMM[6]}
1142e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12, {@XMM[15]}			@ save last round key
1143e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
1144e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r4, {@XMM[7]}
1145e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1146e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	2
1147e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel0:
1148e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1149e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1150e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$ivp]		@ load IV
1151e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_loop
1152e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1153e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1154e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_loop:
1155e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$len, $len, #0x8
1156e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bmi	.Lcbc_dec_loop_finish
1157e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1158e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
1159e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
1160e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1161e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $keysched			@ pass the key
1162e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1163e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r4, $key, #248
1164e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1165e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
1166e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds
1167e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
1168e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x60
1169e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$fp, {@XMM[15]}			@ put aside IV
1170e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1171e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1172e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1173e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp, {@XMM[14]}			@ reload IV
1174e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
1175e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1176e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
1177e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1178e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[9]
1179e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
1180e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[10]
1181e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @XMM[2], @XMM[11]
1182e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
1183e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[12]
1184e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1185e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[3], @XMM[3], @XMM[13]
1186e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1187e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[5], @XMM[5], @XMM[14]
1188e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1189e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[2]}, [$out]!
1190e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[7]}, [$out]!
1191e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[3]}, [$out]!
1192e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[5]}, [$out]!
1193e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1194e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_loop
1195e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1196e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_loop_finish:
1197e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adds	$len, $len, #8
1198e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	.Lcbc_dec_done
1199e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1200e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]}, [$inp]!		@ load input
1201e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp	$len, #2
1202e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	.Lcbc_dec_one
1203e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[1]}, [$inp]!
1204e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1205e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $keysched			@ pass the key
1206e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1207e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r4, $key, #248
1208e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1209e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds
1210e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$fp, {@XMM[15]}			@ put aside IV
1211e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	.Lcbc_dec_two
1212e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[2]}, [$inp]!
1213e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp	$len, #4
1214e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	.Lcbc_dec_three
1215e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[3]}, [$inp]!
1216e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	.Lcbc_dec_four
1217e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[4]}, [$inp]!
1218e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp	$len, #6
1219e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	.Lcbc_dec_five
1220e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[5]}, [$inp]!
1221e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	.Lcbc_dec_six
1222e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[6]}, [$inp]!
1223e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x70
1224e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1225e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1226e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1227e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp, {@XMM[14]}			@ reload IV
1228e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
1229e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1230e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
1231e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1232e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[9]
1233e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
1234e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[10]
1235e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @XMM[2], @XMM[11]
1236e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!
1237e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[12]
1238e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1239e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[3], @XMM[3], @XMM[13]
1240e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1241e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1242e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[2]}, [$out]!
1243e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[7]}, [$out]!
1244e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[3]}, [$out]!
1245e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_done
1246e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1247e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_six:
1248e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x60
1249e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1250e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp,{@XMM[14]}			@ reload IV
1251e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
1252e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1253e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
1254e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1255e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[9]
1256e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[12]}, [$inp]!
1257e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[10]
1258e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @XMM[2], @XMM[11]
1259e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!
1260e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[12]
1261e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1262e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1263e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1264e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[2]}, [$out]!
1265e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[7]}, [$out]!
1266e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_done
1267e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1268e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_five:
1269e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x50
1270e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1271e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp, {@XMM[14]}			@ reload IV
1272e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
1273e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1274e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
1275e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1276e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[9]
1277e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!
1278e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[10]
1279e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1280e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[2], @XMM[2], @XMM[11]
1281e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1282e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1283e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[2]}, [$out]!
1284e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_done
1285e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1286e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_four:
1287e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x40
1288e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1289e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp, {@XMM[14]}			@ reload IV
1290e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
1291e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1292e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[10]}, [$inp]!
1293e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1294e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[9]
1295e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!
1296e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[4], @XMM[4], @XMM[10]
1297e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1298e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1299e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[4]}, [$out]!
1300e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_done
1301e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1302e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_three:
1303e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x30
1304e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1305e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp, {@XMM[14]}			@ reload IV
1306e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
1307e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1308e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!
1309e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1310e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[6], @XMM[6], @XMM[9]
1311e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1312e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[6]}, [$out]!
1313e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_done
1314e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1315e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_two:
1316e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x20
1317e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_decrypt8
1318e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$fp, {@XMM[14]}			@ reload IV
1319e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
1320e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
1321e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
1322e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[1], @XMM[1], @XMM[8]
1323e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1324e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lcbc_dec_done
1325e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1326e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_one:
1327e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	$inp, $inp, #0x10
1328e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$rounds, $out			@ save original out pointer
1329e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$out, $fp			@ use the iv scratch space as out buffer
1330e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r2, $key
1331e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
1332e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[5],@XMM[0]			@ and input are preserved
1333e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	AES_decrypt
1334e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]}, [$fp,:64]		@ load result
1335e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
1336e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
1337e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]}, [$rounds]		@ write output
1338e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1339e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_done:
1340e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1341e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q0, #0
1342e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q1, #0
1343e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lcbc_dec_bzero:				@ wipe key schedule [if any]
1344e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		$keysched!, {q0-q1}
1345e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		$keysched, $fp
1346e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne		.Lcbc_dec_bzero
1347e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1348e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1349e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, $fp
1350e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	sp, #0x10			@ add sp,$fp,#0x10 is no good for thumb
1351e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[15]}, [$ivp]		@ return IV
1352e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_POP
1353e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!, {r4-r10, pc}
1354e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1355e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1356e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
1357e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel{
1358e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
1359e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy $const = "r6";	# shared with _bsaes_encrypt8_alt
1360e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy $keysched = "sp";
1361e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1362e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1363e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.extern	AES_encrypt
1364e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.global	bsaes_ctr32_encrypt_blocks
1365e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_ctr32_encrypt_blocks,%function
1366e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	5
1367e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_ctr32_encrypt_blocks:
1368e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp	$len, #8			@ use plain AES for
1369e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	.Lctr_enc_short			@ small sizes
1370e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1371e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	ip, sp
1372e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!, {r4-r10, lr}
1373e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_PUSH
1374e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
1375e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
1376e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$fp, sp				@ save sp
1377e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1378e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	$rounds, [$key, #240]		@ get # of rounds
1379e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1380e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ allocate the key schedule on the stack
1381e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
1382e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, #`128-32`			@ size of bit-sliced key schedule
1383e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1384e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ populate the key schedule
1385e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
1386e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
1387e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, r12				@ sp is $keysched
1388e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1389e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
1390e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12, {@XMM[7]}			@ save last round key
1391e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1392e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
1393e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
1394e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	$keysched, {@XMM[4]}		@ load round0 key
1395e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1396e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r12, [$key, #244]
1397e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	eors	r12, #1
1398e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	0f
1399e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1400e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ populate the key schedule
1401e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	str	r12, [$key, #244]
1402e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
1403e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
1404e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, $key, #248			@ pass key schedule
1405e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1406e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
1407e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12, {@XMM[7]}			@ save last round key
1408e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1409e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	2
1410e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel0:	add	r12, $key, #248
1411e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
1412e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adrl	$ctr, .LREVM0SR			@ borrow $ctr
1413e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	r12, {@XMM[4]}			@ load round0 key
1414e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	sp, #0x10			@ place for adjusted round0 key
1415e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1416e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1417e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	@XMM[8],#1		@ compose 1<<96
1418e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9],@XMM[9],@XMM[9]
1419e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vrev32.8	@XMM[0],@XMM[0]
1420e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
1421e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vrev32.8	@XMM[4],@XMM[4]
1422e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
1423e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
1424e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lctr_enc_loop
1425e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1426e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1427e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lctr_enc_loop:
1428e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
1429e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
1430e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
1431e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
1432e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
1433e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
1434e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
1435e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
1436e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
1437e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1438e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1439e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ to flip byte order in 32-bit counter
1440e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1441e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$keysched, {@XMM[9]}		@ load round0 key
1442e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1443e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $keysched, #0x10		@ pass next round key
1444e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1445e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #`248+16`
1446e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1447e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
1448e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1449e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		$fp, {@XMM[10]}			@ save next counter
1450e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
1451e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1452e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8_alt
1453e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1454e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, $len, #8
1455e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo		.Lctr_enc_loop_done
1456e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1457e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
1458e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
1459e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[8]
1460e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[9]
1461e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
1462e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[4], @XMM[10]
1463e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[6], @XMM[11]
1464e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
1465e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[3], @XMM[12]
1466e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
1467e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[7], @XMM[13]
1468e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[2], @XMM[14]
1469e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[4]}, [$out]!
1470e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[15]
1471e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[6]}, [$out]!
1472e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	@XMM[8], #1			@ compose 1<<96
1473e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[3]}, [$out]!
1474e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[9], @XMM[9]
1475e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[7]}, [$out]!
1476e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
1477e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[2]}, [$out]!
1478e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
1479e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[5]}, [$out]!
1480e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$fp, {@XMM[0]}			@ load counter
1481e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1482e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne		.Lctr_enc_loop
1483e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lctr_enc_done
1484e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1485e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1486e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lctr_enc_loop_done:
1487e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		$len, $len, #8
1488e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[8]}, [$inp]!	@ load input
1489e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[8]
1490e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [$out]!	@ write output
1491e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		$len, #2
1492e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo		.Lctr_enc_done
1493e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[9]}, [$inp]!
1494e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[9]
1495e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[1]}, [$out]!
1496e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq		.Lctr_enc_done
1497e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[10]}, [$inp]!
1498e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[4], @XMM[10]
1499e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[4]}, [$out]!
1500e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		$len, #4
1501e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo		.Lctr_enc_done
1502e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[11]}, [$inp]!
1503e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[6], @XMM[11]
1504e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[6]}, [$out]!
1505e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq		.Lctr_enc_done
1506e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[12]}, [$inp]!
1507e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[3], @XMM[12]
1508e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[3]}, [$out]!
1509e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		$len, #6
1510e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo		.Lctr_enc_done
1511e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[13]}, [$inp]!
1512e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[7], @XMM[13]
1513e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[7]}, [$out]!
1514e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq		.Lctr_enc_done
1515e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[14]}, [$inp]
1516e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[2], @XMM[14]
1517e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[2]}, [$out]!
1518e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1519e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lctr_enc_done:
1520e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q0, #0
1521e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q1, #0
1522e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1523e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lctr_enc_bzero:			@ wipe key schedule [if any]
1524e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		$keysched!, {q0-q1}
1525e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		$keysched, $fp
1526e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne		.Lctr_enc_bzero
1527e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1528e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		$keysched, {q0-q1}
1529e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1530e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1531e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, $fp
1532e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
1533e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_POP
1534e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!, {r4-r10, pc}	@ return
1535e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1536e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1537e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lctr_enc_short:
1538e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	ip, [sp]		@ ctr pointer is passed on stack
1539e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!, {r4-r8, lr}
1540e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1541e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $inp		@ copy arguments
1542e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $out
1543e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r6, $len
1544e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r7, $key
1545e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r8, [ip, #12]		@ load counter LSW
1546e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
1547e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef __ARMEL__
1548e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	rev	r8, r8
1549e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1550e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	sp, sp, #0x10
1551e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
1552e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	sp, sp, #0x10
1553e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1554e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lctr_enc_short_loop:
1555e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r0, sp, #0x10		@ input counter value
1556e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r1, sp			@ output on the stack
1557e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r2, r7			@ key
1558e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1559e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	AES_encrypt
1560e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1561e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[0]}, [r4]!	@ load input
1562e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
1563e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r8, r8, #1
1564e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef __ARMEL__
1565e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	rev	r0, r8
1566e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	str	r0, [sp, #0x1c]		@ next counter value
1567e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1568e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	str	r8, [sp, #0x1c]		@ next counter value
1569e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1570e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[0],@XMM[0],@XMM[1]
1571e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8	{@XMM[0]}, [r5]!	@ store output
1572e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	r6, r6, #1
1573e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne	.Lctr_enc_short_loop
1574e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1575e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q0, #0
1576e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q1, #0
1577e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		sp!, {q0-q1}
1578e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1579e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia	sp!, {r4-r8, pc}
1580e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1581e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1582e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
1583e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel{
1584e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel######################################################################
1585e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1586e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#	const AES_KEY *key1, const AES_KEY *key2,
1587e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#	const unsigned char iv[16]);
1588e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#
1589e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
1590e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy $const="r6";		# returned by _bsaes_key_convert
1591e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy $twmask=@XMM[5];
1592e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelmy @T=@XMM[6..7];
1593e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1594e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1595e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.globl	bsaes_xts_encrypt
1596e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_xts_encrypt,%function
1597e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1598e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_xts_encrypt:
1599e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	ip, sp
1600e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!, {r4-r10, lr}		@ 0x20
1601e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_PUSH
1602e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r6, sp				@ future $fp
1603e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1604e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$inp, r0
1605e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$out, r1
1606e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$len, r2
1607e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$key, r3
1608e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1609e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r0, sp, #0x10			@ 0x10
1610e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bic	r0, #0xf			@ align at 16 bytes
1611e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, r0
1612e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1613e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef	XTS_CHAIN_TWEAK
1614e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r0, [ip]			@ pointer to input tweak
1615e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1616e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ generate initial tweak
1617e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r0, [ip, #4]			@ iv[]
1618e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r1, sp
1619e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r2, [ip, #0]			@ key2
1620e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	AES_encrypt
1621e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r0,sp				@ pointer to initial tweak
1622e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1623e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1624e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	$rounds, [$key, #240]		@ get # of rounds
1625e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$fp, r6
1626e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1627e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ allocate the key schedule on the stack
1628e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
1629e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
1630e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r12, #`32+16`			@ place for tweak[9]
1631e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1632e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ populate the key schedule
1633e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
1634e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
1635e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, r12
1636e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, #0x90			@ pass key schedule
1637e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1638e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
1639e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12, {@XMM[7]}			@ save last round key
1640e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1641e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r12, [$key, #244]
1642e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	eors	r12, #1
1643e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	0f
1644e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1645e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	str	r12, [$key, #244]
1646e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
1647e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
1648e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, $key, #248			@ pass key schedule
1649e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
1650e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
1651e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12, {@XMM[7]}
1652e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1653e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	2
1654e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel0:	sub	sp, #0x90			@ place for tweak[9]
1655e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1656e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1657e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
1658e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adr	$magic, .Lxts_magic
1659e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1660e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$len, #0x80
1661e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	.Lxts_enc_short
1662e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lxts_enc_loop
1663e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1664e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1665e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_loop:
1666e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$magic, {$twmask}	@ load XTS magic
1667e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[0], @XMM[8], #63
1668e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1669e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[0], @T[0], $twmask
1670e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1671e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelfor($i=9;$i<16;$i++) {
1672e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1673e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
1674e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[$i-1]}, [r0,:128]!
1675e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1676e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[1], @XMM[$i], #63
1677e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i], @XMM[$i], @T[0]
1678e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[1], @T[1], $twmask
1679e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1680e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@T=reverse(@T);
1681e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1682e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=10);
1683e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[$i-10]}, [$inp]!
1684e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1685e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=11);
1686e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1687e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1688e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
1689e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1690e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
1691e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[15]}, [r0,:128]!
1692e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1693e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[8], @T[0]
1694e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1695e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1696e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
1697e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[5], @XMM[13]
1698e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1699e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1700e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1701e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1702e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1703e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[6], @XMM[6], @XMM[14]
1704e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1705e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[7], @XMM[7], @XMM[15]
1706e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1707e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1708e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1709e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1710e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1711e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
1712e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1713e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
1714e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1715e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[4], @XMM[10]
1716e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1717e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[6], @XMM[11]
1718e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
1719e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[3], @XMM[12]
1720e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
1721e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[11], @XMM[7], @XMM[13]
1722e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[12], @XMM[2], @XMM[14]
1723e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
1724e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[13], @XMM[5], @XMM[15]
1725e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
1726e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1727e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1728e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1729e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, #0x80
1730e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bpl		.Lxts_enc_loop
1731e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1732e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_short:
1733e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adds		$len, #0x70
1734e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bmi		.Lxts_enc_done
1735e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1736e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$magic, {$twmask}	@ load XTS magic
1737e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[0], @XMM[8], #63
1738e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1739e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[0], @T[0], $twmask
1740e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1741e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelfor($i=9;$i<16;$i++) {
1742e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1743e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
1744e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[$i-1]}, [r0,:128]!
1745e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1746e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[1], @XMM[$i], #63
1747e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i], @XMM[$i], @T[0]
1748e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[1], @T[1], $twmask
1749e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1750e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@T=reverse(@T);
1751e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1752e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=10);
1753e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[$i-10]}, [$inp]!
1754e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, #0x10
1755e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bmi		.Lxts_enc_`$i-9`
1756e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1757e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=11);
1758e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1759e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
1760e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
1761e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
1762e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub		$len, #0x10
1763e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
1764e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1765e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[6]}, [$inp]!
1766e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[5], @XMM[13]
1767e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1768e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1769e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1770e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1771e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1772e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[6], @XMM[6], @XMM[14]
1773e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1774e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1775e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1776e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1777e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1778e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1779e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
1780e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1781e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
1782e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1783e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[4], @XMM[10]
1784e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1785e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[6], @XMM[11]
1786e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[14]}, [r0,:128]!
1787e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[3], @XMM[12]
1788e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
1789e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[11], @XMM[7], @XMM[13]
1790e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[12], @XMM[2], @XMM[14]
1791e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
1792e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[12]}, [$out]!
1793e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1794e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1795e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_enc_done
1796e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1797e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_6:
1798e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
1799e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1800e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[4], @XMM[4], @XMM[12]
1801e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1802e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1803e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1804e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1805e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1806e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[5], @XMM[13]
1807e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1808e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1809e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1810e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1811e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1812e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1813e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
1814e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1815e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
1816e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1817e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[4], @XMM[10]
1818e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1819e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[6], @XMM[11]
1820e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[3], @XMM[12]
1821e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
1822e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[11], @XMM[7], @XMM[13]
1823e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
1824e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1825e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1826e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_enc_done
1827e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1828e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel@ put this in range for both ARM and Thumb mode adr instructions
1829e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	5
1830e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_magic:
1831e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	.quad	1, 0x87
1832e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1833e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	5
1834e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_5:
1835e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
1836e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1837e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[3], @XMM[3], @XMM[11]
1838e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1839e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1840e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1841e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1842e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1843e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[4], @XMM[4], @XMM[12]
1844e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1845e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1846e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1847e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1848e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1849e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1850e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
1851e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1852e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]}, [r0,:128]!
1853e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1854e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[4], @XMM[10]
1855e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1856e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[6], @XMM[11]
1857e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[3], @XMM[12]
1858e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
1859e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]}, [$out]!
1860e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1861e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1862e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_enc_done
1863e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1864e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_4:
1865e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
1866e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1867e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[2], @XMM[2], @XMM[10]
1868e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1869e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1870e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1871e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1872e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1873e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[3], @XMM[3], @XMM[11]
1874e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1875e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1876e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1877e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1878e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1879e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1880e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
1881e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1882e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1883e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[4], @XMM[10]
1884e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1885e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[6], @XMM[11]
1886e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
1887e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1888e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1889e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_enc_done
1890e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1891e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_3:
1892e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
1893e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1894e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[9]
1895e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1896e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1897e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1898e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1899e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1900e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[2], @XMM[2], @XMM[10]
1901e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1902e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1903e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1904e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1905e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1906e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
1907e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]}, [r0,:128]!
1908e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1909e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1910e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[4], @XMM[10]
1911e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1912e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]}, [$out]!
1913e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1914e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1915e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_enc_done
1916e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1917e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_2:
1918e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
1919e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1920e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
1921e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
1922e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
1923e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
1924e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
1925e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1926e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[9]
1927e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
1928e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1929e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1930e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_encrypt8
1931e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1932e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
1933e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
1934e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
1935e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
1936e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1937e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
1938e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_enc_done
1939e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
1940e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_1:
1941e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1942e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[8]
1943e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r1, sp
1944e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [sp,:128]
1945e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r2, $key
1946e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r4, $fp				@ preserve fp
1947e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1948e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		AES_encrypt
1949e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1950e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [sp,:128]
1951e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
1952e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [$out]!
1953e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		$fp, r4
1954e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1955e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov		@XMM[8], @XMM[9]		@ next round tweak
1956e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1957e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_done:
1958e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	XTS_CHAIN_TWEAK
1959e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adds		$len, #0x10
1960e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq		.Lxts_enc_ret
1961e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub		r6, $out, #0x10
1962e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1963e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_steal:
1964e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldrb		r0, [$inp], #1
1965e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldrb		r1, [$out, #-0x10]
1966e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	strb		r0, [$out, #-0x10]
1967e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	strb		r1, [$out], #1
1968e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1969e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, #1
1970e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bhi		.Lxts_enc_steal
1971e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1972e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [r6]
1973e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
1974e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
1975e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r1, sp
1976e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [sp,:128]
1977e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r2, $key
1978e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r4, $fp			@ preserve fp
1979e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1980e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		AES_encrypt
1981e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1982e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [sp,:128]
1983e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
1984e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [r6]
1985e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		$fp, r4
1986e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1987e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
1988e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_ret:
1989e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bic		r0, $fp, #0xf
1990e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q0, #0
1991e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q1, #0
1992e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef	XTS_CHAIN_TWEAK
1993e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
1994e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
1995e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_enc_bzero:				@ wipe key schedule [if any]
1996e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		sp!, {q0-q1}
1997e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		sp, r0
1998e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne		.Lxts_enc_bzero
1999e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2000e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		sp, $fp
2001e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef	XTS_CHAIN_TWEAK
2002e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]}, [r1]
2003e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2004e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_POP
2005e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia		sp!, {r4-r10, pc}	@ return
2006e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2007e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2008e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2009e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.globl	bsaes_xts_decrypt
2010e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.type	bsaes_xts_decrypt,%function
2011e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2012e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelbsaes_xts_decrypt:
2013e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	ip, sp
2014e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	stmdb	sp!, {r4-r10, lr}		@ 0x20
2015e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_PUSH
2016e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r6, sp				@ future $fp
2017e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2018e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$inp, r0
2019e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$out, r1
2020e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$len, r2
2021e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$key, r3
2022e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2023e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r0, sp, #0x10			@ 0x10
2024e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bic	r0, #0xf			@ align at 16 bytes
2025e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, r0
2026e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2027e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef	XTS_CHAIN_TWEAK
2028e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r0, [ip]			@ pointer to input tweak
2029e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2030e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ generate initial tweak
2031e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r0, [ip, #4]			@ iv[]
2032e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r1, sp
2033e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r2, [ip, #0]			@ key2
2034e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	AES_encrypt
2035e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r0, sp				@ pointer to initial tweak
2036e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2037e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2038e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	$rounds, [$key, #240]		@ get # of rounds
2039e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	$fp, r6
2040e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2041e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ allocate the key schedule on the stack
2042e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
2043e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
2044e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub	r12, #`32+16`			@ place for tweak[9]
2045e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2046e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ populate the key schedule
2047e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
2048e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
2049e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	sp, r12
2050e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, #0x90			@ pass key schedule
2051e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
2052e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r4, sp, #0x90
2053e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	r4, {@XMM[6]}
2054e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12,  {@XMM[15]}		@ save last round key
2055e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
2056e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r4, {@XMM[7]}
2057e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2058e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr	r12, [$key, #244]
2059e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	eors	r12, #1
2060e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq	0f
2061e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2062e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	str	r12, [$key, #244]
2063e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r4, $key			@ pass key
2064e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov	r5, $rounds			@ pass # of rounds
2065e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r12, $key, #248			@ pass key schedule
2066e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl	_bsaes_key_convert
2067e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add	r4, $key, #248
2068e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia	r4, {@XMM[6]}
2069e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r12,  {@XMM[15]}		@ save last round key
2070e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
2071e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia	r4, {@XMM[7]}
2072e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2073e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	2
2074e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel0:	sub	sp, #0x90			@ place for tweak[9]
2075e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2076e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
2077e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adr	$magic, .Lxts_magic
2078e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2079e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	tst	$len, #0xf			@ if not multiple of 16
2080e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	it	ne				@ Thumb2 thing, sanity check in ARM
2081e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subne	$len, #0x10			@ subtract another 16 bytes
2082e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs	$len, #0x80
2083e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2084e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	blo	.Lxts_dec_short
2085e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b	.Lxts_dec_loop
2086e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2087e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2088e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_loop:
2089e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$magic, {$twmask}	@ load XTS magic
2090e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[0], @XMM[8], #63
2091e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2092e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[0], @T[0], $twmask
2093e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2094e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelfor($i=9;$i<16;$i++) {
2095e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
2096e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
2097e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[$i-1]}, [r0,:128]!
2098e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2099e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[1], @XMM[$i], #63
2100e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i], @XMM[$i], @T[0]
2101e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[1], @T[1], $twmask
2102e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2103e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@T=reverse(@T);
2104e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2105e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=10);
2106e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[$i-10]}, [$inp]!
2107e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2108e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=11);
2109e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2110e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2111e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
2112e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
2113e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
2114e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[15]}, [r0,:128]!
2115e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2116e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[8], @T[0]
2117e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2118e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2119e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
2120e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[5], @XMM[13]
2121e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2122e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2123e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2124e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2125e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2126e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[6], @XMM[6], @XMM[14]
2127e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2128e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[7], @XMM[7], @XMM[15]
2129e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2130e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2131e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2132e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2133e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2134e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
2135e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2136e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
2137e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2138e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[6], @XMM[10]
2139e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2140e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[4], @XMM[11]
2141e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
2142e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[2], @XMM[12]
2143e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
2144e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[11], @XMM[7], @XMM[13]
2145e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[12], @XMM[3], @XMM[14]
2146e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
2147e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[13], @XMM[5], @XMM[15]
2148e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
2149e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2150e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2151e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2152e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, #0x80
2153e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bpl		.Lxts_dec_loop
2154e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2155e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_short:
2156e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adds		$len, #0x70
2157e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bmi		.Lxts_dec_done
2158e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2159e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$magic, {$twmask}	@ load XTS magic
2160e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[0], @XMM[8], #63
2161e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2162e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[0], @T[0], $twmask
2163e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2164e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelfor($i=9;$i<16;$i++) {
2165e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
2166e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
2167e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[$i-1]}, [r0,:128]!
2168e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2169e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@T[1], @XMM[$i], #63
2170e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i], @XMM[$i], @T[0]
2171e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@T[1], @T[1], $twmask
2172e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2173e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@T=reverse(@T);
2174e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2175e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=10);
2176e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[$i-10]}, [$inp]!
2177e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, #0x10
2178e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bmi		.Lxts_dec_`$i-9`
2179e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2180e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___ if ($i>=11);
2181e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2182e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2183e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
2184e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
2185e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	sub		$len, #0x10
2186e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
2187e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2188e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[6]}, [$inp]!
2189e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[5], @XMM[13]
2190e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2191e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2192e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2193e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2194e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2195e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[6], @XMM[6], @XMM[14]
2196e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2197e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2198e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2199e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2200e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2201e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2202e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
2203e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2204e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
2205e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2206e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[6], @XMM[10]
2207e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2208e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[4], @XMM[11]
2209e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[14]}, [r0,:128]!
2210e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[2], @XMM[12]
2211e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
2212e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[11], @XMM[7], @XMM[13]
2213e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[12], @XMM[3], @XMM[14]
2214e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
2215e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[12]}, [$out]!
2216e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2217e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2218e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_dec_done
2219e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2220e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_6:
2221e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
2222e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2223e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[4], @XMM[4], @XMM[12]
2224e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2225e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2226e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2227e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2228e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2229e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[5], @XMM[5], @XMM[13]
2230e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2231e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2232e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2233e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2234e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2235e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2236e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
2237e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2238e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
2239e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2240e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[6], @XMM[10]
2241e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2242e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[4], @XMM[11]
2243e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[2], @XMM[12]
2244e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
2245e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[11], @XMM[7], @XMM[13]
2246e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
2247e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2248e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2249e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_dec_done
2250e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2251e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_5:
2252e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
2253e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2254e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[3], @XMM[3], @XMM[11]
2255e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2256e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2257e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2258e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2259e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2260e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[4], @XMM[4], @XMM[12]
2261e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2262e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2263e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2264e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2265e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2266e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2267e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
2268e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2269e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[12]}, [r0,:128]!
2270e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2271e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[6], @XMM[10]
2272e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2273e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[4], @XMM[11]
2274e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[10], @XMM[2], @XMM[12]
2275e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
2276e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[10]}, [$out]!
2277e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2278e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2279e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_dec_done
2280e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2281e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_4:
2282e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
2283e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2284e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[2], @XMM[2], @XMM[10]
2285e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2286e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2287e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2288e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2289e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2290e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[3], @XMM[3], @XMM[11]
2291e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2292e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2293e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2294e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2295e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2296e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2297e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
2298e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2299e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2300e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[6], @XMM[10]
2301e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2302e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[4], @XMM[11]
2303e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
2304e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2305e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2306e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_dec_done
2307e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2308e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_3:
2309e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
2310e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2311e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[9]
2312e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2313e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2314e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2315e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2316e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2317e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[2], @XMM[2], @XMM[10]
2318e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2319e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2320e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2321e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2322e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2323e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
2324e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[10]}, [r0,:128]!
2325e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2326e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2327e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[8], @XMM[6], @XMM[10]
2328e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2329e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]}, [$out]!
2330e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2331e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2332e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_dec_done
2333e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2334e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_2:
2335e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
2336e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2337e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
2338e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	BSAES_ASM_EXTENDED_KEY
2339e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, sp, #0x90			@ pass key schedule
2340e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#else
2341e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	add		r4, $key, #248			@ pass key schedule
2342e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2343e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[9]
2344e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $rounds			@ pass rounds
2345e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2346e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2347e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		_bsaes_decrypt8
2348e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2349e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
2350e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[ 8]
2351e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[1], @XMM[1], @XMM[ 9]
2352e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
2353e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2354e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
2355e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	b		.Lxts_dec_done
2356e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.align	4
2357e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_1:
2358e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2359e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[8]
2360e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r1, sp
2361e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [sp,:128]
2362e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r2, $key
2363e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r4, $fp				@ preserve fp
2364e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r5, $magic			@ preserve magic
2365e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2366e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		AES_decrypt
2367e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2368e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [sp,:128]
2369e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
2370e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [$out]!
2371e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		$fp, r4
2372e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		$magic, r5
2373e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2374e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov		@XMM[8], @XMM[9]		@ next round tweak
2375e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2376e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_done:
2377e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifndef	XTS_CHAIN_TWEAK
2378e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	adds		$len, #0x10
2379e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	beq		.Lxts_dec_ret
2380e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2381e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ calculate one round of extra tweak for the stolen ciphertext
2382e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vldmia		$magic, {$twmask}
2383e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vshr.s64	@XMM[6], @XMM[8], #63
2384e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vand		@XMM[6], @XMM[6], $twmask
2385e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vadd.u64	@XMM[9], @XMM[8], @XMM[8]
2386e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vswp		`&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
2387e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[9], @XMM[9], @XMM[6]
2388e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2389e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	@ perform the final decryption with the last tweak value
2390e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [$inp]!
2391e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2392e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[9]
2393e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r1, sp
2394e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [sp,:128]
2395e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r2, $key
2396e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r4, $fp			@ preserve fp
2397e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2398e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		AES_decrypt
2399e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2400e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [sp,:128]
2401e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[9]
2402e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [$out]
2403e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2404e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r6, $out
2405e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_steal:
2406e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldrb		r1, [$out]
2407e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldrb		r0, [$inp], #1
2408e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	strb		r1, [$out, #0x10]
2409e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	strb		r0, [$out], #1
2410e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2411e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	subs		$len, #1
2412e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bhi		.Lxts_dec_steal
2413e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2414e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [r6]
2415e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r0, sp
2416e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[8]
2417e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r1, sp
2418e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [sp,:128]
2419e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		r2, $key
2420e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2421e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bl		AES_decrypt
2422e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2423e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vld1.8		{@XMM[0]}, [sp,:128]
2424e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	veor		@XMM[0], @XMM[0], @XMM[8]
2425e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[0]}, [r6]
2426e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		$fp, r4
2427e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2428e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2429e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_ret:
2430e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bic		r0, $fp, #0xf
2431e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q0, #0
2432e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vmov.i32	q1, #0
2433e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef	XTS_CHAIN_TWEAK
2434e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
2435e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2436e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.Lxts_dec_bzero:				@ wipe key schedule [if any]
2437e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vstmia		sp!, {q0-q1}
2438e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	cmp		sp, r0
2439e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	bne		.Lxts_dec_bzero
2440e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2441e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	mov		sp, $fp
2442e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#ifdef	XTS_CHAIN_TWEAK
2443e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	vst1.8		{@XMM[8]}, [r1]
2444e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2445e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	VFP_ABI_POP
2446e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	ldmia		sp!, {r4-r10, pc}	@ return
2447e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2448e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2449e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2450e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
2451e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code.=<<___;
2452e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel#endif
2453e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel___
2454e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2455e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2456e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2457e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelopen SELF,$0;
2458e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelwhile(<SELF>) {
2459e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel	next if (/^#!/);
2460e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel        last if (!s/^#/@/ and !/^$/);
2461e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel        print;
2462e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel}
2463e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelclose SELF;
2464e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2465e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelprint $code;
2466e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvel
2467e4e7f10bfc4069925e99cc4b428c3434e30b6c3fArd Biesheuvelclose STDOUT;
2468