1#include "arm_arch.h" 2 3.text 4#if !defined(__clang__) 5.arch armv8-a+crypto 6#endif 7.globl gcm_init_v8 8.type gcm_init_v8,%function 9.align 4 10gcm_init_v8: 11 ld1 {v17.2d},[x1] //load input H 12 movi v19.16b,#0xe1 13 shl v19.2d,v19.2d,#57 //0xc2.0 14 ext v3.16b,v17.16b,v17.16b,#8 15 ushr v18.2d,v19.2d,#63 16 dup v17.4s,v17.s[1] 17 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 18 ushr v18.2d,v3.2d,#63 19 sshr v17.4s,v17.4s,#31 //broadcast carry bit 20 and v18.16b,v18.16b,v16.16b 21 shl v3.2d,v3.2d,#1 22 ext v18.16b,v18.16b,v18.16b,#8 23 and v16.16b,v16.16b,v17.16b 24 orr v3.16b,v3.16b,v18.16b //H<<<=1 25 eor v20.16b,v3.16b,v16.16b //twisted H 26 st1 {v20.2d},[x0],#16 //store Htable[0] 27 28 //calculate H^2 29 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 30 pmull v0.1q,v20.1d,v20.1d 31 eor v16.16b,v16.16b,v20.16b 32 pmull2 v2.1q,v20.2d,v20.2d 33 pmull v1.1q,v16.1d,v16.1d 34 35 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 36 eor v18.16b,v0.16b,v2.16b 37 eor v1.16b,v1.16b,v17.16b 38 eor v1.16b,v1.16b,v18.16b 39 pmull v18.1q,v0.1d,v19.1d //1st phase 40 41 ins v2.d[0],v1.d[1] 42 ins v1.d[1],v0.d[0] 43 eor v0.16b,v1.16b,v18.16b 44 45 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 46 pmull v0.1q,v0.1d,v19.1d 47 eor v18.16b,v18.16b,v2.16b 48 eor v22.16b,v0.16b,v18.16b 49 50 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 51 eor v17.16b,v17.16b,v22.16b 52 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 53 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 54 55 ret 56.size gcm_init_v8,.-gcm_init_v8 57.globl gcm_gmult_v8 58.type gcm_gmult_v8,%function 59.align 4 60gcm_gmult_v8: 61 ld1 {v17.2d},[x0] //load Xi 62 movi v19.16b,#0xe1 63 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 64 shl v19.2d,v19.2d,#57 65#ifndef __ARMEB__ 66 rev64 v17.16b,v17.16b 67#endif 68 ext v3.16b,v17.16b,v17.16b,#8 69 70 pmull v0.1q,v20.1d,v3.1d //H.lo�Xi.lo 71 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 72 pmull2 v2.1q,v20.2d,v3.2d //H.hi�Xi.hi 73 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)�(Xi.lo+Xi.hi) 74 75 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 76 eor v18.16b,v0.16b,v2.16b 77 eor v1.16b,v1.16b,v17.16b 78 eor v1.16b,v1.16b,v18.16b 79 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 80 81 ins v2.d[0],v1.d[1] 82 ins v1.d[1],v0.d[0] 83 eor v0.16b,v1.16b,v18.16b 84 85 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 86 pmull v0.1q,v0.1d,v19.1d 87 eor v18.16b,v18.16b,v2.16b 88 eor v0.16b,v0.16b,v18.16b 89 90#ifndef __ARMEB__ 91 rev64 v0.16b,v0.16b 92#endif 93 ext v0.16b,v0.16b,v0.16b,#8 94 st1 {v0.2d},[x0] //write out Xi 95 96 ret 97.size gcm_gmult_v8,.-gcm_gmult_v8 98.globl gcm_ghash_v8 99.type gcm_ghash_v8,%function 100.align 4 101gcm_ghash_v8: 102 ld1 {v0.2d},[x0] //load [rotated] Xi 103 //"[rotated]" means that 104 //loaded value would have 105 //to be rotated in order to 106 //make it appear as in 107 //alorithm specification 108 subs x3,x3,#32 //see if x3 is 32 or larger 109 mov x12,#16 //x12 is used as post- 110 //increment for input pointer; 111 //as loop is modulo-scheduled 112 //x12 is zeroed just in time 113 //to preclude oversteping 114 //inp[len], which means that 115 //last block[s] are actually 116 //loaded twice, but last 117 //copy is not processed 118 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 119 movi v19.16b,#0xe1 120 ld1 {v22.2d},[x1] 121 csel x12,xzr,x12,eq //is it time to zero x12? 122 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 123 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 124 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 125#ifndef __ARMEB__ 126 rev64 v16.16b,v16.16b 127 rev64 v0.16b,v0.16b 128#endif 129 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 130 b.lo .Lodd_tail_v8 //x3 was less than 32 131 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 132#ifndef __ARMEB__ 133 rev64 v17.16b,v17.16b 134#endif 135 ext v7.16b,v17.16b,v17.16b,#8 136 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 137 pmull v4.1q,v20.1d,v7.1d //H�Ii+1 138 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 139 pmull2 v6.1q,v20.2d,v7.2d 140 b .Loop_mod2x_v8 141 142.align 4 143.Loop_mod2x_v8: 144 ext v18.16b,v3.16b,v3.16b,#8 145 subs x3,x3,#32 //is there more data? 146 pmull v0.1q,v22.1d,v3.1d //H^2.lo�Xi.lo 147 csel x12,xzr,x12,lo //is it time to zero x12? 148 149 pmull v5.1q,v21.1d,v17.1d 150 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 151 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi�Xi.hi 152 eor v0.16b,v0.16b,v4.16b //accumulate 153 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)�(Xi.lo+Xi.hi) 154 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 155 156 eor v2.16b,v2.16b,v6.16b 157 csel x12,xzr,x12,eq //is it time to zero x12? 158 eor v1.16b,v1.16b,v5.16b 159 160 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 161 eor v18.16b,v0.16b,v2.16b 162 eor v1.16b,v1.16b,v17.16b 163 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 164#ifndef __ARMEB__ 165 rev64 v16.16b,v16.16b 166#endif 167 eor v1.16b,v1.16b,v18.16b 168 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 169 170#ifndef __ARMEB__ 171 rev64 v17.16b,v17.16b 172#endif 173 ins v2.d[0],v1.d[1] 174 ins v1.d[1],v0.d[0] 175 ext v7.16b,v17.16b,v17.16b,#8 176 ext v3.16b,v16.16b,v16.16b,#8 177 eor v0.16b,v1.16b,v18.16b 178 pmull v4.1q,v20.1d,v7.1d //H�Ii+1 179 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 180 181 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 182 pmull v0.1q,v0.1d,v19.1d 183 eor v3.16b,v3.16b,v18.16b 184 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 185 eor v3.16b,v3.16b,v0.16b 186 pmull2 v6.1q,v20.2d,v7.2d 187 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 188 189 eor v2.16b,v2.16b,v18.16b 190 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 191 adds x3,x3,#32 //re-construct x3 192 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 193 b.eq .Ldone_v8 //is x3 zero? 194.Lodd_tail_v8: 195 ext v18.16b,v0.16b,v0.16b,#8 196 eor v3.16b,v3.16b,v0.16b //inp^=Xi 197 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 198 199 pmull v0.1q,v20.1d,v3.1d //H.lo�Xi.lo 200 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 201 pmull2 v2.1q,v20.2d,v3.2d //H.hi�Xi.hi 202 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)�(Xi.lo+Xi.hi) 203 204 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 205 eor v18.16b,v0.16b,v2.16b 206 eor v1.16b,v1.16b,v17.16b 207 eor v1.16b,v1.16b,v18.16b 208 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 209 210 ins v2.d[0],v1.d[1] 211 ins v1.d[1],v0.d[0] 212 eor v0.16b,v1.16b,v18.16b 213 214 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 215 pmull v0.1q,v0.1d,v19.1d 216 eor v18.16b,v18.16b,v2.16b 217 eor v0.16b,v0.16b,v18.16b 218 219.Ldone_v8: 220#ifndef __ARMEB__ 221 rev64 v0.16b,v0.16b 222#endif 223 ext v0.16b,v0.16b,v0.16b,#8 224 st1 {v0.2d},[x0] //write out Xi 225 226 ret 227.size gcm_ghash_v8,.-gcm_ghash_v8 228.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 229.align 2 230.align 2 231