AesOpt.asm revision baa3858d3f5d128a5c8466b700098109edcad5f2
1baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; AesOpt.asm -- Intel's AES. 2baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; 2009-12-12 : Igor Pavlov : Public domain 3baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 4baa3858d3f5d128a5c8466b700098109edcad5f2repo syncinclude 7zAsm.asm 5baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 6baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_ASM_START 7baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 8baa3858d3f5d128a5c8466b700098109edcad5f2repo syncifndef x64 9baa3858d3f5d128a5c8466b700098109edcad5f2repo sync .xmm 10baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendif 11baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 12baa3858d3f5d128a5c8466b700098109edcad5f2repo syncifdef x64 13baa3858d3f5d128a5c8466b700098109edcad5f2repo sync num equ r8 14baa3858d3f5d128a5c8466b700098109edcad5f2repo syncelse 15baa3858d3f5d128a5c8466b700098109edcad5f2repo sync num equ [r4 + REG_SIZE * 4] 16baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendif 17baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 18baa3858d3f5d128a5c8466b700098109edcad5f2repo syncrD equ r2 19baa3858d3f5d128a5c8466b700098109edcad5f2repo syncrN equ r0 20baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 21baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROLOG macro reg:req 22baa3858d3f5d128a5c8466b700098109edcad5f2repo sync ifdef x64 23baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [r4 + 8], xmm6 24baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [r4 + 8 + 16], xmm7 25baa3858d3f5d128a5c8466b700098109edcad5f2repo sync endif 26baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 27baa3858d3f5d128a5c8466b700098109edcad5f2repo sync push r3 28baa3858d3f5d128a5c8466b700098109edcad5f2repo sync push r5 29baa3858d3f5d128a5c8466b700098109edcad5f2repo sync push r6 30baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 31baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov rN, num 32baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov x6, [r1 + 16] 33baa3858d3f5d128a5c8466b700098109edcad5f2repo sync shl x6, 5 34baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 35baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa reg, [r1] 36baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add r1, 32 37baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 38baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 39baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_EPILOG macro 40baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pop r6 41baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pop r5 42baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pop r3 43baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 44baa3858d3f5d128a5c8466b700098109edcad5f2repo sync ifdef x64 45baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm6, [r4 + 8] 46baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm7, [r4 + 8 + 16] 47baa3858d3f5d128a5c8466b700098109edcad5f2repo sync endif 48baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 49baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_ENDP 50baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 51baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 52baa3858d3f5d128a5c8466b700098109edcad5f2repo syncways equ 4 53baa3858d3f5d128a5c8466b700098109edcad5f2repo syncways16 equ (ways * 16) 54baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 55baa3858d3f5d128a5c8466b700098109edcad5f2repo syncOP_W macro op, op2 56baa3858d3f5d128a5c8466b700098109edcad5f2repo sync i = 0 57baa3858d3f5d128a5c8466b700098109edcad5f2repo sync rept ways 58baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op @CatStr(xmm,%i), op2 59baa3858d3f5d128a5c8466b700098109edcad5f2repo sync i = i + 1 60baa3858d3f5d128a5c8466b700098109edcad5f2repo sync endm 61baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 62baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 63baa3858d3f5d128a5c8466b700098109edcad5f2repo syncLOAD_OP macro op:req, offs:req 64baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op xmm0, [r1 + r3 offs] 65baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 66baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 67baa3858d3f5d128a5c8466b700098109edcad5f2repo syncLOAD_OP_W macro op:req, offs:req 68baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm7, [r1 + r3 offs] 69baa3858d3f5d128a5c8466b700098109edcad5f2repo sync OP_W op, xmm7 70baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 71baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 72baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 73baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; ---------- AES-CBC Decode ---------- 74baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 75baa3858d3f5d128a5c8466b700098109edcad5f2repo syncCBC_DEC_UPDATE macro reg, offs 76baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor reg, xmm6 77baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm6, [rD + offs] 78baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [rD + offs], reg 79baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 80baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 81baa3858d3f5d128a5c8466b700098109edcad5f2repo syncDECODE macro op:req 82baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesdec, +16 83baa3858d3f5d128a5c8466b700098109edcad5f2repo sync @@: 84baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesdec, +0 85baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesdec, -16 86baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub x3, 32 87baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnz @B 88baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesdeclast, +0 89baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 90baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 91baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROC AesCbc_Decode_Intel, 3 92baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_PROLOG xmm6 93baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 94baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub x6, 32 95baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 96baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jmp check2 97baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 98baa3858d3f5d128a5c8466b700098109edcad5f2repo sync align 16 99baa3858d3f5d128a5c8466b700098109edcad5f2repo sync nextBlocks2: 100baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov x3, x6 101baa3858d3f5d128a5c8466b700098109edcad5f2repo sync OP_W movdqa, [rD + i * 16] 102baa3858d3f5d128a5c8466b700098109edcad5f2repo sync LOAD_OP_W pxor, +32 103baa3858d3f5d128a5c8466b700098109edcad5f2repo sync DECODE LOAD_OP_W 104baa3858d3f5d128a5c8466b700098109edcad5f2repo sync OP_W CBC_DEC_UPDATE, i * 16 105baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rD, ways16 106baa3858d3f5d128a5c8466b700098109edcad5f2repo sync check2: 107baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub rN, ways 108baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnc nextBlocks2 109baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 110baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rN, ways 111baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jmp check 112baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 113baa3858d3f5d128a5c8466b700098109edcad5f2repo sync nextBlock: 114baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov x3, x6 115baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm1, [rD] 116baa3858d3f5d128a5c8466b700098109edcad5f2repo sync LOAD_OP movdqa, +32 117baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor xmm0, xmm1 118baa3858d3f5d128a5c8466b700098109edcad5f2repo sync DECODE LOAD_OP 119baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor xmm0, xmm6 120baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [rD], xmm0 121baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm6, xmm1 122baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rD, 16 123baa3858d3f5d128a5c8466b700098109edcad5f2repo sync check: 124baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub rN, 1 125baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnc nextBlock 126baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 127baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [r1 - 32], xmm6 128baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_EPILOG 129baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 130baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 131baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; ---------- AES-CBC Encode ---------- 132baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 133baa3858d3f5d128a5c8466b700098109edcad5f2repo syncENCODE macro op:req 134baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesenc, -16 135baa3858d3f5d128a5c8466b700098109edcad5f2repo sync @@: 136baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesenc, +0 137baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesenc, +16 138baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add r3, 32 139baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnz @B 140baa3858d3f5d128a5c8466b700098109edcad5f2repo sync op aesenclast, +0 141baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 142baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 143baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROC AesCbc_Encode_Intel, 3 144baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_PROLOG xmm0 145baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 146baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add r1, r6 147baa3858d3f5d128a5c8466b700098109edcad5f2repo sync neg r6 148baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add r6, 32 149baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 150baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jmp check_e 151baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 152baa3858d3f5d128a5c8466b700098109edcad5f2repo sync align 16 153baa3858d3f5d128a5c8466b700098109edcad5f2repo sync nextBlock_e: 154baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov r3, r6 155baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor xmm0, [rD] 156baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor xmm0, [r1 + r3 - 32] 157baa3858d3f5d128a5c8466b700098109edcad5f2repo sync ENCODE LOAD_OP 158baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [rD], xmm0 159baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rD, 16 160baa3858d3f5d128a5c8466b700098109edcad5f2repo sync check_e: 161baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub rN, 1 162baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnc nextBlock_e 163baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 164baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [r1 + r6 - 64], xmm0 165baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_EPILOG 166baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 167baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 168baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; ---------- AES-CTR ---------- 169baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 170baa3858d3f5d128a5c8466b700098109edcad5f2repo syncXOR_UPD_1 macro reg, offs 171baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor reg, [rD + offs] 172baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 173baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 174baa3858d3f5d128a5c8466b700098109edcad5f2repo syncXOR_UPD_2 macro reg, offs 175baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [rD + offs], reg 176baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm 177baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 178baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROC AesCtr_Code_Intel, 3 179baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_PROLOG xmm6 180baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 181baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov r5, r4 182baa3858d3f5d128a5c8466b700098109edcad5f2repo sync shr r5, 4 183baa3858d3f5d128a5c8466b700098109edcad5f2repo sync dec r5 184baa3858d3f5d128a5c8466b700098109edcad5f2repo sync shl r5, 4 185baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 186baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov DWORD PTR [r5], 1 187baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov DWORD PTR [r5 + 4], 0 188baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov DWORD PTR [r5 + 8], 0 189baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov DWORD PTR [r5 + 12], 0 190baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 191baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add r1, r6 192baa3858d3f5d128a5c8466b700098109edcad5f2repo sync neg r6 193baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add r6, 32 194baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 195baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jmp check2_c 196baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 197baa3858d3f5d128a5c8466b700098109edcad5f2repo sync align 16 198baa3858d3f5d128a5c8466b700098109edcad5f2repo sync nextBlocks2_c: 199baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm7, [r5] 200baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 201baa3858d3f5d128a5c8466b700098109edcad5f2repo sync i = 0 202baa3858d3f5d128a5c8466b700098109edcad5f2repo sync rept ways 203baa3858d3f5d128a5c8466b700098109edcad5f2repo sync paddq xmm6, xmm7 204baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa @CatStr(xmm,%i), xmm6 205baa3858d3f5d128a5c8466b700098109edcad5f2repo sync i = i + 1 206baa3858d3f5d128a5c8466b700098109edcad5f2repo sync endm 207baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 208baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov r3, r6 209baa3858d3f5d128a5c8466b700098109edcad5f2repo sync LOAD_OP_W pxor, -32 210baa3858d3f5d128a5c8466b700098109edcad5f2repo sync ENCODE LOAD_OP_W 211baa3858d3f5d128a5c8466b700098109edcad5f2repo sync OP_W XOR_UPD_1, i * 16 212baa3858d3f5d128a5c8466b700098109edcad5f2repo sync OP_W XOR_UPD_2, i * 16 213baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rD, ways16 214baa3858d3f5d128a5c8466b700098109edcad5f2repo sync check2_c: 215baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub rN, ways 216baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnc nextBlocks2_c 217baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 218baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rN, ways 219baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jmp check_c 220baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 221baa3858d3f5d128a5c8466b700098109edcad5f2repo sync nextBlock_c: 222baa3858d3f5d128a5c8466b700098109edcad5f2repo sync paddq xmm6, [r5] 223baa3858d3f5d128a5c8466b700098109edcad5f2repo sync mov r3, r6 224baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa xmm0, [r1 + r3 - 32] 225baa3858d3f5d128a5c8466b700098109edcad5f2repo sync pxor xmm0, xmm6 226baa3858d3f5d128a5c8466b700098109edcad5f2repo sync ENCODE LOAD_OP 227baa3858d3f5d128a5c8466b700098109edcad5f2repo sync XOR_UPD_1 xmm0, 0 228baa3858d3f5d128a5c8466b700098109edcad5f2repo sync XOR_UPD_2 xmm0, 0 229baa3858d3f5d128a5c8466b700098109edcad5f2repo sync add rD, 16 230baa3858d3f5d128a5c8466b700098109edcad5f2repo sync check_c: 231baa3858d3f5d128a5c8466b700098109edcad5f2repo sync sub rN, 1 232baa3858d3f5d128a5c8466b700098109edcad5f2repo sync jnc nextBlock_c 233baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 234baa3858d3f5d128a5c8466b700098109edcad5f2repo sync movdqa [r1 + r6 - 64], xmm6 235baa3858d3f5d128a5c8466b700098109edcad5f2repo sync MY_EPILOG 236baa3858d3f5d128a5c8466b700098109edcad5f2repo sync 237baa3858d3f5d128a5c8466b700098109edcad5f2repo syncend 238