AesOpt.asm revision baa3858d3f5d128a5c8466b700098109edcad5f2
1baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; AesOpt.asm -- Intel's AES.
2baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; 2009-12-12 : Igor Pavlov : Public domain
3baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
4baa3858d3f5d128a5c8466b700098109edcad5f2repo syncinclude 7zAsm.asm
5baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
6baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_ASM_START
7baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
8baa3858d3f5d128a5c8466b700098109edcad5f2repo syncifndef x64
9baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    .xmm
10baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendif
11baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
12baa3858d3f5d128a5c8466b700098109edcad5f2repo syncifdef x64
13baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    num     equ r8
14baa3858d3f5d128a5c8466b700098109edcad5f2repo syncelse
15baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    num     equ [r4 + REG_SIZE * 4]
16baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendif
17baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
18baa3858d3f5d128a5c8466b700098109edcad5f2repo syncrD equ r2
19baa3858d3f5d128a5c8466b700098109edcad5f2repo syncrN equ r0
20baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
21baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROLOG macro reg:req
22baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    ifdef x64
23baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [r4 + 8], xmm6
24baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [r4 + 8 + 16], xmm7
25baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    endif
26baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
27baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    push    r3
28baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    push    r5
29baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    push    r6
30baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
31baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     rN, num
32baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     x6, [r1 + 16]
33baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    shl     x6, 5
34baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
35baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  reg, [r1]
36baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     r1, 32
37baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
38baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
39baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_EPILOG macro
40baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pop     r6
41baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pop     r5
42baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pop     r3
43baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
44baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    ifdef x64
45baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm6, [r4 + 8]
46baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm7, [r4 + 8 + 16]
47baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    endif
48baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
49baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_ENDP
50baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
51baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
52baa3858d3f5d128a5c8466b700098109edcad5f2repo syncways equ 4
53baa3858d3f5d128a5c8466b700098109edcad5f2repo syncways16 equ (ways * 16)
54baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
55baa3858d3f5d128a5c8466b700098109edcad5f2repo syncOP_W macro op, op2
56baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    i = 0
57baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    rept ways
58baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op @CatStr(xmm,%i), op2
59baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    i = i + 1
60baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    endm
61baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
62baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
63baa3858d3f5d128a5c8466b700098109edcad5f2repo syncLOAD_OP macro op:req, offs:req
64baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      xmm0, [r1 + r3 offs]
65baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
66baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  
67baa3858d3f5d128a5c8466b700098109edcad5f2repo syncLOAD_OP_W macro op:req, offs:req
68baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm7, [r1 + r3 offs]
69baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    OP_W    op, xmm7
70baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
71baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
72baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
73baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; ---------- AES-CBC Decode ----------
74baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
75baa3858d3f5d128a5c8466b700098109edcad5f2repo syncCBC_DEC_UPDATE macro reg, offs
76baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    reg, xmm6
77baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm6, [rD + offs]
78baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [rD + offs], reg
79baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
80baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
81baa3858d3f5d128a5c8466b700098109edcad5f2repo syncDECODE macro op:req
82baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesdec, +16
83baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  @@:
84baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesdec, +0
85baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesdec, -16
86baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     x3, 32
87baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnz     @B
88baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesdeclast, +0
89baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
90baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
91baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROC AesCbc_Decode_Intel, 3
92baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_PROLOG xmm6
93baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
94baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     x6, 32
95baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
96baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jmp     check2
97baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
98baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  align 16
99baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  nextBlocks2:
100baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     x3, x6
101baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    OP_W    movdqa, [rD + i * 16]
102baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    LOAD_OP_W  pxor, +32
103baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    DECODE  LOAD_OP_W
104baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    OP_W    CBC_DEC_UPDATE, i * 16
105baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rD, ways16
106baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  check2:
107baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     rN, ways
108baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnc     nextBlocks2
109baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
110baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rN, ways
111baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jmp     check
112baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
113baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  nextBlock:
114baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     x3, x6
115baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm1, [rD]
116baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    LOAD_OP movdqa, +32
117baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    xmm0, xmm1
118baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    DECODE  LOAD_OP
119baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    xmm0, xmm6
120baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [rD], xmm0
121baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm6, xmm1
122baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rD, 16
123baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  check:
124baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     rN, 1
125baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnc     nextBlock
126baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
127baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [r1 - 32], xmm6
128baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_EPILOG
129baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
130baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
131baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; ---------- AES-CBC Encode ----------
132baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
133baa3858d3f5d128a5c8466b700098109edcad5f2repo syncENCODE macro op:req
134baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesenc, -16
135baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  @@:
136baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesenc, +0
137baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesenc, +16
138baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     r3, 32
139baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnz     @B
140baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    op      aesenclast, +0
141baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
142baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
143baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROC AesCbc_Encode_Intel, 3
144baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_PROLOG xmm0
145baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
146baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     r1, r6
147baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    neg     r6
148baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     r6, 32
149baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
150baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jmp     check_e
151baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
152baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  align 16
153baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  nextBlock_e:
154baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     r3, r6
155baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    xmm0, [rD]
156baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    xmm0, [r1 + r3 - 32]
157baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    ENCODE  LOAD_OP
158baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [rD], xmm0
159baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rD, 16
160baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  check_e:
161baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     rN, 1
162baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnc     nextBlock_e
163baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
164baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [r1 + r6 - 64], xmm0
165baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_EPILOG
166baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
167baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
168baa3858d3f5d128a5c8466b700098109edcad5f2repo sync; ---------- AES-CTR ----------
169baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
170baa3858d3f5d128a5c8466b700098109edcad5f2repo syncXOR_UPD_1 macro reg, offs
171baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    reg, [rD + offs]
172baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
173baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
174baa3858d3f5d128a5c8466b700098109edcad5f2repo syncXOR_UPD_2 macro reg, offs
175baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [rD + offs], reg
176baa3858d3f5d128a5c8466b700098109edcad5f2repo syncendm
177baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
178baa3858d3f5d128a5c8466b700098109edcad5f2repo syncMY_PROC AesCtr_Code_Intel, 3
179baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_PROLOG xmm6
180baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
181baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     r5, r4
182baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    shr     r5, 4
183baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    dec     r5
184baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    shl     r5, 4
185baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
186baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     DWORD PTR [r5], 1
187baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     DWORD PTR [r5 + 4], 0
188baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     DWORD PTR [r5 + 8], 0
189baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     DWORD PTR [r5 + 12], 0
190baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    
191baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     r1, r6
192baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    neg     r6
193baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     r6, 32
194baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
195baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jmp     check2_c
196baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
197baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  align 16
198baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  nextBlocks2_c:
199baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm7, [r5]
200baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
201baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    i = 0
202baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    rept ways
203baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    paddq   xmm6, xmm7
204baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  @CatStr(xmm,%i), xmm6
205baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    i = i + 1
206baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    endm
207baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
208baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     r3, r6
209baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    LOAD_OP_W  pxor, -32
210baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    ENCODE  LOAD_OP_W
211baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    OP_W    XOR_UPD_1, i * 16
212baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    OP_W    XOR_UPD_2, i * 16
213baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rD, ways16
214baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  check2_c:
215baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     rN, ways
216baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnc     nextBlocks2_c
217baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
218baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rN, ways
219baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jmp     check_c
220baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
221baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  nextBlock_c:
222baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    paddq   xmm6, [r5]
223baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    mov     r3, r6
224baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  xmm0, [r1 + r3 - 32]
225baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    pxor    xmm0, xmm6
226baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    ENCODE  LOAD_OP
227baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    XOR_UPD_1 xmm0, 0
228baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    XOR_UPD_2 xmm0, 0
229baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    add     rD, 16
230baa3858d3f5d128a5c8466b700098109edcad5f2repo sync  check_c:
231baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    sub     rN, 1
232baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    jnc     nextBlock_c
233baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
234baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    movdqa  [r1 + r6 - 64], xmm6
235baa3858d3f5d128a5c8466b700098109edcad5f2repo sync    MY_EPILOG
236baa3858d3f5d128a5c8466b700098109edcad5f2repo sync
237baa3858d3f5d128a5c8466b700098109edcad5f2repo syncend
238