1; AesOpt.asm -- Intel's AES.
2; 2009-12-12 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6MY_ASM_START
7
8ifndef x64
9    .xmm
10endif
11
12ifdef x64
13    num     equ r8
14else
15    num     equ [r4 + REG_SIZE * 4]
16endif
17
18rD equ r2
19rN equ r0
20
21MY_PROLOG macro reg:req
22    ifdef x64
23    movdqa  [r4 + 8], xmm6
24    movdqa  [r4 + 8 + 16], xmm7
25    endif
26
27    push    r3
28    push    r5
29    push    r6
30
31    mov     rN, num
32    mov     x6, [r1 + 16]
33    shl     x6, 5
34
35    movdqa  reg, [r1]
36    add     r1, 32
37endm
38
39MY_EPILOG macro
40    pop     r6
41    pop     r5
42    pop     r3
43
44    ifdef x64
45    movdqa  xmm6, [r4 + 8]
46    movdqa  xmm7, [r4 + 8 + 16]
47    endif
48
49    MY_ENDP
50endm
51
52ways equ 4
53ways16 equ (ways * 16)
54
55OP_W macro op, op2
56    i = 0
57    rept ways
58    op @CatStr(xmm,%i), op2
59    i = i + 1
60    endm
61endm
62
63LOAD_OP macro op:req, offs:req
64    op      xmm0, [r1 + r3 offs]
65endm
66
67LOAD_OP_W macro op:req, offs:req
68    movdqa  xmm7, [r1 + r3 offs]
69    OP_W    op, xmm7
70endm
71
72
73; ---------- AES-CBC Decode ----------
74
75CBC_DEC_UPDATE macro reg, offs
76    pxor    reg, xmm6
77    movdqa  xmm6, [rD + offs]
78    movdqa  [rD + offs], reg
79endm
80
81DECODE macro op:req
82    op      aesdec, +16
83  @@:
84    op      aesdec, +0
85    op      aesdec, -16
86    sub     x3, 32
87    jnz     @B
88    op      aesdeclast, +0
89endm
90
91MY_PROC AesCbc_Decode_Intel, 3
92    MY_PROLOG xmm6
93
94    sub     x6, 32
95
96    jmp     check2
97
98  align 16
99  nextBlocks2:
100    mov     x3, x6
101    OP_W    movdqa, [rD + i * 16]
102    LOAD_OP_W  pxor, +32
103    DECODE  LOAD_OP_W
104    OP_W    CBC_DEC_UPDATE, i * 16
105    add     rD, ways16
106  check2:
107    sub     rN, ways
108    jnc     nextBlocks2
109
110    add     rN, ways
111    jmp     check
112
113  nextBlock:
114    mov     x3, x6
115    movdqa  xmm1, [rD]
116    LOAD_OP movdqa, +32
117    pxor    xmm0, xmm1
118    DECODE  LOAD_OP
119    pxor    xmm0, xmm6
120    movdqa  [rD], xmm0
121    movdqa  xmm6, xmm1
122    add     rD, 16
123  check:
124    sub     rN, 1
125    jnc     nextBlock
126
127    movdqa  [r1 - 32], xmm6
128    MY_EPILOG
129
130
131; ---------- AES-CBC Encode ----------
132
133ENCODE macro op:req
134    op      aesenc, -16
135  @@:
136    op      aesenc, +0
137    op      aesenc, +16
138    add     r3, 32
139    jnz     @B
140    op      aesenclast, +0
141endm
142
143MY_PROC AesCbc_Encode_Intel, 3
144    MY_PROLOG xmm0
145
146    add     r1, r6
147    neg     r6
148    add     r6, 32
149
150    jmp     check_e
151
152  align 16
153  nextBlock_e:
154    mov     r3, r6
155    pxor    xmm0, [rD]
156    pxor    xmm0, [r1 + r3 - 32]
157    ENCODE  LOAD_OP
158    movdqa  [rD], xmm0
159    add     rD, 16
160  check_e:
161    sub     rN, 1
162    jnc     nextBlock_e
163
164    movdqa  [r1 + r6 - 64], xmm0
165    MY_EPILOG
166
167
168; ---------- AES-CTR ----------
169
170XOR_UPD_1 macro reg, offs
171    pxor    reg, [rD + offs]
172endm
173
174XOR_UPD_2 macro reg, offs
175    movdqa  [rD + offs], reg
176endm
177
178MY_PROC AesCtr_Code_Intel, 3
179    MY_PROLOG xmm6
180
181    mov     r5, r4
182    shr     r5, 4
183    dec     r5
184    shl     r5, 4
185
186    mov     DWORD PTR [r5], 1
187    mov     DWORD PTR [r5 + 4], 0
188    mov     DWORD PTR [r5 + 8], 0
189    mov     DWORD PTR [r5 + 12], 0
190
191    add     r1, r6
192    neg     r6
193    add     r6, 32
194
195    jmp     check2_c
196
197  align 16
198  nextBlocks2_c:
199    movdqa  xmm7, [r5]
200
201    i = 0
202    rept ways
203    paddq   xmm6, xmm7
204    movdqa  @CatStr(xmm,%i), xmm6
205    i = i + 1
206    endm
207
208    mov     r3, r6
209    LOAD_OP_W  pxor, -32
210    ENCODE  LOAD_OP_W
211    OP_W    XOR_UPD_1, i * 16
212    OP_W    XOR_UPD_2, i * 16
213    add     rD, ways16
214  check2_c:
215    sub     rN, ways
216    jnc     nextBlocks2_c
217
218    add     rN, ways
219    jmp     check_c
220
221  nextBlock_c:
222    paddq   xmm6, [r5]
223    mov     r3, r6
224    movdqa  xmm0, [r1 + r3 - 32]
225    pxor    xmm0, xmm6
226    ENCODE  LOAD_OP
227    XOR_UPD_1 xmm0, 0
228    XOR_UPD_2 xmm0, 0
229    add     rD, 16
230  check_c:
231    sub     rN, 1
232    jnc     nextBlock_c
233
234    movdqa  [r1 + r6 - 64], xmm6
235    MY_EPILOG
236
237end
238