1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section	.text	code align=64
14%else
15section	.text	code
16%endif
17align	64
18L$_vpaes_consts:
19dd	218628480,235210255,168496130,67568393
20dd	252381056,17041926,33884169,51187212
21dd	252645135,252645135,252645135,252645135
22dd	1512730624,3266504856,1377990664,3401244816
23dd	830229760,1275146365,2969422977,3447763452
24dd	3411033600,2979783055,338359620,2782886510
25dd	4209124096,907596821,221174255,1006095553
26dd	191964160,3799684038,3164090317,1589111125
27dd	182528256,1777043520,2877432650,3265356744
28dd	1874708224,3503451415,3305285752,363511674
29dd	1606117888,3487855781,1093350906,2384367825
30dd	197121,67569157,134941193,202313229
31dd	67569157,134941193,202313229,197121
32dd	134941193,202313229,197121,67569157
33dd	202313229,197121,67569157,134941193
34dd	33619971,100992007,168364043,235736079
35dd	235736079,33619971,100992007,168364043
36dd	168364043,235736079,33619971,100992007
37dd	100992007,168364043,235736079,33619971
38dd	50462976,117835012,185207048,252579084
39dd	252314880,51251460,117574920,184942860
40dd	184682752,252054788,50987272,118359308
41dd	118099200,185467140,251790600,50727180
42dd	2946363062,528716217,1300004225,1881839624
43dd	1532713819,1532713819,1532713819,1532713819
44dd	3602276352,4288629033,3737020424,4153884961
45dd	1354558464,32357713,2958822624,3775749553
46dd	1201988352,132424512,1572796698,503232858
47dd	2213177600,1597421020,4103937655,675398315
48dd	2749646592,4273543773,1511898873,121693092
49dd	3040248576,1103263732,2871565598,1608280554
50dd	2236667136,2588920351,482954393,64377734
51dd	3069987328,291237287,2117370568,3650299247
52dd	533321216,3573750986,2572112006,1401264716
53dd	1339849704,2721158661,548607111,3445553514
54dd	2128193280,3054596040,2183486460,1257083700
55dd	655635200,1165381986,3923443150,2344132524
56dd	190078720,256924420,290342170,357187870
57dd	1610966272,2263057382,4103205268,309794674
58dd	2592527872,2233205587,1335446729,3402964816
59dd	3973531904,3225098121,3002836325,1918774430
60dd	3870401024,2102906079,2284471353,4117666579
61dd	617007872,1021508343,366931923,691083277
62dd	2528395776,3491914898,2968704004,1613121270
63dd	3445188352,3247741094,844474987,4093578302
64dd	651481088,1190302358,1689581232,574775300
65dd	4289380608,206939853,2555985458,2489840491
66dd	2130264064,327674451,3566485037,3349835193
67dd	2470714624,316102159,3636825756,3393945945
68db	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
69db	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
70db	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
71db	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
72db	118,101,114,115,105,116,121,41,0
73align	64
74align	16
75__vpaes_preheat:
76	add	ebp,DWORD [esp]
77	movdqa	xmm7,[ebp-48]
78	movdqa	xmm6,[ebp-16]
79	ret
80align	16
81__vpaes_encrypt_core:
82	mov	ecx,16
83	mov	eax,DWORD [240+edx]
84	movdqa	xmm1,xmm6
85	movdqa	xmm2,[ebp]
86	pandn	xmm1,xmm0
87	pand	xmm0,xmm6
88	movdqu	xmm5,[edx]
89db	102,15,56,0,208
90	movdqa	xmm0,[16+ebp]
91	pxor	xmm2,xmm5
92	psrld	xmm1,4
93	add	edx,16
94db	102,15,56,0,193
95	lea	ebx,[192+ebp]
96	pxor	xmm0,xmm2
97	jmp	NEAR L$000enc_entry
98align	16
99L$001enc_loop:
100	movdqa	xmm4,[32+ebp]
101	movdqa	xmm0,[48+ebp]
102db	102,15,56,0,226
103db	102,15,56,0,195
104	pxor	xmm4,xmm5
105	movdqa	xmm5,[64+ebp]
106	pxor	xmm0,xmm4
107	movdqa	xmm1,[ecx*1+ebx-64]
108db	102,15,56,0,234
109	movdqa	xmm2,[80+ebp]
110	movdqa	xmm4,[ecx*1+ebx]
111db	102,15,56,0,211
112	movdqa	xmm3,xmm0
113	pxor	xmm2,xmm5
114db	102,15,56,0,193
115	add	edx,16
116	pxor	xmm0,xmm2
117db	102,15,56,0,220
118	add	ecx,16
119	pxor	xmm3,xmm0
120db	102,15,56,0,193
121	and	ecx,48
122	sub	eax,1
123	pxor	xmm0,xmm3
124L$000enc_entry:
125	movdqa	xmm1,xmm6
126	movdqa	xmm5,[ebp-32]
127	pandn	xmm1,xmm0
128	psrld	xmm1,4
129	pand	xmm0,xmm6
130db	102,15,56,0,232
131	movdqa	xmm3,xmm7
132	pxor	xmm0,xmm1
133db	102,15,56,0,217
134	movdqa	xmm4,xmm7
135	pxor	xmm3,xmm5
136db	102,15,56,0,224
137	movdqa	xmm2,xmm7
138	pxor	xmm4,xmm5
139db	102,15,56,0,211
140	movdqa	xmm3,xmm7
141	pxor	xmm2,xmm0
142db	102,15,56,0,220
143	movdqu	xmm5,[edx]
144	pxor	xmm3,xmm1
145	jnz	NEAR L$001enc_loop
146	movdqa	xmm4,[96+ebp]
147	movdqa	xmm0,[112+ebp]
148db	102,15,56,0,226
149	pxor	xmm4,xmm5
150db	102,15,56,0,195
151	movdqa	xmm1,[64+ecx*1+ebx]
152	pxor	xmm0,xmm4
153db	102,15,56,0,193
154	ret
155align	16
156__vpaes_decrypt_core:
157	lea	ebx,[608+ebp]
158	mov	eax,DWORD [240+edx]
159	movdqa	xmm1,xmm6
160	movdqa	xmm2,[ebx-64]
161	pandn	xmm1,xmm0
162	mov	ecx,eax
163	psrld	xmm1,4
164	movdqu	xmm5,[edx]
165	shl	ecx,4
166	pand	xmm0,xmm6
167db	102,15,56,0,208
168	movdqa	xmm0,[ebx-48]
169	xor	ecx,48
170db	102,15,56,0,193
171	and	ecx,48
172	pxor	xmm2,xmm5
173	movdqa	xmm5,[176+ebp]
174	pxor	xmm0,xmm2
175	add	edx,16
176	lea	ecx,[ecx*1+ebx-352]
177	jmp	NEAR L$002dec_entry
178align	16
179L$003dec_loop:
180	movdqa	xmm4,[ebx-32]
181	movdqa	xmm1,[ebx-16]
182db	102,15,56,0,226
183db	102,15,56,0,203
184	pxor	xmm0,xmm4
185	movdqa	xmm4,[ebx]
186	pxor	xmm0,xmm1
187	movdqa	xmm1,[16+ebx]
188db	102,15,56,0,226
189db	102,15,56,0,197
190db	102,15,56,0,203
191	pxor	xmm0,xmm4
192	movdqa	xmm4,[32+ebx]
193	pxor	xmm0,xmm1
194	movdqa	xmm1,[48+ebx]
195db	102,15,56,0,226
196db	102,15,56,0,197
197db	102,15,56,0,203
198	pxor	xmm0,xmm4
199	movdqa	xmm4,[64+ebx]
200	pxor	xmm0,xmm1
201	movdqa	xmm1,[80+ebx]
202db	102,15,56,0,226
203db	102,15,56,0,197
204db	102,15,56,0,203
205	pxor	xmm0,xmm4
206	add	edx,16
207db	102,15,58,15,237,12
208	pxor	xmm0,xmm1
209	sub	eax,1
210L$002dec_entry:
211	movdqa	xmm1,xmm6
212	movdqa	xmm2,[ebp-32]
213	pandn	xmm1,xmm0
214	pand	xmm0,xmm6
215	psrld	xmm1,4
216db	102,15,56,0,208
217	movdqa	xmm3,xmm7
218	pxor	xmm0,xmm1
219db	102,15,56,0,217
220	movdqa	xmm4,xmm7
221	pxor	xmm3,xmm2
222db	102,15,56,0,224
223	pxor	xmm4,xmm2
224	movdqa	xmm2,xmm7
225db	102,15,56,0,211
226	movdqa	xmm3,xmm7
227	pxor	xmm2,xmm0
228db	102,15,56,0,220
229	movdqu	xmm0,[edx]
230	pxor	xmm3,xmm1
231	jnz	NEAR L$003dec_loop
232	movdqa	xmm4,[96+ebx]
233db	102,15,56,0,226
234	pxor	xmm4,xmm0
235	movdqa	xmm0,[112+ebx]
236	movdqa	xmm2,[ecx]
237db	102,15,56,0,195
238	pxor	xmm0,xmm4
239db	102,15,56,0,194
240	ret
241align	16
242__vpaes_schedule_core:
243	add	ebp,DWORD [esp]
244	movdqu	xmm0,[esi]
245	movdqa	xmm2,[320+ebp]
246	movdqa	xmm3,xmm0
247	lea	ebx,[ebp]
248	movdqa	[4+esp],xmm2
249	call	__vpaes_schedule_transform
250	movdqa	xmm7,xmm0
251	test	edi,edi
252	jnz	NEAR L$004schedule_am_decrypting
253	movdqu	[edx],xmm0
254	jmp	NEAR L$005schedule_go
255L$004schedule_am_decrypting:
256	movdqa	xmm1,[256+ecx*1+ebp]
257db	102,15,56,0,217
258	movdqu	[edx],xmm3
259	xor	ecx,48
260L$005schedule_go:
261	cmp	eax,192
262	ja	NEAR L$006schedule_256
263	je	NEAR L$007schedule_192
264L$008schedule_128:
265	mov	eax,10
266L$009loop_schedule_128:
267	call	__vpaes_schedule_round
268	dec	eax
269	jz	NEAR L$010schedule_mangle_last
270	call	__vpaes_schedule_mangle
271	jmp	NEAR L$009loop_schedule_128
272align	16
273L$007schedule_192:
274	movdqu	xmm0,[8+esi]
275	call	__vpaes_schedule_transform
276	movdqa	xmm6,xmm0
277	pxor	xmm4,xmm4
278	movhlps	xmm6,xmm4
279	mov	eax,4
280L$011loop_schedule_192:
281	call	__vpaes_schedule_round
282db	102,15,58,15,198,8
283	call	__vpaes_schedule_mangle
284	call	__vpaes_schedule_192_smear
285	call	__vpaes_schedule_mangle
286	call	__vpaes_schedule_round
287	dec	eax
288	jz	NEAR L$010schedule_mangle_last
289	call	__vpaes_schedule_mangle
290	call	__vpaes_schedule_192_smear
291	jmp	NEAR L$011loop_schedule_192
292align	16
293L$006schedule_256:
294	movdqu	xmm0,[16+esi]
295	call	__vpaes_schedule_transform
296	mov	eax,7
297L$012loop_schedule_256:
298	call	__vpaes_schedule_mangle
299	movdqa	xmm6,xmm0
300	call	__vpaes_schedule_round
301	dec	eax
302	jz	NEAR L$010schedule_mangle_last
303	call	__vpaes_schedule_mangle
304	pshufd	xmm0,xmm0,255
305	movdqa	[20+esp],xmm7
306	movdqa	xmm7,xmm6
307	call	L$_vpaes_schedule_low_round
308	movdqa	xmm7,[20+esp]
309	jmp	NEAR L$012loop_schedule_256
310align	16
311L$010schedule_mangle_last:
312	lea	ebx,[384+ebp]
313	test	edi,edi
314	jnz	NEAR L$013schedule_mangle_last_dec
315	movdqa	xmm1,[256+ecx*1+ebp]
316db	102,15,56,0,193
317	lea	ebx,[352+ebp]
318	add	edx,32
319L$013schedule_mangle_last_dec:
320	add	edx,-16
321	pxor	xmm0,[336+ebp]
322	call	__vpaes_schedule_transform
323	movdqu	[edx],xmm0
324	pxor	xmm0,xmm0
325	pxor	xmm1,xmm1
326	pxor	xmm2,xmm2
327	pxor	xmm3,xmm3
328	pxor	xmm4,xmm4
329	pxor	xmm5,xmm5
330	pxor	xmm6,xmm6
331	pxor	xmm7,xmm7
332	ret
333align	16
334__vpaes_schedule_192_smear:
335	pshufd	xmm1,xmm6,128
336	pshufd	xmm0,xmm7,254
337	pxor	xmm6,xmm1
338	pxor	xmm1,xmm1
339	pxor	xmm6,xmm0
340	movdqa	xmm0,xmm6
341	movhlps	xmm6,xmm1
342	ret
343align	16
344__vpaes_schedule_round:
345	movdqa	xmm2,[8+esp]
346	pxor	xmm1,xmm1
347db	102,15,58,15,202,15
348db	102,15,58,15,210,15
349	pxor	xmm7,xmm1
350	pshufd	xmm0,xmm0,255
351db	102,15,58,15,192,1
352	movdqa	[8+esp],xmm2
353L$_vpaes_schedule_low_round:
354	movdqa	xmm1,xmm7
355	pslldq	xmm7,4
356	pxor	xmm7,xmm1
357	movdqa	xmm1,xmm7
358	pslldq	xmm7,8
359	pxor	xmm7,xmm1
360	pxor	xmm7,[336+ebp]
361	movdqa	xmm4,[ebp-16]
362	movdqa	xmm5,[ebp-48]
363	movdqa	xmm1,xmm4
364	pandn	xmm1,xmm0
365	psrld	xmm1,4
366	pand	xmm0,xmm4
367	movdqa	xmm2,[ebp-32]
368db	102,15,56,0,208
369	pxor	xmm0,xmm1
370	movdqa	xmm3,xmm5
371db	102,15,56,0,217
372	pxor	xmm3,xmm2
373	movdqa	xmm4,xmm5
374db	102,15,56,0,224
375	pxor	xmm4,xmm2
376	movdqa	xmm2,xmm5
377db	102,15,56,0,211
378	pxor	xmm2,xmm0
379	movdqa	xmm3,xmm5
380db	102,15,56,0,220
381	pxor	xmm3,xmm1
382	movdqa	xmm4,[32+ebp]
383db	102,15,56,0,226
384	movdqa	xmm0,[48+ebp]
385db	102,15,56,0,195
386	pxor	xmm0,xmm4
387	pxor	xmm0,xmm7
388	movdqa	xmm7,xmm0
389	ret
390align	16
391__vpaes_schedule_transform:
392	movdqa	xmm2,[ebp-16]
393	movdqa	xmm1,xmm2
394	pandn	xmm1,xmm0
395	psrld	xmm1,4
396	pand	xmm0,xmm2
397	movdqa	xmm2,[ebx]
398db	102,15,56,0,208
399	movdqa	xmm0,[16+ebx]
400db	102,15,56,0,193
401	pxor	xmm0,xmm2
402	ret
403align	16
404__vpaes_schedule_mangle:
405	movdqa	xmm4,xmm0
406	movdqa	xmm5,[128+ebp]
407	test	edi,edi
408	jnz	NEAR L$014schedule_mangle_dec
409	add	edx,16
410	pxor	xmm4,[336+ebp]
411db	102,15,56,0,229
412	movdqa	xmm3,xmm4
413db	102,15,56,0,229
414	pxor	xmm3,xmm4
415db	102,15,56,0,229
416	pxor	xmm3,xmm4
417	jmp	NEAR L$015schedule_mangle_both
418align	16
419L$014schedule_mangle_dec:
420	movdqa	xmm2,[ebp-16]
421	lea	esi,[416+ebp]
422	movdqa	xmm1,xmm2
423	pandn	xmm1,xmm4
424	psrld	xmm1,4
425	pand	xmm4,xmm2
426	movdqa	xmm2,[esi]
427db	102,15,56,0,212
428	movdqa	xmm3,[16+esi]
429db	102,15,56,0,217
430	pxor	xmm3,xmm2
431db	102,15,56,0,221
432	movdqa	xmm2,[32+esi]
433db	102,15,56,0,212
434	pxor	xmm2,xmm3
435	movdqa	xmm3,[48+esi]
436db	102,15,56,0,217
437	pxor	xmm3,xmm2
438db	102,15,56,0,221
439	movdqa	xmm2,[64+esi]
440db	102,15,56,0,212
441	pxor	xmm2,xmm3
442	movdqa	xmm3,[80+esi]
443db	102,15,56,0,217
444	pxor	xmm3,xmm2
445db	102,15,56,0,221
446	movdqa	xmm2,[96+esi]
447db	102,15,56,0,212
448	pxor	xmm2,xmm3
449	movdqa	xmm3,[112+esi]
450db	102,15,56,0,217
451	pxor	xmm3,xmm2
452	add	edx,-16
453L$015schedule_mangle_both:
454	movdqa	xmm1,[256+ecx*1+ebp]
455db	102,15,56,0,217
456	add	ecx,-16
457	and	ecx,48
458	movdqu	[edx],xmm3
459	ret
460global	_vpaes_set_encrypt_key
461align	16
462_vpaes_set_encrypt_key:
463L$_vpaes_set_encrypt_key_begin:
464	push	ebp
465	push	ebx
466	push	esi
467	push	edi
468	mov	esi,DWORD [20+esp]
469	lea	ebx,[esp-56]
470	mov	eax,DWORD [24+esp]
471	and	ebx,-16
472	mov	edx,DWORD [28+esp]
473	xchg	ebx,esp
474	mov	DWORD [48+esp],ebx
475	mov	ebx,eax
476	shr	ebx,5
477	add	ebx,5
478	mov	DWORD [240+edx],ebx
479	mov	ecx,48
480	mov	edi,0
481	lea	ebp,[(L$_vpaes_consts+0x30-L$016pic_point)]
482	call	__vpaes_schedule_core
483L$016pic_point:
484	mov	esp,DWORD [48+esp]
485	xor	eax,eax
486	pop	edi
487	pop	esi
488	pop	ebx
489	pop	ebp
490	ret
491global	_vpaes_set_decrypt_key
492align	16
493_vpaes_set_decrypt_key:
494L$_vpaes_set_decrypt_key_begin:
495	push	ebp
496	push	ebx
497	push	esi
498	push	edi
499	mov	esi,DWORD [20+esp]
500	lea	ebx,[esp-56]
501	mov	eax,DWORD [24+esp]
502	and	ebx,-16
503	mov	edx,DWORD [28+esp]
504	xchg	ebx,esp
505	mov	DWORD [48+esp],ebx
506	mov	ebx,eax
507	shr	ebx,5
508	add	ebx,5
509	mov	DWORD [240+edx],ebx
510	shl	ebx,4
511	lea	edx,[16+ebx*1+edx]
512	mov	edi,1
513	mov	ecx,eax
514	shr	ecx,1
515	and	ecx,32
516	xor	ecx,32
517	lea	ebp,[(L$_vpaes_consts+0x30-L$017pic_point)]
518	call	__vpaes_schedule_core
519L$017pic_point:
520	mov	esp,DWORD [48+esp]
521	xor	eax,eax
522	pop	edi
523	pop	esi
524	pop	ebx
525	pop	ebp
526	ret
527global	_vpaes_encrypt
528align	16
529_vpaes_encrypt:
530L$_vpaes_encrypt_begin:
531	push	ebp
532	push	ebx
533	push	esi
534	push	edi
535	lea	ebp,[(L$_vpaes_consts+0x30-L$018pic_point)]
536	call	__vpaes_preheat
537L$018pic_point:
538	mov	esi,DWORD [20+esp]
539	lea	ebx,[esp-56]
540	mov	edi,DWORD [24+esp]
541	and	ebx,-16
542	mov	edx,DWORD [28+esp]
543	xchg	ebx,esp
544	mov	DWORD [48+esp],ebx
545	movdqu	xmm0,[esi]
546	call	__vpaes_encrypt_core
547	movdqu	[edi],xmm0
548	mov	esp,DWORD [48+esp]
549	pop	edi
550	pop	esi
551	pop	ebx
552	pop	ebp
553	ret
554global	_vpaes_decrypt
555align	16
556_vpaes_decrypt:
557L$_vpaes_decrypt_begin:
558	push	ebp
559	push	ebx
560	push	esi
561	push	edi
562	lea	ebp,[(L$_vpaes_consts+0x30-L$019pic_point)]
563	call	__vpaes_preheat
564L$019pic_point:
565	mov	esi,DWORD [20+esp]
566	lea	ebx,[esp-56]
567	mov	edi,DWORD [24+esp]
568	and	ebx,-16
569	mov	edx,DWORD [28+esp]
570	xchg	ebx,esp
571	mov	DWORD [48+esp],ebx
572	movdqu	xmm0,[esi]
573	call	__vpaes_decrypt_core
574	movdqu	[edi],xmm0
575	mov	esp,DWORD [48+esp]
576	pop	edi
577	pop	esi
578	pop	ebx
579	pop	ebp
580	ret
581global	_vpaes_cbc_encrypt
582align	16
583_vpaes_cbc_encrypt:
584L$_vpaes_cbc_encrypt_begin:
585	push	ebp
586	push	ebx
587	push	esi
588	push	edi
589	mov	esi,DWORD [20+esp]
590	mov	edi,DWORD [24+esp]
591	mov	eax,DWORD [28+esp]
592	mov	edx,DWORD [32+esp]
593	sub	eax,16
594	jc	NEAR L$020cbc_abort
595	lea	ebx,[esp-56]
596	mov	ebp,DWORD [36+esp]
597	and	ebx,-16
598	mov	ecx,DWORD [40+esp]
599	xchg	ebx,esp
600	movdqu	xmm1,[ebp]
601	sub	edi,esi
602	mov	DWORD [48+esp],ebx
603	mov	DWORD [esp],edi
604	mov	DWORD [4+esp],edx
605	mov	DWORD [8+esp],ebp
606	mov	edi,eax
607	lea	ebp,[(L$_vpaes_consts+0x30-L$021pic_point)]
608	call	__vpaes_preheat
609L$021pic_point:
610	cmp	ecx,0
611	je	NEAR L$022cbc_dec_loop
612	jmp	NEAR L$023cbc_enc_loop
613align	16
614L$023cbc_enc_loop:
615	movdqu	xmm0,[esi]
616	pxor	xmm0,xmm1
617	call	__vpaes_encrypt_core
618	mov	ebx,DWORD [esp]
619	mov	edx,DWORD [4+esp]
620	movdqa	xmm1,xmm0
621	movdqu	[esi*1+ebx],xmm0
622	lea	esi,[16+esi]
623	sub	edi,16
624	jnc	NEAR L$023cbc_enc_loop
625	jmp	NEAR L$024cbc_done
626align	16
627L$022cbc_dec_loop:
628	movdqu	xmm0,[esi]
629	movdqa	[16+esp],xmm1
630	movdqa	[32+esp],xmm0
631	call	__vpaes_decrypt_core
632	mov	ebx,DWORD [esp]
633	mov	edx,DWORD [4+esp]
634	pxor	xmm0,[16+esp]
635	movdqa	xmm1,[32+esp]
636	movdqu	[esi*1+ebx],xmm0
637	lea	esi,[16+esi]
638	sub	edi,16
639	jnc	NEAR L$022cbc_dec_loop
640L$024cbc_done:
641	mov	ebx,DWORD [8+esp]
642	mov	esp,DWORD [48+esp]
643	movdqu	[ebx],xmm1
644L$020cbc_abort:
645	pop	edi
646	pop	esi
647	pop	ebx
648	pop	ebp
649	ret
650