1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section	.text	code align=64
14%else
15section	.text	code
16%endif
17global	_gcm_gmult_4bit_x86
18align	16
19_gcm_gmult_4bit_x86:
20L$_gcm_gmult_4bit_x86_begin:
21	push	ebp
22	push	ebx
23	push	esi
24	push	edi
25	sub	esp,84
26	mov	edi,DWORD [104+esp]
27	mov	esi,DWORD [108+esp]
28	mov	ebp,DWORD [edi]
29	mov	edx,DWORD [4+edi]
30	mov	ecx,DWORD [8+edi]
31	mov	ebx,DWORD [12+edi]
32	mov	DWORD [16+esp],0
33	mov	DWORD [20+esp],471859200
34	mov	DWORD [24+esp],943718400
35	mov	DWORD [28+esp],610271232
36	mov	DWORD [32+esp],1887436800
37	mov	DWORD [36+esp],1822425088
38	mov	DWORD [40+esp],1220542464
39	mov	DWORD [44+esp],1423966208
40	mov	DWORD [48+esp],3774873600
41	mov	DWORD [52+esp],4246732800
42	mov	DWORD [56+esp],3644850176
43	mov	DWORD [60+esp],3311403008
44	mov	DWORD [64+esp],2441084928
45	mov	DWORD [68+esp],2376073216
46	mov	DWORD [72+esp],2847932416
47	mov	DWORD [76+esp],3051356160
48	mov	DWORD [esp],ebp
49	mov	DWORD [4+esp],edx
50	mov	DWORD [8+esp],ecx
51	mov	DWORD [12+esp],ebx
52	shr	ebx,20
53	and	ebx,240
54	mov	ebp,DWORD [4+ebx*1+esi]
55	mov	edx,DWORD [ebx*1+esi]
56	mov	ecx,DWORD [12+ebx*1+esi]
57	mov	ebx,DWORD [8+ebx*1+esi]
58	xor	eax,eax
59	mov	edi,15
60	jmp	NEAR L$000x86_loop
61align	16
62L$000x86_loop:
63	mov	al,bl
64	shrd	ebx,ecx,4
65	and	al,15
66	shrd	ecx,edx,4
67	shrd	edx,ebp,4
68	shr	ebp,4
69	xor	ebp,DWORD [16+eax*4+esp]
70	mov	al,BYTE [edi*1+esp]
71	and	al,240
72	xor	ebx,DWORD [8+eax*1+esi]
73	xor	ecx,DWORD [12+eax*1+esi]
74	xor	edx,DWORD [eax*1+esi]
75	xor	ebp,DWORD [4+eax*1+esi]
76	dec	edi
77	js	NEAR L$001x86_break
78	mov	al,bl
79	shrd	ebx,ecx,4
80	and	al,15
81	shrd	ecx,edx,4
82	shrd	edx,ebp,4
83	shr	ebp,4
84	xor	ebp,DWORD [16+eax*4+esp]
85	mov	al,BYTE [edi*1+esp]
86	shl	al,4
87	xor	ebx,DWORD [8+eax*1+esi]
88	xor	ecx,DWORD [12+eax*1+esi]
89	xor	edx,DWORD [eax*1+esi]
90	xor	ebp,DWORD [4+eax*1+esi]
91	jmp	NEAR L$000x86_loop
92align	16
93L$001x86_break:
94	bswap	ebx
95	bswap	ecx
96	bswap	edx
97	bswap	ebp
98	mov	edi,DWORD [104+esp]
99	mov	DWORD [12+edi],ebx
100	mov	DWORD [8+edi],ecx
101	mov	DWORD [4+edi],edx
102	mov	DWORD [edi],ebp
103	add	esp,84
104	pop	edi
105	pop	esi
106	pop	ebx
107	pop	ebp
108	ret
109global	_gcm_ghash_4bit_x86
110align	16
111_gcm_ghash_4bit_x86:
112L$_gcm_ghash_4bit_x86_begin:
113	push	ebp
114	push	ebx
115	push	esi
116	push	edi
117	sub	esp,84
118	mov	ebx,DWORD [104+esp]
119	mov	esi,DWORD [108+esp]
120	mov	edi,DWORD [112+esp]
121	mov	ecx,DWORD [116+esp]
122	add	ecx,edi
123	mov	DWORD [116+esp],ecx
124	mov	ebp,DWORD [ebx]
125	mov	edx,DWORD [4+ebx]
126	mov	ecx,DWORD [8+ebx]
127	mov	ebx,DWORD [12+ebx]
128	mov	DWORD [16+esp],0
129	mov	DWORD [20+esp],471859200
130	mov	DWORD [24+esp],943718400
131	mov	DWORD [28+esp],610271232
132	mov	DWORD [32+esp],1887436800
133	mov	DWORD [36+esp],1822425088
134	mov	DWORD [40+esp],1220542464
135	mov	DWORD [44+esp],1423966208
136	mov	DWORD [48+esp],3774873600
137	mov	DWORD [52+esp],4246732800
138	mov	DWORD [56+esp],3644850176
139	mov	DWORD [60+esp],3311403008
140	mov	DWORD [64+esp],2441084928
141	mov	DWORD [68+esp],2376073216
142	mov	DWORD [72+esp],2847932416
143	mov	DWORD [76+esp],3051356160
144align	16
145L$002x86_outer_loop:
146	xor	ebx,DWORD [12+edi]
147	xor	ecx,DWORD [8+edi]
148	xor	edx,DWORD [4+edi]
149	xor	ebp,DWORD [edi]
150	mov	DWORD [12+esp],ebx
151	mov	DWORD [8+esp],ecx
152	mov	DWORD [4+esp],edx
153	mov	DWORD [esp],ebp
154	shr	ebx,20
155	and	ebx,240
156	mov	ebp,DWORD [4+ebx*1+esi]
157	mov	edx,DWORD [ebx*1+esi]
158	mov	ecx,DWORD [12+ebx*1+esi]
159	mov	ebx,DWORD [8+ebx*1+esi]
160	xor	eax,eax
161	mov	edi,15
162	jmp	NEAR L$003x86_loop
163align	16
164L$003x86_loop:
165	mov	al,bl
166	shrd	ebx,ecx,4
167	and	al,15
168	shrd	ecx,edx,4
169	shrd	edx,ebp,4
170	shr	ebp,4
171	xor	ebp,DWORD [16+eax*4+esp]
172	mov	al,BYTE [edi*1+esp]
173	and	al,240
174	xor	ebx,DWORD [8+eax*1+esi]
175	xor	ecx,DWORD [12+eax*1+esi]
176	xor	edx,DWORD [eax*1+esi]
177	xor	ebp,DWORD [4+eax*1+esi]
178	dec	edi
179	js	NEAR L$004x86_break
180	mov	al,bl
181	shrd	ebx,ecx,4
182	and	al,15
183	shrd	ecx,edx,4
184	shrd	edx,ebp,4
185	shr	ebp,4
186	xor	ebp,DWORD [16+eax*4+esp]
187	mov	al,BYTE [edi*1+esp]
188	shl	al,4
189	xor	ebx,DWORD [8+eax*1+esi]
190	xor	ecx,DWORD [12+eax*1+esi]
191	xor	edx,DWORD [eax*1+esi]
192	xor	ebp,DWORD [4+eax*1+esi]
193	jmp	NEAR L$003x86_loop
194align	16
195L$004x86_break:
196	bswap	ebx
197	bswap	ecx
198	bswap	edx
199	bswap	ebp
200	mov	edi,DWORD [112+esp]
201	lea	edi,[16+edi]
202	cmp	edi,DWORD [116+esp]
203	mov	DWORD [112+esp],edi
204	jb	NEAR L$002x86_outer_loop
205	mov	edi,DWORD [104+esp]
206	mov	DWORD [12+edi],ebx
207	mov	DWORD [8+edi],ecx
208	mov	DWORD [4+edi],edx
209	mov	DWORD [edi],ebp
210	add	esp,84
211	pop	edi
212	pop	esi
213	pop	ebx
214	pop	ebp
215	ret
216global	_gcm_gmult_4bit_mmx
217align	16
218_gcm_gmult_4bit_mmx:
219L$_gcm_gmult_4bit_mmx_begin:
220	push	ebp
221	push	ebx
222	push	esi
223	push	edi
224	mov	edi,DWORD [20+esp]
225	mov	esi,DWORD [24+esp]
226	call	L$005pic_point
227L$005pic_point:
228	pop	eax
229	lea	eax,[(L$rem_4bit-L$005pic_point)+eax]
230	movzx	ebx,BYTE [15+edi]
231	xor	ecx,ecx
232	mov	edx,ebx
233	mov	cl,dl
234	mov	ebp,14
235	shl	cl,4
236	and	edx,240
237	movq	mm0,[8+ecx*1+esi]
238	movq	mm1,[ecx*1+esi]
239	movd	ebx,mm0
240	jmp	NEAR L$006mmx_loop
241align	16
242L$006mmx_loop:
243	psrlq	mm0,4
244	and	ebx,15
245	movq	mm2,mm1
246	psrlq	mm1,4
247	pxor	mm0,[8+edx*1+esi]
248	mov	cl,BYTE [ebp*1+edi]
249	psllq	mm2,60
250	pxor	mm1,[ebx*8+eax]
251	dec	ebp
252	movd	ebx,mm0
253	pxor	mm1,[edx*1+esi]
254	mov	edx,ecx
255	pxor	mm0,mm2
256	js	NEAR L$007mmx_break
257	shl	cl,4
258	and	ebx,15
259	psrlq	mm0,4
260	and	edx,240
261	movq	mm2,mm1
262	psrlq	mm1,4
263	pxor	mm0,[8+ecx*1+esi]
264	psllq	mm2,60
265	pxor	mm1,[ebx*8+eax]
266	movd	ebx,mm0
267	pxor	mm1,[ecx*1+esi]
268	pxor	mm0,mm2
269	jmp	NEAR L$006mmx_loop
270align	16
271L$007mmx_break:
272	shl	cl,4
273	and	ebx,15
274	psrlq	mm0,4
275	and	edx,240
276	movq	mm2,mm1
277	psrlq	mm1,4
278	pxor	mm0,[8+ecx*1+esi]
279	psllq	mm2,60
280	pxor	mm1,[ebx*8+eax]
281	movd	ebx,mm0
282	pxor	mm1,[ecx*1+esi]
283	pxor	mm0,mm2
284	psrlq	mm0,4
285	and	ebx,15
286	movq	mm2,mm1
287	psrlq	mm1,4
288	pxor	mm0,[8+edx*1+esi]
289	psllq	mm2,60
290	pxor	mm1,[ebx*8+eax]
291	movd	ebx,mm0
292	pxor	mm1,[edx*1+esi]
293	pxor	mm0,mm2
294	psrlq	mm0,32
295	movd	edx,mm1
296	psrlq	mm1,32
297	movd	ecx,mm0
298	movd	ebp,mm1
299	bswap	ebx
300	bswap	edx
301	bswap	ecx
302	bswap	ebp
303	emms
304	mov	DWORD [12+edi],ebx
305	mov	DWORD [4+edi],edx
306	mov	DWORD [8+edi],ecx
307	mov	DWORD [edi],ebp
308	pop	edi
309	pop	esi
310	pop	ebx
311	pop	ebp
312	ret
313global	_gcm_ghash_4bit_mmx
314align	16
315_gcm_ghash_4bit_mmx:
316L$_gcm_ghash_4bit_mmx_begin:
317	push	ebp
318	push	ebx
319	push	esi
320	push	edi
321	mov	eax,DWORD [20+esp]
322	mov	ebx,DWORD [24+esp]
323	mov	ecx,DWORD [28+esp]
324	mov	edx,DWORD [32+esp]
325	mov	ebp,esp
326	call	L$008pic_point
327L$008pic_point:
328	pop	esi
329	lea	esi,[(L$rem_8bit-L$008pic_point)+esi]
330	sub	esp,544
331	and	esp,-64
332	sub	esp,16
333	add	edx,ecx
334	mov	DWORD [544+esp],eax
335	mov	DWORD [552+esp],edx
336	mov	DWORD [556+esp],ebp
337	add	ebx,128
338	lea	edi,[144+esp]
339	lea	ebp,[400+esp]
340	mov	edx,DWORD [ebx-120]
341	movq	mm0,[ebx-120]
342	movq	mm3,[ebx-128]
343	shl	edx,4
344	mov	BYTE [esp],dl
345	mov	edx,DWORD [ebx-104]
346	movq	mm2,[ebx-104]
347	movq	mm5,[ebx-112]
348	movq	[edi-128],mm0
349	psrlq	mm0,4
350	movq	[edi],mm3
351	movq	mm7,mm3
352	psrlq	mm3,4
353	shl	edx,4
354	mov	BYTE [1+esp],dl
355	mov	edx,DWORD [ebx-88]
356	movq	mm1,[ebx-88]
357	psllq	mm7,60
358	movq	mm4,[ebx-96]
359	por	mm0,mm7
360	movq	[edi-120],mm2
361	psrlq	mm2,4
362	movq	[8+edi],mm5
363	movq	mm6,mm5
364	movq	[ebp-128],mm0
365	psrlq	mm5,4
366	movq	[ebp],mm3
367	shl	edx,4
368	mov	BYTE [2+esp],dl
369	mov	edx,DWORD [ebx-72]
370	movq	mm0,[ebx-72]
371	psllq	mm6,60
372	movq	mm3,[ebx-80]
373	por	mm2,mm6
374	movq	[edi-112],mm1
375	psrlq	mm1,4
376	movq	[16+edi],mm4
377	movq	mm7,mm4
378	movq	[ebp-120],mm2
379	psrlq	mm4,4
380	movq	[8+ebp],mm5
381	shl	edx,4
382	mov	BYTE [3+esp],dl
383	mov	edx,DWORD [ebx-56]
384	movq	mm2,[ebx-56]
385	psllq	mm7,60
386	movq	mm5,[ebx-64]
387	por	mm1,mm7
388	movq	[edi-104],mm0
389	psrlq	mm0,4
390	movq	[24+edi],mm3
391	movq	mm6,mm3
392	movq	[ebp-112],mm1
393	psrlq	mm3,4
394	movq	[16+ebp],mm4
395	shl	edx,4
396	mov	BYTE [4+esp],dl
397	mov	edx,DWORD [ebx-40]
398	movq	mm1,[ebx-40]
399	psllq	mm6,60
400	movq	mm4,[ebx-48]
401	por	mm0,mm6
402	movq	[edi-96],mm2
403	psrlq	mm2,4
404	movq	[32+edi],mm5
405	movq	mm7,mm5
406	movq	[ebp-104],mm0
407	psrlq	mm5,4
408	movq	[24+ebp],mm3
409	shl	edx,4
410	mov	BYTE [5+esp],dl
411	mov	edx,DWORD [ebx-24]
412	movq	mm0,[ebx-24]
413	psllq	mm7,60
414	movq	mm3,[ebx-32]
415	por	mm2,mm7
416	movq	[edi-88],mm1
417	psrlq	mm1,4
418	movq	[40+edi],mm4
419	movq	mm6,mm4
420	movq	[ebp-96],mm2
421	psrlq	mm4,4
422	movq	[32+ebp],mm5
423	shl	edx,4
424	mov	BYTE [6+esp],dl
425	mov	edx,DWORD [ebx-8]
426	movq	mm2,[ebx-8]
427	psllq	mm6,60
428	movq	mm5,[ebx-16]
429	por	mm1,mm6
430	movq	[edi-80],mm0
431	psrlq	mm0,4
432	movq	[48+edi],mm3
433	movq	mm7,mm3
434	movq	[ebp-88],mm1
435	psrlq	mm3,4
436	movq	[40+ebp],mm4
437	shl	edx,4
438	mov	BYTE [7+esp],dl
439	mov	edx,DWORD [8+ebx]
440	movq	mm1,[8+ebx]
441	psllq	mm7,60
442	movq	mm4,[ebx]
443	por	mm0,mm7
444	movq	[edi-72],mm2
445	psrlq	mm2,4
446	movq	[56+edi],mm5
447	movq	mm6,mm5
448	movq	[ebp-80],mm0
449	psrlq	mm5,4
450	movq	[48+ebp],mm3
451	shl	edx,4
452	mov	BYTE [8+esp],dl
453	mov	edx,DWORD [24+ebx]
454	movq	mm0,[24+ebx]
455	psllq	mm6,60
456	movq	mm3,[16+ebx]
457	por	mm2,mm6
458	movq	[edi-64],mm1
459	psrlq	mm1,4
460	movq	[64+edi],mm4
461	movq	mm7,mm4
462	movq	[ebp-72],mm2
463	psrlq	mm4,4
464	movq	[56+ebp],mm5
465	shl	edx,4
466	mov	BYTE [9+esp],dl
467	mov	edx,DWORD [40+ebx]
468	movq	mm2,[40+ebx]
469	psllq	mm7,60
470	movq	mm5,[32+ebx]
471	por	mm1,mm7
472	movq	[edi-56],mm0
473	psrlq	mm0,4
474	movq	[72+edi],mm3
475	movq	mm6,mm3
476	movq	[ebp-64],mm1
477	psrlq	mm3,4
478	movq	[64+ebp],mm4
479	shl	edx,4
480	mov	BYTE [10+esp],dl
481	mov	edx,DWORD [56+ebx]
482	movq	mm1,[56+ebx]
483	psllq	mm6,60
484	movq	mm4,[48+ebx]
485	por	mm0,mm6
486	movq	[edi-48],mm2
487	psrlq	mm2,4
488	movq	[80+edi],mm5
489	movq	mm7,mm5
490	movq	[ebp-56],mm0
491	psrlq	mm5,4
492	movq	[72+ebp],mm3
493	shl	edx,4
494	mov	BYTE [11+esp],dl
495	mov	edx,DWORD [72+ebx]
496	movq	mm0,[72+ebx]
497	psllq	mm7,60
498	movq	mm3,[64+ebx]
499	por	mm2,mm7
500	movq	[edi-40],mm1
501	psrlq	mm1,4
502	movq	[88+edi],mm4
503	movq	mm6,mm4
504	movq	[ebp-48],mm2
505	psrlq	mm4,4
506	movq	[80+ebp],mm5
507	shl	edx,4
508	mov	BYTE [12+esp],dl
509	mov	edx,DWORD [88+ebx]
510	movq	mm2,[88+ebx]
511	psllq	mm6,60
512	movq	mm5,[80+ebx]
513	por	mm1,mm6
514	movq	[edi-32],mm0
515	psrlq	mm0,4
516	movq	[96+edi],mm3
517	movq	mm7,mm3
518	movq	[ebp-40],mm1
519	psrlq	mm3,4
520	movq	[88+ebp],mm4
521	shl	edx,4
522	mov	BYTE [13+esp],dl
523	mov	edx,DWORD [104+ebx]
524	movq	mm1,[104+ebx]
525	psllq	mm7,60
526	movq	mm4,[96+ebx]
527	por	mm0,mm7
528	movq	[edi-24],mm2
529	psrlq	mm2,4
530	movq	[104+edi],mm5
531	movq	mm6,mm5
532	movq	[ebp-32],mm0
533	psrlq	mm5,4
534	movq	[96+ebp],mm3
535	shl	edx,4
536	mov	BYTE [14+esp],dl
537	mov	edx,DWORD [120+ebx]
538	movq	mm0,[120+ebx]
539	psllq	mm6,60
540	movq	mm3,[112+ebx]
541	por	mm2,mm6
542	movq	[edi-16],mm1
543	psrlq	mm1,4
544	movq	[112+edi],mm4
545	movq	mm7,mm4
546	movq	[ebp-24],mm2
547	psrlq	mm4,4
548	movq	[104+ebp],mm5
549	shl	edx,4
550	mov	BYTE [15+esp],dl
551	psllq	mm7,60
552	por	mm1,mm7
553	movq	[edi-8],mm0
554	psrlq	mm0,4
555	movq	[120+edi],mm3
556	movq	mm6,mm3
557	movq	[ebp-16],mm1
558	psrlq	mm3,4
559	movq	[112+ebp],mm4
560	psllq	mm6,60
561	por	mm0,mm6
562	movq	[ebp-8],mm0
563	movq	[120+ebp],mm3
564	movq	mm6,[eax]
565	mov	ebx,DWORD [8+eax]
566	mov	edx,DWORD [12+eax]
567align	16
568L$009outer:
569	xor	edx,DWORD [12+ecx]
570	xor	ebx,DWORD [8+ecx]
571	pxor	mm6,[ecx]
572	lea	ecx,[16+ecx]
573	mov	DWORD [536+esp],ebx
574	movq	[528+esp],mm6
575	mov	DWORD [548+esp],ecx
576	xor	eax,eax
577	rol	edx,8
578	mov	al,dl
579	mov	ebp,eax
580	and	al,15
581	shr	ebp,4
582	pxor	mm0,mm0
583	rol	edx,8
584	pxor	mm1,mm1
585	pxor	mm2,mm2
586	movq	mm7,[16+eax*8+esp]
587	movq	mm6,[144+eax*8+esp]
588	mov	al,dl
589	movd	ebx,mm7
590	psrlq	mm7,8
591	movq	mm3,mm6
592	mov	edi,eax
593	psrlq	mm6,8
594	pxor	mm7,[272+ebp*8+esp]
595	and	al,15
596	psllq	mm3,56
597	shr	edi,4
598	pxor	mm7,[16+eax*8+esp]
599	rol	edx,8
600	pxor	mm6,[144+eax*8+esp]
601	pxor	mm7,mm3
602	pxor	mm6,[400+ebp*8+esp]
603	xor	bl,BYTE [ebp*1+esp]
604	mov	al,dl
605	movd	ecx,mm7
606	movzx	ebx,bl
607	psrlq	mm7,8
608	movq	mm3,mm6
609	mov	ebp,eax
610	psrlq	mm6,8
611	pxor	mm7,[272+edi*8+esp]
612	and	al,15
613	psllq	mm3,56
614	shr	ebp,4
615	pinsrw	mm2,WORD [ebx*2+esi],2
616	pxor	mm7,[16+eax*8+esp]
617	rol	edx,8
618	pxor	mm6,[144+eax*8+esp]
619	pxor	mm7,mm3
620	pxor	mm6,[400+edi*8+esp]
621	xor	cl,BYTE [edi*1+esp]
622	mov	al,dl
623	mov	edx,DWORD [536+esp]
624	movd	ebx,mm7
625	movzx	ecx,cl
626	psrlq	mm7,8
627	movq	mm3,mm6
628	mov	edi,eax
629	psrlq	mm6,8
630	pxor	mm7,[272+ebp*8+esp]
631	and	al,15
632	psllq	mm3,56
633	pxor	mm6,mm2
634	shr	edi,4
635	pinsrw	mm1,WORD [ecx*2+esi],2
636	pxor	mm7,[16+eax*8+esp]
637	rol	edx,8
638	pxor	mm6,[144+eax*8+esp]
639	pxor	mm7,mm3
640	pxor	mm6,[400+ebp*8+esp]
641	xor	bl,BYTE [ebp*1+esp]
642	mov	al,dl
643	movd	ecx,mm7
644	movzx	ebx,bl
645	psrlq	mm7,8
646	movq	mm3,mm6
647	mov	ebp,eax
648	psrlq	mm6,8
649	pxor	mm7,[272+edi*8+esp]
650	and	al,15
651	psllq	mm3,56
652	pxor	mm6,mm1
653	shr	ebp,4
654	pinsrw	mm0,WORD [ebx*2+esi],2
655	pxor	mm7,[16+eax*8+esp]
656	rol	edx,8
657	pxor	mm6,[144+eax*8+esp]
658	pxor	mm7,mm3
659	pxor	mm6,[400+edi*8+esp]
660	xor	cl,BYTE [edi*1+esp]
661	mov	al,dl
662	movd	ebx,mm7
663	movzx	ecx,cl
664	psrlq	mm7,8
665	movq	mm3,mm6
666	mov	edi,eax
667	psrlq	mm6,8
668	pxor	mm7,[272+ebp*8+esp]
669	and	al,15
670	psllq	mm3,56
671	pxor	mm6,mm0
672	shr	edi,4
673	pinsrw	mm2,WORD [ecx*2+esi],2
674	pxor	mm7,[16+eax*8+esp]
675	rol	edx,8
676	pxor	mm6,[144+eax*8+esp]
677	pxor	mm7,mm3
678	pxor	mm6,[400+ebp*8+esp]
679	xor	bl,BYTE [ebp*1+esp]
680	mov	al,dl
681	movd	ecx,mm7
682	movzx	ebx,bl
683	psrlq	mm7,8
684	movq	mm3,mm6
685	mov	ebp,eax
686	psrlq	mm6,8
687	pxor	mm7,[272+edi*8+esp]
688	and	al,15
689	psllq	mm3,56
690	pxor	mm6,mm2
691	shr	ebp,4
692	pinsrw	mm1,WORD [ebx*2+esi],2
693	pxor	mm7,[16+eax*8+esp]
694	rol	edx,8
695	pxor	mm6,[144+eax*8+esp]
696	pxor	mm7,mm3
697	pxor	mm6,[400+edi*8+esp]
698	xor	cl,BYTE [edi*1+esp]
699	mov	al,dl
700	mov	edx,DWORD [532+esp]
701	movd	ebx,mm7
702	movzx	ecx,cl
703	psrlq	mm7,8
704	movq	mm3,mm6
705	mov	edi,eax
706	psrlq	mm6,8
707	pxor	mm7,[272+ebp*8+esp]
708	and	al,15
709	psllq	mm3,56
710	pxor	mm6,mm1
711	shr	edi,4
712	pinsrw	mm0,WORD [ecx*2+esi],2
713	pxor	mm7,[16+eax*8+esp]
714	rol	edx,8
715	pxor	mm6,[144+eax*8+esp]
716	pxor	mm7,mm3
717	pxor	mm6,[400+ebp*8+esp]
718	xor	bl,BYTE [ebp*1+esp]
719	mov	al,dl
720	movd	ecx,mm7
721	movzx	ebx,bl
722	psrlq	mm7,8
723	movq	mm3,mm6
724	mov	ebp,eax
725	psrlq	mm6,8
726	pxor	mm7,[272+edi*8+esp]
727	and	al,15
728	psllq	mm3,56
729	pxor	mm6,mm0
730	shr	ebp,4
731	pinsrw	mm2,WORD [ebx*2+esi],2
732	pxor	mm7,[16+eax*8+esp]
733	rol	edx,8
734	pxor	mm6,[144+eax*8+esp]
735	pxor	mm7,mm3
736	pxor	mm6,[400+edi*8+esp]
737	xor	cl,BYTE [edi*1+esp]
738	mov	al,dl
739	movd	ebx,mm7
740	movzx	ecx,cl
741	psrlq	mm7,8
742	movq	mm3,mm6
743	mov	edi,eax
744	psrlq	mm6,8
745	pxor	mm7,[272+ebp*8+esp]
746	and	al,15
747	psllq	mm3,56
748	pxor	mm6,mm2
749	shr	edi,4
750	pinsrw	mm1,WORD [ecx*2+esi],2
751	pxor	mm7,[16+eax*8+esp]
752	rol	edx,8
753	pxor	mm6,[144+eax*8+esp]
754	pxor	mm7,mm3
755	pxor	mm6,[400+ebp*8+esp]
756	xor	bl,BYTE [ebp*1+esp]
757	mov	al,dl
758	movd	ecx,mm7
759	movzx	ebx,bl
760	psrlq	mm7,8
761	movq	mm3,mm6
762	mov	ebp,eax
763	psrlq	mm6,8
764	pxor	mm7,[272+edi*8+esp]
765	and	al,15
766	psllq	mm3,56
767	pxor	mm6,mm1
768	shr	ebp,4
769	pinsrw	mm0,WORD [ebx*2+esi],2
770	pxor	mm7,[16+eax*8+esp]
771	rol	edx,8
772	pxor	mm6,[144+eax*8+esp]
773	pxor	mm7,mm3
774	pxor	mm6,[400+edi*8+esp]
775	xor	cl,BYTE [edi*1+esp]
776	mov	al,dl
777	mov	edx,DWORD [528+esp]
778	movd	ebx,mm7
779	movzx	ecx,cl
780	psrlq	mm7,8
781	movq	mm3,mm6
782	mov	edi,eax
783	psrlq	mm6,8
784	pxor	mm7,[272+ebp*8+esp]
785	and	al,15
786	psllq	mm3,56
787	pxor	mm6,mm0
788	shr	edi,4
789	pinsrw	mm2,WORD [ecx*2+esi],2
790	pxor	mm7,[16+eax*8+esp]
791	rol	edx,8
792	pxor	mm6,[144+eax*8+esp]
793	pxor	mm7,mm3
794	pxor	mm6,[400+ebp*8+esp]
795	xor	bl,BYTE [ebp*1+esp]
796	mov	al,dl
797	movd	ecx,mm7
798	movzx	ebx,bl
799	psrlq	mm7,8
800	movq	mm3,mm6
801	mov	ebp,eax
802	psrlq	mm6,8
803	pxor	mm7,[272+edi*8+esp]
804	and	al,15
805	psllq	mm3,56
806	pxor	mm6,mm2
807	shr	ebp,4
808	pinsrw	mm1,WORD [ebx*2+esi],2
809	pxor	mm7,[16+eax*8+esp]
810	rol	edx,8
811	pxor	mm6,[144+eax*8+esp]
812	pxor	mm7,mm3
813	pxor	mm6,[400+edi*8+esp]
814	xor	cl,BYTE [edi*1+esp]
815	mov	al,dl
816	movd	ebx,mm7
817	movzx	ecx,cl
818	psrlq	mm7,8
819	movq	mm3,mm6
820	mov	edi,eax
821	psrlq	mm6,8
822	pxor	mm7,[272+ebp*8+esp]
823	and	al,15
824	psllq	mm3,56
825	pxor	mm6,mm1
826	shr	edi,4
827	pinsrw	mm0,WORD [ecx*2+esi],2
828	pxor	mm7,[16+eax*8+esp]
829	rol	edx,8
830	pxor	mm6,[144+eax*8+esp]
831	pxor	mm7,mm3
832	pxor	mm6,[400+ebp*8+esp]
833	xor	bl,BYTE [ebp*1+esp]
834	mov	al,dl
835	movd	ecx,mm7
836	movzx	ebx,bl
837	psrlq	mm7,8
838	movq	mm3,mm6
839	mov	ebp,eax
840	psrlq	mm6,8
841	pxor	mm7,[272+edi*8+esp]
842	and	al,15
843	psllq	mm3,56
844	pxor	mm6,mm0
845	shr	ebp,4
846	pinsrw	mm2,WORD [ebx*2+esi],2
847	pxor	mm7,[16+eax*8+esp]
848	rol	edx,8
849	pxor	mm6,[144+eax*8+esp]
850	pxor	mm7,mm3
851	pxor	mm6,[400+edi*8+esp]
852	xor	cl,BYTE [edi*1+esp]
853	mov	al,dl
854	mov	edx,DWORD [524+esp]
855	movd	ebx,mm7
856	movzx	ecx,cl
857	psrlq	mm7,8
858	movq	mm3,mm6
859	mov	edi,eax
860	psrlq	mm6,8
861	pxor	mm7,[272+ebp*8+esp]
862	and	al,15
863	psllq	mm3,56
864	pxor	mm6,mm2
865	shr	edi,4
866	pinsrw	mm1,WORD [ecx*2+esi],2
867	pxor	mm7,[16+eax*8+esp]
868	pxor	mm6,[144+eax*8+esp]
869	xor	bl,BYTE [ebp*1+esp]
870	pxor	mm7,mm3
871	pxor	mm6,[400+ebp*8+esp]
872	movzx	ebx,bl
873	pxor	mm2,mm2
874	psllq	mm1,4
875	movd	ecx,mm7
876	psrlq	mm7,4
877	movq	mm3,mm6
878	psrlq	mm6,4
879	shl	ecx,4
880	pxor	mm7,[16+edi*8+esp]
881	psllq	mm3,60
882	movzx	ecx,cl
883	pxor	mm7,mm3
884	pxor	mm6,[144+edi*8+esp]
885	pinsrw	mm0,WORD [ebx*2+esi],2
886	pxor	mm6,mm1
887	movd	edx,mm7
888	pinsrw	mm2,WORD [ecx*2+esi],3
889	psllq	mm0,12
890	pxor	mm6,mm0
891	psrlq	mm7,32
892	pxor	mm6,mm2
893	mov	ecx,DWORD [548+esp]
894	movd	ebx,mm7
895	movq	mm3,mm6
896	psllw	mm6,8
897	psrlw	mm3,8
898	por	mm6,mm3
899	bswap	edx
900	pshufw	mm6,mm6,27
901	bswap	ebx
902	cmp	ecx,DWORD [552+esp]
903	jne	NEAR L$009outer
904	mov	eax,DWORD [544+esp]
905	mov	DWORD [12+eax],edx
906	mov	DWORD [8+eax],ebx
907	movq	[eax],mm6
908	mov	esp,DWORD [556+esp]
909	emms
910	pop	edi
911	pop	esi
912	pop	ebx
913	pop	ebp
914	ret
915global	_gcm_init_clmul
916align	16
917_gcm_init_clmul:
918L$_gcm_init_clmul_begin:
919	mov	edx,DWORD [4+esp]
920	mov	eax,DWORD [8+esp]
921	call	L$010pic
922L$010pic:
923	pop	ecx
924	lea	ecx,[(L$bswap-L$010pic)+ecx]
925	movdqu	xmm2,[eax]
926	pshufd	xmm2,xmm2,78
927	pshufd	xmm4,xmm2,255
928	movdqa	xmm3,xmm2
929	psllq	xmm2,1
930	pxor	xmm5,xmm5
931	psrlq	xmm3,63
932	pcmpgtd	xmm5,xmm4
933	pslldq	xmm3,8
934	por	xmm2,xmm3
935	pand	xmm5,[16+ecx]
936	pxor	xmm2,xmm5
937	movdqa	xmm0,xmm2
938	movdqa	xmm1,xmm0
939	pshufd	xmm3,xmm0,78
940	pshufd	xmm4,xmm2,78
941	pxor	xmm3,xmm0
942	pxor	xmm4,xmm2
943db	102,15,58,68,194,0
944db	102,15,58,68,202,17
945db	102,15,58,68,220,0
946	xorps	xmm3,xmm0
947	xorps	xmm3,xmm1
948	movdqa	xmm4,xmm3
949	psrldq	xmm3,8
950	pslldq	xmm4,8
951	pxor	xmm1,xmm3
952	pxor	xmm0,xmm4
953	movdqa	xmm4,xmm0
954	movdqa	xmm3,xmm0
955	psllq	xmm0,5
956	pxor	xmm3,xmm0
957	psllq	xmm0,1
958	pxor	xmm0,xmm3
959	psllq	xmm0,57
960	movdqa	xmm3,xmm0
961	pslldq	xmm0,8
962	psrldq	xmm3,8
963	pxor	xmm0,xmm4
964	pxor	xmm1,xmm3
965	movdqa	xmm4,xmm0
966	psrlq	xmm0,1
967	pxor	xmm1,xmm4
968	pxor	xmm4,xmm0
969	psrlq	xmm0,5
970	pxor	xmm0,xmm4
971	psrlq	xmm0,1
972	pxor	xmm0,xmm1
973	pshufd	xmm3,xmm2,78
974	pshufd	xmm4,xmm0,78
975	pxor	xmm3,xmm2
976	movdqu	[edx],xmm2
977	pxor	xmm4,xmm0
978	movdqu	[16+edx],xmm0
979db	102,15,58,15,227,8
980	movdqu	[32+edx],xmm4
981	ret
982global	_gcm_gmult_clmul
983align	16
984_gcm_gmult_clmul:
985L$_gcm_gmult_clmul_begin:
986	mov	eax,DWORD [4+esp]
987	mov	edx,DWORD [8+esp]
988	call	L$011pic
989L$011pic:
990	pop	ecx
991	lea	ecx,[(L$bswap-L$011pic)+ecx]
992	movdqu	xmm0,[eax]
993	movdqa	xmm5,[ecx]
994	movups	xmm2,[edx]
995db	102,15,56,0,197
996	movups	xmm4,[32+edx]
997	movdqa	xmm1,xmm0
998	pshufd	xmm3,xmm0,78
999	pxor	xmm3,xmm0
1000db	102,15,58,68,194,0
1001db	102,15,58,68,202,17
1002db	102,15,58,68,220,0
1003	xorps	xmm3,xmm0
1004	xorps	xmm3,xmm1
1005	movdqa	xmm4,xmm3
1006	psrldq	xmm3,8
1007	pslldq	xmm4,8
1008	pxor	xmm1,xmm3
1009	pxor	xmm0,xmm4
1010	movdqa	xmm4,xmm0
1011	movdqa	xmm3,xmm0
1012	psllq	xmm0,5
1013	pxor	xmm3,xmm0
1014	psllq	xmm0,1
1015	pxor	xmm0,xmm3
1016	psllq	xmm0,57
1017	movdqa	xmm3,xmm0
1018	pslldq	xmm0,8
1019	psrldq	xmm3,8
1020	pxor	xmm0,xmm4
1021	pxor	xmm1,xmm3
1022	movdqa	xmm4,xmm0
1023	psrlq	xmm0,1
1024	pxor	xmm1,xmm4
1025	pxor	xmm4,xmm0
1026	psrlq	xmm0,5
1027	pxor	xmm0,xmm4
1028	psrlq	xmm0,1
1029	pxor	xmm0,xmm1
1030db	102,15,56,0,197
1031	movdqu	[eax],xmm0
1032	ret
1033global	_gcm_ghash_clmul
1034align	16
1035_gcm_ghash_clmul:
1036L$_gcm_ghash_clmul_begin:
1037	push	ebp
1038	push	ebx
1039	push	esi
1040	push	edi
1041	mov	eax,DWORD [20+esp]
1042	mov	edx,DWORD [24+esp]
1043	mov	esi,DWORD [28+esp]
1044	mov	ebx,DWORD [32+esp]
1045	call	L$012pic
1046L$012pic:
1047	pop	ecx
1048	lea	ecx,[(L$bswap-L$012pic)+ecx]
1049	movdqu	xmm0,[eax]
1050	movdqa	xmm5,[ecx]
1051	movdqu	xmm2,[edx]
1052db	102,15,56,0,197
1053	sub	ebx,16
1054	jz	NEAR L$013odd_tail
1055	movdqu	xmm3,[esi]
1056	movdqu	xmm6,[16+esi]
1057db	102,15,56,0,221
1058db	102,15,56,0,245
1059	movdqu	xmm5,[32+edx]
1060	pxor	xmm0,xmm3
1061	pshufd	xmm3,xmm6,78
1062	movdqa	xmm7,xmm6
1063	pxor	xmm3,xmm6
1064	lea	esi,[32+esi]
1065db	102,15,58,68,242,0
1066db	102,15,58,68,250,17
1067db	102,15,58,68,221,0
1068	movups	xmm2,[16+edx]
1069	nop
1070	sub	ebx,32
1071	jbe	NEAR L$014even_tail
1072	jmp	NEAR L$015mod_loop
1073align	32
1074L$015mod_loop:
1075	pshufd	xmm4,xmm0,78
1076	movdqa	xmm1,xmm0
1077	pxor	xmm4,xmm0
1078	nop
1079db	102,15,58,68,194,0
1080db	102,15,58,68,202,17
1081db	102,15,58,68,229,16
1082	movups	xmm2,[edx]
1083	xorps	xmm0,xmm6
1084	movdqa	xmm5,[ecx]
1085	xorps	xmm1,xmm7
1086	movdqu	xmm7,[esi]
1087	pxor	xmm3,xmm0
1088	movdqu	xmm6,[16+esi]
1089	pxor	xmm3,xmm1
1090db	102,15,56,0,253
1091	pxor	xmm4,xmm3
1092	movdqa	xmm3,xmm4
1093	psrldq	xmm4,8
1094	pslldq	xmm3,8
1095	pxor	xmm1,xmm4
1096	pxor	xmm0,xmm3
1097db	102,15,56,0,245
1098	pxor	xmm1,xmm7
1099	movdqa	xmm7,xmm6
1100	movdqa	xmm4,xmm0
1101	movdqa	xmm3,xmm0
1102	psllq	xmm0,5
1103	pxor	xmm3,xmm0
1104	psllq	xmm0,1
1105	pxor	xmm0,xmm3
1106db	102,15,58,68,242,0
1107	movups	xmm5,[32+edx]
1108	psllq	xmm0,57
1109	movdqa	xmm3,xmm0
1110	pslldq	xmm0,8
1111	psrldq	xmm3,8
1112	pxor	xmm0,xmm4
1113	pxor	xmm1,xmm3
1114	pshufd	xmm3,xmm7,78
1115	movdqa	xmm4,xmm0
1116	psrlq	xmm0,1
1117	pxor	xmm3,xmm7
1118	pxor	xmm1,xmm4
1119db	102,15,58,68,250,17
1120	movups	xmm2,[16+edx]
1121	pxor	xmm4,xmm0
1122	psrlq	xmm0,5
1123	pxor	xmm0,xmm4
1124	psrlq	xmm0,1
1125	pxor	xmm0,xmm1
1126db	102,15,58,68,221,0
1127	lea	esi,[32+esi]
1128	sub	ebx,32
1129	ja	NEAR L$015mod_loop
1130L$014even_tail:
1131	pshufd	xmm4,xmm0,78
1132	movdqa	xmm1,xmm0
1133	pxor	xmm4,xmm0
1134db	102,15,58,68,194,0
1135db	102,15,58,68,202,17
1136db	102,15,58,68,229,16
1137	movdqa	xmm5,[ecx]
1138	xorps	xmm0,xmm6
1139	xorps	xmm1,xmm7
1140	pxor	xmm3,xmm0
1141	pxor	xmm3,xmm1
1142	pxor	xmm4,xmm3
1143	movdqa	xmm3,xmm4
1144	psrldq	xmm4,8
1145	pslldq	xmm3,8
1146	pxor	xmm1,xmm4
1147	pxor	xmm0,xmm3
1148	movdqa	xmm4,xmm0
1149	movdqa	xmm3,xmm0
1150	psllq	xmm0,5
1151	pxor	xmm3,xmm0
1152	psllq	xmm0,1
1153	pxor	xmm0,xmm3
1154	psllq	xmm0,57
1155	movdqa	xmm3,xmm0
1156	pslldq	xmm0,8
1157	psrldq	xmm3,8
1158	pxor	xmm0,xmm4
1159	pxor	xmm1,xmm3
1160	movdqa	xmm4,xmm0
1161	psrlq	xmm0,1
1162	pxor	xmm1,xmm4
1163	pxor	xmm4,xmm0
1164	psrlq	xmm0,5
1165	pxor	xmm0,xmm4
1166	psrlq	xmm0,1
1167	pxor	xmm0,xmm1
1168	test	ebx,ebx
1169	jnz	NEAR L$016done
1170	movups	xmm2,[edx]
1171L$013odd_tail:
1172	movdqu	xmm3,[esi]
1173db	102,15,56,0,221
1174	pxor	xmm0,xmm3
1175	movdqa	xmm1,xmm0
1176	pshufd	xmm3,xmm0,78
1177	pshufd	xmm4,xmm2,78
1178	pxor	xmm3,xmm0
1179	pxor	xmm4,xmm2
1180db	102,15,58,68,194,0
1181db	102,15,58,68,202,17
1182db	102,15,58,68,220,0
1183	xorps	xmm3,xmm0
1184	xorps	xmm3,xmm1
1185	movdqa	xmm4,xmm3
1186	psrldq	xmm3,8
1187	pslldq	xmm4,8
1188	pxor	xmm1,xmm3
1189	pxor	xmm0,xmm4
1190	movdqa	xmm4,xmm0
1191	movdqa	xmm3,xmm0
1192	psllq	xmm0,5
1193	pxor	xmm3,xmm0
1194	psllq	xmm0,1
1195	pxor	xmm0,xmm3
1196	psllq	xmm0,57
1197	movdqa	xmm3,xmm0
1198	pslldq	xmm0,8
1199	psrldq	xmm3,8
1200	pxor	xmm0,xmm4
1201	pxor	xmm1,xmm3
1202	movdqa	xmm4,xmm0
1203	psrlq	xmm0,1
1204	pxor	xmm1,xmm4
1205	pxor	xmm4,xmm0
1206	psrlq	xmm0,5
1207	pxor	xmm0,xmm4
1208	psrlq	xmm0,1
1209	pxor	xmm0,xmm1
1210L$016done:
1211db	102,15,56,0,197
1212	movdqu	[eax],xmm0
1213	pop	edi
1214	pop	esi
1215	pop	ebx
1216	pop	ebp
1217	ret
1218align	64
1219L$bswap:
1220db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1221db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
1222align	64
1223L$rem_8bit:
1224dw	0,450,900,582,1800,1738,1164,1358
1225dw	3600,4050,3476,3158,2328,2266,2716,2910
1226dw	7200,7650,8100,7782,6952,6890,6316,6510
1227dw	4656,5106,4532,4214,5432,5370,5820,6014
1228dw	14400,14722,15300,14854,16200,16010,15564,15630
1229dw	13904,14226,13780,13334,12632,12442,13020,13086
1230dw	9312,9634,10212,9766,9064,8874,8428,8494
1231dw	10864,11186,10740,10294,11640,11450,12028,12094
1232dw	28800,28994,29444,29382,30600,30282,29708,30158
1233dw	32400,32594,32020,31958,31128,30810,31260,31710
1234dw	27808,28002,28452,28390,27560,27242,26668,27118
1235dw	25264,25458,24884,24822,26040,25722,26172,26622
1236dw	18624,18690,19268,19078,20424,19978,19532,19854
1237dw	18128,18194,17748,17558,16856,16410,16988,17310
1238dw	21728,21794,22372,22182,21480,21034,20588,20910
1239dw	23280,23346,22900,22710,24056,23610,24188,24510
1240dw	57600,57538,57988,58182,58888,59338,58764,58446
1241dw	61200,61138,60564,60758,59416,59866,60316,59998
1242dw	64800,64738,65188,65382,64040,64490,63916,63598
1243dw	62256,62194,61620,61814,62520,62970,63420,63102
1244dw	55616,55426,56004,56070,56904,57226,56780,56334
1245dw	55120,54930,54484,54550,53336,53658,54236,53790
1246dw	50528,50338,50916,50982,49768,50090,49644,49198
1247dw	52080,51890,51444,51510,52344,52666,53244,52798
1248dw	37248,36930,37380,37830,38536,38730,38156,38094
1249dw	40848,40530,39956,40406,39064,39258,39708,39646
1250dw	36256,35938,36388,36838,35496,35690,35116,35054
1251dw	33712,33394,32820,33270,33976,34170,34620,34558
1252dw	43456,43010,43588,43910,44744,44810,44364,44174
1253dw	42960,42514,42068,42390,41176,41242,41820,41630
1254dw	46560,46114,46692,47014,45800,45866,45420,45230
1255dw	48112,47666,47220,47542,48376,48442,49020,48830
1256align	64
1257L$rem_4bit:
1258dd	0,0,0,471859200,0,943718400,0,610271232
1259dd	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1260dd	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1261dd	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1262db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
1263db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
1264db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
1265db	0
1266