1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section	.text	code align=64
14%else
15section	.text	code
16%endif
17global	_gcm_gmult_4bit_mmx
18align	16
19_gcm_gmult_4bit_mmx:
20L$_gcm_gmult_4bit_mmx_begin:
21	push	ebp
22	push	ebx
23	push	esi
24	push	edi
25	mov	edi,DWORD [20+esp]
26	mov	esi,DWORD [24+esp]
27	call	L$000pic_point
28L$000pic_point:
29	pop	eax
30	lea	eax,[(L$rem_4bit-L$000pic_point)+eax]
31	movzx	ebx,BYTE [15+edi]
32	xor	ecx,ecx
33	mov	edx,ebx
34	mov	cl,dl
35	mov	ebp,14
36	shl	cl,4
37	and	edx,240
38	movq	mm0,[8+ecx*1+esi]
39	movq	mm1,[ecx*1+esi]
40	movd	ebx,mm0
41	jmp	NEAR L$001mmx_loop
42align	16
43L$001mmx_loop:
44	psrlq	mm0,4
45	and	ebx,15
46	movq	mm2,mm1
47	psrlq	mm1,4
48	pxor	mm0,[8+edx*1+esi]
49	mov	cl,BYTE [ebp*1+edi]
50	psllq	mm2,60
51	pxor	mm1,[ebx*8+eax]
52	dec	ebp
53	movd	ebx,mm0
54	pxor	mm1,[edx*1+esi]
55	mov	edx,ecx
56	pxor	mm0,mm2
57	js	NEAR L$002mmx_break
58	shl	cl,4
59	and	ebx,15
60	psrlq	mm0,4
61	and	edx,240
62	movq	mm2,mm1
63	psrlq	mm1,4
64	pxor	mm0,[8+ecx*1+esi]
65	psllq	mm2,60
66	pxor	mm1,[ebx*8+eax]
67	movd	ebx,mm0
68	pxor	mm1,[ecx*1+esi]
69	pxor	mm0,mm2
70	jmp	NEAR L$001mmx_loop
71align	16
72L$002mmx_break:
73	shl	cl,4
74	and	ebx,15
75	psrlq	mm0,4
76	and	edx,240
77	movq	mm2,mm1
78	psrlq	mm1,4
79	pxor	mm0,[8+ecx*1+esi]
80	psllq	mm2,60
81	pxor	mm1,[ebx*8+eax]
82	movd	ebx,mm0
83	pxor	mm1,[ecx*1+esi]
84	pxor	mm0,mm2
85	psrlq	mm0,4
86	and	ebx,15
87	movq	mm2,mm1
88	psrlq	mm1,4
89	pxor	mm0,[8+edx*1+esi]
90	psllq	mm2,60
91	pxor	mm1,[ebx*8+eax]
92	movd	ebx,mm0
93	pxor	mm1,[edx*1+esi]
94	pxor	mm0,mm2
95	psrlq	mm0,32
96	movd	edx,mm1
97	psrlq	mm1,32
98	movd	ecx,mm0
99	movd	ebp,mm1
100	bswap	ebx
101	bswap	edx
102	bswap	ecx
103	bswap	ebp
104	emms
105	mov	DWORD [12+edi],ebx
106	mov	DWORD [4+edi],edx
107	mov	DWORD [8+edi],ecx
108	mov	DWORD [edi],ebp
109	pop	edi
110	pop	esi
111	pop	ebx
112	pop	ebp
113	ret
114global	_gcm_ghash_4bit_mmx
115align	16
116_gcm_ghash_4bit_mmx:
117L$_gcm_ghash_4bit_mmx_begin:
118	push	ebp
119	push	ebx
120	push	esi
121	push	edi
122	mov	eax,DWORD [20+esp]
123	mov	ebx,DWORD [24+esp]
124	mov	ecx,DWORD [28+esp]
125	mov	edx,DWORD [32+esp]
126	mov	ebp,esp
127	call	L$003pic_point
128L$003pic_point:
129	pop	esi
130	lea	esi,[(L$rem_8bit-L$003pic_point)+esi]
131	sub	esp,544
132	and	esp,-64
133	sub	esp,16
134	add	edx,ecx
135	mov	DWORD [544+esp],eax
136	mov	DWORD [552+esp],edx
137	mov	DWORD [556+esp],ebp
138	add	ebx,128
139	lea	edi,[144+esp]
140	lea	ebp,[400+esp]
141	mov	edx,DWORD [ebx-120]
142	movq	mm0,[ebx-120]
143	movq	mm3,[ebx-128]
144	shl	edx,4
145	mov	BYTE [esp],dl
146	mov	edx,DWORD [ebx-104]
147	movq	mm2,[ebx-104]
148	movq	mm5,[ebx-112]
149	movq	[edi-128],mm0
150	psrlq	mm0,4
151	movq	[edi],mm3
152	movq	mm7,mm3
153	psrlq	mm3,4
154	shl	edx,4
155	mov	BYTE [1+esp],dl
156	mov	edx,DWORD [ebx-88]
157	movq	mm1,[ebx-88]
158	psllq	mm7,60
159	movq	mm4,[ebx-96]
160	por	mm0,mm7
161	movq	[edi-120],mm2
162	psrlq	mm2,4
163	movq	[8+edi],mm5
164	movq	mm6,mm5
165	movq	[ebp-128],mm0
166	psrlq	mm5,4
167	movq	[ebp],mm3
168	shl	edx,4
169	mov	BYTE [2+esp],dl
170	mov	edx,DWORD [ebx-72]
171	movq	mm0,[ebx-72]
172	psllq	mm6,60
173	movq	mm3,[ebx-80]
174	por	mm2,mm6
175	movq	[edi-112],mm1
176	psrlq	mm1,4
177	movq	[16+edi],mm4
178	movq	mm7,mm4
179	movq	[ebp-120],mm2
180	psrlq	mm4,4
181	movq	[8+ebp],mm5
182	shl	edx,4
183	mov	BYTE [3+esp],dl
184	mov	edx,DWORD [ebx-56]
185	movq	mm2,[ebx-56]
186	psllq	mm7,60
187	movq	mm5,[ebx-64]
188	por	mm1,mm7
189	movq	[edi-104],mm0
190	psrlq	mm0,4
191	movq	[24+edi],mm3
192	movq	mm6,mm3
193	movq	[ebp-112],mm1
194	psrlq	mm3,4
195	movq	[16+ebp],mm4
196	shl	edx,4
197	mov	BYTE [4+esp],dl
198	mov	edx,DWORD [ebx-40]
199	movq	mm1,[ebx-40]
200	psllq	mm6,60
201	movq	mm4,[ebx-48]
202	por	mm0,mm6
203	movq	[edi-96],mm2
204	psrlq	mm2,4
205	movq	[32+edi],mm5
206	movq	mm7,mm5
207	movq	[ebp-104],mm0
208	psrlq	mm5,4
209	movq	[24+ebp],mm3
210	shl	edx,4
211	mov	BYTE [5+esp],dl
212	mov	edx,DWORD [ebx-24]
213	movq	mm0,[ebx-24]
214	psllq	mm7,60
215	movq	mm3,[ebx-32]
216	por	mm2,mm7
217	movq	[edi-88],mm1
218	psrlq	mm1,4
219	movq	[40+edi],mm4
220	movq	mm6,mm4
221	movq	[ebp-96],mm2
222	psrlq	mm4,4
223	movq	[32+ebp],mm5
224	shl	edx,4
225	mov	BYTE [6+esp],dl
226	mov	edx,DWORD [ebx-8]
227	movq	mm2,[ebx-8]
228	psllq	mm6,60
229	movq	mm5,[ebx-16]
230	por	mm1,mm6
231	movq	[edi-80],mm0
232	psrlq	mm0,4
233	movq	[48+edi],mm3
234	movq	mm7,mm3
235	movq	[ebp-88],mm1
236	psrlq	mm3,4
237	movq	[40+ebp],mm4
238	shl	edx,4
239	mov	BYTE [7+esp],dl
240	mov	edx,DWORD [8+ebx]
241	movq	mm1,[8+ebx]
242	psllq	mm7,60
243	movq	mm4,[ebx]
244	por	mm0,mm7
245	movq	[edi-72],mm2
246	psrlq	mm2,4
247	movq	[56+edi],mm5
248	movq	mm6,mm5
249	movq	[ebp-80],mm0
250	psrlq	mm5,4
251	movq	[48+ebp],mm3
252	shl	edx,4
253	mov	BYTE [8+esp],dl
254	mov	edx,DWORD [24+ebx]
255	movq	mm0,[24+ebx]
256	psllq	mm6,60
257	movq	mm3,[16+ebx]
258	por	mm2,mm6
259	movq	[edi-64],mm1
260	psrlq	mm1,4
261	movq	[64+edi],mm4
262	movq	mm7,mm4
263	movq	[ebp-72],mm2
264	psrlq	mm4,4
265	movq	[56+ebp],mm5
266	shl	edx,4
267	mov	BYTE [9+esp],dl
268	mov	edx,DWORD [40+ebx]
269	movq	mm2,[40+ebx]
270	psllq	mm7,60
271	movq	mm5,[32+ebx]
272	por	mm1,mm7
273	movq	[edi-56],mm0
274	psrlq	mm0,4
275	movq	[72+edi],mm3
276	movq	mm6,mm3
277	movq	[ebp-64],mm1
278	psrlq	mm3,4
279	movq	[64+ebp],mm4
280	shl	edx,4
281	mov	BYTE [10+esp],dl
282	mov	edx,DWORD [56+ebx]
283	movq	mm1,[56+ebx]
284	psllq	mm6,60
285	movq	mm4,[48+ebx]
286	por	mm0,mm6
287	movq	[edi-48],mm2
288	psrlq	mm2,4
289	movq	[80+edi],mm5
290	movq	mm7,mm5
291	movq	[ebp-56],mm0
292	psrlq	mm5,4
293	movq	[72+ebp],mm3
294	shl	edx,4
295	mov	BYTE [11+esp],dl
296	mov	edx,DWORD [72+ebx]
297	movq	mm0,[72+ebx]
298	psllq	mm7,60
299	movq	mm3,[64+ebx]
300	por	mm2,mm7
301	movq	[edi-40],mm1
302	psrlq	mm1,4
303	movq	[88+edi],mm4
304	movq	mm6,mm4
305	movq	[ebp-48],mm2
306	psrlq	mm4,4
307	movq	[80+ebp],mm5
308	shl	edx,4
309	mov	BYTE [12+esp],dl
310	mov	edx,DWORD [88+ebx]
311	movq	mm2,[88+ebx]
312	psllq	mm6,60
313	movq	mm5,[80+ebx]
314	por	mm1,mm6
315	movq	[edi-32],mm0
316	psrlq	mm0,4
317	movq	[96+edi],mm3
318	movq	mm7,mm3
319	movq	[ebp-40],mm1
320	psrlq	mm3,4
321	movq	[88+ebp],mm4
322	shl	edx,4
323	mov	BYTE [13+esp],dl
324	mov	edx,DWORD [104+ebx]
325	movq	mm1,[104+ebx]
326	psllq	mm7,60
327	movq	mm4,[96+ebx]
328	por	mm0,mm7
329	movq	[edi-24],mm2
330	psrlq	mm2,4
331	movq	[104+edi],mm5
332	movq	mm6,mm5
333	movq	[ebp-32],mm0
334	psrlq	mm5,4
335	movq	[96+ebp],mm3
336	shl	edx,4
337	mov	BYTE [14+esp],dl
338	mov	edx,DWORD [120+ebx]
339	movq	mm0,[120+ebx]
340	psllq	mm6,60
341	movq	mm3,[112+ebx]
342	por	mm2,mm6
343	movq	[edi-16],mm1
344	psrlq	mm1,4
345	movq	[112+edi],mm4
346	movq	mm7,mm4
347	movq	[ebp-24],mm2
348	psrlq	mm4,4
349	movq	[104+ebp],mm5
350	shl	edx,4
351	mov	BYTE [15+esp],dl
352	psllq	mm7,60
353	por	mm1,mm7
354	movq	[edi-8],mm0
355	psrlq	mm0,4
356	movq	[120+edi],mm3
357	movq	mm6,mm3
358	movq	[ebp-16],mm1
359	psrlq	mm3,4
360	movq	[112+ebp],mm4
361	psllq	mm6,60
362	por	mm0,mm6
363	movq	[ebp-8],mm0
364	movq	[120+ebp],mm3
365	movq	mm6,[eax]
366	mov	ebx,DWORD [8+eax]
367	mov	edx,DWORD [12+eax]
368align	16
369L$004outer:
370	xor	edx,DWORD [12+ecx]
371	xor	ebx,DWORD [8+ecx]
372	pxor	mm6,[ecx]
373	lea	ecx,[16+ecx]
374	mov	DWORD [536+esp],ebx
375	movq	[528+esp],mm6
376	mov	DWORD [548+esp],ecx
377	xor	eax,eax
378	rol	edx,8
379	mov	al,dl
380	mov	ebp,eax
381	and	al,15
382	shr	ebp,4
383	pxor	mm0,mm0
384	rol	edx,8
385	pxor	mm1,mm1
386	pxor	mm2,mm2
387	movq	mm7,[16+eax*8+esp]
388	movq	mm6,[144+eax*8+esp]
389	mov	al,dl
390	movd	ebx,mm7
391	psrlq	mm7,8
392	movq	mm3,mm6
393	mov	edi,eax
394	psrlq	mm6,8
395	pxor	mm7,[272+ebp*8+esp]
396	and	al,15
397	psllq	mm3,56
398	shr	edi,4
399	pxor	mm7,[16+eax*8+esp]
400	rol	edx,8
401	pxor	mm6,[144+eax*8+esp]
402	pxor	mm7,mm3
403	pxor	mm6,[400+ebp*8+esp]
404	xor	bl,BYTE [ebp*1+esp]
405	mov	al,dl
406	movd	ecx,mm7
407	movzx	ebx,bl
408	psrlq	mm7,8
409	movq	mm3,mm6
410	mov	ebp,eax
411	psrlq	mm6,8
412	pxor	mm7,[272+edi*8+esp]
413	and	al,15
414	psllq	mm3,56
415	shr	ebp,4
416	pinsrw	mm2,WORD [ebx*2+esi],2
417	pxor	mm7,[16+eax*8+esp]
418	rol	edx,8
419	pxor	mm6,[144+eax*8+esp]
420	pxor	mm7,mm3
421	pxor	mm6,[400+edi*8+esp]
422	xor	cl,BYTE [edi*1+esp]
423	mov	al,dl
424	mov	edx,DWORD [536+esp]
425	movd	ebx,mm7
426	movzx	ecx,cl
427	psrlq	mm7,8
428	movq	mm3,mm6
429	mov	edi,eax
430	psrlq	mm6,8
431	pxor	mm7,[272+ebp*8+esp]
432	and	al,15
433	psllq	mm3,56
434	pxor	mm6,mm2
435	shr	edi,4
436	pinsrw	mm1,WORD [ecx*2+esi],2
437	pxor	mm7,[16+eax*8+esp]
438	rol	edx,8
439	pxor	mm6,[144+eax*8+esp]
440	pxor	mm7,mm3
441	pxor	mm6,[400+ebp*8+esp]
442	xor	bl,BYTE [ebp*1+esp]
443	mov	al,dl
444	movd	ecx,mm7
445	movzx	ebx,bl
446	psrlq	mm7,8
447	movq	mm3,mm6
448	mov	ebp,eax
449	psrlq	mm6,8
450	pxor	mm7,[272+edi*8+esp]
451	and	al,15
452	psllq	mm3,56
453	pxor	mm6,mm1
454	shr	ebp,4
455	pinsrw	mm0,WORD [ebx*2+esi],2
456	pxor	mm7,[16+eax*8+esp]
457	rol	edx,8
458	pxor	mm6,[144+eax*8+esp]
459	pxor	mm7,mm3
460	pxor	mm6,[400+edi*8+esp]
461	xor	cl,BYTE [edi*1+esp]
462	mov	al,dl
463	movd	ebx,mm7
464	movzx	ecx,cl
465	psrlq	mm7,8
466	movq	mm3,mm6
467	mov	edi,eax
468	psrlq	mm6,8
469	pxor	mm7,[272+ebp*8+esp]
470	and	al,15
471	psllq	mm3,56
472	pxor	mm6,mm0
473	shr	edi,4
474	pinsrw	mm2,WORD [ecx*2+esi],2
475	pxor	mm7,[16+eax*8+esp]
476	rol	edx,8
477	pxor	mm6,[144+eax*8+esp]
478	pxor	mm7,mm3
479	pxor	mm6,[400+ebp*8+esp]
480	xor	bl,BYTE [ebp*1+esp]
481	mov	al,dl
482	movd	ecx,mm7
483	movzx	ebx,bl
484	psrlq	mm7,8
485	movq	mm3,mm6
486	mov	ebp,eax
487	psrlq	mm6,8
488	pxor	mm7,[272+edi*8+esp]
489	and	al,15
490	psllq	mm3,56
491	pxor	mm6,mm2
492	shr	ebp,4
493	pinsrw	mm1,WORD [ebx*2+esi],2
494	pxor	mm7,[16+eax*8+esp]
495	rol	edx,8
496	pxor	mm6,[144+eax*8+esp]
497	pxor	mm7,mm3
498	pxor	mm6,[400+edi*8+esp]
499	xor	cl,BYTE [edi*1+esp]
500	mov	al,dl
501	mov	edx,DWORD [532+esp]
502	movd	ebx,mm7
503	movzx	ecx,cl
504	psrlq	mm7,8
505	movq	mm3,mm6
506	mov	edi,eax
507	psrlq	mm6,8
508	pxor	mm7,[272+ebp*8+esp]
509	and	al,15
510	psllq	mm3,56
511	pxor	mm6,mm1
512	shr	edi,4
513	pinsrw	mm0,WORD [ecx*2+esi],2
514	pxor	mm7,[16+eax*8+esp]
515	rol	edx,8
516	pxor	mm6,[144+eax*8+esp]
517	pxor	mm7,mm3
518	pxor	mm6,[400+ebp*8+esp]
519	xor	bl,BYTE [ebp*1+esp]
520	mov	al,dl
521	movd	ecx,mm7
522	movzx	ebx,bl
523	psrlq	mm7,8
524	movq	mm3,mm6
525	mov	ebp,eax
526	psrlq	mm6,8
527	pxor	mm7,[272+edi*8+esp]
528	and	al,15
529	psllq	mm3,56
530	pxor	mm6,mm0
531	shr	ebp,4
532	pinsrw	mm2,WORD [ebx*2+esi],2
533	pxor	mm7,[16+eax*8+esp]
534	rol	edx,8
535	pxor	mm6,[144+eax*8+esp]
536	pxor	mm7,mm3
537	pxor	mm6,[400+edi*8+esp]
538	xor	cl,BYTE [edi*1+esp]
539	mov	al,dl
540	movd	ebx,mm7
541	movzx	ecx,cl
542	psrlq	mm7,8
543	movq	mm3,mm6
544	mov	edi,eax
545	psrlq	mm6,8
546	pxor	mm7,[272+ebp*8+esp]
547	and	al,15
548	psllq	mm3,56
549	pxor	mm6,mm2
550	shr	edi,4
551	pinsrw	mm1,WORD [ecx*2+esi],2
552	pxor	mm7,[16+eax*8+esp]
553	rol	edx,8
554	pxor	mm6,[144+eax*8+esp]
555	pxor	mm7,mm3
556	pxor	mm6,[400+ebp*8+esp]
557	xor	bl,BYTE [ebp*1+esp]
558	mov	al,dl
559	movd	ecx,mm7
560	movzx	ebx,bl
561	psrlq	mm7,8
562	movq	mm3,mm6
563	mov	ebp,eax
564	psrlq	mm6,8
565	pxor	mm7,[272+edi*8+esp]
566	and	al,15
567	psllq	mm3,56
568	pxor	mm6,mm1
569	shr	ebp,4
570	pinsrw	mm0,WORD [ebx*2+esi],2
571	pxor	mm7,[16+eax*8+esp]
572	rol	edx,8
573	pxor	mm6,[144+eax*8+esp]
574	pxor	mm7,mm3
575	pxor	mm6,[400+edi*8+esp]
576	xor	cl,BYTE [edi*1+esp]
577	mov	al,dl
578	mov	edx,DWORD [528+esp]
579	movd	ebx,mm7
580	movzx	ecx,cl
581	psrlq	mm7,8
582	movq	mm3,mm6
583	mov	edi,eax
584	psrlq	mm6,8
585	pxor	mm7,[272+ebp*8+esp]
586	and	al,15
587	psllq	mm3,56
588	pxor	mm6,mm0
589	shr	edi,4
590	pinsrw	mm2,WORD [ecx*2+esi],2
591	pxor	mm7,[16+eax*8+esp]
592	rol	edx,8
593	pxor	mm6,[144+eax*8+esp]
594	pxor	mm7,mm3
595	pxor	mm6,[400+ebp*8+esp]
596	xor	bl,BYTE [ebp*1+esp]
597	mov	al,dl
598	movd	ecx,mm7
599	movzx	ebx,bl
600	psrlq	mm7,8
601	movq	mm3,mm6
602	mov	ebp,eax
603	psrlq	mm6,8
604	pxor	mm7,[272+edi*8+esp]
605	and	al,15
606	psllq	mm3,56
607	pxor	mm6,mm2
608	shr	ebp,4
609	pinsrw	mm1,WORD [ebx*2+esi],2
610	pxor	mm7,[16+eax*8+esp]
611	rol	edx,8
612	pxor	mm6,[144+eax*8+esp]
613	pxor	mm7,mm3
614	pxor	mm6,[400+edi*8+esp]
615	xor	cl,BYTE [edi*1+esp]
616	mov	al,dl
617	movd	ebx,mm7
618	movzx	ecx,cl
619	psrlq	mm7,8
620	movq	mm3,mm6
621	mov	edi,eax
622	psrlq	mm6,8
623	pxor	mm7,[272+ebp*8+esp]
624	and	al,15
625	psllq	mm3,56
626	pxor	mm6,mm1
627	shr	edi,4
628	pinsrw	mm0,WORD [ecx*2+esi],2
629	pxor	mm7,[16+eax*8+esp]
630	rol	edx,8
631	pxor	mm6,[144+eax*8+esp]
632	pxor	mm7,mm3
633	pxor	mm6,[400+ebp*8+esp]
634	xor	bl,BYTE [ebp*1+esp]
635	mov	al,dl
636	movd	ecx,mm7
637	movzx	ebx,bl
638	psrlq	mm7,8
639	movq	mm3,mm6
640	mov	ebp,eax
641	psrlq	mm6,8
642	pxor	mm7,[272+edi*8+esp]
643	and	al,15
644	psllq	mm3,56
645	pxor	mm6,mm0
646	shr	ebp,4
647	pinsrw	mm2,WORD [ebx*2+esi],2
648	pxor	mm7,[16+eax*8+esp]
649	rol	edx,8
650	pxor	mm6,[144+eax*8+esp]
651	pxor	mm7,mm3
652	pxor	mm6,[400+edi*8+esp]
653	xor	cl,BYTE [edi*1+esp]
654	mov	al,dl
655	mov	edx,DWORD [524+esp]
656	movd	ebx,mm7
657	movzx	ecx,cl
658	psrlq	mm7,8
659	movq	mm3,mm6
660	mov	edi,eax
661	psrlq	mm6,8
662	pxor	mm7,[272+ebp*8+esp]
663	and	al,15
664	psllq	mm3,56
665	pxor	mm6,mm2
666	shr	edi,4
667	pinsrw	mm1,WORD [ecx*2+esi],2
668	pxor	mm7,[16+eax*8+esp]
669	pxor	mm6,[144+eax*8+esp]
670	xor	bl,BYTE [ebp*1+esp]
671	pxor	mm7,mm3
672	pxor	mm6,[400+ebp*8+esp]
673	movzx	ebx,bl
674	pxor	mm2,mm2
675	psllq	mm1,4
676	movd	ecx,mm7
677	psrlq	mm7,4
678	movq	mm3,mm6
679	psrlq	mm6,4
680	shl	ecx,4
681	pxor	mm7,[16+edi*8+esp]
682	psllq	mm3,60
683	movzx	ecx,cl
684	pxor	mm7,mm3
685	pxor	mm6,[144+edi*8+esp]
686	pinsrw	mm0,WORD [ebx*2+esi],2
687	pxor	mm6,mm1
688	movd	edx,mm7
689	pinsrw	mm2,WORD [ecx*2+esi],3
690	psllq	mm0,12
691	pxor	mm6,mm0
692	psrlq	mm7,32
693	pxor	mm6,mm2
694	mov	ecx,DWORD [548+esp]
695	movd	ebx,mm7
696	movq	mm3,mm6
697	psllw	mm6,8
698	psrlw	mm3,8
699	por	mm6,mm3
700	bswap	edx
701	pshufw	mm6,mm6,27
702	bswap	ebx
703	cmp	ecx,DWORD [552+esp]
704	jne	NEAR L$004outer
705	mov	eax,DWORD [544+esp]
706	mov	DWORD [12+eax],edx
707	mov	DWORD [8+eax],ebx
708	movq	[eax],mm6
709	mov	esp,DWORD [556+esp]
710	emms
711	pop	edi
712	pop	esi
713	pop	ebx
714	pop	ebp
715	ret
716global	_gcm_init_clmul
717align	16
718_gcm_init_clmul:
719L$_gcm_init_clmul_begin:
720	mov	edx,DWORD [4+esp]
721	mov	eax,DWORD [8+esp]
722	call	L$005pic
723L$005pic:
724	pop	ecx
725	lea	ecx,[(L$bswap-L$005pic)+ecx]
726	movdqu	xmm2,[eax]
727	pshufd	xmm2,xmm2,78
728	pshufd	xmm4,xmm2,255
729	movdqa	xmm3,xmm2
730	psllq	xmm2,1
731	pxor	xmm5,xmm5
732	psrlq	xmm3,63
733	pcmpgtd	xmm5,xmm4
734	pslldq	xmm3,8
735	por	xmm2,xmm3
736	pand	xmm5,[16+ecx]
737	pxor	xmm2,xmm5
738	movdqa	xmm0,xmm2
739	movdqa	xmm1,xmm0
740	pshufd	xmm3,xmm0,78
741	pshufd	xmm4,xmm2,78
742	pxor	xmm3,xmm0
743	pxor	xmm4,xmm2
744db	102,15,58,68,194,0
745db	102,15,58,68,202,17
746db	102,15,58,68,220,0
747	xorps	xmm3,xmm0
748	xorps	xmm3,xmm1
749	movdqa	xmm4,xmm3
750	psrldq	xmm3,8
751	pslldq	xmm4,8
752	pxor	xmm1,xmm3
753	pxor	xmm0,xmm4
754	movdqa	xmm4,xmm0
755	movdqa	xmm3,xmm0
756	psllq	xmm0,5
757	pxor	xmm3,xmm0
758	psllq	xmm0,1
759	pxor	xmm0,xmm3
760	psllq	xmm0,57
761	movdqa	xmm3,xmm0
762	pslldq	xmm0,8
763	psrldq	xmm3,8
764	pxor	xmm0,xmm4
765	pxor	xmm1,xmm3
766	movdqa	xmm4,xmm0
767	psrlq	xmm0,1
768	pxor	xmm1,xmm4
769	pxor	xmm4,xmm0
770	psrlq	xmm0,5
771	pxor	xmm0,xmm4
772	psrlq	xmm0,1
773	pxor	xmm0,xmm1
774	pshufd	xmm3,xmm2,78
775	pshufd	xmm4,xmm0,78
776	pxor	xmm3,xmm2
777	movdqu	[edx],xmm2
778	pxor	xmm4,xmm0
779	movdqu	[16+edx],xmm0
780db	102,15,58,15,227,8
781	movdqu	[32+edx],xmm4
782	ret
783global	_gcm_gmult_clmul
784align	16
785_gcm_gmult_clmul:
786L$_gcm_gmult_clmul_begin:
787	mov	eax,DWORD [4+esp]
788	mov	edx,DWORD [8+esp]
789	call	L$006pic
790L$006pic:
791	pop	ecx
792	lea	ecx,[(L$bswap-L$006pic)+ecx]
793	movdqu	xmm0,[eax]
794	movdqa	xmm5,[ecx]
795	movups	xmm2,[edx]
796db	102,15,56,0,197
797	movups	xmm4,[32+edx]
798	movdqa	xmm1,xmm0
799	pshufd	xmm3,xmm0,78
800	pxor	xmm3,xmm0
801db	102,15,58,68,194,0
802db	102,15,58,68,202,17
803db	102,15,58,68,220,0
804	xorps	xmm3,xmm0
805	xorps	xmm3,xmm1
806	movdqa	xmm4,xmm3
807	psrldq	xmm3,8
808	pslldq	xmm4,8
809	pxor	xmm1,xmm3
810	pxor	xmm0,xmm4
811	movdqa	xmm4,xmm0
812	movdqa	xmm3,xmm0
813	psllq	xmm0,5
814	pxor	xmm3,xmm0
815	psllq	xmm0,1
816	pxor	xmm0,xmm3
817	psllq	xmm0,57
818	movdqa	xmm3,xmm0
819	pslldq	xmm0,8
820	psrldq	xmm3,8
821	pxor	xmm0,xmm4
822	pxor	xmm1,xmm3
823	movdqa	xmm4,xmm0
824	psrlq	xmm0,1
825	pxor	xmm1,xmm4
826	pxor	xmm4,xmm0
827	psrlq	xmm0,5
828	pxor	xmm0,xmm4
829	psrlq	xmm0,1
830	pxor	xmm0,xmm1
831db	102,15,56,0,197
832	movdqu	[eax],xmm0
833	ret
834global	_gcm_ghash_clmul
835align	16
836_gcm_ghash_clmul:
837L$_gcm_ghash_clmul_begin:
838	push	ebp
839	push	ebx
840	push	esi
841	push	edi
842	mov	eax,DWORD [20+esp]
843	mov	edx,DWORD [24+esp]
844	mov	esi,DWORD [28+esp]
845	mov	ebx,DWORD [32+esp]
846	call	L$007pic
847L$007pic:
848	pop	ecx
849	lea	ecx,[(L$bswap-L$007pic)+ecx]
850	movdqu	xmm0,[eax]
851	movdqa	xmm5,[ecx]
852	movdqu	xmm2,[edx]
853db	102,15,56,0,197
854	sub	ebx,16
855	jz	NEAR L$008odd_tail
856	movdqu	xmm3,[esi]
857	movdqu	xmm6,[16+esi]
858db	102,15,56,0,221
859db	102,15,56,0,245
860	movdqu	xmm5,[32+edx]
861	pxor	xmm0,xmm3
862	pshufd	xmm3,xmm6,78
863	movdqa	xmm7,xmm6
864	pxor	xmm3,xmm6
865	lea	esi,[32+esi]
866db	102,15,58,68,242,0
867db	102,15,58,68,250,17
868db	102,15,58,68,221,0
869	movups	xmm2,[16+edx]
870	nop
871	sub	ebx,32
872	jbe	NEAR L$009even_tail
873	jmp	NEAR L$010mod_loop
874align	32
875L$010mod_loop:
876	pshufd	xmm4,xmm0,78
877	movdqa	xmm1,xmm0
878	pxor	xmm4,xmm0
879	nop
880db	102,15,58,68,194,0
881db	102,15,58,68,202,17
882db	102,15,58,68,229,16
883	movups	xmm2,[edx]
884	xorps	xmm0,xmm6
885	movdqa	xmm5,[ecx]
886	xorps	xmm1,xmm7
887	movdqu	xmm7,[esi]
888	pxor	xmm3,xmm0
889	movdqu	xmm6,[16+esi]
890	pxor	xmm3,xmm1
891db	102,15,56,0,253
892	pxor	xmm4,xmm3
893	movdqa	xmm3,xmm4
894	psrldq	xmm4,8
895	pslldq	xmm3,8
896	pxor	xmm1,xmm4
897	pxor	xmm0,xmm3
898db	102,15,56,0,245
899	pxor	xmm1,xmm7
900	movdqa	xmm7,xmm6
901	movdqa	xmm4,xmm0
902	movdqa	xmm3,xmm0
903	psllq	xmm0,5
904	pxor	xmm3,xmm0
905	psllq	xmm0,1
906	pxor	xmm0,xmm3
907db	102,15,58,68,242,0
908	movups	xmm5,[32+edx]
909	psllq	xmm0,57
910	movdqa	xmm3,xmm0
911	pslldq	xmm0,8
912	psrldq	xmm3,8
913	pxor	xmm0,xmm4
914	pxor	xmm1,xmm3
915	pshufd	xmm3,xmm7,78
916	movdqa	xmm4,xmm0
917	psrlq	xmm0,1
918	pxor	xmm3,xmm7
919	pxor	xmm1,xmm4
920db	102,15,58,68,250,17
921	movups	xmm2,[16+edx]
922	pxor	xmm4,xmm0
923	psrlq	xmm0,5
924	pxor	xmm0,xmm4
925	psrlq	xmm0,1
926	pxor	xmm0,xmm1
927db	102,15,58,68,221,0
928	lea	esi,[32+esi]
929	sub	ebx,32
930	ja	NEAR L$010mod_loop
931L$009even_tail:
932	pshufd	xmm4,xmm0,78
933	movdqa	xmm1,xmm0
934	pxor	xmm4,xmm0
935db	102,15,58,68,194,0
936db	102,15,58,68,202,17
937db	102,15,58,68,229,16
938	movdqa	xmm5,[ecx]
939	xorps	xmm0,xmm6
940	xorps	xmm1,xmm7
941	pxor	xmm3,xmm0
942	pxor	xmm3,xmm1
943	pxor	xmm4,xmm3
944	movdqa	xmm3,xmm4
945	psrldq	xmm4,8
946	pslldq	xmm3,8
947	pxor	xmm1,xmm4
948	pxor	xmm0,xmm3
949	movdqa	xmm4,xmm0
950	movdqa	xmm3,xmm0
951	psllq	xmm0,5
952	pxor	xmm3,xmm0
953	psllq	xmm0,1
954	pxor	xmm0,xmm3
955	psllq	xmm0,57
956	movdqa	xmm3,xmm0
957	pslldq	xmm0,8
958	psrldq	xmm3,8
959	pxor	xmm0,xmm4
960	pxor	xmm1,xmm3
961	movdqa	xmm4,xmm0
962	psrlq	xmm0,1
963	pxor	xmm1,xmm4
964	pxor	xmm4,xmm0
965	psrlq	xmm0,5
966	pxor	xmm0,xmm4
967	psrlq	xmm0,1
968	pxor	xmm0,xmm1
969	test	ebx,ebx
970	jnz	NEAR L$011done
971	movups	xmm2,[edx]
972L$008odd_tail:
973	movdqu	xmm3,[esi]
974db	102,15,56,0,221
975	pxor	xmm0,xmm3
976	movdqa	xmm1,xmm0
977	pshufd	xmm3,xmm0,78
978	pshufd	xmm4,xmm2,78
979	pxor	xmm3,xmm0
980	pxor	xmm4,xmm2
981db	102,15,58,68,194,0
982db	102,15,58,68,202,17
983db	102,15,58,68,220,0
984	xorps	xmm3,xmm0
985	xorps	xmm3,xmm1
986	movdqa	xmm4,xmm3
987	psrldq	xmm3,8
988	pslldq	xmm4,8
989	pxor	xmm1,xmm3
990	pxor	xmm0,xmm4
991	movdqa	xmm4,xmm0
992	movdqa	xmm3,xmm0
993	psllq	xmm0,5
994	pxor	xmm3,xmm0
995	psllq	xmm0,1
996	pxor	xmm0,xmm3
997	psllq	xmm0,57
998	movdqa	xmm3,xmm0
999	pslldq	xmm0,8
1000	psrldq	xmm3,8
1001	pxor	xmm0,xmm4
1002	pxor	xmm1,xmm3
1003	movdqa	xmm4,xmm0
1004	psrlq	xmm0,1
1005	pxor	xmm1,xmm4
1006	pxor	xmm4,xmm0
1007	psrlq	xmm0,5
1008	pxor	xmm0,xmm4
1009	psrlq	xmm0,1
1010	pxor	xmm0,xmm1
1011L$011done:
1012db	102,15,56,0,197
1013	movdqu	[eax],xmm0
1014	pop	edi
1015	pop	esi
1016	pop	ebx
1017	pop	ebp
1018	ret
1019align	64
1020L$bswap:
1021db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1022db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
1023align	64
1024L$rem_8bit:
1025dw	0,450,900,582,1800,1738,1164,1358
1026dw	3600,4050,3476,3158,2328,2266,2716,2910
1027dw	7200,7650,8100,7782,6952,6890,6316,6510
1028dw	4656,5106,4532,4214,5432,5370,5820,6014
1029dw	14400,14722,15300,14854,16200,16010,15564,15630
1030dw	13904,14226,13780,13334,12632,12442,13020,13086
1031dw	9312,9634,10212,9766,9064,8874,8428,8494
1032dw	10864,11186,10740,10294,11640,11450,12028,12094
1033dw	28800,28994,29444,29382,30600,30282,29708,30158
1034dw	32400,32594,32020,31958,31128,30810,31260,31710
1035dw	27808,28002,28452,28390,27560,27242,26668,27118
1036dw	25264,25458,24884,24822,26040,25722,26172,26622
1037dw	18624,18690,19268,19078,20424,19978,19532,19854
1038dw	18128,18194,17748,17558,16856,16410,16988,17310
1039dw	21728,21794,22372,22182,21480,21034,20588,20910
1040dw	23280,23346,22900,22710,24056,23610,24188,24510
1041dw	57600,57538,57988,58182,58888,59338,58764,58446
1042dw	61200,61138,60564,60758,59416,59866,60316,59998
1043dw	64800,64738,65188,65382,64040,64490,63916,63598
1044dw	62256,62194,61620,61814,62520,62970,63420,63102
1045dw	55616,55426,56004,56070,56904,57226,56780,56334
1046dw	55120,54930,54484,54550,53336,53658,54236,53790
1047dw	50528,50338,50916,50982,49768,50090,49644,49198
1048dw	52080,51890,51444,51510,52344,52666,53244,52798
1049dw	37248,36930,37380,37830,38536,38730,38156,38094
1050dw	40848,40530,39956,40406,39064,39258,39708,39646
1051dw	36256,35938,36388,36838,35496,35690,35116,35054
1052dw	33712,33394,32820,33270,33976,34170,34620,34558
1053dw	43456,43010,43588,43910,44744,44810,44364,44174
1054dw	42960,42514,42068,42390,41176,41242,41820,41630
1055dw	46560,46114,46692,47014,45800,45866,45420,45230
1056dw	48112,47666,47220,47542,48376,48442,49020,48830
1057align	64
1058L$rem_4bit:
1059dd	0,0,0,471859200,0,943718400,0,610271232
1060dd	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1061dd	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1062dd	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1063db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
1064db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
1065db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
1066db	0
1067