1#if defined(__i386__)
2.file	"src/crypto/bn/asm/x86-mont.S"
3.text
4.globl	_bn_mul_mont
5.private_extern	_bn_mul_mont
6.align	4
7_bn_mul_mont:
8L_bn_mul_mont_begin:
9	pushl	%ebp
10	pushl	%ebx
11	pushl	%esi
12	pushl	%edi
13	xorl	%eax,%eax
14	movl	40(%esp),%edi
15	cmpl	$4,%edi
16	jl	L000just_leave
17	leal	20(%esp),%esi
18	leal	24(%esp),%edx
19	movl	%esp,%ebp
20	addl	$2,%edi
21	negl	%edi
22	leal	-32(%esp,%edi,4),%esp
23	negl	%edi
24	movl	%esp,%eax
25	subl	%edx,%eax
26	andl	$2047,%eax
27	subl	%eax,%esp
28	xorl	%esp,%edx
29	andl	$2048,%edx
30	xorl	$2048,%edx
31	subl	%edx,%esp
32	andl	$-64,%esp
33	movl	(%esi),%eax
34	movl	4(%esi),%ebx
35	movl	8(%esi),%ecx
36	movl	12(%esi),%edx
37	movl	16(%esi),%esi
38	movl	(%esi),%esi
39	movl	%eax,4(%esp)
40	movl	%ebx,8(%esp)
41	movl	%ecx,12(%esp)
42	movl	%edx,16(%esp)
43	movl	%esi,20(%esp)
44	leal	-3(%edi),%ebx
45	movl	%ebp,24(%esp)
46	call	L001PIC_me_up
47L001PIC_me_up:
48	popl	%eax
49	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax
50	btl	$26,(%eax)
51	jnc	L002non_sse2
52	movl	$-1,%eax
53	movd	%eax,%mm7
54	movl	8(%esp),%esi
55	movl	12(%esp),%edi
56	movl	16(%esp),%ebp
57	xorl	%edx,%edx
58	xorl	%ecx,%ecx
59	movd	(%edi),%mm4
60	movd	(%esi),%mm5
61	movd	(%ebp),%mm3
62	pmuludq	%mm4,%mm5
63	movq	%mm5,%mm2
64	movq	%mm5,%mm0
65	pand	%mm7,%mm0
66	pmuludq	20(%esp),%mm5
67	pmuludq	%mm5,%mm3
68	paddq	%mm0,%mm3
69	movd	4(%ebp),%mm1
70	movd	4(%esi),%mm0
71	psrlq	$32,%mm2
72	psrlq	$32,%mm3
73	incl	%ecx
74.align	4,0x90
75L0031st:
76	pmuludq	%mm4,%mm0
77	pmuludq	%mm5,%mm1
78	paddq	%mm0,%mm2
79	paddq	%mm1,%mm3
80	movq	%mm2,%mm0
81	pand	%mm7,%mm0
82	movd	4(%ebp,%ecx,4),%mm1
83	paddq	%mm0,%mm3
84	movd	4(%esi,%ecx,4),%mm0
85	psrlq	$32,%mm2
86	movd	%mm3,28(%esp,%ecx,4)
87	psrlq	$32,%mm3
88	leal	1(%ecx),%ecx
89	cmpl	%ebx,%ecx
90	jl	L0031st
91	pmuludq	%mm4,%mm0
92	pmuludq	%mm5,%mm1
93	paddq	%mm0,%mm2
94	paddq	%mm1,%mm3
95	movq	%mm2,%mm0
96	pand	%mm7,%mm0
97	paddq	%mm0,%mm3
98	movd	%mm3,28(%esp,%ecx,4)
99	psrlq	$32,%mm2
100	psrlq	$32,%mm3
101	paddq	%mm2,%mm3
102	movq	%mm3,32(%esp,%ebx,4)
103	incl	%edx
104L004outer:
105	xorl	%ecx,%ecx
106	movd	(%edi,%edx,4),%mm4
107	movd	(%esi),%mm5
108	movd	32(%esp),%mm6
109	movd	(%ebp),%mm3
110	pmuludq	%mm4,%mm5
111	paddq	%mm6,%mm5
112	movq	%mm5,%mm0
113	movq	%mm5,%mm2
114	pand	%mm7,%mm0
115	pmuludq	20(%esp),%mm5
116	pmuludq	%mm5,%mm3
117	paddq	%mm0,%mm3
118	movd	36(%esp),%mm6
119	movd	4(%ebp),%mm1
120	movd	4(%esi),%mm0
121	psrlq	$32,%mm2
122	psrlq	$32,%mm3
123	paddq	%mm6,%mm2
124	incl	%ecx
125	decl	%ebx
126L005inner:
127	pmuludq	%mm4,%mm0
128	pmuludq	%mm5,%mm1
129	paddq	%mm0,%mm2
130	paddq	%mm1,%mm3
131	movq	%mm2,%mm0
132	movd	36(%esp,%ecx,4),%mm6
133	pand	%mm7,%mm0
134	movd	4(%ebp,%ecx,4),%mm1
135	paddq	%mm0,%mm3
136	movd	4(%esi,%ecx,4),%mm0
137	psrlq	$32,%mm2
138	movd	%mm3,28(%esp,%ecx,4)
139	psrlq	$32,%mm3
140	paddq	%mm6,%mm2
141	decl	%ebx
142	leal	1(%ecx),%ecx
143	jnz	L005inner
144	movl	%ecx,%ebx
145	pmuludq	%mm4,%mm0
146	pmuludq	%mm5,%mm1
147	paddq	%mm0,%mm2
148	paddq	%mm1,%mm3
149	movq	%mm2,%mm0
150	pand	%mm7,%mm0
151	paddq	%mm0,%mm3
152	movd	%mm3,28(%esp,%ecx,4)
153	psrlq	$32,%mm2
154	psrlq	$32,%mm3
155	movd	36(%esp,%ebx,4),%mm6
156	paddq	%mm2,%mm3
157	paddq	%mm6,%mm3
158	movq	%mm3,32(%esp,%ebx,4)
159	leal	1(%edx),%edx
160	cmpl	%ebx,%edx
161	jle	L004outer
162	emms
163	jmp	L006common_tail
164.align	4,0x90
165L002non_sse2:
166	movl	8(%esp),%esi
167	leal	1(%ebx),%ebp
168	movl	12(%esp),%edi
169	xorl	%ecx,%ecx
170	movl	%esi,%edx
171	andl	$1,%ebp
172	subl	%edi,%edx
173	leal	4(%edi,%ebx,4),%eax
174	orl	%edx,%ebp
175	movl	(%edi),%edi
176	jz	L007bn_sqr_mont
177	movl	%eax,28(%esp)
178	movl	(%esi),%eax
179	xorl	%edx,%edx
180.align	4,0x90
181L008mull:
182	movl	%edx,%ebp
183	mull	%edi
184	addl	%eax,%ebp
185	leal	1(%ecx),%ecx
186	adcl	$0,%edx
187	movl	(%esi,%ecx,4),%eax
188	cmpl	%ebx,%ecx
189	movl	%ebp,28(%esp,%ecx,4)
190	jl	L008mull
191	movl	%edx,%ebp
192	mull	%edi
193	movl	20(%esp),%edi
194	addl	%ebp,%eax
195	movl	16(%esp),%esi
196	adcl	$0,%edx
197	imull	32(%esp),%edi
198	movl	%eax,32(%esp,%ebx,4)
199	xorl	%ecx,%ecx
200	movl	%edx,36(%esp,%ebx,4)
201	movl	%ecx,40(%esp,%ebx,4)
202	movl	(%esi),%eax
203	mull	%edi
204	addl	32(%esp),%eax
205	movl	4(%esi),%eax
206	adcl	$0,%edx
207	incl	%ecx
208	jmp	L0092ndmadd
209.align	4,0x90
210L0101stmadd:
211	movl	%edx,%ebp
212	mull	%edi
213	addl	32(%esp,%ecx,4),%ebp
214	leal	1(%ecx),%ecx
215	adcl	$0,%edx
216	addl	%eax,%ebp
217	movl	(%esi,%ecx,4),%eax
218	adcl	$0,%edx
219	cmpl	%ebx,%ecx
220	movl	%ebp,28(%esp,%ecx,4)
221	jl	L0101stmadd
222	movl	%edx,%ebp
223	mull	%edi
224	addl	32(%esp,%ebx,4),%eax
225	movl	20(%esp),%edi
226	adcl	$0,%edx
227	movl	16(%esp),%esi
228	addl	%eax,%ebp
229	adcl	$0,%edx
230	imull	32(%esp),%edi
231	xorl	%ecx,%ecx
232	addl	36(%esp,%ebx,4),%edx
233	movl	%ebp,32(%esp,%ebx,4)
234	adcl	$0,%ecx
235	movl	(%esi),%eax
236	movl	%edx,36(%esp,%ebx,4)
237	movl	%ecx,40(%esp,%ebx,4)
238	mull	%edi
239	addl	32(%esp),%eax
240	movl	4(%esi),%eax
241	adcl	$0,%edx
242	movl	$1,%ecx
243.align	4,0x90
244L0092ndmadd:
245	movl	%edx,%ebp
246	mull	%edi
247	addl	32(%esp,%ecx,4),%ebp
248	leal	1(%ecx),%ecx
249	adcl	$0,%edx
250	addl	%eax,%ebp
251	movl	(%esi,%ecx,4),%eax
252	adcl	$0,%edx
253	cmpl	%ebx,%ecx
254	movl	%ebp,24(%esp,%ecx,4)
255	jl	L0092ndmadd
256	movl	%edx,%ebp
257	mull	%edi
258	addl	32(%esp,%ebx,4),%ebp
259	adcl	$0,%edx
260	addl	%eax,%ebp
261	adcl	$0,%edx
262	movl	%ebp,28(%esp,%ebx,4)
263	xorl	%eax,%eax
264	movl	12(%esp),%ecx
265	addl	36(%esp,%ebx,4),%edx
266	adcl	40(%esp,%ebx,4),%eax
267	leal	4(%ecx),%ecx
268	movl	%edx,32(%esp,%ebx,4)
269	cmpl	28(%esp),%ecx
270	movl	%eax,36(%esp,%ebx,4)
271	je	L006common_tail
272	movl	(%ecx),%edi
273	movl	8(%esp),%esi
274	movl	%ecx,12(%esp)
275	xorl	%ecx,%ecx
276	xorl	%edx,%edx
277	movl	(%esi),%eax
278	jmp	L0101stmadd
279.align	4,0x90
280L007bn_sqr_mont:
281	movl	%ebx,(%esp)
282	movl	%ecx,12(%esp)
283	movl	%edi,%eax
284	mull	%edi
285	movl	%eax,32(%esp)
286	movl	%edx,%ebx
287	shrl	$1,%edx
288	andl	$1,%ebx
289	incl	%ecx
290.align	4,0x90
291L011sqr:
292	movl	(%esi,%ecx,4),%eax
293	movl	%edx,%ebp
294	mull	%edi
295	addl	%ebp,%eax
296	leal	1(%ecx),%ecx
297	adcl	$0,%edx
298	leal	(%ebx,%eax,2),%ebp
299	shrl	$31,%eax
300	cmpl	(%esp),%ecx
301	movl	%eax,%ebx
302	movl	%ebp,28(%esp,%ecx,4)
303	jl	L011sqr
304	movl	(%esi,%ecx,4),%eax
305	movl	%edx,%ebp
306	mull	%edi
307	addl	%ebp,%eax
308	movl	20(%esp),%edi
309	adcl	$0,%edx
310	movl	16(%esp),%esi
311	leal	(%ebx,%eax,2),%ebp
312	imull	32(%esp),%edi
313	shrl	$31,%eax
314	movl	%ebp,32(%esp,%ecx,4)
315	leal	(%eax,%edx,2),%ebp
316	movl	(%esi),%eax
317	shrl	$31,%edx
318	movl	%ebp,36(%esp,%ecx,4)
319	movl	%edx,40(%esp,%ecx,4)
320	mull	%edi
321	addl	32(%esp),%eax
322	movl	%ecx,%ebx
323	adcl	$0,%edx
324	movl	4(%esi),%eax
325	movl	$1,%ecx
326.align	4,0x90
327L0123rdmadd:
328	movl	%edx,%ebp
329	mull	%edi
330	addl	32(%esp,%ecx,4),%ebp
331	adcl	$0,%edx
332	addl	%eax,%ebp
333	movl	4(%esi,%ecx,4),%eax
334	adcl	$0,%edx
335	movl	%ebp,28(%esp,%ecx,4)
336	movl	%edx,%ebp
337	mull	%edi
338	addl	36(%esp,%ecx,4),%ebp
339	leal	2(%ecx),%ecx
340	adcl	$0,%edx
341	addl	%eax,%ebp
342	movl	(%esi,%ecx,4),%eax
343	adcl	$0,%edx
344	cmpl	%ebx,%ecx
345	movl	%ebp,24(%esp,%ecx,4)
346	jl	L0123rdmadd
347	movl	%edx,%ebp
348	mull	%edi
349	addl	32(%esp,%ebx,4),%ebp
350	adcl	$0,%edx
351	addl	%eax,%ebp
352	adcl	$0,%edx
353	movl	%ebp,28(%esp,%ebx,4)
354	movl	12(%esp),%ecx
355	xorl	%eax,%eax
356	movl	8(%esp),%esi
357	addl	36(%esp,%ebx,4),%edx
358	adcl	40(%esp,%ebx,4),%eax
359	movl	%edx,32(%esp,%ebx,4)
360	cmpl	%ebx,%ecx
361	movl	%eax,36(%esp,%ebx,4)
362	je	L006common_tail
363	movl	4(%esi,%ecx,4),%edi
364	leal	1(%ecx),%ecx
365	movl	%edi,%eax
366	movl	%ecx,12(%esp)
367	mull	%edi
368	addl	32(%esp,%ecx,4),%eax
369	adcl	$0,%edx
370	movl	%eax,32(%esp,%ecx,4)
371	xorl	%ebp,%ebp
372	cmpl	%ebx,%ecx
373	leal	1(%ecx),%ecx
374	je	L013sqrlast
375	movl	%edx,%ebx
376	shrl	$1,%edx
377	andl	$1,%ebx
378.align	4,0x90
379L014sqradd:
380	movl	(%esi,%ecx,4),%eax
381	movl	%edx,%ebp
382	mull	%edi
383	addl	%ebp,%eax
384	leal	(%eax,%eax,1),%ebp
385	adcl	$0,%edx
386	shrl	$31,%eax
387	addl	32(%esp,%ecx,4),%ebp
388	leal	1(%ecx),%ecx
389	adcl	$0,%eax
390	addl	%ebx,%ebp
391	adcl	$0,%eax
392	cmpl	(%esp),%ecx
393	movl	%ebp,28(%esp,%ecx,4)
394	movl	%eax,%ebx
395	jle	L014sqradd
396	movl	%edx,%ebp
397	addl	%edx,%edx
398	shrl	$31,%ebp
399	addl	%ebx,%edx
400	adcl	$0,%ebp
401L013sqrlast:
402	movl	20(%esp),%edi
403	movl	16(%esp),%esi
404	imull	32(%esp),%edi
405	addl	32(%esp,%ecx,4),%edx
406	movl	(%esi),%eax
407	adcl	$0,%ebp
408	movl	%edx,32(%esp,%ecx,4)
409	movl	%ebp,36(%esp,%ecx,4)
410	mull	%edi
411	addl	32(%esp),%eax
412	leal	-1(%ecx),%ebx
413	adcl	$0,%edx
414	movl	$1,%ecx
415	movl	4(%esi),%eax
416	jmp	L0123rdmadd
417.align	4,0x90
418L006common_tail:
419	movl	16(%esp),%ebp
420	movl	4(%esp),%edi
421	leal	32(%esp),%esi
422	movl	(%esi),%eax
423	movl	%ebx,%ecx
424	xorl	%edx,%edx
425.align	4,0x90
426L015sub:
427	sbbl	(%ebp,%edx,4),%eax
428	movl	%eax,(%edi,%edx,4)
429	decl	%ecx
430	movl	4(%esi,%edx,4),%eax
431	leal	1(%edx),%edx
432	jge	L015sub
433	sbbl	$0,%eax
434.align	4,0x90
435L016copy:
436	movl	(%esi,%ebx,4),%edx
437	movl	(%edi,%ebx,4),%ebp
438	xorl	%ebp,%edx
439	andl	%eax,%edx
440	xorl	%ebp,%edx
441	movl	%ecx,(%esi,%ebx,4)
442	movl	%edx,(%edi,%ebx,4)
443	decl	%ebx
444	jge	L016copy
445	movl	24(%esp),%esp
446	movl	$1,%eax
447L000just_leave:
448	popl	%edi
449	popl	%esi
450	popl	%ebx
451	popl	%ebp
452	ret
453.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
454.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
455.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
456.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
457.byte	111,114,103,62,0
458.section __IMPORT,__pointers,non_lazy_symbol_pointers
459L_OPENSSL_ia32cap_P$non_lazy_ptr:
460.indirect_symbol	_OPENSSL_ia32cap_P
461.long	0
462#endif
463