1#if defined(__i386__)
2.file	"src/crypto/bn/asm/x86-mont.S"
3.text
4.globl	bn_mul_mont
5.hidden	bn_mul_mont
6.type	bn_mul_mont,@function
7.align	16
8bn_mul_mont:
9.L_bn_mul_mont_begin:
10	pushl	%ebp
11	pushl	%ebx
12	pushl	%esi
13	pushl	%edi
14	xorl	%eax,%eax
15	movl	40(%esp),%edi
16	cmpl	$4,%edi
17	jl	.L000just_leave
18	leal	20(%esp),%esi
19	leal	24(%esp),%edx
20	movl	%esp,%ebp
21	addl	$2,%edi
22	negl	%edi
23	leal	-32(%esp,%edi,4),%esp
24	negl	%edi
25	movl	%esp,%eax
26	subl	%edx,%eax
27	andl	$2047,%eax
28	subl	%eax,%esp
29	xorl	%esp,%edx
30	andl	$2048,%edx
31	xorl	$2048,%edx
32	subl	%edx,%esp
33	andl	$-64,%esp
34	movl	(%esi),%eax
35	movl	4(%esi),%ebx
36	movl	8(%esi),%ecx
37	movl	12(%esi),%edx
38	movl	16(%esi),%esi
39	movl	(%esi),%esi
40	movl	%eax,4(%esp)
41	movl	%ebx,8(%esp)
42	movl	%ecx,12(%esp)
43	movl	%edx,16(%esp)
44	movl	%esi,20(%esp)
45	leal	-3(%edi),%ebx
46	movl	%ebp,24(%esp)
47	call	.L001PIC_me_up
48.L001PIC_me_up:
49	popl	%eax
50	leal	OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax
51	btl	$26,(%eax)
52	jnc	.L002non_sse2
53	movl	$-1,%eax
54	movd	%eax,%mm7
55	movl	8(%esp),%esi
56	movl	12(%esp),%edi
57	movl	16(%esp),%ebp
58	xorl	%edx,%edx
59	xorl	%ecx,%ecx
60	movd	(%edi),%mm4
61	movd	(%esi),%mm5
62	movd	(%ebp),%mm3
63	pmuludq	%mm4,%mm5
64	movq	%mm5,%mm2
65	movq	%mm5,%mm0
66	pand	%mm7,%mm0
67	pmuludq	20(%esp),%mm5
68	pmuludq	%mm5,%mm3
69	paddq	%mm0,%mm3
70	movd	4(%ebp),%mm1
71	movd	4(%esi),%mm0
72	psrlq	$32,%mm2
73	psrlq	$32,%mm3
74	incl	%ecx
75.align	16
76.L0031st:
77	pmuludq	%mm4,%mm0
78	pmuludq	%mm5,%mm1
79	paddq	%mm0,%mm2
80	paddq	%mm1,%mm3
81	movq	%mm2,%mm0
82	pand	%mm7,%mm0
83	movd	4(%ebp,%ecx,4),%mm1
84	paddq	%mm0,%mm3
85	movd	4(%esi,%ecx,4),%mm0
86	psrlq	$32,%mm2
87	movd	%mm3,28(%esp,%ecx,4)
88	psrlq	$32,%mm3
89	leal	1(%ecx),%ecx
90	cmpl	%ebx,%ecx
91	jl	.L0031st
92	pmuludq	%mm4,%mm0
93	pmuludq	%mm5,%mm1
94	paddq	%mm0,%mm2
95	paddq	%mm1,%mm3
96	movq	%mm2,%mm0
97	pand	%mm7,%mm0
98	paddq	%mm0,%mm3
99	movd	%mm3,28(%esp,%ecx,4)
100	psrlq	$32,%mm2
101	psrlq	$32,%mm3
102	paddq	%mm2,%mm3
103	movq	%mm3,32(%esp,%ebx,4)
104	incl	%edx
105.L004outer:
106	xorl	%ecx,%ecx
107	movd	(%edi,%edx,4),%mm4
108	movd	(%esi),%mm5
109	movd	32(%esp),%mm6
110	movd	(%ebp),%mm3
111	pmuludq	%mm4,%mm5
112	paddq	%mm6,%mm5
113	movq	%mm5,%mm0
114	movq	%mm5,%mm2
115	pand	%mm7,%mm0
116	pmuludq	20(%esp),%mm5
117	pmuludq	%mm5,%mm3
118	paddq	%mm0,%mm3
119	movd	36(%esp),%mm6
120	movd	4(%ebp),%mm1
121	movd	4(%esi),%mm0
122	psrlq	$32,%mm2
123	psrlq	$32,%mm3
124	paddq	%mm6,%mm2
125	incl	%ecx
126	decl	%ebx
127.L005inner:
128	pmuludq	%mm4,%mm0
129	pmuludq	%mm5,%mm1
130	paddq	%mm0,%mm2
131	paddq	%mm1,%mm3
132	movq	%mm2,%mm0
133	movd	36(%esp,%ecx,4),%mm6
134	pand	%mm7,%mm0
135	movd	4(%ebp,%ecx,4),%mm1
136	paddq	%mm0,%mm3
137	movd	4(%esi,%ecx,4),%mm0
138	psrlq	$32,%mm2
139	movd	%mm3,28(%esp,%ecx,4)
140	psrlq	$32,%mm3
141	paddq	%mm6,%mm2
142	decl	%ebx
143	leal	1(%ecx),%ecx
144	jnz	.L005inner
145	movl	%ecx,%ebx
146	pmuludq	%mm4,%mm0
147	pmuludq	%mm5,%mm1
148	paddq	%mm0,%mm2
149	paddq	%mm1,%mm3
150	movq	%mm2,%mm0
151	pand	%mm7,%mm0
152	paddq	%mm0,%mm3
153	movd	%mm3,28(%esp,%ecx,4)
154	psrlq	$32,%mm2
155	psrlq	$32,%mm3
156	movd	36(%esp,%ebx,4),%mm6
157	paddq	%mm2,%mm3
158	paddq	%mm6,%mm3
159	movq	%mm3,32(%esp,%ebx,4)
160	leal	1(%edx),%edx
161	cmpl	%ebx,%edx
162	jle	.L004outer
163	emms
164	jmp	.L006common_tail
165.align	16
166.L002non_sse2:
167	movl	8(%esp),%esi
168	leal	1(%ebx),%ebp
169	movl	12(%esp),%edi
170	xorl	%ecx,%ecx
171	movl	%esi,%edx
172	andl	$1,%ebp
173	subl	%edi,%edx
174	leal	4(%edi,%ebx,4),%eax
175	orl	%edx,%ebp
176	movl	(%edi),%edi
177	jz	.L007bn_sqr_mont
178	movl	%eax,28(%esp)
179	movl	(%esi),%eax
180	xorl	%edx,%edx
181.align	16
182.L008mull:
183	movl	%edx,%ebp
184	mull	%edi
185	addl	%eax,%ebp
186	leal	1(%ecx),%ecx
187	adcl	$0,%edx
188	movl	(%esi,%ecx,4),%eax
189	cmpl	%ebx,%ecx
190	movl	%ebp,28(%esp,%ecx,4)
191	jl	.L008mull
192	movl	%edx,%ebp
193	mull	%edi
194	movl	20(%esp),%edi
195	addl	%ebp,%eax
196	movl	16(%esp),%esi
197	adcl	$0,%edx
198	imull	32(%esp),%edi
199	movl	%eax,32(%esp,%ebx,4)
200	xorl	%ecx,%ecx
201	movl	%edx,36(%esp,%ebx,4)
202	movl	%ecx,40(%esp,%ebx,4)
203	movl	(%esi),%eax
204	mull	%edi
205	addl	32(%esp),%eax
206	movl	4(%esi),%eax
207	adcl	$0,%edx
208	incl	%ecx
209	jmp	.L0092ndmadd
210.align	16
211.L0101stmadd:
212	movl	%edx,%ebp
213	mull	%edi
214	addl	32(%esp,%ecx,4),%ebp
215	leal	1(%ecx),%ecx
216	adcl	$0,%edx
217	addl	%eax,%ebp
218	movl	(%esi,%ecx,4),%eax
219	adcl	$0,%edx
220	cmpl	%ebx,%ecx
221	movl	%ebp,28(%esp,%ecx,4)
222	jl	.L0101stmadd
223	movl	%edx,%ebp
224	mull	%edi
225	addl	32(%esp,%ebx,4),%eax
226	movl	20(%esp),%edi
227	adcl	$0,%edx
228	movl	16(%esp),%esi
229	addl	%eax,%ebp
230	adcl	$0,%edx
231	imull	32(%esp),%edi
232	xorl	%ecx,%ecx
233	addl	36(%esp,%ebx,4),%edx
234	movl	%ebp,32(%esp,%ebx,4)
235	adcl	$0,%ecx
236	movl	(%esi),%eax
237	movl	%edx,36(%esp,%ebx,4)
238	movl	%ecx,40(%esp,%ebx,4)
239	mull	%edi
240	addl	32(%esp),%eax
241	movl	4(%esi),%eax
242	adcl	$0,%edx
243	movl	$1,%ecx
244.align	16
245.L0092ndmadd:
246	movl	%edx,%ebp
247	mull	%edi
248	addl	32(%esp,%ecx,4),%ebp
249	leal	1(%ecx),%ecx
250	adcl	$0,%edx
251	addl	%eax,%ebp
252	movl	(%esi,%ecx,4),%eax
253	adcl	$0,%edx
254	cmpl	%ebx,%ecx
255	movl	%ebp,24(%esp,%ecx,4)
256	jl	.L0092ndmadd
257	movl	%edx,%ebp
258	mull	%edi
259	addl	32(%esp,%ebx,4),%ebp
260	adcl	$0,%edx
261	addl	%eax,%ebp
262	adcl	$0,%edx
263	movl	%ebp,28(%esp,%ebx,4)
264	xorl	%eax,%eax
265	movl	12(%esp),%ecx
266	addl	36(%esp,%ebx,4),%edx
267	adcl	40(%esp,%ebx,4),%eax
268	leal	4(%ecx),%ecx
269	movl	%edx,32(%esp,%ebx,4)
270	cmpl	28(%esp),%ecx
271	movl	%eax,36(%esp,%ebx,4)
272	je	.L006common_tail
273	movl	(%ecx),%edi
274	movl	8(%esp),%esi
275	movl	%ecx,12(%esp)
276	xorl	%ecx,%ecx
277	xorl	%edx,%edx
278	movl	(%esi),%eax
279	jmp	.L0101stmadd
280.align	16
281.L007bn_sqr_mont:
282	movl	%ebx,(%esp)
283	movl	%ecx,12(%esp)
284	movl	%edi,%eax
285	mull	%edi
286	movl	%eax,32(%esp)
287	movl	%edx,%ebx
288	shrl	$1,%edx
289	andl	$1,%ebx
290	incl	%ecx
291.align	16
292.L011sqr:
293	movl	(%esi,%ecx,4),%eax
294	movl	%edx,%ebp
295	mull	%edi
296	addl	%ebp,%eax
297	leal	1(%ecx),%ecx
298	adcl	$0,%edx
299	leal	(%ebx,%eax,2),%ebp
300	shrl	$31,%eax
301	cmpl	(%esp),%ecx
302	movl	%eax,%ebx
303	movl	%ebp,28(%esp,%ecx,4)
304	jl	.L011sqr
305	movl	(%esi,%ecx,4),%eax
306	movl	%edx,%ebp
307	mull	%edi
308	addl	%ebp,%eax
309	movl	20(%esp),%edi
310	adcl	$0,%edx
311	movl	16(%esp),%esi
312	leal	(%ebx,%eax,2),%ebp
313	imull	32(%esp),%edi
314	shrl	$31,%eax
315	movl	%ebp,32(%esp,%ecx,4)
316	leal	(%eax,%edx,2),%ebp
317	movl	(%esi),%eax
318	shrl	$31,%edx
319	movl	%ebp,36(%esp,%ecx,4)
320	movl	%edx,40(%esp,%ecx,4)
321	mull	%edi
322	addl	32(%esp),%eax
323	movl	%ecx,%ebx
324	adcl	$0,%edx
325	movl	4(%esi),%eax
326	movl	$1,%ecx
327.align	16
328.L0123rdmadd:
329	movl	%edx,%ebp
330	mull	%edi
331	addl	32(%esp,%ecx,4),%ebp
332	adcl	$0,%edx
333	addl	%eax,%ebp
334	movl	4(%esi,%ecx,4),%eax
335	adcl	$0,%edx
336	movl	%ebp,28(%esp,%ecx,4)
337	movl	%edx,%ebp
338	mull	%edi
339	addl	36(%esp,%ecx,4),%ebp
340	leal	2(%ecx),%ecx
341	adcl	$0,%edx
342	addl	%eax,%ebp
343	movl	(%esi,%ecx,4),%eax
344	adcl	$0,%edx
345	cmpl	%ebx,%ecx
346	movl	%ebp,24(%esp,%ecx,4)
347	jl	.L0123rdmadd
348	movl	%edx,%ebp
349	mull	%edi
350	addl	32(%esp,%ebx,4),%ebp
351	adcl	$0,%edx
352	addl	%eax,%ebp
353	adcl	$0,%edx
354	movl	%ebp,28(%esp,%ebx,4)
355	movl	12(%esp),%ecx
356	xorl	%eax,%eax
357	movl	8(%esp),%esi
358	addl	36(%esp,%ebx,4),%edx
359	adcl	40(%esp,%ebx,4),%eax
360	movl	%edx,32(%esp,%ebx,4)
361	cmpl	%ebx,%ecx
362	movl	%eax,36(%esp,%ebx,4)
363	je	.L006common_tail
364	movl	4(%esi,%ecx,4),%edi
365	leal	1(%ecx),%ecx
366	movl	%edi,%eax
367	movl	%ecx,12(%esp)
368	mull	%edi
369	addl	32(%esp,%ecx,4),%eax
370	adcl	$0,%edx
371	movl	%eax,32(%esp,%ecx,4)
372	xorl	%ebp,%ebp
373	cmpl	%ebx,%ecx
374	leal	1(%ecx),%ecx
375	je	.L013sqrlast
376	movl	%edx,%ebx
377	shrl	$1,%edx
378	andl	$1,%ebx
379.align	16
380.L014sqradd:
381	movl	(%esi,%ecx,4),%eax
382	movl	%edx,%ebp
383	mull	%edi
384	addl	%ebp,%eax
385	leal	(%eax,%eax,1),%ebp
386	adcl	$0,%edx
387	shrl	$31,%eax
388	addl	32(%esp,%ecx,4),%ebp
389	leal	1(%ecx),%ecx
390	adcl	$0,%eax
391	addl	%ebx,%ebp
392	adcl	$0,%eax
393	cmpl	(%esp),%ecx
394	movl	%ebp,28(%esp,%ecx,4)
395	movl	%eax,%ebx
396	jle	.L014sqradd
397	movl	%edx,%ebp
398	addl	%edx,%edx
399	shrl	$31,%ebp
400	addl	%ebx,%edx
401	adcl	$0,%ebp
402.L013sqrlast:
403	movl	20(%esp),%edi
404	movl	16(%esp),%esi
405	imull	32(%esp),%edi
406	addl	32(%esp,%ecx,4),%edx
407	movl	(%esi),%eax
408	adcl	$0,%ebp
409	movl	%edx,32(%esp,%ecx,4)
410	movl	%ebp,36(%esp,%ecx,4)
411	mull	%edi
412	addl	32(%esp),%eax
413	leal	-1(%ecx),%ebx
414	adcl	$0,%edx
415	movl	$1,%ecx
416	movl	4(%esi),%eax
417	jmp	.L0123rdmadd
418.align	16
419.L006common_tail:
420	movl	16(%esp),%ebp
421	movl	4(%esp),%edi
422	leal	32(%esp),%esi
423	movl	(%esi),%eax
424	movl	%ebx,%ecx
425	xorl	%edx,%edx
426.align	16
427.L015sub:
428	sbbl	(%ebp,%edx,4),%eax
429	movl	%eax,(%edi,%edx,4)
430	decl	%ecx
431	movl	4(%esi,%edx,4),%eax
432	leal	1(%edx),%edx
433	jge	.L015sub
434	sbbl	$0,%eax
435.align	16
436.L016copy:
437	movl	(%esi,%ebx,4),%edx
438	movl	(%edi,%ebx,4),%ebp
439	xorl	%ebp,%edx
440	andl	%eax,%edx
441	xorl	%ebp,%edx
442	movl	%ecx,(%esi,%ebx,4)
443	movl	%edx,(%edi,%ebx,4)
444	decl	%ebx
445	jge	.L016copy
446	movl	24(%esp),%esp
447	movl	$1,%eax
448.L000just_leave:
449	popl	%edi
450	popl	%esi
451	popl	%ebx
452	popl	%ebp
453	ret
454.size	bn_mul_mont,.-.L_bn_mul_mont_begin
455.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
456.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
457.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
458.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
459.byte	111,114,103,62,0
460#endif
461