1#if defined(__x86_64__)
2.text
3
4.extern	OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
6
7.globl	bn_mul_mont_gather5
8.hidden bn_mul_mont_gather5
9.type	bn_mul_mont_gather5,@function
10.align	64
11bn_mul_mont_gather5:
12.cfi_startproc
13	movl	%r9d,%r9d
14	movq	%rsp,%rax
15.cfi_def_cfa_register	%rax
16	testl	$7,%r9d
17	jnz	.Lmul_enter
18	jmp	.Lmul4x_enter
19
20.align	16
21.Lmul_enter:
22	movd	8(%rsp),%xmm5
23	pushq	%rbx
24.cfi_offset	%rbx,-16
25	pushq	%rbp
26.cfi_offset	%rbp,-24
27	pushq	%r12
28.cfi_offset	%r12,-32
29	pushq	%r13
30.cfi_offset	%r13,-40
31	pushq	%r14
32.cfi_offset	%r14,-48
33	pushq	%r15
34.cfi_offset	%r15,-56
35
36	negq	%r9
37	movq	%rsp,%r11
38	leaq	-280(%rsp,%r9,8),%r10
39	negq	%r9
40	andq	$-1024,%r10
41
42
43
44
45
46
47
48
49
50	subq	%r10,%r11
51	andq	$-4096,%r11
52	leaq	(%r10,%r11,1),%rsp
53	movq	(%rsp),%r11
54	cmpq	%r10,%rsp
55	ja	.Lmul_page_walk
56	jmp	.Lmul_page_walk_done
57
58.Lmul_page_walk:
59	leaq	-4096(%rsp),%rsp
60	movq	(%rsp),%r11
61	cmpq	%r10,%rsp
62	ja	.Lmul_page_walk
63.Lmul_page_walk_done:
64
65	leaq	.Linc(%rip),%r10
66	movq	%rax,8(%rsp,%r9,8)
67.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
68.Lmul_body:
69
70	leaq	128(%rdx),%r12
71	movdqa	0(%r10),%xmm0
72	movdqa	16(%r10),%xmm1
73	leaq	24-112(%rsp,%r9,8),%r10
74	andq	$-16,%r10
75
76	pshufd	$0,%xmm5,%xmm5
77	movdqa	%xmm1,%xmm4
78	movdqa	%xmm1,%xmm2
79	paddd	%xmm0,%xmm1
80	pcmpeqd	%xmm5,%xmm0
81.byte	0x67
82	movdqa	%xmm4,%xmm3
83	paddd	%xmm1,%xmm2
84	pcmpeqd	%xmm5,%xmm1
85	movdqa	%xmm0,112(%r10)
86	movdqa	%xmm4,%xmm0
87
88	paddd	%xmm2,%xmm3
89	pcmpeqd	%xmm5,%xmm2
90	movdqa	%xmm1,128(%r10)
91	movdqa	%xmm4,%xmm1
92
93	paddd	%xmm3,%xmm0
94	pcmpeqd	%xmm5,%xmm3
95	movdqa	%xmm2,144(%r10)
96	movdqa	%xmm4,%xmm2
97
98	paddd	%xmm0,%xmm1
99	pcmpeqd	%xmm5,%xmm0
100	movdqa	%xmm3,160(%r10)
101	movdqa	%xmm4,%xmm3
102	paddd	%xmm1,%xmm2
103	pcmpeqd	%xmm5,%xmm1
104	movdqa	%xmm0,176(%r10)
105	movdqa	%xmm4,%xmm0
106
107	paddd	%xmm2,%xmm3
108	pcmpeqd	%xmm5,%xmm2
109	movdqa	%xmm1,192(%r10)
110	movdqa	%xmm4,%xmm1
111
112	paddd	%xmm3,%xmm0
113	pcmpeqd	%xmm5,%xmm3
114	movdqa	%xmm2,208(%r10)
115	movdqa	%xmm4,%xmm2
116
117	paddd	%xmm0,%xmm1
118	pcmpeqd	%xmm5,%xmm0
119	movdqa	%xmm3,224(%r10)
120	movdqa	%xmm4,%xmm3
121	paddd	%xmm1,%xmm2
122	pcmpeqd	%xmm5,%xmm1
123	movdqa	%xmm0,240(%r10)
124	movdqa	%xmm4,%xmm0
125
126	paddd	%xmm2,%xmm3
127	pcmpeqd	%xmm5,%xmm2
128	movdqa	%xmm1,256(%r10)
129	movdqa	%xmm4,%xmm1
130
131	paddd	%xmm3,%xmm0
132	pcmpeqd	%xmm5,%xmm3
133	movdqa	%xmm2,272(%r10)
134	movdqa	%xmm4,%xmm2
135
136	paddd	%xmm0,%xmm1
137	pcmpeqd	%xmm5,%xmm0
138	movdqa	%xmm3,288(%r10)
139	movdqa	%xmm4,%xmm3
140	paddd	%xmm1,%xmm2
141	pcmpeqd	%xmm5,%xmm1
142	movdqa	%xmm0,304(%r10)
143
144	paddd	%xmm2,%xmm3
145.byte	0x67
146	pcmpeqd	%xmm5,%xmm2
147	movdqa	%xmm1,320(%r10)
148
149	pcmpeqd	%xmm5,%xmm3
150	movdqa	%xmm2,336(%r10)
151	pand	64(%r12),%xmm0
152
153	pand	80(%r12),%xmm1
154	pand	96(%r12),%xmm2
155	movdqa	%xmm3,352(%r10)
156	pand	112(%r12),%xmm3
157	por	%xmm2,%xmm0
158	por	%xmm3,%xmm1
159	movdqa	-128(%r12),%xmm4
160	movdqa	-112(%r12),%xmm5
161	movdqa	-96(%r12),%xmm2
162	pand	112(%r10),%xmm4
163	movdqa	-80(%r12),%xmm3
164	pand	128(%r10),%xmm5
165	por	%xmm4,%xmm0
166	pand	144(%r10),%xmm2
167	por	%xmm5,%xmm1
168	pand	160(%r10),%xmm3
169	por	%xmm2,%xmm0
170	por	%xmm3,%xmm1
171	movdqa	-64(%r12),%xmm4
172	movdqa	-48(%r12),%xmm5
173	movdqa	-32(%r12),%xmm2
174	pand	176(%r10),%xmm4
175	movdqa	-16(%r12),%xmm3
176	pand	192(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	208(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	224(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183	movdqa	0(%r12),%xmm4
184	movdqa	16(%r12),%xmm5
185	movdqa	32(%r12),%xmm2
186	pand	240(%r10),%xmm4
187	movdqa	48(%r12),%xmm3
188	pand	256(%r10),%xmm5
189	por	%xmm4,%xmm0
190	pand	272(%r10),%xmm2
191	por	%xmm5,%xmm1
192	pand	288(%r10),%xmm3
193	por	%xmm2,%xmm0
194	por	%xmm3,%xmm1
195	por	%xmm1,%xmm0
196	pshufd	$0x4e,%xmm0,%xmm1
197	por	%xmm1,%xmm0
198	leaq	256(%r12),%r12
199.byte	102,72,15,126,195
200
201	movq	(%r8),%r8
202	movq	(%rsi),%rax
203
204	xorq	%r14,%r14
205	xorq	%r15,%r15
206
207	movq	%r8,%rbp
208	mulq	%rbx
209	movq	%rax,%r10
210	movq	(%rcx),%rax
211
212	imulq	%r10,%rbp
213	movq	%rdx,%r11
214
215	mulq	%rbp
216	addq	%rax,%r10
217	movq	8(%rsi),%rax
218	adcq	$0,%rdx
219	movq	%rdx,%r13
220
221	leaq	1(%r15),%r15
222	jmp	.L1st_enter
223
224.align	16
225.L1st:
226	addq	%rax,%r13
227	movq	(%rsi,%r15,8),%rax
228	adcq	$0,%rdx
229	addq	%r11,%r13
230	movq	%r10,%r11
231	adcq	$0,%rdx
232	movq	%r13,-16(%rsp,%r15,8)
233	movq	%rdx,%r13
234
235.L1st_enter:
236	mulq	%rbx
237	addq	%rax,%r11
238	movq	(%rcx,%r15,8),%rax
239	adcq	$0,%rdx
240	leaq	1(%r15),%r15
241	movq	%rdx,%r10
242
243	mulq	%rbp
244	cmpq	%r9,%r15
245	jne	.L1st
246
247
248	addq	%rax,%r13
249	adcq	$0,%rdx
250	addq	%r11,%r13
251	adcq	$0,%rdx
252	movq	%r13,-16(%rsp,%r9,8)
253	movq	%rdx,%r13
254	movq	%r10,%r11
255
256	xorq	%rdx,%rdx
257	addq	%r11,%r13
258	adcq	$0,%rdx
259	movq	%r13,-8(%rsp,%r9,8)
260	movq	%rdx,(%rsp,%r9,8)
261
262	leaq	1(%r14),%r14
263	jmp	.Louter
264.align	16
265.Louter:
266	leaq	24+128(%rsp,%r9,8),%rdx
267	andq	$-16,%rdx
268	pxor	%xmm4,%xmm4
269	pxor	%xmm5,%xmm5
270	movdqa	-128(%r12),%xmm0
271	movdqa	-112(%r12),%xmm1
272	movdqa	-96(%r12),%xmm2
273	movdqa	-80(%r12),%xmm3
274	pand	-128(%rdx),%xmm0
275	pand	-112(%rdx),%xmm1
276	por	%xmm0,%xmm4
277	pand	-96(%rdx),%xmm2
278	por	%xmm1,%xmm5
279	pand	-80(%rdx),%xmm3
280	por	%xmm2,%xmm4
281	por	%xmm3,%xmm5
282	movdqa	-64(%r12),%xmm0
283	movdqa	-48(%r12),%xmm1
284	movdqa	-32(%r12),%xmm2
285	movdqa	-16(%r12),%xmm3
286	pand	-64(%rdx),%xmm0
287	pand	-48(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	-32(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	-16(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	0(%r12),%xmm0
295	movdqa	16(%r12),%xmm1
296	movdqa	32(%r12),%xmm2
297	movdqa	48(%r12),%xmm3
298	pand	0(%rdx),%xmm0
299	pand	16(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	32(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	48(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	movdqa	64(%r12),%xmm0
307	movdqa	80(%r12),%xmm1
308	movdqa	96(%r12),%xmm2
309	movdqa	112(%r12),%xmm3
310	pand	64(%rdx),%xmm0
311	pand	80(%rdx),%xmm1
312	por	%xmm0,%xmm4
313	pand	96(%rdx),%xmm2
314	por	%xmm1,%xmm5
315	pand	112(%rdx),%xmm3
316	por	%xmm2,%xmm4
317	por	%xmm3,%xmm5
318	por	%xmm5,%xmm4
319	pshufd	$0x4e,%xmm4,%xmm0
320	por	%xmm4,%xmm0
321	leaq	256(%r12),%r12
322
323	movq	(%rsi),%rax
324.byte	102,72,15,126,195
325
326	xorq	%r15,%r15
327	movq	%r8,%rbp
328	movq	(%rsp),%r10
329
330	mulq	%rbx
331	addq	%rax,%r10
332	movq	(%rcx),%rax
333	adcq	$0,%rdx
334
335	imulq	%r10,%rbp
336	movq	%rdx,%r11
337
338	mulq	%rbp
339	addq	%rax,%r10
340	movq	8(%rsi),%rax
341	adcq	$0,%rdx
342	movq	8(%rsp),%r10
343	movq	%rdx,%r13
344
345	leaq	1(%r15),%r15
346	jmp	.Linner_enter
347
348.align	16
349.Linner:
350	addq	%rax,%r13
351	movq	(%rsi,%r15,8),%rax
352	adcq	$0,%rdx
353	addq	%r10,%r13
354	movq	(%rsp,%r15,8),%r10
355	adcq	$0,%rdx
356	movq	%r13,-16(%rsp,%r15,8)
357	movq	%rdx,%r13
358
359.Linner_enter:
360	mulq	%rbx
361	addq	%rax,%r11
362	movq	(%rcx,%r15,8),%rax
363	adcq	$0,%rdx
364	addq	%r11,%r10
365	movq	%rdx,%r11
366	adcq	$0,%r11
367	leaq	1(%r15),%r15
368
369	mulq	%rbp
370	cmpq	%r9,%r15
371	jne	.Linner
372
373	addq	%rax,%r13
374	adcq	$0,%rdx
375	addq	%r10,%r13
376	movq	(%rsp,%r9,8),%r10
377	adcq	$0,%rdx
378	movq	%r13,-16(%rsp,%r9,8)
379	movq	%rdx,%r13
380
381	xorq	%rdx,%rdx
382	addq	%r11,%r13
383	adcq	$0,%rdx
384	addq	%r10,%r13
385	adcq	$0,%rdx
386	movq	%r13,-8(%rsp,%r9,8)
387	movq	%rdx,(%rsp,%r9,8)
388
389	leaq	1(%r14),%r14
390	cmpq	%r9,%r14
391	jb	.Louter
392
393	xorq	%r14,%r14
394	movq	(%rsp),%rax
395	leaq	(%rsp),%rsi
396	movq	%r9,%r15
397	jmp	.Lsub
398.align	16
399.Lsub:	sbbq	(%rcx,%r14,8),%rax
400	movq	%rax,(%rdi,%r14,8)
401	movq	8(%rsi,%r14,8),%rax
402	leaq	1(%r14),%r14
403	decq	%r15
404	jnz	.Lsub
405
406	sbbq	$0,%rax
407	xorq	%r14,%r14
408	andq	%rax,%rsi
409	notq	%rax
410	movq	%rdi,%rcx
411	andq	%rax,%rcx
412	movq	%r9,%r15
413	orq	%rcx,%rsi
414.align	16
415.Lcopy:
416	movq	(%rsi,%r14,8),%rax
417	movq	%r14,(%rsp,%r14,8)
418	movq	%rax,(%rdi,%r14,8)
419	leaq	1(%r14),%r14
420	subq	$1,%r15
421	jnz	.Lcopy
422
423	movq	8(%rsp,%r9,8),%rsi
424.cfi_def_cfa	%rsi,8
425	movq	$1,%rax
426
427	movq	-48(%rsi),%r15
428.cfi_restore	%r15
429	movq	-40(%rsi),%r14
430.cfi_restore	%r14
431	movq	-32(%rsi),%r13
432.cfi_restore	%r13
433	movq	-24(%rsi),%r12
434.cfi_restore	%r12
435	movq	-16(%rsi),%rbp
436.cfi_restore	%rbp
437	movq	-8(%rsi),%rbx
438.cfi_restore	%rbx
439	leaq	(%rsi),%rsp
440.cfi_def_cfa_register	%rsp
441.Lmul_epilogue:
442	.byte	0xf3,0xc3
443.cfi_endproc
444.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
445.type	bn_mul4x_mont_gather5,@function
446.align	32
447bn_mul4x_mont_gather5:
448.cfi_startproc
449.byte	0x67
450	movq	%rsp,%rax
451.cfi_def_cfa_register	%rax
452.Lmul4x_enter:
453	pushq	%rbx
454.cfi_offset	%rbx,-16
455	pushq	%rbp
456.cfi_offset	%rbp,-24
457	pushq	%r12
458.cfi_offset	%r12,-32
459	pushq	%r13
460.cfi_offset	%r13,-40
461	pushq	%r14
462.cfi_offset	%r14,-48
463	pushq	%r15
464.cfi_offset	%r15,-56
465.Lmul4x_prologue:
466
467.byte	0x67
468	shll	$3,%r9d
469	leaq	(%r9,%r9,2),%r10
470	negq	%r9
471
472
473
474
475
476
477
478
479
480
481	leaq	-320(%rsp,%r9,2),%r11
482	movq	%rsp,%rbp
483	subq	%rdi,%r11
484	andq	$4095,%r11
485	cmpq	%r11,%r10
486	jb	.Lmul4xsp_alt
487	subq	%r11,%rbp
488	leaq	-320(%rbp,%r9,2),%rbp
489	jmp	.Lmul4xsp_done
490
491.align	32
492.Lmul4xsp_alt:
493	leaq	4096-320(,%r9,2),%r10
494	leaq	-320(%rbp,%r9,2),%rbp
495	subq	%r10,%r11
496	movq	$0,%r10
497	cmovcq	%r10,%r11
498	subq	%r11,%rbp
499.Lmul4xsp_done:
500	andq	$-64,%rbp
501	movq	%rsp,%r11
502	subq	%rbp,%r11
503	andq	$-4096,%r11
504	leaq	(%r11,%rbp,1),%rsp
505	movq	(%rsp),%r10
506	cmpq	%rbp,%rsp
507	ja	.Lmul4x_page_walk
508	jmp	.Lmul4x_page_walk_done
509
510.Lmul4x_page_walk:
511	leaq	-4096(%rsp),%rsp
512	movq	(%rsp),%r10
513	cmpq	%rbp,%rsp
514	ja	.Lmul4x_page_walk
515.Lmul4x_page_walk_done:
516
517	negq	%r9
518
519	movq	%rax,40(%rsp)
520.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
521.Lmul4x_body:
522
523	call	mul4x_internal
524
525	movq	40(%rsp),%rsi
526.cfi_def_cfa	%rsi,8
527	movq	$1,%rax
528
529	movq	-48(%rsi),%r15
530.cfi_restore	%r15
531	movq	-40(%rsi),%r14
532.cfi_restore	%r14
533	movq	-32(%rsi),%r13
534.cfi_restore	%r13
535	movq	-24(%rsi),%r12
536.cfi_restore	%r12
537	movq	-16(%rsi),%rbp
538.cfi_restore	%rbp
539	movq	-8(%rsi),%rbx
540.cfi_restore	%rbx
541	leaq	(%rsi),%rsp
542.cfi_def_cfa_register	%rsp
543.Lmul4x_epilogue:
544	.byte	0xf3,0xc3
545.cfi_endproc
546.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
547
548.type	mul4x_internal,@function
549.align	32
550mul4x_internal:
551	shlq	$5,%r9
552	movd	8(%rax),%xmm5
553	leaq	.Linc(%rip),%rax
554	leaq	128(%rdx,%r9,1),%r13
555	shrq	$5,%r9
556	movdqa	0(%rax),%xmm0
557	movdqa	16(%rax),%xmm1
558	leaq	88-112(%rsp,%r9,1),%r10
559	leaq	128(%rdx),%r12
560
561	pshufd	$0,%xmm5,%xmm5
562	movdqa	%xmm1,%xmm4
563.byte	0x67,0x67
564	movdqa	%xmm1,%xmm2
565	paddd	%xmm0,%xmm1
566	pcmpeqd	%xmm5,%xmm0
567.byte	0x67
568	movdqa	%xmm4,%xmm3
569	paddd	%xmm1,%xmm2
570	pcmpeqd	%xmm5,%xmm1
571	movdqa	%xmm0,112(%r10)
572	movdqa	%xmm4,%xmm0
573
574	paddd	%xmm2,%xmm3
575	pcmpeqd	%xmm5,%xmm2
576	movdqa	%xmm1,128(%r10)
577	movdqa	%xmm4,%xmm1
578
579	paddd	%xmm3,%xmm0
580	pcmpeqd	%xmm5,%xmm3
581	movdqa	%xmm2,144(%r10)
582	movdqa	%xmm4,%xmm2
583
584	paddd	%xmm0,%xmm1
585	pcmpeqd	%xmm5,%xmm0
586	movdqa	%xmm3,160(%r10)
587	movdqa	%xmm4,%xmm3
588	paddd	%xmm1,%xmm2
589	pcmpeqd	%xmm5,%xmm1
590	movdqa	%xmm0,176(%r10)
591	movdqa	%xmm4,%xmm0
592
593	paddd	%xmm2,%xmm3
594	pcmpeqd	%xmm5,%xmm2
595	movdqa	%xmm1,192(%r10)
596	movdqa	%xmm4,%xmm1
597
598	paddd	%xmm3,%xmm0
599	pcmpeqd	%xmm5,%xmm3
600	movdqa	%xmm2,208(%r10)
601	movdqa	%xmm4,%xmm2
602
603	paddd	%xmm0,%xmm1
604	pcmpeqd	%xmm5,%xmm0
605	movdqa	%xmm3,224(%r10)
606	movdqa	%xmm4,%xmm3
607	paddd	%xmm1,%xmm2
608	pcmpeqd	%xmm5,%xmm1
609	movdqa	%xmm0,240(%r10)
610	movdqa	%xmm4,%xmm0
611
612	paddd	%xmm2,%xmm3
613	pcmpeqd	%xmm5,%xmm2
614	movdqa	%xmm1,256(%r10)
615	movdqa	%xmm4,%xmm1
616
617	paddd	%xmm3,%xmm0
618	pcmpeqd	%xmm5,%xmm3
619	movdqa	%xmm2,272(%r10)
620	movdqa	%xmm4,%xmm2
621
622	paddd	%xmm0,%xmm1
623	pcmpeqd	%xmm5,%xmm0
624	movdqa	%xmm3,288(%r10)
625	movdqa	%xmm4,%xmm3
626	paddd	%xmm1,%xmm2
627	pcmpeqd	%xmm5,%xmm1
628	movdqa	%xmm0,304(%r10)
629
630	paddd	%xmm2,%xmm3
631.byte	0x67
632	pcmpeqd	%xmm5,%xmm2
633	movdqa	%xmm1,320(%r10)
634
635	pcmpeqd	%xmm5,%xmm3
636	movdqa	%xmm2,336(%r10)
637	pand	64(%r12),%xmm0
638
639	pand	80(%r12),%xmm1
640	pand	96(%r12),%xmm2
641	movdqa	%xmm3,352(%r10)
642	pand	112(%r12),%xmm3
643	por	%xmm2,%xmm0
644	por	%xmm3,%xmm1
645	movdqa	-128(%r12),%xmm4
646	movdqa	-112(%r12),%xmm5
647	movdqa	-96(%r12),%xmm2
648	pand	112(%r10),%xmm4
649	movdqa	-80(%r12),%xmm3
650	pand	128(%r10),%xmm5
651	por	%xmm4,%xmm0
652	pand	144(%r10),%xmm2
653	por	%xmm5,%xmm1
654	pand	160(%r10),%xmm3
655	por	%xmm2,%xmm0
656	por	%xmm3,%xmm1
657	movdqa	-64(%r12),%xmm4
658	movdqa	-48(%r12),%xmm5
659	movdqa	-32(%r12),%xmm2
660	pand	176(%r10),%xmm4
661	movdqa	-16(%r12),%xmm3
662	pand	192(%r10),%xmm5
663	por	%xmm4,%xmm0
664	pand	208(%r10),%xmm2
665	por	%xmm5,%xmm1
666	pand	224(%r10),%xmm3
667	por	%xmm2,%xmm0
668	por	%xmm3,%xmm1
669	movdqa	0(%r12),%xmm4
670	movdqa	16(%r12),%xmm5
671	movdqa	32(%r12),%xmm2
672	pand	240(%r10),%xmm4
673	movdqa	48(%r12),%xmm3
674	pand	256(%r10),%xmm5
675	por	%xmm4,%xmm0
676	pand	272(%r10),%xmm2
677	por	%xmm5,%xmm1
678	pand	288(%r10),%xmm3
679	por	%xmm2,%xmm0
680	por	%xmm3,%xmm1
681	por	%xmm1,%xmm0
682	pshufd	$0x4e,%xmm0,%xmm1
683	por	%xmm1,%xmm0
684	leaq	256(%r12),%r12
685.byte	102,72,15,126,195
686
687	movq	%r13,16+8(%rsp)
688	movq	%rdi,56+8(%rsp)
689
690	movq	(%r8),%r8
691	movq	(%rsi),%rax
692	leaq	(%rsi,%r9,1),%rsi
693	negq	%r9
694
695	movq	%r8,%rbp
696	mulq	%rbx
697	movq	%rax,%r10
698	movq	(%rcx),%rax
699
700	imulq	%r10,%rbp
701	leaq	64+8(%rsp),%r14
702	movq	%rdx,%r11
703
704	mulq	%rbp
705	addq	%rax,%r10
706	movq	8(%rsi,%r9,1),%rax
707	adcq	$0,%rdx
708	movq	%rdx,%rdi
709
710	mulq	%rbx
711	addq	%rax,%r11
712	movq	8(%rcx),%rax
713	adcq	$0,%rdx
714	movq	%rdx,%r10
715
716	mulq	%rbp
717	addq	%rax,%rdi
718	movq	16(%rsi,%r9,1),%rax
719	adcq	$0,%rdx
720	addq	%r11,%rdi
721	leaq	32(%r9),%r15
722	leaq	32(%rcx),%rcx
723	adcq	$0,%rdx
724	movq	%rdi,(%r14)
725	movq	%rdx,%r13
726	jmp	.L1st4x
727
728.align	32
729.L1st4x:
730	mulq	%rbx
731	addq	%rax,%r10
732	movq	-16(%rcx),%rax
733	leaq	32(%r14),%r14
734	adcq	$0,%rdx
735	movq	%rdx,%r11
736
737	mulq	%rbp
738	addq	%rax,%r13
739	movq	-8(%rsi,%r15,1),%rax
740	adcq	$0,%rdx
741	addq	%r10,%r13
742	adcq	$0,%rdx
743	movq	%r13,-24(%r14)
744	movq	%rdx,%rdi
745
746	mulq	%rbx
747	addq	%rax,%r11
748	movq	-8(%rcx),%rax
749	adcq	$0,%rdx
750	movq	%rdx,%r10
751
752	mulq	%rbp
753	addq	%rax,%rdi
754	movq	(%rsi,%r15,1),%rax
755	adcq	$0,%rdx
756	addq	%r11,%rdi
757	adcq	$0,%rdx
758	movq	%rdi,-16(%r14)
759	movq	%rdx,%r13
760
761	mulq	%rbx
762	addq	%rax,%r10
763	movq	0(%rcx),%rax
764	adcq	$0,%rdx
765	movq	%rdx,%r11
766
767	mulq	%rbp
768	addq	%rax,%r13
769	movq	8(%rsi,%r15,1),%rax
770	adcq	$0,%rdx
771	addq	%r10,%r13
772	adcq	$0,%rdx
773	movq	%r13,-8(%r14)
774	movq	%rdx,%rdi
775
776	mulq	%rbx
777	addq	%rax,%r11
778	movq	8(%rcx),%rax
779	adcq	$0,%rdx
780	movq	%rdx,%r10
781
782	mulq	%rbp
783	addq	%rax,%rdi
784	movq	16(%rsi,%r15,1),%rax
785	adcq	$0,%rdx
786	addq	%r11,%rdi
787	leaq	32(%rcx),%rcx
788	adcq	$0,%rdx
789	movq	%rdi,(%r14)
790	movq	%rdx,%r13
791
792	addq	$32,%r15
793	jnz	.L1st4x
794
795	mulq	%rbx
796	addq	%rax,%r10
797	movq	-16(%rcx),%rax
798	leaq	32(%r14),%r14
799	adcq	$0,%rdx
800	movq	%rdx,%r11
801
802	mulq	%rbp
803	addq	%rax,%r13
804	movq	-8(%rsi),%rax
805	adcq	$0,%rdx
806	addq	%r10,%r13
807	adcq	$0,%rdx
808	movq	%r13,-24(%r14)
809	movq	%rdx,%rdi
810
811	mulq	%rbx
812	addq	%rax,%r11
813	movq	-8(%rcx),%rax
814	adcq	$0,%rdx
815	movq	%rdx,%r10
816
817	mulq	%rbp
818	addq	%rax,%rdi
819	movq	(%rsi,%r9,1),%rax
820	adcq	$0,%rdx
821	addq	%r11,%rdi
822	adcq	$0,%rdx
823	movq	%rdi,-16(%r14)
824	movq	%rdx,%r13
825
826	leaq	(%rcx,%r9,1),%rcx
827
828	xorq	%rdi,%rdi
829	addq	%r10,%r13
830	adcq	$0,%rdi
831	movq	%r13,-8(%r14)
832
833	jmp	.Louter4x
834
835.align	32
836.Louter4x:
837	leaq	16+128(%r14),%rdx
838	pxor	%xmm4,%xmm4
839	pxor	%xmm5,%xmm5
840	movdqa	-128(%r12),%xmm0
841	movdqa	-112(%r12),%xmm1
842	movdqa	-96(%r12),%xmm2
843	movdqa	-80(%r12),%xmm3
844	pand	-128(%rdx),%xmm0
845	pand	-112(%rdx),%xmm1
846	por	%xmm0,%xmm4
847	pand	-96(%rdx),%xmm2
848	por	%xmm1,%xmm5
849	pand	-80(%rdx),%xmm3
850	por	%xmm2,%xmm4
851	por	%xmm3,%xmm5
852	movdqa	-64(%r12),%xmm0
853	movdqa	-48(%r12),%xmm1
854	movdqa	-32(%r12),%xmm2
855	movdqa	-16(%r12),%xmm3
856	pand	-64(%rdx),%xmm0
857	pand	-48(%rdx),%xmm1
858	por	%xmm0,%xmm4
859	pand	-32(%rdx),%xmm2
860	por	%xmm1,%xmm5
861	pand	-16(%rdx),%xmm3
862	por	%xmm2,%xmm4
863	por	%xmm3,%xmm5
864	movdqa	0(%r12),%xmm0
865	movdqa	16(%r12),%xmm1
866	movdqa	32(%r12),%xmm2
867	movdqa	48(%r12),%xmm3
868	pand	0(%rdx),%xmm0
869	pand	16(%rdx),%xmm1
870	por	%xmm0,%xmm4
871	pand	32(%rdx),%xmm2
872	por	%xmm1,%xmm5
873	pand	48(%rdx),%xmm3
874	por	%xmm2,%xmm4
875	por	%xmm3,%xmm5
876	movdqa	64(%r12),%xmm0
877	movdqa	80(%r12),%xmm1
878	movdqa	96(%r12),%xmm2
879	movdqa	112(%r12),%xmm3
880	pand	64(%rdx),%xmm0
881	pand	80(%rdx),%xmm1
882	por	%xmm0,%xmm4
883	pand	96(%rdx),%xmm2
884	por	%xmm1,%xmm5
885	pand	112(%rdx),%xmm3
886	por	%xmm2,%xmm4
887	por	%xmm3,%xmm5
888	por	%xmm5,%xmm4
889	pshufd	$0x4e,%xmm4,%xmm0
890	por	%xmm4,%xmm0
891	leaq	256(%r12),%r12
892.byte	102,72,15,126,195
893
894	movq	(%r14,%r9,1),%r10
895	movq	%r8,%rbp
896	mulq	%rbx
897	addq	%rax,%r10
898	movq	(%rcx),%rax
899	adcq	$0,%rdx
900
901	imulq	%r10,%rbp
902	movq	%rdx,%r11
903	movq	%rdi,(%r14)
904
905	leaq	(%r14,%r9,1),%r14
906
907	mulq	%rbp
908	addq	%rax,%r10
909	movq	8(%rsi,%r9,1),%rax
910	adcq	$0,%rdx
911	movq	%rdx,%rdi
912
913	mulq	%rbx
914	addq	%rax,%r11
915	movq	8(%rcx),%rax
916	adcq	$0,%rdx
917	addq	8(%r14),%r11
918	adcq	$0,%rdx
919	movq	%rdx,%r10
920
921	mulq	%rbp
922	addq	%rax,%rdi
923	movq	16(%rsi,%r9,1),%rax
924	adcq	$0,%rdx
925	addq	%r11,%rdi
926	leaq	32(%r9),%r15
927	leaq	32(%rcx),%rcx
928	adcq	$0,%rdx
929	movq	%rdx,%r13
930	jmp	.Linner4x
931
932.align	32
933.Linner4x:
934	mulq	%rbx
935	addq	%rax,%r10
936	movq	-16(%rcx),%rax
937	adcq	$0,%rdx
938	addq	16(%r14),%r10
939	leaq	32(%r14),%r14
940	adcq	$0,%rdx
941	movq	%rdx,%r11
942
943	mulq	%rbp
944	addq	%rax,%r13
945	movq	-8(%rsi,%r15,1),%rax
946	adcq	$0,%rdx
947	addq	%r10,%r13
948	adcq	$0,%rdx
949	movq	%rdi,-32(%r14)
950	movq	%rdx,%rdi
951
952	mulq	%rbx
953	addq	%rax,%r11
954	movq	-8(%rcx),%rax
955	adcq	$0,%rdx
956	addq	-8(%r14),%r11
957	adcq	$0,%rdx
958	movq	%rdx,%r10
959
960	mulq	%rbp
961	addq	%rax,%rdi
962	movq	(%rsi,%r15,1),%rax
963	adcq	$0,%rdx
964	addq	%r11,%rdi
965	adcq	$0,%rdx
966	movq	%r13,-24(%r14)
967	movq	%rdx,%r13
968
969	mulq	%rbx
970	addq	%rax,%r10
971	movq	0(%rcx),%rax
972	adcq	$0,%rdx
973	addq	(%r14),%r10
974	adcq	$0,%rdx
975	movq	%rdx,%r11
976
977	mulq	%rbp
978	addq	%rax,%r13
979	movq	8(%rsi,%r15,1),%rax
980	adcq	$0,%rdx
981	addq	%r10,%r13
982	adcq	$0,%rdx
983	movq	%rdi,-16(%r14)
984	movq	%rdx,%rdi
985
986	mulq	%rbx
987	addq	%rax,%r11
988	movq	8(%rcx),%rax
989	adcq	$0,%rdx
990	addq	8(%r14),%r11
991	adcq	$0,%rdx
992	movq	%rdx,%r10
993
994	mulq	%rbp
995	addq	%rax,%rdi
996	movq	16(%rsi,%r15,1),%rax
997	adcq	$0,%rdx
998	addq	%r11,%rdi
999	leaq	32(%rcx),%rcx
1000	adcq	$0,%rdx
1001	movq	%r13,-8(%r14)
1002	movq	%rdx,%r13
1003
1004	addq	$32,%r15
1005	jnz	.Linner4x
1006
1007	mulq	%rbx
1008	addq	%rax,%r10
1009	movq	-16(%rcx),%rax
1010	adcq	$0,%rdx
1011	addq	16(%r14),%r10
1012	leaq	32(%r14),%r14
1013	adcq	$0,%rdx
1014	movq	%rdx,%r11
1015
1016	mulq	%rbp
1017	addq	%rax,%r13
1018	movq	-8(%rsi),%rax
1019	adcq	$0,%rdx
1020	addq	%r10,%r13
1021	adcq	$0,%rdx
1022	movq	%rdi,-32(%r14)
1023	movq	%rdx,%rdi
1024
1025	mulq	%rbx
1026	addq	%rax,%r11
1027	movq	%rbp,%rax
1028	movq	-8(%rcx),%rbp
1029	adcq	$0,%rdx
1030	addq	-8(%r14),%r11
1031	adcq	$0,%rdx
1032	movq	%rdx,%r10
1033
1034	mulq	%rbp
1035	addq	%rax,%rdi
1036	movq	(%rsi,%r9,1),%rax
1037	adcq	$0,%rdx
1038	addq	%r11,%rdi
1039	adcq	$0,%rdx
1040	movq	%r13,-24(%r14)
1041	movq	%rdx,%r13
1042
1043	movq	%rdi,-16(%r14)
1044	leaq	(%rcx,%r9,1),%rcx
1045
1046	xorq	%rdi,%rdi
1047	addq	%r10,%r13
1048	adcq	$0,%rdi
1049	addq	(%r14),%r13
1050	adcq	$0,%rdi
1051	movq	%r13,-8(%r14)
1052
1053	cmpq	16+8(%rsp),%r12
1054	jb	.Louter4x
1055	xorq	%rax,%rax
1056	subq	%r13,%rbp
1057	adcq	%r15,%r15
1058	orq	%r15,%rdi
1059	subq	%rdi,%rax
1060	leaq	(%r14,%r9,1),%rbx
1061	movq	(%rcx),%r12
1062	leaq	(%rcx),%rbp
1063	movq	%r9,%rcx
1064	sarq	$3+2,%rcx
1065	movq	56+8(%rsp),%rdi
1066	decq	%r12
1067	xorq	%r10,%r10
1068	movq	8(%rbp),%r13
1069	movq	16(%rbp),%r14
1070	movq	24(%rbp),%r15
1071	jmp	.Lsqr4x_sub_entry
1072.size	mul4x_internal,.-mul4x_internal
1073.globl	bn_power5
1074.hidden bn_power5
1075.type	bn_power5,@function
1076.align	32
1077bn_power5:
1078.cfi_startproc
1079	movq	%rsp,%rax
1080.cfi_def_cfa_register	%rax
1081	pushq	%rbx
1082.cfi_offset	%rbx,-16
1083	pushq	%rbp
1084.cfi_offset	%rbp,-24
1085	pushq	%r12
1086.cfi_offset	%r12,-32
1087	pushq	%r13
1088.cfi_offset	%r13,-40
1089	pushq	%r14
1090.cfi_offset	%r14,-48
1091	pushq	%r15
1092.cfi_offset	%r15,-56
1093.Lpower5_prologue:
1094
1095	shll	$3,%r9d
1096	leal	(%r9,%r9,2),%r10d
1097	negq	%r9
1098	movq	(%r8),%r8
1099
1100
1101
1102
1103
1104
1105
1106
1107	leaq	-320(%rsp,%r9,2),%r11
1108	movq	%rsp,%rbp
1109	subq	%rdi,%r11
1110	andq	$4095,%r11
1111	cmpq	%r11,%r10
1112	jb	.Lpwr_sp_alt
1113	subq	%r11,%rbp
1114	leaq	-320(%rbp,%r9,2),%rbp
1115	jmp	.Lpwr_sp_done
1116
1117.align	32
1118.Lpwr_sp_alt:
1119	leaq	4096-320(,%r9,2),%r10
1120	leaq	-320(%rbp,%r9,2),%rbp
1121	subq	%r10,%r11
1122	movq	$0,%r10
1123	cmovcq	%r10,%r11
1124	subq	%r11,%rbp
1125.Lpwr_sp_done:
1126	andq	$-64,%rbp
1127	movq	%rsp,%r11
1128	subq	%rbp,%r11
1129	andq	$-4096,%r11
1130	leaq	(%r11,%rbp,1),%rsp
1131	movq	(%rsp),%r10
1132	cmpq	%rbp,%rsp
1133	ja	.Lpwr_page_walk
1134	jmp	.Lpwr_page_walk_done
1135
1136.Lpwr_page_walk:
1137	leaq	-4096(%rsp),%rsp
1138	movq	(%rsp),%r10
1139	cmpq	%rbp,%rsp
1140	ja	.Lpwr_page_walk
1141.Lpwr_page_walk_done:
1142
1143	movq	%r9,%r10
1144	negq	%r9
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155	movq	%r8,32(%rsp)
1156	movq	%rax,40(%rsp)
1157.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1158.Lpower5_body:
1159.byte	102,72,15,110,207
1160.byte	102,72,15,110,209
1161.byte	102,73,15,110,218
1162.byte	102,72,15,110,226
1163
1164	call	__bn_sqr8x_internal
1165	call	__bn_post4x_internal
1166	call	__bn_sqr8x_internal
1167	call	__bn_post4x_internal
1168	call	__bn_sqr8x_internal
1169	call	__bn_post4x_internal
1170	call	__bn_sqr8x_internal
1171	call	__bn_post4x_internal
1172	call	__bn_sqr8x_internal
1173	call	__bn_post4x_internal
1174
1175.byte	102,72,15,126,209
1176.byte	102,72,15,126,226
1177	movq	%rsi,%rdi
1178	movq	40(%rsp),%rax
1179	leaq	32(%rsp),%r8
1180
1181	call	mul4x_internal
1182
1183	movq	40(%rsp),%rsi
1184.cfi_def_cfa	%rsi,8
1185	movq	$1,%rax
1186	movq	-48(%rsi),%r15
1187.cfi_restore	%r15
1188	movq	-40(%rsi),%r14
1189.cfi_restore	%r14
1190	movq	-32(%rsi),%r13
1191.cfi_restore	%r13
1192	movq	-24(%rsi),%r12
1193.cfi_restore	%r12
1194	movq	-16(%rsi),%rbp
1195.cfi_restore	%rbp
1196	movq	-8(%rsi),%rbx
1197.cfi_restore	%rbx
1198	leaq	(%rsi),%rsp
1199.cfi_def_cfa_register	%rsp
1200.Lpower5_epilogue:
1201	.byte	0xf3,0xc3
1202.cfi_endproc
1203.size	bn_power5,.-bn_power5
1204
1205.globl	bn_sqr8x_internal
1206.hidden bn_sqr8x_internal
1207.hidden	bn_sqr8x_internal
1208.type	bn_sqr8x_internal,@function
1209.align	32
1210bn_sqr8x_internal:
1211__bn_sqr8x_internal:
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285	leaq	32(%r10),%rbp
1286	leaq	(%rsi,%r9,1),%rsi
1287
1288	movq	%r9,%rcx
1289
1290
1291	movq	-32(%rsi,%rbp,1),%r14
1292	leaq	48+8(%rsp,%r9,2),%rdi
1293	movq	-24(%rsi,%rbp,1),%rax
1294	leaq	-32(%rdi,%rbp,1),%rdi
1295	movq	-16(%rsi,%rbp,1),%rbx
1296	movq	%rax,%r15
1297
1298	mulq	%r14
1299	movq	%rax,%r10
1300	movq	%rbx,%rax
1301	movq	%rdx,%r11
1302	movq	%r10,-24(%rdi,%rbp,1)
1303
1304	mulq	%r14
1305	addq	%rax,%r11
1306	movq	%rbx,%rax
1307	adcq	$0,%rdx
1308	movq	%r11,-16(%rdi,%rbp,1)
1309	movq	%rdx,%r10
1310
1311
1312	movq	-8(%rsi,%rbp,1),%rbx
1313	mulq	%r15
1314	movq	%rax,%r12
1315	movq	%rbx,%rax
1316	movq	%rdx,%r13
1317
1318	leaq	(%rbp),%rcx
1319	mulq	%r14
1320	addq	%rax,%r10
1321	movq	%rbx,%rax
1322	movq	%rdx,%r11
1323	adcq	$0,%r11
1324	addq	%r12,%r10
1325	adcq	$0,%r11
1326	movq	%r10,-8(%rdi,%rcx,1)
1327	jmp	.Lsqr4x_1st
1328
1329.align	32
1330.Lsqr4x_1st:
1331	movq	(%rsi,%rcx,1),%rbx
1332	mulq	%r15
1333	addq	%rax,%r13
1334	movq	%rbx,%rax
1335	movq	%rdx,%r12
1336	adcq	$0,%r12
1337
1338	mulq	%r14
1339	addq	%rax,%r11
1340	movq	%rbx,%rax
1341	movq	8(%rsi,%rcx,1),%rbx
1342	movq	%rdx,%r10
1343	adcq	$0,%r10
1344	addq	%r13,%r11
1345	adcq	$0,%r10
1346
1347
1348	mulq	%r15
1349	addq	%rax,%r12
1350	movq	%rbx,%rax
1351	movq	%r11,(%rdi,%rcx,1)
1352	movq	%rdx,%r13
1353	adcq	$0,%r13
1354
1355	mulq	%r14
1356	addq	%rax,%r10
1357	movq	%rbx,%rax
1358	movq	16(%rsi,%rcx,1),%rbx
1359	movq	%rdx,%r11
1360	adcq	$0,%r11
1361	addq	%r12,%r10
1362	adcq	$0,%r11
1363
1364	mulq	%r15
1365	addq	%rax,%r13
1366	movq	%rbx,%rax
1367	movq	%r10,8(%rdi,%rcx,1)
1368	movq	%rdx,%r12
1369	adcq	$0,%r12
1370
1371	mulq	%r14
1372	addq	%rax,%r11
1373	movq	%rbx,%rax
1374	movq	24(%rsi,%rcx,1),%rbx
1375	movq	%rdx,%r10
1376	adcq	$0,%r10
1377	addq	%r13,%r11
1378	adcq	$0,%r10
1379
1380
1381	mulq	%r15
1382	addq	%rax,%r12
1383	movq	%rbx,%rax
1384	movq	%r11,16(%rdi,%rcx,1)
1385	movq	%rdx,%r13
1386	adcq	$0,%r13
1387	leaq	32(%rcx),%rcx
1388
1389	mulq	%r14
1390	addq	%rax,%r10
1391	movq	%rbx,%rax
1392	movq	%rdx,%r11
1393	adcq	$0,%r11
1394	addq	%r12,%r10
1395	adcq	$0,%r11
1396	movq	%r10,-8(%rdi,%rcx,1)
1397
1398	cmpq	$0,%rcx
1399	jne	.Lsqr4x_1st
1400
1401	mulq	%r15
1402	addq	%rax,%r13
1403	leaq	16(%rbp),%rbp
1404	adcq	$0,%rdx
1405	addq	%r11,%r13
1406	adcq	$0,%rdx
1407
1408	movq	%r13,(%rdi)
1409	movq	%rdx,%r12
1410	movq	%rdx,8(%rdi)
1411	jmp	.Lsqr4x_outer
1412
1413.align	32
1414.Lsqr4x_outer:
1415	movq	-32(%rsi,%rbp,1),%r14
1416	leaq	48+8(%rsp,%r9,2),%rdi
1417	movq	-24(%rsi,%rbp,1),%rax
1418	leaq	-32(%rdi,%rbp,1),%rdi
1419	movq	-16(%rsi,%rbp,1),%rbx
1420	movq	%rax,%r15
1421
1422	mulq	%r14
1423	movq	-24(%rdi,%rbp,1),%r10
1424	addq	%rax,%r10
1425	movq	%rbx,%rax
1426	adcq	$0,%rdx
1427	movq	%r10,-24(%rdi,%rbp,1)
1428	movq	%rdx,%r11
1429
1430	mulq	%r14
1431	addq	%rax,%r11
1432	movq	%rbx,%rax
1433	adcq	$0,%rdx
1434	addq	-16(%rdi,%rbp,1),%r11
1435	movq	%rdx,%r10
1436	adcq	$0,%r10
1437	movq	%r11,-16(%rdi,%rbp,1)
1438
1439	xorq	%r12,%r12
1440
1441	movq	-8(%rsi,%rbp,1),%rbx
1442	mulq	%r15
1443	addq	%rax,%r12
1444	movq	%rbx,%rax
1445	adcq	$0,%rdx
1446	addq	-8(%rdi,%rbp,1),%r12
1447	movq	%rdx,%r13
1448	adcq	$0,%r13
1449
1450	mulq	%r14
1451	addq	%rax,%r10
1452	movq	%rbx,%rax
1453	adcq	$0,%rdx
1454	addq	%r12,%r10
1455	movq	%rdx,%r11
1456	adcq	$0,%r11
1457	movq	%r10,-8(%rdi,%rbp,1)
1458
1459	leaq	(%rbp),%rcx
1460	jmp	.Lsqr4x_inner
1461
1462.align	32
1463.Lsqr4x_inner:
1464	movq	(%rsi,%rcx,1),%rbx
1465	mulq	%r15
1466	addq	%rax,%r13
1467	movq	%rbx,%rax
1468	movq	%rdx,%r12
1469	adcq	$0,%r12
1470	addq	(%rdi,%rcx,1),%r13
1471	adcq	$0,%r12
1472
1473.byte	0x67
1474	mulq	%r14
1475	addq	%rax,%r11
1476	movq	%rbx,%rax
1477	movq	8(%rsi,%rcx,1),%rbx
1478	movq	%rdx,%r10
1479	adcq	$0,%r10
1480	addq	%r13,%r11
1481	adcq	$0,%r10
1482
1483	mulq	%r15
1484	addq	%rax,%r12
1485	movq	%r11,(%rdi,%rcx,1)
1486	movq	%rbx,%rax
1487	movq	%rdx,%r13
1488	adcq	$0,%r13
1489	addq	8(%rdi,%rcx,1),%r12
1490	leaq	16(%rcx),%rcx
1491	adcq	$0,%r13
1492
1493	mulq	%r14
1494	addq	%rax,%r10
1495	movq	%rbx,%rax
1496	adcq	$0,%rdx
1497	addq	%r12,%r10
1498	movq	%rdx,%r11
1499	adcq	$0,%r11
1500	movq	%r10,-8(%rdi,%rcx,1)
1501
1502	cmpq	$0,%rcx
1503	jne	.Lsqr4x_inner
1504
1505.byte	0x67
1506	mulq	%r15
1507	addq	%rax,%r13
1508	adcq	$0,%rdx
1509	addq	%r11,%r13
1510	adcq	$0,%rdx
1511
1512	movq	%r13,(%rdi)
1513	movq	%rdx,%r12
1514	movq	%rdx,8(%rdi)
1515
1516	addq	$16,%rbp
1517	jnz	.Lsqr4x_outer
1518
1519
1520	movq	-32(%rsi),%r14
1521	leaq	48+8(%rsp,%r9,2),%rdi
1522	movq	-24(%rsi),%rax
1523	leaq	-32(%rdi,%rbp,1),%rdi
1524	movq	-16(%rsi),%rbx
1525	movq	%rax,%r15
1526
1527	mulq	%r14
1528	addq	%rax,%r10
1529	movq	%rbx,%rax
1530	movq	%rdx,%r11
1531	adcq	$0,%r11
1532
1533	mulq	%r14
1534	addq	%rax,%r11
1535	movq	%rbx,%rax
1536	movq	%r10,-24(%rdi)
1537	movq	%rdx,%r10
1538	adcq	$0,%r10
1539	addq	%r13,%r11
1540	movq	-8(%rsi),%rbx
1541	adcq	$0,%r10
1542
1543	mulq	%r15
1544	addq	%rax,%r12
1545	movq	%rbx,%rax
1546	movq	%r11,-16(%rdi)
1547	movq	%rdx,%r13
1548	adcq	$0,%r13
1549
1550	mulq	%r14
1551	addq	%rax,%r10
1552	movq	%rbx,%rax
1553	movq	%rdx,%r11
1554	adcq	$0,%r11
1555	addq	%r12,%r10
1556	adcq	$0,%r11
1557	movq	%r10,-8(%rdi)
1558
1559	mulq	%r15
1560	addq	%rax,%r13
1561	movq	-16(%rsi),%rax
1562	adcq	$0,%rdx
1563	addq	%r11,%r13
1564	adcq	$0,%rdx
1565
1566	movq	%r13,(%rdi)
1567	movq	%rdx,%r12
1568	movq	%rdx,8(%rdi)
1569
1570	mulq	%rbx
1571	addq	$16,%rbp
1572	xorq	%r14,%r14
1573	subq	%r9,%rbp
1574	xorq	%r15,%r15
1575
1576	addq	%r12,%rax
1577	adcq	$0,%rdx
1578	movq	%rax,8(%rdi)
1579	movq	%rdx,16(%rdi)
1580	movq	%r15,24(%rdi)
1581
1582	movq	-16(%rsi,%rbp,1),%rax
1583	leaq	48+8(%rsp),%rdi
1584	xorq	%r10,%r10
1585	movq	8(%rdi),%r11
1586
1587	leaq	(%r14,%r10,2),%r12
1588	shrq	$63,%r10
1589	leaq	(%rcx,%r11,2),%r13
1590	shrq	$63,%r11
1591	orq	%r10,%r13
1592	movq	16(%rdi),%r10
1593	movq	%r11,%r14
1594	mulq	%rax
1595	negq	%r15
1596	movq	24(%rdi),%r11
1597	adcq	%rax,%r12
1598	movq	-8(%rsi,%rbp,1),%rax
1599	movq	%r12,(%rdi)
1600	adcq	%rdx,%r13
1601
1602	leaq	(%r14,%r10,2),%rbx
1603	movq	%r13,8(%rdi)
1604	sbbq	%r15,%r15
1605	shrq	$63,%r10
1606	leaq	(%rcx,%r11,2),%r8
1607	shrq	$63,%r11
1608	orq	%r10,%r8
1609	movq	32(%rdi),%r10
1610	movq	%r11,%r14
1611	mulq	%rax
1612	negq	%r15
1613	movq	40(%rdi),%r11
1614	adcq	%rax,%rbx
1615	movq	0(%rsi,%rbp,1),%rax
1616	movq	%rbx,16(%rdi)
1617	adcq	%rdx,%r8
1618	leaq	16(%rbp),%rbp
1619	movq	%r8,24(%rdi)
1620	sbbq	%r15,%r15
1621	leaq	64(%rdi),%rdi
1622	jmp	.Lsqr4x_shift_n_add
1623
1624.align	32
1625.Lsqr4x_shift_n_add:
1626	leaq	(%r14,%r10,2),%r12
1627	shrq	$63,%r10
1628	leaq	(%rcx,%r11,2),%r13
1629	shrq	$63,%r11
1630	orq	%r10,%r13
1631	movq	-16(%rdi),%r10
1632	movq	%r11,%r14
1633	mulq	%rax
1634	negq	%r15
1635	movq	-8(%rdi),%r11
1636	adcq	%rax,%r12
1637	movq	-8(%rsi,%rbp,1),%rax
1638	movq	%r12,-32(%rdi)
1639	adcq	%rdx,%r13
1640
1641	leaq	(%r14,%r10,2),%rbx
1642	movq	%r13,-24(%rdi)
1643	sbbq	%r15,%r15
1644	shrq	$63,%r10
1645	leaq	(%rcx,%r11,2),%r8
1646	shrq	$63,%r11
1647	orq	%r10,%r8
1648	movq	0(%rdi),%r10
1649	movq	%r11,%r14
1650	mulq	%rax
1651	negq	%r15
1652	movq	8(%rdi),%r11
1653	adcq	%rax,%rbx
1654	movq	0(%rsi,%rbp,1),%rax
1655	movq	%rbx,-16(%rdi)
1656	adcq	%rdx,%r8
1657
1658	leaq	(%r14,%r10,2),%r12
1659	movq	%r8,-8(%rdi)
1660	sbbq	%r15,%r15
1661	shrq	$63,%r10
1662	leaq	(%rcx,%r11,2),%r13
1663	shrq	$63,%r11
1664	orq	%r10,%r13
1665	movq	16(%rdi),%r10
1666	movq	%r11,%r14
1667	mulq	%rax
1668	negq	%r15
1669	movq	24(%rdi),%r11
1670	adcq	%rax,%r12
1671	movq	8(%rsi,%rbp,1),%rax
1672	movq	%r12,0(%rdi)
1673	adcq	%rdx,%r13
1674
1675	leaq	(%r14,%r10,2),%rbx
1676	movq	%r13,8(%rdi)
1677	sbbq	%r15,%r15
1678	shrq	$63,%r10
1679	leaq	(%rcx,%r11,2),%r8
1680	shrq	$63,%r11
1681	orq	%r10,%r8
1682	movq	32(%rdi),%r10
1683	movq	%r11,%r14
1684	mulq	%rax
1685	negq	%r15
1686	movq	40(%rdi),%r11
1687	adcq	%rax,%rbx
1688	movq	16(%rsi,%rbp,1),%rax
1689	movq	%rbx,16(%rdi)
1690	adcq	%rdx,%r8
1691	movq	%r8,24(%rdi)
1692	sbbq	%r15,%r15
1693	leaq	64(%rdi),%rdi
1694	addq	$32,%rbp
1695	jnz	.Lsqr4x_shift_n_add
1696
1697	leaq	(%r14,%r10,2),%r12
1698.byte	0x67
1699	shrq	$63,%r10
1700	leaq	(%rcx,%r11,2),%r13
1701	shrq	$63,%r11
1702	orq	%r10,%r13
1703	movq	-16(%rdi),%r10
1704	movq	%r11,%r14
1705	mulq	%rax
1706	negq	%r15
1707	movq	-8(%rdi),%r11
1708	adcq	%rax,%r12
1709	movq	-8(%rsi),%rax
1710	movq	%r12,-32(%rdi)
1711	adcq	%rdx,%r13
1712
1713	leaq	(%r14,%r10,2),%rbx
1714	movq	%r13,-24(%rdi)
1715	sbbq	%r15,%r15
1716	shrq	$63,%r10
1717	leaq	(%rcx,%r11,2),%r8
1718	shrq	$63,%r11
1719	orq	%r10,%r8
1720	mulq	%rax
1721	negq	%r15
1722	adcq	%rax,%rbx
1723	adcq	%rdx,%r8
1724	movq	%rbx,-16(%rdi)
1725	movq	%r8,-8(%rdi)
1726.byte	102,72,15,126,213
1727__bn_sqr8x_reduction:
1728	xorq	%rax,%rax
1729	leaq	(%r9,%rbp,1),%rcx
1730	leaq	48+8(%rsp,%r9,2),%rdx
1731	movq	%rcx,0+8(%rsp)
1732	leaq	48+8(%rsp,%r9,1),%rdi
1733	movq	%rdx,8+8(%rsp)
1734	negq	%r9
1735	jmp	.L8x_reduction_loop
1736
1737.align	32
1738.L8x_reduction_loop:
1739	leaq	(%rdi,%r9,1),%rdi
1740.byte	0x66
1741	movq	0(%rdi),%rbx
1742	movq	8(%rdi),%r9
1743	movq	16(%rdi),%r10
1744	movq	24(%rdi),%r11
1745	movq	32(%rdi),%r12
1746	movq	40(%rdi),%r13
1747	movq	48(%rdi),%r14
1748	movq	56(%rdi),%r15
1749	movq	%rax,(%rdx)
1750	leaq	64(%rdi),%rdi
1751
1752.byte	0x67
1753	movq	%rbx,%r8
1754	imulq	32+8(%rsp),%rbx
1755	movq	0(%rbp),%rax
1756	movl	$8,%ecx
1757	jmp	.L8x_reduce
1758
1759.align	32
1760.L8x_reduce:
1761	mulq	%rbx
1762	movq	8(%rbp),%rax
1763	negq	%r8
1764	movq	%rdx,%r8
1765	adcq	$0,%r8
1766
1767	mulq	%rbx
1768	addq	%rax,%r9
1769	movq	16(%rbp),%rax
1770	adcq	$0,%rdx
1771	addq	%r9,%r8
1772	movq	%rbx,48-8+8(%rsp,%rcx,8)
1773	movq	%rdx,%r9
1774	adcq	$0,%r9
1775
1776	mulq	%rbx
1777	addq	%rax,%r10
1778	movq	24(%rbp),%rax
1779	adcq	$0,%rdx
1780	addq	%r10,%r9
1781	movq	32+8(%rsp),%rsi
1782	movq	%rdx,%r10
1783	adcq	$0,%r10
1784
1785	mulq	%rbx
1786	addq	%rax,%r11
1787	movq	32(%rbp),%rax
1788	adcq	$0,%rdx
1789	imulq	%r8,%rsi
1790	addq	%r11,%r10
1791	movq	%rdx,%r11
1792	adcq	$0,%r11
1793
1794	mulq	%rbx
1795	addq	%rax,%r12
1796	movq	40(%rbp),%rax
1797	adcq	$0,%rdx
1798	addq	%r12,%r11
1799	movq	%rdx,%r12
1800	adcq	$0,%r12
1801
1802	mulq	%rbx
1803	addq	%rax,%r13
1804	movq	48(%rbp),%rax
1805	adcq	$0,%rdx
1806	addq	%r13,%r12
1807	movq	%rdx,%r13
1808	adcq	$0,%r13
1809
1810	mulq	%rbx
1811	addq	%rax,%r14
1812	movq	56(%rbp),%rax
1813	adcq	$0,%rdx
1814	addq	%r14,%r13
1815	movq	%rdx,%r14
1816	adcq	$0,%r14
1817
1818	mulq	%rbx
1819	movq	%rsi,%rbx
1820	addq	%rax,%r15
1821	movq	0(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r15,%r14
1824	movq	%rdx,%r15
1825	adcq	$0,%r15
1826
1827	decl	%ecx
1828	jnz	.L8x_reduce
1829
1830	leaq	64(%rbp),%rbp
1831	xorq	%rax,%rax
1832	movq	8+8(%rsp),%rdx
1833	cmpq	0+8(%rsp),%rbp
1834	jae	.L8x_no_tail
1835
1836.byte	0x66
1837	addq	0(%rdi),%r8
1838	adcq	8(%rdi),%r9
1839	adcq	16(%rdi),%r10
1840	adcq	24(%rdi),%r11
1841	adcq	32(%rdi),%r12
1842	adcq	40(%rdi),%r13
1843	adcq	48(%rdi),%r14
1844	adcq	56(%rdi),%r15
1845	sbbq	%rsi,%rsi
1846
1847	movq	48+56+8(%rsp),%rbx
1848	movl	$8,%ecx
1849	movq	0(%rbp),%rax
1850	jmp	.L8x_tail
1851
1852.align	32
1853.L8x_tail:
1854	mulq	%rbx
1855	addq	%rax,%r8
1856	movq	8(%rbp),%rax
1857	movq	%r8,(%rdi)
1858	movq	%rdx,%r8
1859	adcq	$0,%r8
1860
1861	mulq	%rbx
1862	addq	%rax,%r9
1863	movq	16(%rbp),%rax
1864	adcq	$0,%rdx
1865	addq	%r9,%r8
1866	leaq	8(%rdi),%rdi
1867	movq	%rdx,%r9
1868	adcq	$0,%r9
1869
1870	mulq	%rbx
1871	addq	%rax,%r10
1872	movq	24(%rbp),%rax
1873	adcq	$0,%rdx
1874	addq	%r10,%r9
1875	movq	%rdx,%r10
1876	adcq	$0,%r10
1877
1878	mulq	%rbx
1879	addq	%rax,%r11
1880	movq	32(%rbp),%rax
1881	adcq	$0,%rdx
1882	addq	%r11,%r10
1883	movq	%rdx,%r11
1884	adcq	$0,%r11
1885
1886	mulq	%rbx
1887	addq	%rax,%r12
1888	movq	40(%rbp),%rax
1889	adcq	$0,%rdx
1890	addq	%r12,%r11
1891	movq	%rdx,%r12
1892	adcq	$0,%r12
1893
1894	mulq	%rbx
1895	addq	%rax,%r13
1896	movq	48(%rbp),%rax
1897	adcq	$0,%rdx
1898	addq	%r13,%r12
1899	movq	%rdx,%r13
1900	adcq	$0,%r13
1901
1902	mulq	%rbx
1903	addq	%rax,%r14
1904	movq	56(%rbp),%rax
1905	adcq	$0,%rdx
1906	addq	%r14,%r13
1907	movq	%rdx,%r14
1908	adcq	$0,%r14
1909
1910	mulq	%rbx
1911	movq	48-16+8(%rsp,%rcx,8),%rbx
1912	addq	%rax,%r15
1913	adcq	$0,%rdx
1914	addq	%r15,%r14
1915	movq	0(%rbp),%rax
1916	movq	%rdx,%r15
1917	adcq	$0,%r15
1918
1919	decl	%ecx
1920	jnz	.L8x_tail
1921
1922	leaq	64(%rbp),%rbp
1923	movq	8+8(%rsp),%rdx
1924	cmpq	0+8(%rsp),%rbp
1925	jae	.L8x_tail_done
1926
1927	movq	48+56+8(%rsp),%rbx
1928	negq	%rsi
1929	movq	0(%rbp),%rax
1930	adcq	0(%rdi),%r8
1931	adcq	8(%rdi),%r9
1932	adcq	16(%rdi),%r10
1933	adcq	24(%rdi),%r11
1934	adcq	32(%rdi),%r12
1935	adcq	40(%rdi),%r13
1936	adcq	48(%rdi),%r14
1937	adcq	56(%rdi),%r15
1938	sbbq	%rsi,%rsi
1939
1940	movl	$8,%ecx
1941	jmp	.L8x_tail
1942
1943.align	32
1944.L8x_tail_done:
1945	xorq	%rax,%rax
1946	addq	(%rdx),%r8
1947	adcq	$0,%r9
1948	adcq	$0,%r10
1949	adcq	$0,%r11
1950	adcq	$0,%r12
1951	adcq	$0,%r13
1952	adcq	$0,%r14
1953	adcq	$0,%r15
1954	adcq	$0,%rax
1955
1956	negq	%rsi
1957.L8x_no_tail:
1958	adcq	0(%rdi),%r8
1959	adcq	8(%rdi),%r9
1960	adcq	16(%rdi),%r10
1961	adcq	24(%rdi),%r11
1962	adcq	32(%rdi),%r12
1963	adcq	40(%rdi),%r13
1964	adcq	48(%rdi),%r14
1965	adcq	56(%rdi),%r15
1966	adcq	$0,%rax
1967	movq	-8(%rbp),%rcx
1968	xorq	%rsi,%rsi
1969
1970.byte	102,72,15,126,213
1971
1972	movq	%r8,0(%rdi)
1973	movq	%r9,8(%rdi)
1974.byte	102,73,15,126,217
1975	movq	%r10,16(%rdi)
1976	movq	%r11,24(%rdi)
1977	movq	%r12,32(%rdi)
1978	movq	%r13,40(%rdi)
1979	movq	%r14,48(%rdi)
1980	movq	%r15,56(%rdi)
1981	leaq	64(%rdi),%rdi
1982
1983	cmpq	%rdx,%rdi
1984	jb	.L8x_reduction_loop
1985	.byte	0xf3,0xc3
1986.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1987.type	__bn_post4x_internal,@function
1988.align	32
1989__bn_post4x_internal:
1990	movq	0(%rbp),%r12
1991	leaq	(%rdi,%r9,1),%rbx
1992	movq	%r9,%rcx
1993.byte	102,72,15,126,207
1994	negq	%rax
1995.byte	102,72,15,126,206
1996	sarq	$3+2,%rcx
1997	decq	%r12
1998	xorq	%r10,%r10
1999	movq	8(%rbp),%r13
2000	movq	16(%rbp),%r14
2001	movq	24(%rbp),%r15
2002	jmp	.Lsqr4x_sub_entry
2003
2004.align	16
2005.Lsqr4x_sub:
2006	movq	0(%rbp),%r12
2007	movq	8(%rbp),%r13
2008	movq	16(%rbp),%r14
2009	movq	24(%rbp),%r15
2010.Lsqr4x_sub_entry:
2011	leaq	32(%rbp),%rbp
2012	notq	%r12
2013	notq	%r13
2014	notq	%r14
2015	notq	%r15
2016	andq	%rax,%r12
2017	andq	%rax,%r13
2018	andq	%rax,%r14
2019	andq	%rax,%r15
2020
2021	negq	%r10
2022	adcq	0(%rbx),%r12
2023	adcq	8(%rbx),%r13
2024	adcq	16(%rbx),%r14
2025	adcq	24(%rbx),%r15
2026	movq	%r12,0(%rdi)
2027	leaq	32(%rbx),%rbx
2028	movq	%r13,8(%rdi)
2029	sbbq	%r10,%r10
2030	movq	%r14,16(%rdi)
2031	movq	%r15,24(%rdi)
2032	leaq	32(%rdi),%rdi
2033
2034	incq	%rcx
2035	jnz	.Lsqr4x_sub
2036
2037	movq	%r9,%r10
2038	negq	%r9
2039	.byte	0xf3,0xc3
2040.size	__bn_post4x_internal,.-__bn_post4x_internal
2041.globl	bn_from_montgomery
2042.hidden bn_from_montgomery
2043.type	bn_from_montgomery,@function
2044.align	32
2045bn_from_montgomery:
2046	testl	$7,%r9d
2047	jz	bn_from_mont8x
2048	xorl	%eax,%eax
2049	.byte	0xf3,0xc3
2050.size	bn_from_montgomery,.-bn_from_montgomery
2051
2052.type	bn_from_mont8x,@function
2053.align	32
2054bn_from_mont8x:
2055.cfi_startproc
2056.byte	0x67
2057	movq	%rsp,%rax
2058.cfi_def_cfa_register	%rax
2059	pushq	%rbx
2060.cfi_offset	%rbx,-16
2061	pushq	%rbp
2062.cfi_offset	%rbp,-24
2063	pushq	%r12
2064.cfi_offset	%r12,-32
2065	pushq	%r13
2066.cfi_offset	%r13,-40
2067	pushq	%r14
2068.cfi_offset	%r14,-48
2069	pushq	%r15
2070.cfi_offset	%r15,-56
2071.Lfrom_prologue:
2072
2073	shll	$3,%r9d
2074	leaq	(%r9,%r9,2),%r10
2075	negq	%r9
2076	movq	(%r8),%r8
2077
2078
2079
2080
2081
2082
2083
2084
2085	leaq	-320(%rsp,%r9,2),%r11
2086	movq	%rsp,%rbp
2087	subq	%rdi,%r11
2088	andq	$4095,%r11
2089	cmpq	%r11,%r10
2090	jb	.Lfrom_sp_alt
2091	subq	%r11,%rbp
2092	leaq	-320(%rbp,%r9,2),%rbp
2093	jmp	.Lfrom_sp_done
2094
2095.align	32
2096.Lfrom_sp_alt:
2097	leaq	4096-320(,%r9,2),%r10
2098	leaq	-320(%rbp,%r9,2),%rbp
2099	subq	%r10,%r11
2100	movq	$0,%r10
2101	cmovcq	%r10,%r11
2102	subq	%r11,%rbp
2103.Lfrom_sp_done:
2104	andq	$-64,%rbp
2105	movq	%rsp,%r11
2106	subq	%rbp,%r11
2107	andq	$-4096,%r11
2108	leaq	(%r11,%rbp,1),%rsp
2109	movq	(%rsp),%r10
2110	cmpq	%rbp,%rsp
2111	ja	.Lfrom_page_walk
2112	jmp	.Lfrom_page_walk_done
2113
2114.Lfrom_page_walk:
2115	leaq	-4096(%rsp),%rsp
2116	movq	(%rsp),%r10
2117	cmpq	%rbp,%rsp
2118	ja	.Lfrom_page_walk
2119.Lfrom_page_walk_done:
2120
2121	movq	%r9,%r10
2122	negq	%r9
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133	movq	%r8,32(%rsp)
2134	movq	%rax,40(%rsp)
2135.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2136.Lfrom_body:
2137	movq	%r9,%r11
2138	leaq	48(%rsp),%rax
2139	pxor	%xmm0,%xmm0
2140	jmp	.Lmul_by_1
2141
2142.align	32
2143.Lmul_by_1:
2144	movdqu	(%rsi),%xmm1
2145	movdqu	16(%rsi),%xmm2
2146	movdqu	32(%rsi),%xmm3
2147	movdqa	%xmm0,(%rax,%r9,1)
2148	movdqu	48(%rsi),%xmm4
2149	movdqa	%xmm0,16(%rax,%r9,1)
2150.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2151	movdqa	%xmm1,(%rax)
2152	movdqa	%xmm0,32(%rax,%r9,1)
2153	movdqa	%xmm2,16(%rax)
2154	movdqa	%xmm0,48(%rax,%r9,1)
2155	movdqa	%xmm3,32(%rax)
2156	movdqa	%xmm4,48(%rax)
2157	leaq	64(%rax),%rax
2158	subq	$64,%r11
2159	jnz	.Lmul_by_1
2160
2161.byte	102,72,15,110,207
2162.byte	102,72,15,110,209
2163.byte	0x67
2164	movq	%rcx,%rbp
2165.byte	102,73,15,110,218
2166	call	__bn_sqr8x_reduction
2167	call	__bn_post4x_internal
2168
2169	pxor	%xmm0,%xmm0
2170	leaq	48(%rsp),%rax
2171	jmp	.Lfrom_mont_zero
2172
2173.align	32
2174.Lfrom_mont_zero:
2175	movq	40(%rsp),%rsi
2176.cfi_def_cfa	%rsi,8
2177	movdqa	%xmm0,0(%rax)
2178	movdqa	%xmm0,16(%rax)
2179	movdqa	%xmm0,32(%rax)
2180	movdqa	%xmm0,48(%rax)
2181	leaq	64(%rax),%rax
2182	subq	$32,%r9
2183	jnz	.Lfrom_mont_zero
2184
2185	movq	$1,%rax
2186	movq	-48(%rsi),%r15
2187.cfi_restore	%r15
2188	movq	-40(%rsi),%r14
2189.cfi_restore	%r14
2190	movq	-32(%rsi),%r13
2191.cfi_restore	%r13
2192	movq	-24(%rsi),%r12
2193.cfi_restore	%r12
2194	movq	-16(%rsi),%rbp
2195.cfi_restore	%rbp
2196	movq	-8(%rsi),%rbx
2197.cfi_restore	%rbx
2198	leaq	(%rsi),%rsp
2199.cfi_def_cfa_register	%rsp
2200.Lfrom_epilogue:
2201	.byte	0xf3,0xc3
2202.cfi_endproc
2203.size	bn_from_mont8x,.-bn_from_mont8x
2204.globl	bn_scatter5
2205.hidden bn_scatter5
2206.type	bn_scatter5,@function
2207.align	16
2208bn_scatter5:
2209	cmpl	$0,%esi
2210	jz	.Lscatter_epilogue
2211	leaq	(%rdx,%rcx,8),%rdx
2212.Lscatter:
2213	movq	(%rdi),%rax
2214	leaq	8(%rdi),%rdi
2215	movq	%rax,(%rdx)
2216	leaq	256(%rdx),%rdx
2217	subl	$1,%esi
2218	jnz	.Lscatter
2219.Lscatter_epilogue:
2220	.byte	0xf3,0xc3
2221.size	bn_scatter5,.-bn_scatter5
2222
2223.globl	bn_gather5
2224.hidden bn_gather5
2225.type	bn_gather5,@function
2226.align	32
2227bn_gather5:
2228.LSEH_begin_bn_gather5:
2229
2230.byte	0x4c,0x8d,0x14,0x24
2231.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
2232	leaq	.Linc(%rip),%rax
2233	andq	$-16,%rsp
2234
2235	movd	%ecx,%xmm5
2236	movdqa	0(%rax),%xmm0
2237	movdqa	16(%rax),%xmm1
2238	leaq	128(%rdx),%r11
2239	leaq	128(%rsp),%rax
2240
2241	pshufd	$0,%xmm5,%xmm5
2242	movdqa	%xmm1,%xmm4
2243	movdqa	%xmm1,%xmm2
2244	paddd	%xmm0,%xmm1
2245	pcmpeqd	%xmm5,%xmm0
2246	movdqa	%xmm4,%xmm3
2247
2248	paddd	%xmm1,%xmm2
2249	pcmpeqd	%xmm5,%xmm1
2250	movdqa	%xmm0,-128(%rax)
2251	movdqa	%xmm4,%xmm0
2252
2253	paddd	%xmm2,%xmm3
2254	pcmpeqd	%xmm5,%xmm2
2255	movdqa	%xmm1,-112(%rax)
2256	movdqa	%xmm4,%xmm1
2257
2258	paddd	%xmm3,%xmm0
2259	pcmpeqd	%xmm5,%xmm3
2260	movdqa	%xmm2,-96(%rax)
2261	movdqa	%xmm4,%xmm2
2262	paddd	%xmm0,%xmm1
2263	pcmpeqd	%xmm5,%xmm0
2264	movdqa	%xmm3,-80(%rax)
2265	movdqa	%xmm4,%xmm3
2266
2267	paddd	%xmm1,%xmm2
2268	pcmpeqd	%xmm5,%xmm1
2269	movdqa	%xmm0,-64(%rax)
2270	movdqa	%xmm4,%xmm0
2271
2272	paddd	%xmm2,%xmm3
2273	pcmpeqd	%xmm5,%xmm2
2274	movdqa	%xmm1,-48(%rax)
2275	movdqa	%xmm4,%xmm1
2276
2277	paddd	%xmm3,%xmm0
2278	pcmpeqd	%xmm5,%xmm3
2279	movdqa	%xmm2,-32(%rax)
2280	movdqa	%xmm4,%xmm2
2281	paddd	%xmm0,%xmm1
2282	pcmpeqd	%xmm5,%xmm0
2283	movdqa	%xmm3,-16(%rax)
2284	movdqa	%xmm4,%xmm3
2285
2286	paddd	%xmm1,%xmm2
2287	pcmpeqd	%xmm5,%xmm1
2288	movdqa	%xmm0,0(%rax)
2289	movdqa	%xmm4,%xmm0
2290
2291	paddd	%xmm2,%xmm3
2292	pcmpeqd	%xmm5,%xmm2
2293	movdqa	%xmm1,16(%rax)
2294	movdqa	%xmm4,%xmm1
2295
2296	paddd	%xmm3,%xmm0
2297	pcmpeqd	%xmm5,%xmm3
2298	movdqa	%xmm2,32(%rax)
2299	movdqa	%xmm4,%xmm2
2300	paddd	%xmm0,%xmm1
2301	pcmpeqd	%xmm5,%xmm0
2302	movdqa	%xmm3,48(%rax)
2303	movdqa	%xmm4,%xmm3
2304
2305	paddd	%xmm1,%xmm2
2306	pcmpeqd	%xmm5,%xmm1
2307	movdqa	%xmm0,64(%rax)
2308	movdqa	%xmm4,%xmm0
2309
2310	paddd	%xmm2,%xmm3
2311	pcmpeqd	%xmm5,%xmm2
2312	movdqa	%xmm1,80(%rax)
2313	movdqa	%xmm4,%xmm1
2314
2315	paddd	%xmm3,%xmm0
2316	pcmpeqd	%xmm5,%xmm3
2317	movdqa	%xmm2,96(%rax)
2318	movdqa	%xmm4,%xmm2
2319	movdqa	%xmm3,112(%rax)
2320	jmp	.Lgather
2321
2322.align	32
2323.Lgather:
2324	pxor	%xmm4,%xmm4
2325	pxor	%xmm5,%xmm5
2326	movdqa	-128(%r11),%xmm0
2327	movdqa	-112(%r11),%xmm1
2328	movdqa	-96(%r11),%xmm2
2329	pand	-128(%rax),%xmm0
2330	movdqa	-80(%r11),%xmm3
2331	pand	-112(%rax),%xmm1
2332	por	%xmm0,%xmm4
2333	pand	-96(%rax),%xmm2
2334	por	%xmm1,%xmm5
2335	pand	-80(%rax),%xmm3
2336	por	%xmm2,%xmm4
2337	por	%xmm3,%xmm5
2338	movdqa	-64(%r11),%xmm0
2339	movdqa	-48(%r11),%xmm1
2340	movdqa	-32(%r11),%xmm2
2341	pand	-64(%rax),%xmm0
2342	movdqa	-16(%r11),%xmm3
2343	pand	-48(%rax),%xmm1
2344	por	%xmm0,%xmm4
2345	pand	-32(%rax),%xmm2
2346	por	%xmm1,%xmm5
2347	pand	-16(%rax),%xmm3
2348	por	%xmm2,%xmm4
2349	por	%xmm3,%xmm5
2350	movdqa	0(%r11),%xmm0
2351	movdqa	16(%r11),%xmm1
2352	movdqa	32(%r11),%xmm2
2353	pand	0(%rax),%xmm0
2354	movdqa	48(%r11),%xmm3
2355	pand	16(%rax),%xmm1
2356	por	%xmm0,%xmm4
2357	pand	32(%rax),%xmm2
2358	por	%xmm1,%xmm5
2359	pand	48(%rax),%xmm3
2360	por	%xmm2,%xmm4
2361	por	%xmm3,%xmm5
2362	movdqa	64(%r11),%xmm0
2363	movdqa	80(%r11),%xmm1
2364	movdqa	96(%r11),%xmm2
2365	pand	64(%rax),%xmm0
2366	movdqa	112(%r11),%xmm3
2367	pand	80(%rax),%xmm1
2368	por	%xmm0,%xmm4
2369	pand	96(%rax),%xmm2
2370	por	%xmm1,%xmm5
2371	pand	112(%rax),%xmm3
2372	por	%xmm2,%xmm4
2373	por	%xmm3,%xmm5
2374	por	%xmm5,%xmm4
2375	leaq	256(%r11),%r11
2376	pshufd	$0x4e,%xmm4,%xmm0
2377	por	%xmm4,%xmm0
2378	movq	%xmm0,(%rdi)
2379	leaq	8(%rdi),%rdi
2380	subl	$1,%esi
2381	jnz	.Lgather
2382
2383	leaq	(%r10),%rsp
2384	.byte	0xf3,0xc3
2385.LSEH_end_bn_gather5:
2386.size	bn_gather5,.-bn_gather5
2387.align	64
2388.Linc:
2389.long	0,0, 1,1
2390.long	2,2, 2,2
2391.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2392#endif
2393