1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13
14
15.globl	_GFp_bn_mul_mont_gather5
16.private_extern _GFp_bn_mul_mont_gather5
17
18.p2align	6
19_GFp_bn_mul_mont_gather5:
20
21	movl	%r9d,%r9d
22	movq	%rsp,%rax
23
24	testl	$7,%r9d
25	jnz	L$mul_enter
26	leaq	_GFp_ia32cap_P(%rip),%r11
27	movl	8(%r11),%r11d
28	jmp	L$mul4x_enter
29
30.p2align	4
31L$mul_enter:
32	movd	8(%rsp),%xmm5
33	pushq	%rbx
34
35	pushq	%rbp
36
37	pushq	%r12
38
39	pushq	%r13
40
41	pushq	%r14
42
43	pushq	%r15
44
45
46	negq	%r9
47	movq	%rsp,%r11
48	leaq	-280(%rsp,%r9,8),%r10
49	negq	%r9
50	andq	$-1024,%r10
51
52
53
54
55
56
57
58
59
60	subq	%r10,%r11
61	andq	$-4096,%r11
62	leaq	(%r10,%r11,1),%rsp
63	movq	(%rsp),%r11
64	cmpq	%r10,%rsp
65	ja	L$mul_page_walk
66	jmp	L$mul_page_walk_done
67
68L$mul_page_walk:
69	leaq	-4096(%rsp),%rsp
70	movq	(%rsp),%r11
71	cmpq	%r10,%rsp
72	ja	L$mul_page_walk
73L$mul_page_walk_done:
74
75	leaq	L$inc(%rip),%r10
76	movq	%rax,8(%rsp,%r9,8)
77
78L$mul_body:
79
80	leaq	128(%rdx),%r12
81	movdqa	0(%r10),%xmm0
82	movdqa	16(%r10),%xmm1
83	leaq	24-112(%rsp,%r9,8),%r10
84	andq	$-16,%r10
85
86	pshufd	$0,%xmm5,%xmm5
87	movdqa	%xmm1,%xmm4
88	movdqa	%xmm1,%xmm2
89	paddd	%xmm0,%xmm1
90	pcmpeqd	%xmm5,%xmm0
91.byte	0x67
92	movdqa	%xmm4,%xmm3
93	paddd	%xmm1,%xmm2
94	pcmpeqd	%xmm5,%xmm1
95	movdqa	%xmm0,112(%r10)
96	movdqa	%xmm4,%xmm0
97
98	paddd	%xmm2,%xmm3
99	pcmpeqd	%xmm5,%xmm2
100	movdqa	%xmm1,128(%r10)
101	movdqa	%xmm4,%xmm1
102
103	paddd	%xmm3,%xmm0
104	pcmpeqd	%xmm5,%xmm3
105	movdqa	%xmm2,144(%r10)
106	movdqa	%xmm4,%xmm2
107
108	paddd	%xmm0,%xmm1
109	pcmpeqd	%xmm5,%xmm0
110	movdqa	%xmm3,160(%r10)
111	movdqa	%xmm4,%xmm3
112	paddd	%xmm1,%xmm2
113	pcmpeqd	%xmm5,%xmm1
114	movdqa	%xmm0,176(%r10)
115	movdqa	%xmm4,%xmm0
116
117	paddd	%xmm2,%xmm3
118	pcmpeqd	%xmm5,%xmm2
119	movdqa	%xmm1,192(%r10)
120	movdqa	%xmm4,%xmm1
121
122	paddd	%xmm3,%xmm0
123	pcmpeqd	%xmm5,%xmm3
124	movdqa	%xmm2,208(%r10)
125	movdqa	%xmm4,%xmm2
126
127	paddd	%xmm0,%xmm1
128	pcmpeqd	%xmm5,%xmm0
129	movdqa	%xmm3,224(%r10)
130	movdqa	%xmm4,%xmm3
131	paddd	%xmm1,%xmm2
132	pcmpeqd	%xmm5,%xmm1
133	movdqa	%xmm0,240(%r10)
134	movdqa	%xmm4,%xmm0
135
136	paddd	%xmm2,%xmm3
137	pcmpeqd	%xmm5,%xmm2
138	movdqa	%xmm1,256(%r10)
139	movdqa	%xmm4,%xmm1
140
141	paddd	%xmm3,%xmm0
142	pcmpeqd	%xmm5,%xmm3
143	movdqa	%xmm2,272(%r10)
144	movdqa	%xmm4,%xmm2
145
146	paddd	%xmm0,%xmm1
147	pcmpeqd	%xmm5,%xmm0
148	movdqa	%xmm3,288(%r10)
149	movdqa	%xmm4,%xmm3
150	paddd	%xmm1,%xmm2
151	pcmpeqd	%xmm5,%xmm1
152	movdqa	%xmm0,304(%r10)
153
154	paddd	%xmm2,%xmm3
155.byte	0x67
156	pcmpeqd	%xmm5,%xmm2
157	movdqa	%xmm1,320(%r10)
158
159	pcmpeqd	%xmm5,%xmm3
160	movdqa	%xmm2,336(%r10)
161	pand	64(%r12),%xmm0
162
163	pand	80(%r12),%xmm1
164	pand	96(%r12),%xmm2
165	movdqa	%xmm3,352(%r10)
166	pand	112(%r12),%xmm3
167	por	%xmm2,%xmm0
168	por	%xmm3,%xmm1
169	movdqa	-128(%r12),%xmm4
170	movdqa	-112(%r12),%xmm5
171	movdqa	-96(%r12),%xmm2
172	pand	112(%r10),%xmm4
173	movdqa	-80(%r12),%xmm3
174	pand	128(%r10),%xmm5
175	por	%xmm4,%xmm0
176	pand	144(%r10),%xmm2
177	por	%xmm5,%xmm1
178	pand	160(%r10),%xmm3
179	por	%xmm2,%xmm0
180	por	%xmm3,%xmm1
181	movdqa	-64(%r12),%xmm4
182	movdqa	-48(%r12),%xmm5
183	movdqa	-32(%r12),%xmm2
184	pand	176(%r10),%xmm4
185	movdqa	-16(%r12),%xmm3
186	pand	192(%r10),%xmm5
187	por	%xmm4,%xmm0
188	pand	208(%r10),%xmm2
189	por	%xmm5,%xmm1
190	pand	224(%r10),%xmm3
191	por	%xmm2,%xmm0
192	por	%xmm3,%xmm1
193	movdqa	0(%r12),%xmm4
194	movdqa	16(%r12),%xmm5
195	movdqa	32(%r12),%xmm2
196	pand	240(%r10),%xmm4
197	movdqa	48(%r12),%xmm3
198	pand	256(%r10),%xmm5
199	por	%xmm4,%xmm0
200	pand	272(%r10),%xmm2
201	por	%xmm5,%xmm1
202	pand	288(%r10),%xmm3
203	por	%xmm2,%xmm0
204	por	%xmm3,%xmm1
205	por	%xmm1,%xmm0
206	pshufd	$0x4e,%xmm0,%xmm1
207	por	%xmm1,%xmm0
208	leaq	256(%r12),%r12
209.byte	102,72,15,126,195
210
211	movq	(%r8),%r8
212	movq	(%rsi),%rax
213
214	xorq	%r14,%r14
215	xorq	%r15,%r15
216
217	movq	%r8,%rbp
218	mulq	%rbx
219	movq	%rax,%r10
220	movq	(%rcx),%rax
221
222	imulq	%r10,%rbp
223	movq	%rdx,%r11
224
225	mulq	%rbp
226	addq	%rax,%r10
227	movq	8(%rsi),%rax
228	adcq	$0,%rdx
229	movq	%rdx,%r13
230
231	leaq	1(%r15),%r15
232	jmp	L$1st_enter
233
234.p2align	4
235L$1st:
236	addq	%rax,%r13
237	movq	(%rsi,%r15,8),%rax
238	adcq	$0,%rdx
239	addq	%r11,%r13
240	movq	%r10,%r11
241	adcq	$0,%rdx
242	movq	%r13,-16(%rsp,%r15,8)
243	movq	%rdx,%r13
244
245L$1st_enter:
246	mulq	%rbx
247	addq	%rax,%r11
248	movq	(%rcx,%r15,8),%rax
249	adcq	$0,%rdx
250	leaq	1(%r15),%r15
251	movq	%rdx,%r10
252
253	mulq	%rbp
254	cmpq	%r9,%r15
255	jne	L$1st
256
257
258	addq	%rax,%r13
259	adcq	$0,%rdx
260	addq	%r11,%r13
261	adcq	$0,%rdx
262	movq	%r13,-16(%rsp,%r9,8)
263	movq	%rdx,%r13
264	movq	%r10,%r11
265
266	xorq	%rdx,%rdx
267	addq	%r11,%r13
268	adcq	$0,%rdx
269	movq	%r13,-8(%rsp,%r9,8)
270	movq	%rdx,(%rsp,%r9,8)
271
272	leaq	1(%r14),%r14
273	jmp	L$outer
274.p2align	4
275L$outer:
276	leaq	24+128(%rsp,%r9,8),%rdx
277	andq	$-16,%rdx
278	pxor	%xmm4,%xmm4
279	pxor	%xmm5,%xmm5
280	movdqa	-128(%r12),%xmm0
281	movdqa	-112(%r12),%xmm1
282	movdqa	-96(%r12),%xmm2
283	movdqa	-80(%r12),%xmm3
284	pand	-128(%rdx),%xmm0
285	pand	-112(%rdx),%xmm1
286	por	%xmm0,%xmm4
287	pand	-96(%rdx),%xmm2
288	por	%xmm1,%xmm5
289	pand	-80(%rdx),%xmm3
290	por	%xmm2,%xmm4
291	por	%xmm3,%xmm5
292	movdqa	-64(%r12),%xmm0
293	movdqa	-48(%r12),%xmm1
294	movdqa	-32(%r12),%xmm2
295	movdqa	-16(%r12),%xmm3
296	pand	-64(%rdx),%xmm0
297	pand	-48(%rdx),%xmm1
298	por	%xmm0,%xmm4
299	pand	-32(%rdx),%xmm2
300	por	%xmm1,%xmm5
301	pand	-16(%rdx),%xmm3
302	por	%xmm2,%xmm4
303	por	%xmm3,%xmm5
304	movdqa	0(%r12),%xmm0
305	movdqa	16(%r12),%xmm1
306	movdqa	32(%r12),%xmm2
307	movdqa	48(%r12),%xmm3
308	pand	0(%rdx),%xmm0
309	pand	16(%rdx),%xmm1
310	por	%xmm0,%xmm4
311	pand	32(%rdx),%xmm2
312	por	%xmm1,%xmm5
313	pand	48(%rdx),%xmm3
314	por	%xmm2,%xmm4
315	por	%xmm3,%xmm5
316	movdqa	64(%r12),%xmm0
317	movdqa	80(%r12),%xmm1
318	movdqa	96(%r12),%xmm2
319	movdqa	112(%r12),%xmm3
320	pand	64(%rdx),%xmm0
321	pand	80(%rdx),%xmm1
322	por	%xmm0,%xmm4
323	pand	96(%rdx),%xmm2
324	por	%xmm1,%xmm5
325	pand	112(%rdx),%xmm3
326	por	%xmm2,%xmm4
327	por	%xmm3,%xmm5
328	por	%xmm5,%xmm4
329	pshufd	$0x4e,%xmm4,%xmm0
330	por	%xmm4,%xmm0
331	leaq	256(%r12),%r12
332
333	movq	(%rsi),%rax
334.byte	102,72,15,126,195
335
336	xorq	%r15,%r15
337	movq	%r8,%rbp
338	movq	(%rsp),%r10
339
340	mulq	%rbx
341	addq	%rax,%r10
342	movq	(%rcx),%rax
343	adcq	$0,%rdx
344
345	imulq	%r10,%rbp
346	movq	%rdx,%r11
347
348	mulq	%rbp
349	addq	%rax,%r10
350	movq	8(%rsi),%rax
351	adcq	$0,%rdx
352	movq	8(%rsp),%r10
353	movq	%rdx,%r13
354
355	leaq	1(%r15),%r15
356	jmp	L$inner_enter
357
358.p2align	4
359L$inner:
360	addq	%rax,%r13
361	movq	(%rsi,%r15,8),%rax
362	adcq	$0,%rdx
363	addq	%r10,%r13
364	movq	(%rsp,%r15,8),%r10
365	adcq	$0,%rdx
366	movq	%r13,-16(%rsp,%r15,8)
367	movq	%rdx,%r13
368
369L$inner_enter:
370	mulq	%rbx
371	addq	%rax,%r11
372	movq	(%rcx,%r15,8),%rax
373	adcq	$0,%rdx
374	addq	%r11,%r10
375	movq	%rdx,%r11
376	adcq	$0,%r11
377	leaq	1(%r15),%r15
378
379	mulq	%rbp
380	cmpq	%r9,%r15
381	jne	L$inner
382
383	addq	%rax,%r13
384	adcq	$0,%rdx
385	addq	%r10,%r13
386	movq	(%rsp,%r9,8),%r10
387	adcq	$0,%rdx
388	movq	%r13,-16(%rsp,%r9,8)
389	movq	%rdx,%r13
390
391	xorq	%rdx,%rdx
392	addq	%r11,%r13
393	adcq	$0,%rdx
394	addq	%r10,%r13
395	adcq	$0,%rdx
396	movq	%r13,-8(%rsp,%r9,8)
397	movq	%rdx,(%rsp,%r9,8)
398
399	leaq	1(%r14),%r14
400	cmpq	%r9,%r14
401	jb	L$outer
402
403	xorq	%r14,%r14
404	movq	(%rsp),%rax
405	leaq	(%rsp),%rsi
406	movq	%r9,%r15
407	jmp	L$sub
408.p2align	4
409L$sub:	sbbq	(%rcx,%r14,8),%rax
410	movq	%rax,(%rdi,%r14,8)
411	movq	8(%rsi,%r14,8),%rax
412	leaq	1(%r14),%r14
413	decq	%r15
414	jnz	L$sub
415
416	sbbq	$0,%rax
417	movq	$-1,%rbx
418	xorq	%rax,%rbx
419	xorq	%r14,%r14
420	movq	%r9,%r15
421
422L$copy:
423	movq	(%rdi,%r14,8),%rcx
424	movq	(%rsp,%r14,8),%rdx
425	andq	%rbx,%rcx
426	andq	%rax,%rdx
427	movq	%r14,(%rsp,%r14,8)
428	orq	%rcx,%rdx
429	movq	%rdx,(%rdi,%r14,8)
430	leaq	1(%r14),%r14
431	subq	$1,%r15
432	jnz	L$copy
433
434	movq	8(%rsp,%r9,8),%rsi
435
436	movq	$1,%rax
437
438	movq	-48(%rsi),%r15
439
440	movq	-40(%rsi),%r14
441
442	movq	-32(%rsi),%r13
443
444	movq	-24(%rsi),%r12
445
446	movq	-16(%rsi),%rbp
447
448	movq	-8(%rsi),%rbx
449
450	leaq	(%rsi),%rsp
451
452L$mul_epilogue:
453	.byte	0xf3,0xc3
454
455
456
457.p2align	5
458bn_mul4x_mont_gather5:
459
460.byte	0x67
461	movq	%rsp,%rax
462
463L$mul4x_enter:
464	andl	$0x80108,%r11d
465	cmpl	$0x80108,%r11d
466	je	L$mulx4x_enter
467	pushq	%rbx
468
469	pushq	%rbp
470
471	pushq	%r12
472
473	pushq	%r13
474
475	pushq	%r14
476
477	pushq	%r15
478
479L$mul4x_prologue:
480
481.byte	0x67
482	shll	$3,%r9d
483	leaq	(%r9,%r9,2),%r10
484	negq	%r9
485
486
487
488
489
490
491
492
493
494
495	leaq	-320(%rsp,%r9,2),%r11
496	movq	%rsp,%rbp
497	subq	%rdi,%r11
498	andq	$4095,%r11
499	cmpq	%r11,%r10
500	jb	L$mul4xsp_alt
501	subq	%r11,%rbp
502	leaq	-320(%rbp,%r9,2),%rbp
503	jmp	L$mul4xsp_done
504
505.p2align	5
506L$mul4xsp_alt:
507	leaq	4096-320(,%r9,2),%r10
508	leaq	-320(%rbp,%r9,2),%rbp
509	subq	%r10,%r11
510	movq	$0,%r10
511	cmovcq	%r10,%r11
512	subq	%r11,%rbp
513L$mul4xsp_done:
514	andq	$-64,%rbp
515	movq	%rsp,%r11
516	subq	%rbp,%r11
517	andq	$-4096,%r11
518	leaq	(%r11,%rbp,1),%rsp
519	movq	(%rsp),%r10
520	cmpq	%rbp,%rsp
521	ja	L$mul4x_page_walk
522	jmp	L$mul4x_page_walk_done
523
524L$mul4x_page_walk:
525	leaq	-4096(%rsp),%rsp
526	movq	(%rsp),%r10
527	cmpq	%rbp,%rsp
528	ja	L$mul4x_page_walk
529L$mul4x_page_walk_done:
530
531	negq	%r9
532
533	movq	%rax,40(%rsp)
534
535L$mul4x_body:
536
537	call	mul4x_internal
538
539	movq	40(%rsp),%rsi
540
541	movq	$1,%rax
542
543	movq	-48(%rsi),%r15
544
545	movq	-40(%rsi),%r14
546
547	movq	-32(%rsi),%r13
548
549	movq	-24(%rsi),%r12
550
551	movq	-16(%rsi),%rbp
552
553	movq	-8(%rsi),%rbx
554
555	leaq	(%rsi),%rsp
556
557L$mul4x_epilogue:
558	.byte	0xf3,0xc3
559
560
561
562
563.p2align	5
564mul4x_internal:
565
566	shlq	$5,%r9
567	movd	8(%rax),%xmm5
568	leaq	L$inc(%rip),%rax
569	leaq	128(%rdx,%r9,1),%r13
570	shrq	$5,%r9
571	movdqa	0(%rax),%xmm0
572	movdqa	16(%rax),%xmm1
573	leaq	88-112(%rsp,%r9,1),%r10
574	leaq	128(%rdx),%r12
575
576	pshufd	$0,%xmm5,%xmm5
577	movdqa	%xmm1,%xmm4
578.byte	0x67,0x67
579	movdqa	%xmm1,%xmm2
580	paddd	%xmm0,%xmm1
581	pcmpeqd	%xmm5,%xmm0
582.byte	0x67
583	movdqa	%xmm4,%xmm3
584	paddd	%xmm1,%xmm2
585	pcmpeqd	%xmm5,%xmm1
586	movdqa	%xmm0,112(%r10)
587	movdqa	%xmm4,%xmm0
588
589	paddd	%xmm2,%xmm3
590	pcmpeqd	%xmm5,%xmm2
591	movdqa	%xmm1,128(%r10)
592	movdqa	%xmm4,%xmm1
593
594	paddd	%xmm3,%xmm0
595	pcmpeqd	%xmm5,%xmm3
596	movdqa	%xmm2,144(%r10)
597	movdqa	%xmm4,%xmm2
598
599	paddd	%xmm0,%xmm1
600	pcmpeqd	%xmm5,%xmm0
601	movdqa	%xmm3,160(%r10)
602	movdqa	%xmm4,%xmm3
603	paddd	%xmm1,%xmm2
604	pcmpeqd	%xmm5,%xmm1
605	movdqa	%xmm0,176(%r10)
606	movdqa	%xmm4,%xmm0
607
608	paddd	%xmm2,%xmm3
609	pcmpeqd	%xmm5,%xmm2
610	movdqa	%xmm1,192(%r10)
611	movdqa	%xmm4,%xmm1
612
613	paddd	%xmm3,%xmm0
614	pcmpeqd	%xmm5,%xmm3
615	movdqa	%xmm2,208(%r10)
616	movdqa	%xmm4,%xmm2
617
618	paddd	%xmm0,%xmm1
619	pcmpeqd	%xmm5,%xmm0
620	movdqa	%xmm3,224(%r10)
621	movdqa	%xmm4,%xmm3
622	paddd	%xmm1,%xmm2
623	pcmpeqd	%xmm5,%xmm1
624	movdqa	%xmm0,240(%r10)
625	movdqa	%xmm4,%xmm0
626
627	paddd	%xmm2,%xmm3
628	pcmpeqd	%xmm5,%xmm2
629	movdqa	%xmm1,256(%r10)
630	movdqa	%xmm4,%xmm1
631
632	paddd	%xmm3,%xmm0
633	pcmpeqd	%xmm5,%xmm3
634	movdqa	%xmm2,272(%r10)
635	movdqa	%xmm4,%xmm2
636
637	paddd	%xmm0,%xmm1
638	pcmpeqd	%xmm5,%xmm0
639	movdqa	%xmm3,288(%r10)
640	movdqa	%xmm4,%xmm3
641	paddd	%xmm1,%xmm2
642	pcmpeqd	%xmm5,%xmm1
643	movdqa	%xmm0,304(%r10)
644
645	paddd	%xmm2,%xmm3
646.byte	0x67
647	pcmpeqd	%xmm5,%xmm2
648	movdqa	%xmm1,320(%r10)
649
650	pcmpeqd	%xmm5,%xmm3
651	movdqa	%xmm2,336(%r10)
652	pand	64(%r12),%xmm0
653
654	pand	80(%r12),%xmm1
655	pand	96(%r12),%xmm2
656	movdqa	%xmm3,352(%r10)
657	pand	112(%r12),%xmm3
658	por	%xmm2,%xmm0
659	por	%xmm3,%xmm1
660	movdqa	-128(%r12),%xmm4
661	movdqa	-112(%r12),%xmm5
662	movdqa	-96(%r12),%xmm2
663	pand	112(%r10),%xmm4
664	movdqa	-80(%r12),%xmm3
665	pand	128(%r10),%xmm5
666	por	%xmm4,%xmm0
667	pand	144(%r10),%xmm2
668	por	%xmm5,%xmm1
669	pand	160(%r10),%xmm3
670	por	%xmm2,%xmm0
671	por	%xmm3,%xmm1
672	movdqa	-64(%r12),%xmm4
673	movdqa	-48(%r12),%xmm5
674	movdqa	-32(%r12),%xmm2
675	pand	176(%r10),%xmm4
676	movdqa	-16(%r12),%xmm3
677	pand	192(%r10),%xmm5
678	por	%xmm4,%xmm0
679	pand	208(%r10),%xmm2
680	por	%xmm5,%xmm1
681	pand	224(%r10),%xmm3
682	por	%xmm2,%xmm0
683	por	%xmm3,%xmm1
684	movdqa	0(%r12),%xmm4
685	movdqa	16(%r12),%xmm5
686	movdqa	32(%r12),%xmm2
687	pand	240(%r10),%xmm4
688	movdqa	48(%r12),%xmm3
689	pand	256(%r10),%xmm5
690	por	%xmm4,%xmm0
691	pand	272(%r10),%xmm2
692	por	%xmm5,%xmm1
693	pand	288(%r10),%xmm3
694	por	%xmm2,%xmm0
695	por	%xmm3,%xmm1
696	por	%xmm1,%xmm0
697	pshufd	$0x4e,%xmm0,%xmm1
698	por	%xmm1,%xmm0
699	leaq	256(%r12),%r12
700.byte	102,72,15,126,195
701
702	movq	%r13,16+8(%rsp)
703	movq	%rdi,56+8(%rsp)
704
705	movq	(%r8),%r8
706	movq	(%rsi),%rax
707	leaq	(%rsi,%r9,1),%rsi
708	negq	%r9
709
710	movq	%r8,%rbp
711	mulq	%rbx
712	movq	%rax,%r10
713	movq	(%rcx),%rax
714
715	imulq	%r10,%rbp
716	leaq	64+8(%rsp),%r14
717	movq	%rdx,%r11
718
719	mulq	%rbp
720	addq	%rax,%r10
721	movq	8(%rsi,%r9,1),%rax
722	adcq	$0,%rdx
723	movq	%rdx,%rdi
724
725	mulq	%rbx
726	addq	%rax,%r11
727	movq	8(%rcx),%rax
728	adcq	$0,%rdx
729	movq	%rdx,%r10
730
731	mulq	%rbp
732	addq	%rax,%rdi
733	movq	16(%rsi,%r9,1),%rax
734	adcq	$0,%rdx
735	addq	%r11,%rdi
736	leaq	32(%r9),%r15
737	leaq	32(%rcx),%rcx
738	adcq	$0,%rdx
739	movq	%rdi,(%r14)
740	movq	%rdx,%r13
741	jmp	L$1st4x
742
743.p2align	5
744L$1st4x:
745	mulq	%rbx
746	addq	%rax,%r10
747	movq	-16(%rcx),%rax
748	leaq	32(%r14),%r14
749	adcq	$0,%rdx
750	movq	%rdx,%r11
751
752	mulq	%rbp
753	addq	%rax,%r13
754	movq	-8(%rsi,%r15,1),%rax
755	adcq	$0,%rdx
756	addq	%r10,%r13
757	adcq	$0,%rdx
758	movq	%r13,-24(%r14)
759	movq	%rdx,%rdi
760
761	mulq	%rbx
762	addq	%rax,%r11
763	movq	-8(%rcx),%rax
764	adcq	$0,%rdx
765	movq	%rdx,%r10
766
767	mulq	%rbp
768	addq	%rax,%rdi
769	movq	(%rsi,%r15,1),%rax
770	adcq	$0,%rdx
771	addq	%r11,%rdi
772	adcq	$0,%rdx
773	movq	%rdi,-16(%r14)
774	movq	%rdx,%r13
775
776	mulq	%rbx
777	addq	%rax,%r10
778	movq	0(%rcx),%rax
779	adcq	$0,%rdx
780	movq	%rdx,%r11
781
782	mulq	%rbp
783	addq	%rax,%r13
784	movq	8(%rsi,%r15,1),%rax
785	adcq	$0,%rdx
786	addq	%r10,%r13
787	adcq	$0,%rdx
788	movq	%r13,-8(%r14)
789	movq	%rdx,%rdi
790
791	mulq	%rbx
792	addq	%rax,%r11
793	movq	8(%rcx),%rax
794	adcq	$0,%rdx
795	movq	%rdx,%r10
796
797	mulq	%rbp
798	addq	%rax,%rdi
799	movq	16(%rsi,%r15,1),%rax
800	adcq	$0,%rdx
801	addq	%r11,%rdi
802	leaq	32(%rcx),%rcx
803	adcq	$0,%rdx
804	movq	%rdi,(%r14)
805	movq	%rdx,%r13
806
807	addq	$32,%r15
808	jnz	L$1st4x
809
810	mulq	%rbx
811	addq	%rax,%r10
812	movq	-16(%rcx),%rax
813	leaq	32(%r14),%r14
814	adcq	$0,%rdx
815	movq	%rdx,%r11
816
817	mulq	%rbp
818	addq	%rax,%r13
819	movq	-8(%rsi),%rax
820	adcq	$0,%rdx
821	addq	%r10,%r13
822	adcq	$0,%rdx
823	movq	%r13,-24(%r14)
824	movq	%rdx,%rdi
825
826	mulq	%rbx
827	addq	%rax,%r11
828	movq	-8(%rcx),%rax
829	adcq	$0,%rdx
830	movq	%rdx,%r10
831
832	mulq	%rbp
833	addq	%rax,%rdi
834	movq	(%rsi,%r9,1),%rax
835	adcq	$0,%rdx
836	addq	%r11,%rdi
837	adcq	$0,%rdx
838	movq	%rdi,-16(%r14)
839	movq	%rdx,%r13
840
841	leaq	(%rcx,%r9,1),%rcx
842
843	xorq	%rdi,%rdi
844	addq	%r10,%r13
845	adcq	$0,%rdi
846	movq	%r13,-8(%r14)
847
848	jmp	L$outer4x
849
850.p2align	5
851L$outer4x:
852	leaq	16+128(%r14),%rdx
853	pxor	%xmm4,%xmm4
854	pxor	%xmm5,%xmm5
855	movdqa	-128(%r12),%xmm0
856	movdqa	-112(%r12),%xmm1
857	movdqa	-96(%r12),%xmm2
858	movdqa	-80(%r12),%xmm3
859	pand	-128(%rdx),%xmm0
860	pand	-112(%rdx),%xmm1
861	por	%xmm0,%xmm4
862	pand	-96(%rdx),%xmm2
863	por	%xmm1,%xmm5
864	pand	-80(%rdx),%xmm3
865	por	%xmm2,%xmm4
866	por	%xmm3,%xmm5
867	movdqa	-64(%r12),%xmm0
868	movdqa	-48(%r12),%xmm1
869	movdqa	-32(%r12),%xmm2
870	movdqa	-16(%r12),%xmm3
871	pand	-64(%rdx),%xmm0
872	pand	-48(%rdx),%xmm1
873	por	%xmm0,%xmm4
874	pand	-32(%rdx),%xmm2
875	por	%xmm1,%xmm5
876	pand	-16(%rdx),%xmm3
877	por	%xmm2,%xmm4
878	por	%xmm3,%xmm5
879	movdqa	0(%r12),%xmm0
880	movdqa	16(%r12),%xmm1
881	movdqa	32(%r12),%xmm2
882	movdqa	48(%r12),%xmm3
883	pand	0(%rdx),%xmm0
884	pand	16(%rdx),%xmm1
885	por	%xmm0,%xmm4
886	pand	32(%rdx),%xmm2
887	por	%xmm1,%xmm5
888	pand	48(%rdx),%xmm3
889	por	%xmm2,%xmm4
890	por	%xmm3,%xmm5
891	movdqa	64(%r12),%xmm0
892	movdqa	80(%r12),%xmm1
893	movdqa	96(%r12),%xmm2
894	movdqa	112(%r12),%xmm3
895	pand	64(%rdx),%xmm0
896	pand	80(%rdx),%xmm1
897	por	%xmm0,%xmm4
898	pand	96(%rdx),%xmm2
899	por	%xmm1,%xmm5
900	pand	112(%rdx),%xmm3
901	por	%xmm2,%xmm4
902	por	%xmm3,%xmm5
903	por	%xmm5,%xmm4
904	pshufd	$0x4e,%xmm4,%xmm0
905	por	%xmm4,%xmm0
906	leaq	256(%r12),%r12
907.byte	102,72,15,126,195
908
909	movq	(%r14,%r9,1),%r10
910	movq	%r8,%rbp
911	mulq	%rbx
912	addq	%rax,%r10
913	movq	(%rcx),%rax
914	adcq	$0,%rdx
915
916	imulq	%r10,%rbp
917	movq	%rdx,%r11
918	movq	%rdi,(%r14)
919
920	leaq	(%r14,%r9,1),%r14
921
922	mulq	%rbp
923	addq	%rax,%r10
924	movq	8(%rsi,%r9,1),%rax
925	adcq	$0,%rdx
926	movq	%rdx,%rdi
927
928	mulq	%rbx
929	addq	%rax,%r11
930	movq	8(%rcx),%rax
931	adcq	$0,%rdx
932	addq	8(%r14),%r11
933	adcq	$0,%rdx
934	movq	%rdx,%r10
935
936	mulq	%rbp
937	addq	%rax,%rdi
938	movq	16(%rsi,%r9,1),%rax
939	adcq	$0,%rdx
940	addq	%r11,%rdi
941	leaq	32(%r9),%r15
942	leaq	32(%rcx),%rcx
943	adcq	$0,%rdx
944	movq	%rdx,%r13
945	jmp	L$inner4x
946
947.p2align	5
948L$inner4x:
949	mulq	%rbx
950	addq	%rax,%r10
951	movq	-16(%rcx),%rax
952	adcq	$0,%rdx
953	addq	16(%r14),%r10
954	leaq	32(%r14),%r14
955	adcq	$0,%rdx
956	movq	%rdx,%r11
957
958	mulq	%rbp
959	addq	%rax,%r13
960	movq	-8(%rsi,%r15,1),%rax
961	adcq	$0,%rdx
962	addq	%r10,%r13
963	adcq	$0,%rdx
964	movq	%rdi,-32(%r14)
965	movq	%rdx,%rdi
966
967	mulq	%rbx
968	addq	%rax,%r11
969	movq	-8(%rcx),%rax
970	adcq	$0,%rdx
971	addq	-8(%r14),%r11
972	adcq	$0,%rdx
973	movq	%rdx,%r10
974
975	mulq	%rbp
976	addq	%rax,%rdi
977	movq	(%rsi,%r15,1),%rax
978	adcq	$0,%rdx
979	addq	%r11,%rdi
980	adcq	$0,%rdx
981	movq	%r13,-24(%r14)
982	movq	%rdx,%r13
983
984	mulq	%rbx
985	addq	%rax,%r10
986	movq	0(%rcx),%rax
987	adcq	$0,%rdx
988	addq	(%r14),%r10
989	adcq	$0,%rdx
990	movq	%rdx,%r11
991
992	mulq	%rbp
993	addq	%rax,%r13
994	movq	8(%rsi,%r15,1),%rax
995	adcq	$0,%rdx
996	addq	%r10,%r13
997	adcq	$0,%rdx
998	movq	%rdi,-16(%r14)
999	movq	%rdx,%rdi
1000
1001	mulq	%rbx
1002	addq	%rax,%r11
1003	movq	8(%rcx),%rax
1004	adcq	$0,%rdx
1005	addq	8(%r14),%r11
1006	adcq	$0,%rdx
1007	movq	%rdx,%r10
1008
1009	mulq	%rbp
1010	addq	%rax,%rdi
1011	movq	16(%rsi,%r15,1),%rax
1012	adcq	$0,%rdx
1013	addq	%r11,%rdi
1014	leaq	32(%rcx),%rcx
1015	adcq	$0,%rdx
1016	movq	%r13,-8(%r14)
1017	movq	%rdx,%r13
1018
1019	addq	$32,%r15
1020	jnz	L$inner4x
1021
1022	mulq	%rbx
1023	addq	%rax,%r10
1024	movq	-16(%rcx),%rax
1025	adcq	$0,%rdx
1026	addq	16(%r14),%r10
1027	leaq	32(%r14),%r14
1028	adcq	$0,%rdx
1029	movq	%rdx,%r11
1030
1031	mulq	%rbp
1032	addq	%rax,%r13
1033	movq	-8(%rsi),%rax
1034	adcq	$0,%rdx
1035	addq	%r10,%r13
1036	adcq	$0,%rdx
1037	movq	%rdi,-32(%r14)
1038	movq	%rdx,%rdi
1039
1040	mulq	%rbx
1041	addq	%rax,%r11
1042	movq	%rbp,%rax
1043	movq	-8(%rcx),%rbp
1044	adcq	$0,%rdx
1045	addq	-8(%r14),%r11
1046	adcq	$0,%rdx
1047	movq	%rdx,%r10
1048
1049	mulq	%rbp
1050	addq	%rax,%rdi
1051	movq	(%rsi,%r9,1),%rax
1052	adcq	$0,%rdx
1053	addq	%r11,%rdi
1054	adcq	$0,%rdx
1055	movq	%r13,-24(%r14)
1056	movq	%rdx,%r13
1057
1058	movq	%rdi,-16(%r14)
1059	leaq	(%rcx,%r9,1),%rcx
1060
1061	xorq	%rdi,%rdi
1062	addq	%r10,%r13
1063	adcq	$0,%rdi
1064	addq	(%r14),%r13
1065	adcq	$0,%rdi
1066	movq	%r13,-8(%r14)
1067
1068	cmpq	16+8(%rsp),%r12
1069	jb	L$outer4x
1070	xorq	%rax,%rax
1071	subq	%r13,%rbp
1072	adcq	%r15,%r15
1073	orq	%r15,%rdi
1074	subq	%rdi,%rax
1075	leaq	(%r14,%r9,1),%rbx
1076	movq	(%rcx),%r12
1077	leaq	(%rcx),%rbp
1078	movq	%r9,%rcx
1079	sarq	$3+2,%rcx
1080	movq	56+8(%rsp),%rdi
1081	decq	%r12
1082	xorq	%r10,%r10
1083	movq	8(%rbp),%r13
1084	movq	16(%rbp),%r14
1085	movq	24(%rbp),%r15
1086	jmp	L$sqr4x_sub_entry
1087
1088
1089.globl	_GFp_bn_power5
1090.private_extern _GFp_bn_power5
1091
1092.p2align	5
1093_GFp_bn_power5:
1094
1095	movq	%rsp,%rax
1096
1097	leaq	_GFp_ia32cap_P(%rip),%r11
1098	movl	8(%r11),%r11d
1099	andl	$0x80108,%r11d
1100	cmpl	$0x80108,%r11d
1101	je	L$powerx5_enter
1102	pushq	%rbx
1103
1104	pushq	%rbp
1105
1106	pushq	%r12
1107
1108	pushq	%r13
1109
1110	pushq	%r14
1111
1112	pushq	%r15
1113
1114L$power5_prologue:
1115
1116	shll	$3,%r9d
1117	leal	(%r9,%r9,2),%r10d
1118	negq	%r9
1119	movq	(%r8),%r8
1120
1121
1122
1123
1124
1125
1126
1127
1128	leaq	-320(%rsp,%r9,2),%r11
1129	movq	%rsp,%rbp
1130	subq	%rdi,%r11
1131	andq	$4095,%r11
1132	cmpq	%r11,%r10
1133	jb	L$pwr_sp_alt
1134	subq	%r11,%rbp
1135	leaq	-320(%rbp,%r9,2),%rbp
1136	jmp	L$pwr_sp_done
1137
1138.p2align	5
1139L$pwr_sp_alt:
1140	leaq	4096-320(,%r9,2),%r10
1141	leaq	-320(%rbp,%r9,2),%rbp
1142	subq	%r10,%r11
1143	movq	$0,%r10
1144	cmovcq	%r10,%r11
1145	subq	%r11,%rbp
1146L$pwr_sp_done:
1147	andq	$-64,%rbp
1148	movq	%rsp,%r11
1149	subq	%rbp,%r11
1150	andq	$-4096,%r11
1151	leaq	(%r11,%rbp,1),%rsp
1152	movq	(%rsp),%r10
1153	cmpq	%rbp,%rsp
1154	ja	L$pwr_page_walk
1155	jmp	L$pwr_page_walk_done
1156
1157L$pwr_page_walk:
1158	leaq	-4096(%rsp),%rsp
1159	movq	(%rsp),%r10
1160	cmpq	%rbp,%rsp
1161	ja	L$pwr_page_walk
1162L$pwr_page_walk_done:
1163
1164	movq	%r9,%r10
1165	negq	%r9
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176	movq	%r8,32(%rsp)
1177	movq	%rax,40(%rsp)
1178
1179L$power5_body:
1180.byte	102,72,15,110,207
1181.byte	102,72,15,110,209
1182.byte	102,73,15,110,218
1183.byte	102,72,15,110,226
1184
1185	call	__bn_sqr8x_internal
1186	call	__bn_post4x_internal
1187	call	__bn_sqr8x_internal
1188	call	__bn_post4x_internal
1189	call	__bn_sqr8x_internal
1190	call	__bn_post4x_internal
1191	call	__bn_sqr8x_internal
1192	call	__bn_post4x_internal
1193	call	__bn_sqr8x_internal
1194	call	__bn_post4x_internal
1195
1196.byte	102,72,15,126,209
1197.byte	102,72,15,126,226
1198	movq	%rsi,%rdi
1199	movq	40(%rsp),%rax
1200	leaq	32(%rsp),%r8
1201
1202	call	mul4x_internal
1203
1204	movq	40(%rsp),%rsi
1205
1206	movq	$1,%rax
1207	movq	-48(%rsi),%r15
1208
1209	movq	-40(%rsi),%r14
1210
1211	movq	-32(%rsi),%r13
1212
1213	movq	-24(%rsi),%r12
1214
1215	movq	-16(%rsi),%rbp
1216
1217	movq	-8(%rsi),%rbx
1218
1219	leaq	(%rsi),%rsp
1220
1221L$power5_epilogue:
1222	.byte	0xf3,0xc3
1223
1224
1225
1226.globl	_GFp_bn_sqr8x_internal
1227.private_extern _GFp_bn_sqr8x_internal
1228.private_extern	_GFp_bn_sqr8x_internal
1229
1230.p2align	5
1231_GFp_bn_sqr8x_internal:
1232__bn_sqr8x_internal:
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307	leaq	32(%r10),%rbp
1308	leaq	(%rsi,%r9,1),%rsi
1309
1310	movq	%r9,%rcx
1311
1312
1313	movq	-32(%rsi,%rbp,1),%r14
1314	leaq	48+8(%rsp,%r9,2),%rdi
1315	movq	-24(%rsi,%rbp,1),%rax
1316	leaq	-32(%rdi,%rbp,1),%rdi
1317	movq	-16(%rsi,%rbp,1),%rbx
1318	movq	%rax,%r15
1319
1320	mulq	%r14
1321	movq	%rax,%r10
1322	movq	%rbx,%rax
1323	movq	%rdx,%r11
1324	movq	%r10,-24(%rdi,%rbp,1)
1325
1326	mulq	%r14
1327	addq	%rax,%r11
1328	movq	%rbx,%rax
1329	adcq	$0,%rdx
1330	movq	%r11,-16(%rdi,%rbp,1)
1331	movq	%rdx,%r10
1332
1333
1334	movq	-8(%rsi,%rbp,1),%rbx
1335	mulq	%r15
1336	movq	%rax,%r12
1337	movq	%rbx,%rax
1338	movq	%rdx,%r13
1339
1340	leaq	(%rbp),%rcx
1341	mulq	%r14
1342	addq	%rax,%r10
1343	movq	%rbx,%rax
1344	movq	%rdx,%r11
1345	adcq	$0,%r11
1346	addq	%r12,%r10
1347	adcq	$0,%r11
1348	movq	%r10,-8(%rdi,%rcx,1)
1349	jmp	L$sqr4x_1st
1350
1351.p2align	5
1352L$sqr4x_1st:
1353	movq	(%rsi,%rcx,1),%rbx
1354	mulq	%r15
1355	addq	%rax,%r13
1356	movq	%rbx,%rax
1357	movq	%rdx,%r12
1358	adcq	$0,%r12
1359
1360	mulq	%r14
1361	addq	%rax,%r11
1362	movq	%rbx,%rax
1363	movq	8(%rsi,%rcx,1),%rbx
1364	movq	%rdx,%r10
1365	adcq	$0,%r10
1366	addq	%r13,%r11
1367	adcq	$0,%r10
1368
1369
1370	mulq	%r15
1371	addq	%rax,%r12
1372	movq	%rbx,%rax
1373	movq	%r11,(%rdi,%rcx,1)
1374	movq	%rdx,%r13
1375	adcq	$0,%r13
1376
1377	mulq	%r14
1378	addq	%rax,%r10
1379	movq	%rbx,%rax
1380	movq	16(%rsi,%rcx,1),%rbx
1381	movq	%rdx,%r11
1382	adcq	$0,%r11
1383	addq	%r12,%r10
1384	adcq	$0,%r11
1385
1386	mulq	%r15
1387	addq	%rax,%r13
1388	movq	%rbx,%rax
1389	movq	%r10,8(%rdi,%rcx,1)
1390	movq	%rdx,%r12
1391	adcq	$0,%r12
1392
1393	mulq	%r14
1394	addq	%rax,%r11
1395	movq	%rbx,%rax
1396	movq	24(%rsi,%rcx,1),%rbx
1397	movq	%rdx,%r10
1398	adcq	$0,%r10
1399	addq	%r13,%r11
1400	adcq	$0,%r10
1401
1402
1403	mulq	%r15
1404	addq	%rax,%r12
1405	movq	%rbx,%rax
1406	movq	%r11,16(%rdi,%rcx,1)
1407	movq	%rdx,%r13
1408	adcq	$0,%r13
1409	leaq	32(%rcx),%rcx
1410
1411	mulq	%r14
1412	addq	%rax,%r10
1413	movq	%rbx,%rax
1414	movq	%rdx,%r11
1415	adcq	$0,%r11
1416	addq	%r12,%r10
1417	adcq	$0,%r11
1418	movq	%r10,-8(%rdi,%rcx,1)
1419
1420	cmpq	$0,%rcx
1421	jne	L$sqr4x_1st
1422
1423	mulq	%r15
1424	addq	%rax,%r13
1425	leaq	16(%rbp),%rbp
1426	adcq	$0,%rdx
1427	addq	%r11,%r13
1428	adcq	$0,%rdx
1429
1430	movq	%r13,(%rdi)
1431	movq	%rdx,%r12
1432	movq	%rdx,8(%rdi)
1433	jmp	L$sqr4x_outer
1434
1435.p2align	5
1436L$sqr4x_outer:
1437	movq	-32(%rsi,%rbp,1),%r14
1438	leaq	48+8(%rsp,%r9,2),%rdi
1439	movq	-24(%rsi,%rbp,1),%rax
1440	leaq	-32(%rdi,%rbp,1),%rdi
1441	movq	-16(%rsi,%rbp,1),%rbx
1442	movq	%rax,%r15
1443
1444	mulq	%r14
1445	movq	-24(%rdi,%rbp,1),%r10
1446	addq	%rax,%r10
1447	movq	%rbx,%rax
1448	adcq	$0,%rdx
1449	movq	%r10,-24(%rdi,%rbp,1)
1450	movq	%rdx,%r11
1451
1452	mulq	%r14
1453	addq	%rax,%r11
1454	movq	%rbx,%rax
1455	adcq	$0,%rdx
1456	addq	-16(%rdi,%rbp,1),%r11
1457	movq	%rdx,%r10
1458	adcq	$0,%r10
1459	movq	%r11,-16(%rdi,%rbp,1)
1460
1461	xorq	%r12,%r12
1462
1463	movq	-8(%rsi,%rbp,1),%rbx
1464	mulq	%r15
1465	addq	%rax,%r12
1466	movq	%rbx,%rax
1467	adcq	$0,%rdx
1468	addq	-8(%rdi,%rbp,1),%r12
1469	movq	%rdx,%r13
1470	adcq	$0,%r13
1471
1472	mulq	%r14
1473	addq	%rax,%r10
1474	movq	%rbx,%rax
1475	adcq	$0,%rdx
1476	addq	%r12,%r10
1477	movq	%rdx,%r11
1478	adcq	$0,%r11
1479	movq	%r10,-8(%rdi,%rbp,1)
1480
1481	leaq	(%rbp),%rcx
1482	jmp	L$sqr4x_inner
1483
1484.p2align	5
1485L$sqr4x_inner:
1486	movq	(%rsi,%rcx,1),%rbx
1487	mulq	%r15
1488	addq	%rax,%r13
1489	movq	%rbx,%rax
1490	movq	%rdx,%r12
1491	adcq	$0,%r12
1492	addq	(%rdi,%rcx,1),%r13
1493	adcq	$0,%r12
1494
1495.byte	0x67
1496	mulq	%r14
1497	addq	%rax,%r11
1498	movq	%rbx,%rax
1499	movq	8(%rsi,%rcx,1),%rbx
1500	movq	%rdx,%r10
1501	adcq	$0,%r10
1502	addq	%r13,%r11
1503	adcq	$0,%r10
1504
1505	mulq	%r15
1506	addq	%rax,%r12
1507	movq	%r11,(%rdi,%rcx,1)
1508	movq	%rbx,%rax
1509	movq	%rdx,%r13
1510	adcq	$0,%r13
1511	addq	8(%rdi,%rcx,1),%r12
1512	leaq	16(%rcx),%rcx
1513	adcq	$0,%r13
1514
1515	mulq	%r14
1516	addq	%rax,%r10
1517	movq	%rbx,%rax
1518	adcq	$0,%rdx
1519	addq	%r12,%r10
1520	movq	%rdx,%r11
1521	adcq	$0,%r11
1522	movq	%r10,-8(%rdi,%rcx,1)
1523
1524	cmpq	$0,%rcx
1525	jne	L$sqr4x_inner
1526
1527.byte	0x67
1528	mulq	%r15
1529	addq	%rax,%r13
1530	adcq	$0,%rdx
1531	addq	%r11,%r13
1532	adcq	$0,%rdx
1533
1534	movq	%r13,(%rdi)
1535	movq	%rdx,%r12
1536	movq	%rdx,8(%rdi)
1537
1538	addq	$16,%rbp
1539	jnz	L$sqr4x_outer
1540
1541
1542	movq	-32(%rsi),%r14
1543	leaq	48+8(%rsp,%r9,2),%rdi
1544	movq	-24(%rsi),%rax
1545	leaq	-32(%rdi,%rbp,1),%rdi
1546	movq	-16(%rsi),%rbx
1547	movq	%rax,%r15
1548
1549	mulq	%r14
1550	addq	%rax,%r10
1551	movq	%rbx,%rax
1552	movq	%rdx,%r11
1553	adcq	$0,%r11
1554
1555	mulq	%r14
1556	addq	%rax,%r11
1557	movq	%rbx,%rax
1558	movq	%r10,-24(%rdi)
1559	movq	%rdx,%r10
1560	adcq	$0,%r10
1561	addq	%r13,%r11
1562	movq	-8(%rsi),%rbx
1563	adcq	$0,%r10
1564
1565	mulq	%r15
1566	addq	%rax,%r12
1567	movq	%rbx,%rax
1568	movq	%r11,-16(%rdi)
1569	movq	%rdx,%r13
1570	adcq	$0,%r13
1571
1572	mulq	%r14
1573	addq	%rax,%r10
1574	movq	%rbx,%rax
1575	movq	%rdx,%r11
1576	adcq	$0,%r11
1577	addq	%r12,%r10
1578	adcq	$0,%r11
1579	movq	%r10,-8(%rdi)
1580
1581	mulq	%r15
1582	addq	%rax,%r13
1583	movq	-16(%rsi),%rax
1584	adcq	$0,%rdx
1585	addq	%r11,%r13
1586	adcq	$0,%rdx
1587
1588	movq	%r13,(%rdi)
1589	movq	%rdx,%r12
1590	movq	%rdx,8(%rdi)
1591
1592	mulq	%rbx
1593	addq	$16,%rbp
1594	xorq	%r14,%r14
1595	subq	%r9,%rbp
1596	xorq	%r15,%r15
1597
1598	addq	%r12,%rax
1599	adcq	$0,%rdx
1600	movq	%rax,8(%rdi)
1601	movq	%rdx,16(%rdi)
1602	movq	%r15,24(%rdi)
1603
1604	movq	-16(%rsi,%rbp,1),%rax
1605	leaq	48+8(%rsp),%rdi
1606	xorq	%r10,%r10
1607	movq	8(%rdi),%r11
1608
1609	leaq	(%r14,%r10,2),%r12
1610	shrq	$63,%r10
1611	leaq	(%rcx,%r11,2),%r13
1612	shrq	$63,%r11
1613	orq	%r10,%r13
1614	movq	16(%rdi),%r10
1615	movq	%r11,%r14
1616	mulq	%rax
1617	negq	%r15
1618	movq	24(%rdi),%r11
1619	adcq	%rax,%r12
1620	movq	-8(%rsi,%rbp,1),%rax
1621	movq	%r12,(%rdi)
1622	adcq	%rdx,%r13
1623
1624	leaq	(%r14,%r10,2),%rbx
1625	movq	%r13,8(%rdi)
1626	sbbq	%r15,%r15
1627	shrq	$63,%r10
1628	leaq	(%rcx,%r11,2),%r8
1629	shrq	$63,%r11
1630	orq	%r10,%r8
1631	movq	32(%rdi),%r10
1632	movq	%r11,%r14
1633	mulq	%rax
1634	negq	%r15
1635	movq	40(%rdi),%r11
1636	adcq	%rax,%rbx
1637	movq	0(%rsi,%rbp,1),%rax
1638	movq	%rbx,16(%rdi)
1639	adcq	%rdx,%r8
1640	leaq	16(%rbp),%rbp
1641	movq	%r8,24(%rdi)
1642	sbbq	%r15,%r15
1643	leaq	64(%rdi),%rdi
1644	jmp	L$sqr4x_shift_n_add
1645
1646.p2align	5
1647L$sqr4x_shift_n_add:
1648	leaq	(%r14,%r10,2),%r12
1649	shrq	$63,%r10
1650	leaq	(%rcx,%r11,2),%r13
1651	shrq	$63,%r11
1652	orq	%r10,%r13
1653	movq	-16(%rdi),%r10
1654	movq	%r11,%r14
1655	mulq	%rax
1656	negq	%r15
1657	movq	-8(%rdi),%r11
1658	adcq	%rax,%r12
1659	movq	-8(%rsi,%rbp,1),%rax
1660	movq	%r12,-32(%rdi)
1661	adcq	%rdx,%r13
1662
1663	leaq	(%r14,%r10,2),%rbx
1664	movq	%r13,-24(%rdi)
1665	sbbq	%r15,%r15
1666	shrq	$63,%r10
1667	leaq	(%rcx,%r11,2),%r8
1668	shrq	$63,%r11
1669	orq	%r10,%r8
1670	movq	0(%rdi),%r10
1671	movq	%r11,%r14
1672	mulq	%rax
1673	negq	%r15
1674	movq	8(%rdi),%r11
1675	adcq	%rax,%rbx
1676	movq	0(%rsi,%rbp,1),%rax
1677	movq	%rbx,-16(%rdi)
1678	adcq	%rdx,%r8
1679
1680	leaq	(%r14,%r10,2),%r12
1681	movq	%r8,-8(%rdi)
1682	sbbq	%r15,%r15
1683	shrq	$63,%r10
1684	leaq	(%rcx,%r11,2),%r13
1685	shrq	$63,%r11
1686	orq	%r10,%r13
1687	movq	16(%rdi),%r10
1688	movq	%r11,%r14
1689	mulq	%rax
1690	negq	%r15
1691	movq	24(%rdi),%r11
1692	adcq	%rax,%r12
1693	movq	8(%rsi,%rbp,1),%rax
1694	movq	%r12,0(%rdi)
1695	adcq	%rdx,%r13
1696
1697	leaq	(%r14,%r10,2),%rbx
1698	movq	%r13,8(%rdi)
1699	sbbq	%r15,%r15
1700	shrq	$63,%r10
1701	leaq	(%rcx,%r11,2),%r8
1702	shrq	$63,%r11
1703	orq	%r10,%r8
1704	movq	32(%rdi),%r10
1705	movq	%r11,%r14
1706	mulq	%rax
1707	negq	%r15
1708	movq	40(%rdi),%r11
1709	adcq	%rax,%rbx
1710	movq	16(%rsi,%rbp,1),%rax
1711	movq	%rbx,16(%rdi)
1712	adcq	%rdx,%r8
1713	movq	%r8,24(%rdi)
1714	sbbq	%r15,%r15
1715	leaq	64(%rdi),%rdi
1716	addq	$32,%rbp
1717	jnz	L$sqr4x_shift_n_add
1718
1719	leaq	(%r14,%r10,2),%r12
1720.byte	0x67
1721	shrq	$63,%r10
1722	leaq	(%rcx,%r11,2),%r13
1723	shrq	$63,%r11
1724	orq	%r10,%r13
1725	movq	-16(%rdi),%r10
1726	movq	%r11,%r14
1727	mulq	%rax
1728	negq	%r15
1729	movq	-8(%rdi),%r11
1730	adcq	%rax,%r12
1731	movq	-8(%rsi),%rax
1732	movq	%r12,-32(%rdi)
1733	adcq	%rdx,%r13
1734
1735	leaq	(%r14,%r10,2),%rbx
1736	movq	%r13,-24(%rdi)
1737	sbbq	%r15,%r15
1738	shrq	$63,%r10
1739	leaq	(%rcx,%r11,2),%r8
1740	shrq	$63,%r11
1741	orq	%r10,%r8
1742	mulq	%rax
1743	negq	%r15
1744	adcq	%rax,%rbx
1745	adcq	%rdx,%r8
1746	movq	%rbx,-16(%rdi)
1747	movq	%r8,-8(%rdi)
1748.byte	102,72,15,126,213
1749__bn_sqr8x_reduction:
1750	xorq	%rax,%rax
1751	leaq	(%r9,%rbp,1),%rcx
1752	leaq	48+8(%rsp,%r9,2),%rdx
1753	movq	%rcx,0+8(%rsp)
1754	leaq	48+8(%rsp,%r9,1),%rdi
1755	movq	%rdx,8+8(%rsp)
1756	negq	%r9
1757	jmp	L$8x_reduction_loop
1758
1759.p2align	5
1760L$8x_reduction_loop:
1761	leaq	(%rdi,%r9,1),%rdi
1762.byte	0x66
1763	movq	0(%rdi),%rbx
1764	movq	8(%rdi),%r9
1765	movq	16(%rdi),%r10
1766	movq	24(%rdi),%r11
1767	movq	32(%rdi),%r12
1768	movq	40(%rdi),%r13
1769	movq	48(%rdi),%r14
1770	movq	56(%rdi),%r15
1771	movq	%rax,(%rdx)
1772	leaq	64(%rdi),%rdi
1773
1774.byte	0x67
1775	movq	%rbx,%r8
1776	imulq	32+8(%rsp),%rbx
1777	movq	0(%rbp),%rax
1778	movl	$8,%ecx
1779	jmp	L$8x_reduce
1780
1781.p2align	5
1782L$8x_reduce:
1783	mulq	%rbx
1784	movq	8(%rbp),%rax
1785	negq	%r8
1786	movq	%rdx,%r8
1787	adcq	$0,%r8
1788
1789	mulq	%rbx
1790	addq	%rax,%r9
1791	movq	16(%rbp),%rax
1792	adcq	$0,%rdx
1793	addq	%r9,%r8
1794	movq	%rbx,48-8+8(%rsp,%rcx,8)
1795	movq	%rdx,%r9
1796	adcq	$0,%r9
1797
1798	mulq	%rbx
1799	addq	%rax,%r10
1800	movq	24(%rbp),%rax
1801	adcq	$0,%rdx
1802	addq	%r10,%r9
1803	movq	32+8(%rsp),%rsi
1804	movq	%rdx,%r10
1805	adcq	$0,%r10
1806
1807	mulq	%rbx
1808	addq	%rax,%r11
1809	movq	32(%rbp),%rax
1810	adcq	$0,%rdx
1811	imulq	%r8,%rsi
1812	addq	%r11,%r10
1813	movq	%rdx,%r11
1814	adcq	$0,%r11
1815
1816	mulq	%rbx
1817	addq	%rax,%r12
1818	movq	40(%rbp),%rax
1819	adcq	$0,%rdx
1820	addq	%r12,%r11
1821	movq	%rdx,%r12
1822	adcq	$0,%r12
1823
1824	mulq	%rbx
1825	addq	%rax,%r13
1826	movq	48(%rbp),%rax
1827	adcq	$0,%rdx
1828	addq	%r13,%r12
1829	movq	%rdx,%r13
1830	adcq	$0,%r13
1831
1832	mulq	%rbx
1833	addq	%rax,%r14
1834	movq	56(%rbp),%rax
1835	adcq	$0,%rdx
1836	addq	%r14,%r13
1837	movq	%rdx,%r14
1838	adcq	$0,%r14
1839
1840	mulq	%rbx
1841	movq	%rsi,%rbx
1842	addq	%rax,%r15
1843	movq	0(%rbp),%rax
1844	adcq	$0,%rdx
1845	addq	%r15,%r14
1846	movq	%rdx,%r15
1847	adcq	$0,%r15
1848
1849	decl	%ecx
1850	jnz	L$8x_reduce
1851
1852	leaq	64(%rbp),%rbp
1853	xorq	%rax,%rax
1854	movq	8+8(%rsp),%rdx
1855	cmpq	0+8(%rsp),%rbp
1856	jae	L$8x_no_tail
1857
1858.byte	0x66
1859	addq	0(%rdi),%r8
1860	adcq	8(%rdi),%r9
1861	adcq	16(%rdi),%r10
1862	adcq	24(%rdi),%r11
1863	adcq	32(%rdi),%r12
1864	adcq	40(%rdi),%r13
1865	adcq	48(%rdi),%r14
1866	adcq	56(%rdi),%r15
1867	sbbq	%rsi,%rsi
1868
1869	movq	48+56+8(%rsp),%rbx
1870	movl	$8,%ecx
1871	movq	0(%rbp),%rax
1872	jmp	L$8x_tail
1873
1874.p2align	5
1875L$8x_tail:
1876	mulq	%rbx
1877	addq	%rax,%r8
1878	movq	8(%rbp),%rax
1879	movq	%r8,(%rdi)
1880	movq	%rdx,%r8
1881	adcq	$0,%r8
1882
1883	mulq	%rbx
1884	addq	%rax,%r9
1885	movq	16(%rbp),%rax
1886	adcq	$0,%rdx
1887	addq	%r9,%r8
1888	leaq	8(%rdi),%rdi
1889	movq	%rdx,%r9
1890	adcq	$0,%r9
1891
1892	mulq	%rbx
1893	addq	%rax,%r10
1894	movq	24(%rbp),%rax
1895	adcq	$0,%rdx
1896	addq	%r10,%r9
1897	movq	%rdx,%r10
1898	adcq	$0,%r10
1899
1900	mulq	%rbx
1901	addq	%rax,%r11
1902	movq	32(%rbp),%rax
1903	adcq	$0,%rdx
1904	addq	%r11,%r10
1905	movq	%rdx,%r11
1906	adcq	$0,%r11
1907
1908	mulq	%rbx
1909	addq	%rax,%r12
1910	movq	40(%rbp),%rax
1911	adcq	$0,%rdx
1912	addq	%r12,%r11
1913	movq	%rdx,%r12
1914	adcq	$0,%r12
1915
1916	mulq	%rbx
1917	addq	%rax,%r13
1918	movq	48(%rbp),%rax
1919	adcq	$0,%rdx
1920	addq	%r13,%r12
1921	movq	%rdx,%r13
1922	adcq	$0,%r13
1923
1924	mulq	%rbx
1925	addq	%rax,%r14
1926	movq	56(%rbp),%rax
1927	adcq	$0,%rdx
1928	addq	%r14,%r13
1929	movq	%rdx,%r14
1930	adcq	$0,%r14
1931
1932	mulq	%rbx
1933	movq	48-16+8(%rsp,%rcx,8),%rbx
1934	addq	%rax,%r15
1935	adcq	$0,%rdx
1936	addq	%r15,%r14
1937	movq	0(%rbp),%rax
1938	movq	%rdx,%r15
1939	adcq	$0,%r15
1940
1941	decl	%ecx
1942	jnz	L$8x_tail
1943
1944	leaq	64(%rbp),%rbp
1945	movq	8+8(%rsp),%rdx
1946	cmpq	0+8(%rsp),%rbp
1947	jae	L$8x_tail_done
1948
1949	movq	48+56+8(%rsp),%rbx
1950	negq	%rsi
1951	movq	0(%rbp),%rax
1952	adcq	0(%rdi),%r8
1953	adcq	8(%rdi),%r9
1954	adcq	16(%rdi),%r10
1955	adcq	24(%rdi),%r11
1956	adcq	32(%rdi),%r12
1957	adcq	40(%rdi),%r13
1958	adcq	48(%rdi),%r14
1959	adcq	56(%rdi),%r15
1960	sbbq	%rsi,%rsi
1961
1962	movl	$8,%ecx
1963	jmp	L$8x_tail
1964
1965.p2align	5
1966L$8x_tail_done:
1967	xorq	%rax,%rax
1968	addq	(%rdx),%r8
1969	adcq	$0,%r9
1970	adcq	$0,%r10
1971	adcq	$0,%r11
1972	adcq	$0,%r12
1973	adcq	$0,%r13
1974	adcq	$0,%r14
1975	adcq	$0,%r15
1976	adcq	$0,%rax
1977
1978	negq	%rsi
1979L$8x_no_tail:
1980	adcq	0(%rdi),%r8
1981	adcq	8(%rdi),%r9
1982	adcq	16(%rdi),%r10
1983	adcq	24(%rdi),%r11
1984	adcq	32(%rdi),%r12
1985	adcq	40(%rdi),%r13
1986	adcq	48(%rdi),%r14
1987	adcq	56(%rdi),%r15
1988	adcq	$0,%rax
1989	movq	-8(%rbp),%rcx
1990	xorq	%rsi,%rsi
1991
1992.byte	102,72,15,126,213
1993
1994	movq	%r8,0(%rdi)
1995	movq	%r9,8(%rdi)
1996.byte	102,73,15,126,217
1997	movq	%r10,16(%rdi)
1998	movq	%r11,24(%rdi)
1999	movq	%r12,32(%rdi)
2000	movq	%r13,40(%rdi)
2001	movq	%r14,48(%rdi)
2002	movq	%r15,56(%rdi)
2003	leaq	64(%rdi),%rdi
2004
2005	cmpq	%rdx,%rdi
2006	jb	L$8x_reduction_loop
2007	.byte	0xf3,0xc3
2008
2009
2010
2011.p2align	5
2012__bn_post4x_internal:
2013
2014	movq	0(%rbp),%r12
2015	leaq	(%rdi,%r9,1),%rbx
2016	movq	%r9,%rcx
2017.byte	102,72,15,126,207
2018	negq	%rax
2019.byte	102,72,15,126,206
2020	sarq	$3+2,%rcx
2021	decq	%r12
2022	xorq	%r10,%r10
2023	movq	8(%rbp),%r13
2024	movq	16(%rbp),%r14
2025	movq	24(%rbp),%r15
2026	jmp	L$sqr4x_sub_entry
2027
2028.p2align	4
2029L$sqr4x_sub:
2030	movq	0(%rbp),%r12
2031	movq	8(%rbp),%r13
2032	movq	16(%rbp),%r14
2033	movq	24(%rbp),%r15
2034L$sqr4x_sub_entry:
2035	leaq	32(%rbp),%rbp
2036	notq	%r12
2037	notq	%r13
2038	notq	%r14
2039	notq	%r15
2040	andq	%rax,%r12
2041	andq	%rax,%r13
2042	andq	%rax,%r14
2043	andq	%rax,%r15
2044
2045	negq	%r10
2046	adcq	0(%rbx),%r12
2047	adcq	8(%rbx),%r13
2048	adcq	16(%rbx),%r14
2049	adcq	24(%rbx),%r15
2050	movq	%r12,0(%rdi)
2051	leaq	32(%rbx),%rbx
2052	movq	%r13,8(%rdi)
2053	sbbq	%r10,%r10
2054	movq	%r14,16(%rdi)
2055	movq	%r15,24(%rdi)
2056	leaq	32(%rdi),%rdi
2057
2058	incq	%rcx
2059	jnz	L$sqr4x_sub
2060
2061	movq	%r9,%r10
2062	negq	%r9
2063	.byte	0xf3,0xc3
2064
2065
2066.globl	_GFp_bn_from_montgomery
2067.private_extern _GFp_bn_from_montgomery
2068
2069.p2align	5
2070_GFp_bn_from_montgomery:
2071
2072	testl	$7,%r9d
2073	jz	bn_from_mont8x
2074	xorl	%eax,%eax
2075	.byte	0xf3,0xc3
2076
2077
2078
2079
2080.p2align	5
2081bn_from_mont8x:
2082
2083.byte	0x67
2084	movq	%rsp,%rax
2085
2086	pushq	%rbx
2087
2088	pushq	%rbp
2089
2090	pushq	%r12
2091
2092	pushq	%r13
2093
2094	pushq	%r14
2095
2096	pushq	%r15
2097
2098L$from_prologue:
2099
2100	shll	$3,%r9d
2101	leaq	(%r9,%r9,2),%r10
2102	negq	%r9
2103	movq	(%r8),%r8
2104
2105
2106
2107
2108
2109
2110
2111
2112	leaq	-320(%rsp,%r9,2),%r11
2113	movq	%rsp,%rbp
2114	subq	%rdi,%r11
2115	andq	$4095,%r11
2116	cmpq	%r11,%r10
2117	jb	L$from_sp_alt
2118	subq	%r11,%rbp
2119	leaq	-320(%rbp,%r9,2),%rbp
2120	jmp	L$from_sp_done
2121
2122.p2align	5
2123L$from_sp_alt:
2124	leaq	4096-320(,%r9,2),%r10
2125	leaq	-320(%rbp,%r9,2),%rbp
2126	subq	%r10,%r11
2127	movq	$0,%r10
2128	cmovcq	%r10,%r11
2129	subq	%r11,%rbp
2130L$from_sp_done:
2131	andq	$-64,%rbp
2132	movq	%rsp,%r11
2133	subq	%rbp,%r11
2134	andq	$-4096,%r11
2135	leaq	(%r11,%rbp,1),%rsp
2136	movq	(%rsp),%r10
2137	cmpq	%rbp,%rsp
2138	ja	L$from_page_walk
2139	jmp	L$from_page_walk_done
2140
2141L$from_page_walk:
2142	leaq	-4096(%rsp),%rsp
2143	movq	(%rsp),%r10
2144	cmpq	%rbp,%rsp
2145	ja	L$from_page_walk
2146L$from_page_walk_done:
2147
2148	movq	%r9,%r10
2149	negq	%r9
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160	movq	%r8,32(%rsp)
2161	movq	%rax,40(%rsp)
2162
2163L$from_body:
2164	movq	%r9,%r11
2165	leaq	48(%rsp),%rax
2166	pxor	%xmm0,%xmm0
2167	jmp	L$mul_by_1
2168
2169.p2align	5
2170L$mul_by_1:
2171	movdqu	(%rsi),%xmm1
2172	movdqu	16(%rsi),%xmm2
2173	movdqu	32(%rsi),%xmm3
2174	movdqa	%xmm0,(%rax,%r9,1)
2175	movdqu	48(%rsi),%xmm4
2176	movdqa	%xmm0,16(%rax,%r9,1)
2177.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2178	movdqa	%xmm1,(%rax)
2179	movdqa	%xmm0,32(%rax,%r9,1)
2180	movdqa	%xmm2,16(%rax)
2181	movdqa	%xmm0,48(%rax,%r9,1)
2182	movdqa	%xmm3,32(%rax)
2183	movdqa	%xmm4,48(%rax)
2184	leaq	64(%rax),%rax
2185	subq	$64,%r11
2186	jnz	L$mul_by_1
2187
2188.byte	102,72,15,110,207
2189.byte	102,72,15,110,209
2190.byte	0x67
2191	movq	%rcx,%rbp
2192.byte	102,73,15,110,218
2193	leaq	_GFp_ia32cap_P(%rip),%r11
2194	movl	8(%r11),%r11d
2195	andl	$0x80108,%r11d
2196	cmpl	$0x80108,%r11d
2197	jne	L$from_mont_nox
2198
2199	leaq	(%rax,%r9,1),%rdi
2200	call	__bn_sqrx8x_reduction
2201	call	__bn_postx4x_internal
2202
2203	pxor	%xmm0,%xmm0
2204	leaq	48(%rsp),%rax
2205	jmp	L$from_mont_zero
2206
2207.p2align	5
2208L$from_mont_nox:
2209	call	__bn_sqr8x_reduction
2210	call	__bn_post4x_internal
2211
2212	pxor	%xmm0,%xmm0
2213	leaq	48(%rsp),%rax
2214	jmp	L$from_mont_zero
2215
2216.p2align	5
2217L$from_mont_zero:
2218	movq	40(%rsp),%rsi
2219
2220	movdqa	%xmm0,0(%rax)
2221	movdqa	%xmm0,16(%rax)
2222	movdqa	%xmm0,32(%rax)
2223	movdqa	%xmm0,48(%rax)
2224	leaq	64(%rax),%rax
2225	subq	$32,%r9
2226	jnz	L$from_mont_zero
2227
2228	movq	$1,%rax
2229	movq	-48(%rsi),%r15
2230
2231	movq	-40(%rsi),%r14
2232
2233	movq	-32(%rsi),%r13
2234
2235	movq	-24(%rsi),%r12
2236
2237	movq	-16(%rsi),%rbp
2238
2239	movq	-8(%rsi),%rbx
2240
2241	leaq	(%rsi),%rsp
2242
2243L$from_epilogue:
2244	.byte	0xf3,0xc3
2245
2246
2247
2248.p2align	5
2249bn_mulx4x_mont_gather5:
2250
2251	movq	%rsp,%rax
2252
2253L$mulx4x_enter:
2254	pushq	%rbx
2255
2256	pushq	%rbp
2257
2258	pushq	%r12
2259
2260	pushq	%r13
2261
2262	pushq	%r14
2263
2264	pushq	%r15
2265
2266L$mulx4x_prologue:
2267
2268	shll	$3,%r9d
2269	leaq	(%r9,%r9,2),%r10
2270	negq	%r9
2271	movq	(%r8),%r8
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282	leaq	-320(%rsp,%r9,2),%r11
2283	movq	%rsp,%rbp
2284	subq	%rdi,%r11
2285	andq	$4095,%r11
2286	cmpq	%r11,%r10
2287	jb	L$mulx4xsp_alt
2288	subq	%r11,%rbp
2289	leaq	-320(%rbp,%r9,2),%rbp
2290	jmp	L$mulx4xsp_done
2291
2292L$mulx4xsp_alt:
2293	leaq	4096-320(,%r9,2),%r10
2294	leaq	-320(%rbp,%r9,2),%rbp
2295	subq	%r10,%r11
2296	movq	$0,%r10
2297	cmovcq	%r10,%r11
2298	subq	%r11,%rbp
2299L$mulx4xsp_done:
2300	andq	$-64,%rbp
2301	movq	%rsp,%r11
2302	subq	%rbp,%r11
2303	andq	$-4096,%r11
2304	leaq	(%r11,%rbp,1),%rsp
2305	movq	(%rsp),%r10
2306	cmpq	%rbp,%rsp
2307	ja	L$mulx4x_page_walk
2308	jmp	L$mulx4x_page_walk_done
2309
2310L$mulx4x_page_walk:
2311	leaq	-4096(%rsp),%rsp
2312	movq	(%rsp),%r10
2313	cmpq	%rbp,%rsp
2314	ja	L$mulx4x_page_walk
2315L$mulx4x_page_walk_done:
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329	movq	%r8,32(%rsp)
2330	movq	%rax,40(%rsp)
2331
2332L$mulx4x_body:
2333	call	mulx4x_internal
2334
2335	movq	40(%rsp),%rsi
2336
2337	movq	$1,%rax
2338
2339	movq	-48(%rsi),%r15
2340
2341	movq	-40(%rsi),%r14
2342
2343	movq	-32(%rsi),%r13
2344
2345	movq	-24(%rsi),%r12
2346
2347	movq	-16(%rsi),%rbp
2348
2349	movq	-8(%rsi),%rbx
2350
2351	leaq	(%rsi),%rsp
2352
2353L$mulx4x_epilogue:
2354	.byte	0xf3,0xc3
2355
2356
2357
2358
2359.p2align	5
2360mulx4x_internal:
2361
2362	movq	%r9,8(%rsp)
2363	movq	%r9,%r10
2364	negq	%r9
2365	shlq	$5,%r9
2366	negq	%r10
2367	leaq	128(%rdx,%r9,1),%r13
2368	shrq	$5+5,%r9
2369	movd	8(%rax),%xmm5
2370	subq	$1,%r9
2371	leaq	L$inc(%rip),%rax
2372	movq	%r13,16+8(%rsp)
2373	movq	%r9,24+8(%rsp)
2374	movq	%rdi,56+8(%rsp)
2375	movdqa	0(%rax),%xmm0
2376	movdqa	16(%rax),%xmm1
2377	leaq	88-112(%rsp,%r10,1),%r10
2378	leaq	128(%rdx),%rdi
2379
2380	pshufd	$0,%xmm5,%xmm5
2381	movdqa	%xmm1,%xmm4
2382.byte	0x67
2383	movdqa	%xmm1,%xmm2
2384.byte	0x67
2385	paddd	%xmm0,%xmm1
2386	pcmpeqd	%xmm5,%xmm0
2387	movdqa	%xmm4,%xmm3
2388	paddd	%xmm1,%xmm2
2389	pcmpeqd	%xmm5,%xmm1
2390	movdqa	%xmm0,112(%r10)
2391	movdqa	%xmm4,%xmm0
2392
2393	paddd	%xmm2,%xmm3
2394	pcmpeqd	%xmm5,%xmm2
2395	movdqa	%xmm1,128(%r10)
2396	movdqa	%xmm4,%xmm1
2397
2398	paddd	%xmm3,%xmm0
2399	pcmpeqd	%xmm5,%xmm3
2400	movdqa	%xmm2,144(%r10)
2401	movdqa	%xmm4,%xmm2
2402
2403	paddd	%xmm0,%xmm1
2404	pcmpeqd	%xmm5,%xmm0
2405	movdqa	%xmm3,160(%r10)
2406	movdqa	%xmm4,%xmm3
2407	paddd	%xmm1,%xmm2
2408	pcmpeqd	%xmm5,%xmm1
2409	movdqa	%xmm0,176(%r10)
2410	movdqa	%xmm4,%xmm0
2411
2412	paddd	%xmm2,%xmm3
2413	pcmpeqd	%xmm5,%xmm2
2414	movdqa	%xmm1,192(%r10)
2415	movdqa	%xmm4,%xmm1
2416
2417	paddd	%xmm3,%xmm0
2418	pcmpeqd	%xmm5,%xmm3
2419	movdqa	%xmm2,208(%r10)
2420	movdqa	%xmm4,%xmm2
2421
2422	paddd	%xmm0,%xmm1
2423	pcmpeqd	%xmm5,%xmm0
2424	movdqa	%xmm3,224(%r10)
2425	movdqa	%xmm4,%xmm3
2426	paddd	%xmm1,%xmm2
2427	pcmpeqd	%xmm5,%xmm1
2428	movdqa	%xmm0,240(%r10)
2429	movdqa	%xmm4,%xmm0
2430
2431	paddd	%xmm2,%xmm3
2432	pcmpeqd	%xmm5,%xmm2
2433	movdqa	%xmm1,256(%r10)
2434	movdqa	%xmm4,%xmm1
2435
2436	paddd	%xmm3,%xmm0
2437	pcmpeqd	%xmm5,%xmm3
2438	movdqa	%xmm2,272(%r10)
2439	movdqa	%xmm4,%xmm2
2440
2441	paddd	%xmm0,%xmm1
2442	pcmpeqd	%xmm5,%xmm0
2443	movdqa	%xmm3,288(%r10)
2444	movdqa	%xmm4,%xmm3
2445.byte	0x67
2446	paddd	%xmm1,%xmm2
2447	pcmpeqd	%xmm5,%xmm1
2448	movdqa	%xmm0,304(%r10)
2449
2450	paddd	%xmm2,%xmm3
2451	pcmpeqd	%xmm5,%xmm2
2452	movdqa	%xmm1,320(%r10)
2453
2454	pcmpeqd	%xmm5,%xmm3
2455	movdqa	%xmm2,336(%r10)
2456
2457	pand	64(%rdi),%xmm0
2458	pand	80(%rdi),%xmm1
2459	pand	96(%rdi),%xmm2
2460	movdqa	%xmm3,352(%r10)
2461	pand	112(%rdi),%xmm3
2462	por	%xmm2,%xmm0
2463	por	%xmm3,%xmm1
2464	movdqa	-128(%rdi),%xmm4
2465	movdqa	-112(%rdi),%xmm5
2466	movdqa	-96(%rdi),%xmm2
2467	pand	112(%r10),%xmm4
2468	movdqa	-80(%rdi),%xmm3
2469	pand	128(%r10),%xmm5
2470	por	%xmm4,%xmm0
2471	pand	144(%r10),%xmm2
2472	por	%xmm5,%xmm1
2473	pand	160(%r10),%xmm3
2474	por	%xmm2,%xmm0
2475	por	%xmm3,%xmm1
2476	movdqa	-64(%rdi),%xmm4
2477	movdqa	-48(%rdi),%xmm5
2478	movdqa	-32(%rdi),%xmm2
2479	pand	176(%r10),%xmm4
2480	movdqa	-16(%rdi),%xmm3
2481	pand	192(%r10),%xmm5
2482	por	%xmm4,%xmm0
2483	pand	208(%r10),%xmm2
2484	por	%xmm5,%xmm1
2485	pand	224(%r10),%xmm3
2486	por	%xmm2,%xmm0
2487	por	%xmm3,%xmm1
2488	movdqa	0(%rdi),%xmm4
2489	movdqa	16(%rdi),%xmm5
2490	movdqa	32(%rdi),%xmm2
2491	pand	240(%r10),%xmm4
2492	movdqa	48(%rdi),%xmm3
2493	pand	256(%r10),%xmm5
2494	por	%xmm4,%xmm0
2495	pand	272(%r10),%xmm2
2496	por	%xmm5,%xmm1
2497	pand	288(%r10),%xmm3
2498	por	%xmm2,%xmm0
2499	por	%xmm3,%xmm1
2500	pxor	%xmm1,%xmm0
2501	pshufd	$0x4e,%xmm0,%xmm1
2502	por	%xmm1,%xmm0
2503	leaq	256(%rdi),%rdi
2504.byte	102,72,15,126,194
2505	leaq	64+32+8(%rsp),%rbx
2506
2507	movq	%rdx,%r9
2508	mulxq	0(%rsi),%r8,%rax
2509	mulxq	8(%rsi),%r11,%r12
2510	addq	%rax,%r11
2511	mulxq	16(%rsi),%rax,%r13
2512	adcq	%rax,%r12
2513	adcq	$0,%r13
2514	mulxq	24(%rsi),%rax,%r14
2515
2516	movq	%r8,%r15
2517	imulq	32+8(%rsp),%r8
2518	xorq	%rbp,%rbp
2519	movq	%r8,%rdx
2520
2521	movq	%rdi,8+8(%rsp)
2522
2523	leaq	32(%rsi),%rsi
2524	adcxq	%rax,%r13
2525	adcxq	%rbp,%r14
2526
2527	mulxq	0(%rcx),%rax,%r10
2528	adcxq	%rax,%r15
2529	adoxq	%r11,%r10
2530	mulxq	8(%rcx),%rax,%r11
2531	adcxq	%rax,%r10
2532	adoxq	%r12,%r11
2533	mulxq	16(%rcx),%rax,%r12
2534	movq	24+8(%rsp),%rdi
2535	movq	%r10,-32(%rbx)
2536	adcxq	%rax,%r11
2537	adoxq	%r13,%r12
2538	mulxq	24(%rcx),%rax,%r15
2539	movq	%r9,%rdx
2540	movq	%r11,-24(%rbx)
2541	adcxq	%rax,%r12
2542	adoxq	%rbp,%r15
2543	leaq	32(%rcx),%rcx
2544	movq	%r12,-16(%rbx)
2545	jmp	L$mulx4x_1st
2546
2547.p2align	5
2548L$mulx4x_1st:
2549	adcxq	%rbp,%r15
2550	mulxq	0(%rsi),%r10,%rax
2551	adcxq	%r14,%r10
2552	mulxq	8(%rsi),%r11,%r14
2553	adcxq	%rax,%r11
2554	mulxq	16(%rsi),%r12,%rax
2555	adcxq	%r14,%r12
2556	mulxq	24(%rsi),%r13,%r14
2557.byte	0x67,0x67
2558	movq	%r8,%rdx
2559	adcxq	%rax,%r13
2560	adcxq	%rbp,%r14
2561	leaq	32(%rsi),%rsi
2562	leaq	32(%rbx),%rbx
2563
2564	adoxq	%r15,%r10
2565	mulxq	0(%rcx),%rax,%r15
2566	adcxq	%rax,%r10
2567	adoxq	%r15,%r11
2568	mulxq	8(%rcx),%rax,%r15
2569	adcxq	%rax,%r11
2570	adoxq	%r15,%r12
2571	mulxq	16(%rcx),%rax,%r15
2572	movq	%r10,-40(%rbx)
2573	adcxq	%rax,%r12
2574	movq	%r11,-32(%rbx)
2575	adoxq	%r15,%r13
2576	mulxq	24(%rcx),%rax,%r15
2577	movq	%r9,%rdx
2578	movq	%r12,-24(%rbx)
2579	adcxq	%rax,%r13
2580	adoxq	%rbp,%r15
2581	leaq	32(%rcx),%rcx
2582	movq	%r13,-16(%rbx)
2583
2584	decq	%rdi
2585	jnz	L$mulx4x_1st
2586
2587	movq	8(%rsp),%rax
2588	adcq	%rbp,%r15
2589	leaq	(%rsi,%rax,1),%rsi
2590	addq	%r15,%r14
2591	movq	8+8(%rsp),%rdi
2592	adcq	%rbp,%rbp
2593	movq	%r14,-8(%rbx)
2594	jmp	L$mulx4x_outer
2595
2596.p2align	5
2597L$mulx4x_outer:
2598	leaq	16-256(%rbx),%r10
2599	pxor	%xmm4,%xmm4
2600.byte	0x67,0x67
2601	pxor	%xmm5,%xmm5
2602	movdqa	-128(%rdi),%xmm0
2603	movdqa	-112(%rdi),%xmm1
2604	movdqa	-96(%rdi),%xmm2
2605	pand	256(%r10),%xmm0
2606	movdqa	-80(%rdi),%xmm3
2607	pand	272(%r10),%xmm1
2608	por	%xmm0,%xmm4
2609	pand	288(%r10),%xmm2
2610	por	%xmm1,%xmm5
2611	pand	304(%r10),%xmm3
2612	por	%xmm2,%xmm4
2613	por	%xmm3,%xmm5
2614	movdqa	-64(%rdi),%xmm0
2615	movdqa	-48(%rdi),%xmm1
2616	movdqa	-32(%rdi),%xmm2
2617	pand	320(%r10),%xmm0
2618	movdqa	-16(%rdi),%xmm3
2619	pand	336(%r10),%xmm1
2620	por	%xmm0,%xmm4
2621	pand	352(%r10),%xmm2
2622	por	%xmm1,%xmm5
2623	pand	368(%r10),%xmm3
2624	por	%xmm2,%xmm4
2625	por	%xmm3,%xmm5
2626	movdqa	0(%rdi),%xmm0
2627	movdqa	16(%rdi),%xmm1
2628	movdqa	32(%rdi),%xmm2
2629	pand	384(%r10),%xmm0
2630	movdqa	48(%rdi),%xmm3
2631	pand	400(%r10),%xmm1
2632	por	%xmm0,%xmm4
2633	pand	416(%r10),%xmm2
2634	por	%xmm1,%xmm5
2635	pand	432(%r10),%xmm3
2636	por	%xmm2,%xmm4
2637	por	%xmm3,%xmm5
2638	movdqa	64(%rdi),%xmm0
2639	movdqa	80(%rdi),%xmm1
2640	movdqa	96(%rdi),%xmm2
2641	pand	448(%r10),%xmm0
2642	movdqa	112(%rdi),%xmm3
2643	pand	464(%r10),%xmm1
2644	por	%xmm0,%xmm4
2645	pand	480(%r10),%xmm2
2646	por	%xmm1,%xmm5
2647	pand	496(%r10),%xmm3
2648	por	%xmm2,%xmm4
2649	por	%xmm3,%xmm5
2650	por	%xmm5,%xmm4
2651	pshufd	$0x4e,%xmm4,%xmm0
2652	por	%xmm4,%xmm0
2653	leaq	256(%rdi),%rdi
2654.byte	102,72,15,126,194
2655
2656	movq	%rbp,(%rbx)
2657	leaq	32(%rbx,%rax,1),%rbx
2658	mulxq	0(%rsi),%r8,%r11
2659	xorq	%rbp,%rbp
2660	movq	%rdx,%r9
2661	mulxq	8(%rsi),%r14,%r12
2662	adoxq	-32(%rbx),%r8
2663	adcxq	%r14,%r11
2664	mulxq	16(%rsi),%r15,%r13
2665	adoxq	-24(%rbx),%r11
2666	adcxq	%r15,%r12
2667	mulxq	24(%rsi),%rdx,%r14
2668	adoxq	-16(%rbx),%r12
2669	adcxq	%rdx,%r13
2670	leaq	(%rcx,%rax,1),%rcx
2671	leaq	32(%rsi),%rsi
2672	adoxq	-8(%rbx),%r13
2673	adcxq	%rbp,%r14
2674	adoxq	%rbp,%r14
2675
2676	movq	%r8,%r15
2677	imulq	32+8(%rsp),%r8
2678
2679	movq	%r8,%rdx
2680	xorq	%rbp,%rbp
2681	movq	%rdi,8+8(%rsp)
2682
2683	mulxq	0(%rcx),%rax,%r10
2684	adcxq	%rax,%r15
2685	adoxq	%r11,%r10
2686	mulxq	8(%rcx),%rax,%r11
2687	adcxq	%rax,%r10
2688	adoxq	%r12,%r11
2689	mulxq	16(%rcx),%rax,%r12
2690	adcxq	%rax,%r11
2691	adoxq	%r13,%r12
2692	mulxq	24(%rcx),%rax,%r15
2693	movq	%r9,%rdx
2694	movq	24+8(%rsp),%rdi
2695	movq	%r10,-32(%rbx)
2696	adcxq	%rax,%r12
2697	movq	%r11,-24(%rbx)
2698	adoxq	%rbp,%r15
2699	movq	%r12,-16(%rbx)
2700	leaq	32(%rcx),%rcx
2701	jmp	L$mulx4x_inner
2702
2703.p2align	5
2704L$mulx4x_inner:
2705	mulxq	0(%rsi),%r10,%rax
2706	adcxq	%rbp,%r15
2707	adoxq	%r14,%r10
2708	mulxq	8(%rsi),%r11,%r14
2709	adcxq	0(%rbx),%r10
2710	adoxq	%rax,%r11
2711	mulxq	16(%rsi),%r12,%rax
2712	adcxq	8(%rbx),%r11
2713	adoxq	%r14,%r12
2714	mulxq	24(%rsi),%r13,%r14
2715	movq	%r8,%rdx
2716	adcxq	16(%rbx),%r12
2717	adoxq	%rax,%r13
2718	adcxq	24(%rbx),%r13
2719	adoxq	%rbp,%r14
2720	leaq	32(%rsi),%rsi
2721	leaq	32(%rbx),%rbx
2722	adcxq	%rbp,%r14
2723
2724	adoxq	%r15,%r10
2725	mulxq	0(%rcx),%rax,%r15
2726	adcxq	%rax,%r10
2727	adoxq	%r15,%r11
2728	mulxq	8(%rcx),%rax,%r15
2729	adcxq	%rax,%r11
2730	adoxq	%r15,%r12
2731	mulxq	16(%rcx),%rax,%r15
2732	movq	%r10,-40(%rbx)
2733	adcxq	%rax,%r12
2734	adoxq	%r15,%r13
2735	movq	%r11,-32(%rbx)
2736	mulxq	24(%rcx),%rax,%r15
2737	movq	%r9,%rdx
2738	leaq	32(%rcx),%rcx
2739	movq	%r12,-24(%rbx)
2740	adcxq	%rax,%r13
2741	adoxq	%rbp,%r15
2742	movq	%r13,-16(%rbx)
2743
2744	decq	%rdi
2745	jnz	L$mulx4x_inner
2746
2747	movq	0+8(%rsp),%rax
2748	adcq	%rbp,%r15
2749	subq	0(%rbx),%rdi
2750	movq	8+8(%rsp),%rdi
2751	movq	16+8(%rsp),%r10
2752	adcq	%r15,%r14
2753	leaq	(%rsi,%rax,1),%rsi
2754	adcq	%rbp,%rbp
2755	movq	%r14,-8(%rbx)
2756
2757	cmpq	%r10,%rdi
2758	jb	L$mulx4x_outer
2759
2760	movq	-8(%rcx),%r10
2761	movq	%rbp,%r8
2762	movq	(%rcx,%rax,1),%r12
2763	leaq	(%rcx,%rax,1),%rbp
2764	movq	%rax,%rcx
2765	leaq	(%rbx,%rax,1),%rdi
2766	xorl	%eax,%eax
2767	xorq	%r15,%r15
2768	subq	%r14,%r10
2769	adcq	%r15,%r15
2770	orq	%r15,%r8
2771	sarq	$3+2,%rcx
2772	subq	%r8,%rax
2773	movq	56+8(%rsp),%rdx
2774	decq	%r12
2775	movq	8(%rbp),%r13
2776	xorq	%r8,%r8
2777	movq	16(%rbp),%r14
2778	movq	24(%rbp),%r15
2779	jmp	L$sqrx4x_sub_entry
2780
2781
2782
2783.p2align	5
2784bn_powerx5:
2785
2786	movq	%rsp,%rax
2787
2788L$powerx5_enter:
2789	pushq	%rbx
2790
2791	pushq	%rbp
2792
2793	pushq	%r12
2794
2795	pushq	%r13
2796
2797	pushq	%r14
2798
2799	pushq	%r15
2800
2801L$powerx5_prologue:
2802
2803	shll	$3,%r9d
2804	leaq	(%r9,%r9,2),%r10
2805	negq	%r9
2806	movq	(%r8),%r8
2807
2808
2809
2810
2811
2812
2813
2814
2815	leaq	-320(%rsp,%r9,2),%r11
2816	movq	%rsp,%rbp
2817	subq	%rdi,%r11
2818	andq	$4095,%r11
2819	cmpq	%r11,%r10
2820	jb	L$pwrx_sp_alt
2821	subq	%r11,%rbp
2822	leaq	-320(%rbp,%r9,2),%rbp
2823	jmp	L$pwrx_sp_done
2824
2825.p2align	5
2826L$pwrx_sp_alt:
2827	leaq	4096-320(,%r9,2),%r10
2828	leaq	-320(%rbp,%r9,2),%rbp
2829	subq	%r10,%r11
2830	movq	$0,%r10
2831	cmovcq	%r10,%r11
2832	subq	%r11,%rbp
2833L$pwrx_sp_done:
2834	andq	$-64,%rbp
2835	movq	%rsp,%r11
2836	subq	%rbp,%r11
2837	andq	$-4096,%r11
2838	leaq	(%r11,%rbp,1),%rsp
2839	movq	(%rsp),%r10
2840	cmpq	%rbp,%rsp
2841	ja	L$pwrx_page_walk
2842	jmp	L$pwrx_page_walk_done
2843
2844L$pwrx_page_walk:
2845	leaq	-4096(%rsp),%rsp
2846	movq	(%rsp),%r10
2847	cmpq	%rbp,%rsp
2848	ja	L$pwrx_page_walk
2849L$pwrx_page_walk_done:
2850
2851	movq	%r9,%r10
2852	negq	%r9
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865	pxor	%xmm0,%xmm0
2866.byte	102,72,15,110,207
2867.byte	102,72,15,110,209
2868.byte	102,73,15,110,218
2869.byte	102,72,15,110,226
2870	movq	%r8,32(%rsp)
2871	movq	%rax,40(%rsp)
2872
2873L$powerx5_body:
2874
2875	call	__bn_sqrx8x_internal
2876	call	__bn_postx4x_internal
2877	call	__bn_sqrx8x_internal
2878	call	__bn_postx4x_internal
2879	call	__bn_sqrx8x_internal
2880	call	__bn_postx4x_internal
2881	call	__bn_sqrx8x_internal
2882	call	__bn_postx4x_internal
2883	call	__bn_sqrx8x_internal
2884	call	__bn_postx4x_internal
2885
2886	movq	%r10,%r9
2887	movq	%rsi,%rdi
2888.byte	102,72,15,126,209
2889.byte	102,72,15,126,226
2890	movq	40(%rsp),%rax
2891
2892	call	mulx4x_internal
2893
2894	movq	40(%rsp),%rsi
2895
2896	movq	$1,%rax
2897
2898	movq	-48(%rsi),%r15
2899
2900	movq	-40(%rsi),%r14
2901
2902	movq	-32(%rsi),%r13
2903
2904	movq	-24(%rsi),%r12
2905
2906	movq	-16(%rsi),%rbp
2907
2908	movq	-8(%rsi),%rbx
2909
2910	leaq	(%rsi),%rsp
2911
2912L$powerx5_epilogue:
2913	.byte	0xf3,0xc3
2914
2915
2916
2917.globl	_GFp_bn_sqrx8x_internal
2918.private_extern _GFp_bn_sqrx8x_internal
2919
2920.p2align	5
2921_GFp_bn_sqrx8x_internal:
2922__bn_sqrx8x_internal:
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964	leaq	48+8(%rsp),%rdi
2965	leaq	(%rsi,%r9,1),%rbp
2966	movq	%r9,0+8(%rsp)
2967	movq	%rbp,8+8(%rsp)
2968	jmp	L$sqr8x_zero_start
2969
2970.p2align	5
2971.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2972L$sqrx8x_zero:
2973.byte	0x3e
2974	movdqa	%xmm0,0(%rdi)
2975	movdqa	%xmm0,16(%rdi)
2976	movdqa	%xmm0,32(%rdi)
2977	movdqa	%xmm0,48(%rdi)
2978L$sqr8x_zero_start:
2979	movdqa	%xmm0,64(%rdi)
2980	movdqa	%xmm0,80(%rdi)
2981	movdqa	%xmm0,96(%rdi)
2982	movdqa	%xmm0,112(%rdi)
2983	leaq	128(%rdi),%rdi
2984	subq	$64,%r9
2985	jnz	L$sqrx8x_zero
2986
2987	movq	0(%rsi),%rdx
2988
2989	xorq	%r10,%r10
2990	xorq	%r11,%r11
2991	xorq	%r12,%r12
2992	xorq	%r13,%r13
2993	xorq	%r14,%r14
2994	xorq	%r15,%r15
2995	leaq	48+8(%rsp),%rdi
2996	xorq	%rbp,%rbp
2997	jmp	L$sqrx8x_outer_loop
2998
2999.p2align	5
3000L$sqrx8x_outer_loop:
3001	mulxq	8(%rsi),%r8,%rax
3002	adcxq	%r9,%r8
3003	adoxq	%rax,%r10
3004	mulxq	16(%rsi),%r9,%rax
3005	adcxq	%r10,%r9
3006	adoxq	%rax,%r11
3007.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3008	adcxq	%r11,%r10
3009	adoxq	%rax,%r12
3010.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3011	adcxq	%r12,%r11
3012	adoxq	%rax,%r13
3013	mulxq	40(%rsi),%r12,%rax
3014	adcxq	%r13,%r12
3015	adoxq	%rax,%r14
3016	mulxq	48(%rsi),%r13,%rax
3017	adcxq	%r14,%r13
3018	adoxq	%r15,%rax
3019	mulxq	56(%rsi),%r14,%r15
3020	movq	8(%rsi),%rdx
3021	adcxq	%rax,%r14
3022	adoxq	%rbp,%r15
3023	adcq	64(%rdi),%r15
3024	movq	%r8,8(%rdi)
3025	movq	%r9,16(%rdi)
3026	sbbq	%rcx,%rcx
3027	xorq	%rbp,%rbp
3028
3029
3030	mulxq	16(%rsi),%r8,%rbx
3031	mulxq	24(%rsi),%r9,%rax
3032	adcxq	%r10,%r8
3033	adoxq	%rbx,%r9
3034	mulxq	32(%rsi),%r10,%rbx
3035	adcxq	%r11,%r9
3036	adoxq	%rax,%r10
3037.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3038	adcxq	%r12,%r10
3039	adoxq	%rbx,%r11
3040.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3041	adcxq	%r13,%r11
3042	adoxq	%r14,%r12
3043.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3044	movq	16(%rsi),%rdx
3045	adcxq	%rax,%r12
3046	adoxq	%rbx,%r13
3047	adcxq	%r15,%r13
3048	adoxq	%rbp,%r14
3049	adcxq	%rbp,%r14
3050
3051	movq	%r8,24(%rdi)
3052	movq	%r9,32(%rdi)
3053
3054	mulxq	24(%rsi),%r8,%rbx
3055	mulxq	32(%rsi),%r9,%rax
3056	adcxq	%r10,%r8
3057	adoxq	%rbx,%r9
3058	mulxq	40(%rsi),%r10,%rbx
3059	adcxq	%r11,%r9
3060	adoxq	%rax,%r10
3061.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3062	adcxq	%r12,%r10
3063	adoxq	%r13,%r11
3064.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3065.byte	0x3e
3066	movq	24(%rsi),%rdx
3067	adcxq	%rbx,%r11
3068	adoxq	%rax,%r12
3069	adcxq	%r14,%r12
3070	movq	%r8,40(%rdi)
3071	movq	%r9,48(%rdi)
3072	mulxq	32(%rsi),%r8,%rax
3073	adoxq	%rbp,%r13
3074	adcxq	%rbp,%r13
3075
3076	mulxq	40(%rsi),%r9,%rbx
3077	adcxq	%r10,%r8
3078	adoxq	%rax,%r9
3079	mulxq	48(%rsi),%r10,%rax
3080	adcxq	%r11,%r9
3081	adoxq	%r12,%r10
3082	mulxq	56(%rsi),%r11,%r12
3083	movq	32(%rsi),%rdx
3084	movq	40(%rsi),%r14
3085	adcxq	%rbx,%r10
3086	adoxq	%rax,%r11
3087	movq	48(%rsi),%r15
3088	adcxq	%r13,%r11
3089	adoxq	%rbp,%r12
3090	adcxq	%rbp,%r12
3091
3092	movq	%r8,56(%rdi)
3093	movq	%r9,64(%rdi)
3094
3095	mulxq	%r14,%r9,%rax
3096	movq	56(%rsi),%r8
3097	adcxq	%r10,%r9
3098	mulxq	%r15,%r10,%rbx
3099	adoxq	%rax,%r10
3100	adcxq	%r11,%r10
3101	mulxq	%r8,%r11,%rax
3102	movq	%r14,%rdx
3103	adoxq	%rbx,%r11
3104	adcxq	%r12,%r11
3105
3106	adcxq	%rbp,%rax
3107
3108	mulxq	%r15,%r14,%rbx
3109	mulxq	%r8,%r12,%r13
3110	movq	%r15,%rdx
3111	leaq	64(%rsi),%rsi
3112	adcxq	%r14,%r11
3113	adoxq	%rbx,%r12
3114	adcxq	%rax,%r12
3115	adoxq	%rbp,%r13
3116
3117.byte	0x67,0x67
3118	mulxq	%r8,%r8,%r14
3119	adcxq	%r8,%r13
3120	adcxq	%rbp,%r14
3121
3122	cmpq	8+8(%rsp),%rsi
3123	je	L$sqrx8x_outer_break
3124
3125	negq	%rcx
3126	movq	$-8,%rcx
3127	movq	%rbp,%r15
3128	movq	64(%rdi),%r8
3129	adcxq	72(%rdi),%r9
3130	adcxq	80(%rdi),%r10
3131	adcxq	88(%rdi),%r11
3132	adcq	96(%rdi),%r12
3133	adcq	104(%rdi),%r13
3134	adcq	112(%rdi),%r14
3135	adcq	120(%rdi),%r15
3136	leaq	(%rsi),%rbp
3137	leaq	128(%rdi),%rdi
3138	sbbq	%rax,%rax
3139
3140	movq	-64(%rsi),%rdx
3141	movq	%rax,16+8(%rsp)
3142	movq	%rdi,24+8(%rsp)
3143
3144
3145	xorl	%eax,%eax
3146	jmp	L$sqrx8x_loop
3147
3148.p2align	5
3149L$sqrx8x_loop:
3150	movq	%r8,%rbx
3151	mulxq	0(%rbp),%rax,%r8
3152	adcxq	%rax,%rbx
3153	adoxq	%r9,%r8
3154
3155	mulxq	8(%rbp),%rax,%r9
3156	adcxq	%rax,%r8
3157	adoxq	%r10,%r9
3158
3159	mulxq	16(%rbp),%rax,%r10
3160	adcxq	%rax,%r9
3161	adoxq	%r11,%r10
3162
3163	mulxq	24(%rbp),%rax,%r11
3164	adcxq	%rax,%r10
3165	adoxq	%r12,%r11
3166
3167.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3168	adcxq	%rax,%r11
3169	adoxq	%r13,%r12
3170
3171	mulxq	40(%rbp),%rax,%r13
3172	adcxq	%rax,%r12
3173	adoxq	%r14,%r13
3174
3175	mulxq	48(%rbp),%rax,%r14
3176	movq	%rbx,(%rdi,%rcx,8)
3177	movl	$0,%ebx
3178	adcxq	%rax,%r13
3179	adoxq	%r15,%r14
3180
3181.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3182	movq	8(%rsi,%rcx,8),%rdx
3183	adcxq	%rax,%r14
3184	adoxq	%rbx,%r15
3185	adcxq	%rbx,%r15
3186
3187.byte	0x67
3188	incq	%rcx
3189	jnz	L$sqrx8x_loop
3190
3191	leaq	64(%rbp),%rbp
3192	movq	$-8,%rcx
3193	cmpq	8+8(%rsp),%rbp
3194	je	L$sqrx8x_break
3195
3196	subq	16+8(%rsp),%rbx
3197.byte	0x66
3198	movq	-64(%rsi),%rdx
3199	adcxq	0(%rdi),%r8
3200	adcxq	8(%rdi),%r9
3201	adcq	16(%rdi),%r10
3202	adcq	24(%rdi),%r11
3203	adcq	32(%rdi),%r12
3204	adcq	40(%rdi),%r13
3205	adcq	48(%rdi),%r14
3206	adcq	56(%rdi),%r15
3207	leaq	64(%rdi),%rdi
3208.byte	0x67
3209	sbbq	%rax,%rax
3210	xorl	%ebx,%ebx
3211	movq	%rax,16+8(%rsp)
3212	jmp	L$sqrx8x_loop
3213
3214.p2align	5
3215L$sqrx8x_break:
3216	xorq	%rbp,%rbp
3217	subq	16+8(%rsp),%rbx
3218	adcxq	%rbp,%r8
3219	movq	24+8(%rsp),%rcx
3220	adcxq	%rbp,%r9
3221	movq	0(%rsi),%rdx
3222	adcq	$0,%r10
3223	movq	%r8,0(%rdi)
3224	adcq	$0,%r11
3225	adcq	$0,%r12
3226	adcq	$0,%r13
3227	adcq	$0,%r14
3228	adcq	$0,%r15
3229	cmpq	%rcx,%rdi
3230	je	L$sqrx8x_outer_loop
3231
3232	movq	%r9,8(%rdi)
3233	movq	8(%rcx),%r9
3234	movq	%r10,16(%rdi)
3235	movq	16(%rcx),%r10
3236	movq	%r11,24(%rdi)
3237	movq	24(%rcx),%r11
3238	movq	%r12,32(%rdi)
3239	movq	32(%rcx),%r12
3240	movq	%r13,40(%rdi)
3241	movq	40(%rcx),%r13
3242	movq	%r14,48(%rdi)
3243	movq	48(%rcx),%r14
3244	movq	%r15,56(%rdi)
3245	movq	56(%rcx),%r15
3246	movq	%rcx,%rdi
3247	jmp	L$sqrx8x_outer_loop
3248
3249.p2align	5
3250L$sqrx8x_outer_break:
3251	movq	%r9,72(%rdi)
3252.byte	102,72,15,126,217
3253	movq	%r10,80(%rdi)
3254	movq	%r11,88(%rdi)
3255	movq	%r12,96(%rdi)
3256	movq	%r13,104(%rdi)
3257	movq	%r14,112(%rdi)
3258	leaq	48+8(%rsp),%rdi
3259	movq	(%rsi,%rcx,1),%rdx
3260
3261	movq	8(%rdi),%r11
3262	xorq	%r10,%r10
3263	movq	0+8(%rsp),%r9
3264	adoxq	%r11,%r11
3265	movq	16(%rdi),%r12
3266	movq	24(%rdi),%r13
3267
3268
3269.p2align	5
3270L$sqrx4x_shift_n_add:
3271	mulxq	%rdx,%rax,%rbx
3272	adoxq	%r12,%r12
3273	adcxq	%r10,%rax
3274.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3275.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3276	adoxq	%r13,%r13
3277	adcxq	%r11,%rbx
3278	movq	40(%rdi),%r11
3279	movq	%rax,0(%rdi)
3280	movq	%rbx,8(%rdi)
3281
3282	mulxq	%rdx,%rax,%rbx
3283	adoxq	%r10,%r10
3284	adcxq	%r12,%rax
3285	movq	16(%rsi,%rcx,1),%rdx
3286	movq	48(%rdi),%r12
3287	adoxq	%r11,%r11
3288	adcxq	%r13,%rbx
3289	movq	56(%rdi),%r13
3290	movq	%rax,16(%rdi)
3291	movq	%rbx,24(%rdi)
3292
3293	mulxq	%rdx,%rax,%rbx
3294	adoxq	%r12,%r12
3295	adcxq	%r10,%rax
3296	movq	24(%rsi,%rcx,1),%rdx
3297	leaq	32(%rcx),%rcx
3298	movq	64(%rdi),%r10
3299	adoxq	%r13,%r13
3300	adcxq	%r11,%rbx
3301	movq	72(%rdi),%r11
3302	movq	%rax,32(%rdi)
3303	movq	%rbx,40(%rdi)
3304
3305	mulxq	%rdx,%rax,%rbx
3306	adoxq	%r10,%r10
3307	adcxq	%r12,%rax
3308	jrcxz	L$sqrx4x_shift_n_add_break
3309.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3310	adoxq	%r11,%r11
3311	adcxq	%r13,%rbx
3312	movq	80(%rdi),%r12
3313	movq	88(%rdi),%r13
3314	movq	%rax,48(%rdi)
3315	movq	%rbx,56(%rdi)
3316	leaq	64(%rdi),%rdi
3317	nop
3318	jmp	L$sqrx4x_shift_n_add
3319
3320.p2align	5
3321L$sqrx4x_shift_n_add_break:
3322	adcxq	%r13,%rbx
3323	movq	%rax,48(%rdi)
3324	movq	%rbx,56(%rdi)
3325	leaq	64(%rdi),%rdi
3326.byte	102,72,15,126,213
3327__bn_sqrx8x_reduction:
3328	xorl	%eax,%eax
3329	movq	32+8(%rsp),%rbx
3330	movq	48+8(%rsp),%rdx
3331	leaq	-64(%rbp,%r9,1),%rcx
3332
3333	movq	%rcx,0+8(%rsp)
3334	movq	%rdi,8+8(%rsp)
3335
3336	leaq	48+8(%rsp),%rdi
3337	jmp	L$sqrx8x_reduction_loop
3338
3339.p2align	5
3340L$sqrx8x_reduction_loop:
3341	movq	8(%rdi),%r9
3342	movq	16(%rdi),%r10
3343	movq	24(%rdi),%r11
3344	movq	32(%rdi),%r12
3345	movq	%rdx,%r8
3346	imulq	%rbx,%rdx
3347	movq	40(%rdi),%r13
3348	movq	48(%rdi),%r14
3349	movq	56(%rdi),%r15
3350	movq	%rax,24+8(%rsp)
3351
3352	leaq	64(%rdi),%rdi
3353	xorq	%rsi,%rsi
3354	movq	$-8,%rcx
3355	jmp	L$sqrx8x_reduce
3356
3357.p2align	5
3358L$sqrx8x_reduce:
3359	movq	%r8,%rbx
3360	mulxq	0(%rbp),%rax,%r8
3361	adcxq	%rbx,%rax
3362	adoxq	%r9,%r8
3363
3364	mulxq	8(%rbp),%rbx,%r9
3365	adcxq	%rbx,%r8
3366	adoxq	%r10,%r9
3367
3368	mulxq	16(%rbp),%rbx,%r10
3369	adcxq	%rbx,%r9
3370	adoxq	%r11,%r10
3371
3372	mulxq	24(%rbp),%rbx,%r11
3373	adcxq	%rbx,%r10
3374	adoxq	%r12,%r11
3375
3376.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3377	movq	%rdx,%rax
3378	movq	%r8,%rdx
3379	adcxq	%rbx,%r11
3380	adoxq	%r13,%r12
3381
3382	mulxq	32+8(%rsp),%rbx,%rdx
3383	movq	%rax,%rdx
3384	movq	%rax,64+48+8(%rsp,%rcx,8)
3385
3386	mulxq	40(%rbp),%rax,%r13
3387	adcxq	%rax,%r12
3388	adoxq	%r14,%r13
3389
3390	mulxq	48(%rbp),%rax,%r14
3391	adcxq	%rax,%r13
3392	adoxq	%r15,%r14
3393
3394	mulxq	56(%rbp),%rax,%r15
3395	movq	%rbx,%rdx
3396	adcxq	%rax,%r14
3397	adoxq	%rsi,%r15
3398	adcxq	%rsi,%r15
3399
3400.byte	0x67,0x67,0x67
3401	incq	%rcx
3402	jnz	L$sqrx8x_reduce
3403
3404	movq	%rsi,%rax
3405	cmpq	0+8(%rsp),%rbp
3406	jae	L$sqrx8x_no_tail
3407
3408	movq	48+8(%rsp),%rdx
3409	addq	0(%rdi),%r8
3410	leaq	64(%rbp),%rbp
3411	movq	$-8,%rcx
3412	adcxq	8(%rdi),%r9
3413	adcxq	16(%rdi),%r10
3414	adcq	24(%rdi),%r11
3415	adcq	32(%rdi),%r12
3416	adcq	40(%rdi),%r13
3417	adcq	48(%rdi),%r14
3418	adcq	56(%rdi),%r15
3419	leaq	64(%rdi),%rdi
3420	sbbq	%rax,%rax
3421
3422	xorq	%rsi,%rsi
3423	movq	%rax,16+8(%rsp)
3424	jmp	L$sqrx8x_tail
3425
3426.p2align	5
3427L$sqrx8x_tail:
3428	movq	%r8,%rbx
3429	mulxq	0(%rbp),%rax,%r8
3430	adcxq	%rax,%rbx
3431	adoxq	%r9,%r8
3432
3433	mulxq	8(%rbp),%rax,%r9
3434	adcxq	%rax,%r8
3435	adoxq	%r10,%r9
3436
3437	mulxq	16(%rbp),%rax,%r10
3438	adcxq	%rax,%r9
3439	adoxq	%r11,%r10
3440
3441	mulxq	24(%rbp),%rax,%r11
3442	adcxq	%rax,%r10
3443	adoxq	%r12,%r11
3444
3445.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3446	adcxq	%rax,%r11
3447	adoxq	%r13,%r12
3448
3449	mulxq	40(%rbp),%rax,%r13
3450	adcxq	%rax,%r12
3451	adoxq	%r14,%r13
3452
3453	mulxq	48(%rbp),%rax,%r14
3454	adcxq	%rax,%r13
3455	adoxq	%r15,%r14
3456
3457	mulxq	56(%rbp),%rax,%r15
3458	movq	72+48+8(%rsp,%rcx,8),%rdx
3459	adcxq	%rax,%r14
3460	adoxq	%rsi,%r15
3461	movq	%rbx,(%rdi,%rcx,8)
3462	movq	%r8,%rbx
3463	adcxq	%rsi,%r15
3464
3465	incq	%rcx
3466	jnz	L$sqrx8x_tail
3467
3468	cmpq	0+8(%rsp),%rbp
3469	jae	L$sqrx8x_tail_done
3470
3471	subq	16+8(%rsp),%rsi
3472	movq	48+8(%rsp),%rdx
3473	leaq	64(%rbp),%rbp
3474	adcq	0(%rdi),%r8
3475	adcq	8(%rdi),%r9
3476	adcq	16(%rdi),%r10
3477	adcq	24(%rdi),%r11
3478	adcq	32(%rdi),%r12
3479	adcq	40(%rdi),%r13
3480	adcq	48(%rdi),%r14
3481	adcq	56(%rdi),%r15
3482	leaq	64(%rdi),%rdi
3483	sbbq	%rax,%rax
3484	subq	$8,%rcx
3485
3486	xorq	%rsi,%rsi
3487	movq	%rax,16+8(%rsp)
3488	jmp	L$sqrx8x_tail
3489
3490.p2align	5
3491L$sqrx8x_tail_done:
3492	xorq	%rax,%rax
3493	addq	24+8(%rsp),%r8
3494	adcq	$0,%r9
3495	adcq	$0,%r10
3496	adcq	$0,%r11
3497	adcq	$0,%r12
3498	adcq	$0,%r13
3499	adcq	$0,%r14
3500	adcq	$0,%r15
3501	adcq	$0,%rax
3502
3503	subq	16+8(%rsp),%rsi
3504L$sqrx8x_no_tail:
3505	adcq	0(%rdi),%r8
3506.byte	102,72,15,126,217
3507	adcq	8(%rdi),%r9
3508	movq	56(%rbp),%rsi
3509.byte	102,72,15,126,213
3510	adcq	16(%rdi),%r10
3511	adcq	24(%rdi),%r11
3512	adcq	32(%rdi),%r12
3513	adcq	40(%rdi),%r13
3514	adcq	48(%rdi),%r14
3515	adcq	56(%rdi),%r15
3516	adcq	$0,%rax
3517
3518	movq	32+8(%rsp),%rbx
3519	movq	64(%rdi,%rcx,1),%rdx
3520
3521	movq	%r8,0(%rdi)
3522	leaq	64(%rdi),%r8
3523	movq	%r9,8(%rdi)
3524	movq	%r10,16(%rdi)
3525	movq	%r11,24(%rdi)
3526	movq	%r12,32(%rdi)
3527	movq	%r13,40(%rdi)
3528	movq	%r14,48(%rdi)
3529	movq	%r15,56(%rdi)
3530
3531	leaq	64(%rdi,%rcx,1),%rdi
3532	cmpq	8+8(%rsp),%r8
3533	jb	L$sqrx8x_reduction_loop
3534	.byte	0xf3,0xc3
3535
3536
3537.p2align	5
3538
3539__bn_postx4x_internal:
3540
3541	movq	0(%rbp),%r12
3542	movq	%rcx,%r10
3543	movq	%rcx,%r9
3544	negq	%rax
3545	sarq	$3+2,%rcx
3546
3547.byte	102,72,15,126,202
3548.byte	102,72,15,126,206
3549	decq	%r12
3550	movq	8(%rbp),%r13
3551	xorq	%r8,%r8
3552	movq	16(%rbp),%r14
3553	movq	24(%rbp),%r15
3554	jmp	L$sqrx4x_sub_entry
3555
3556.p2align	4
3557L$sqrx4x_sub:
3558	movq	0(%rbp),%r12
3559	movq	8(%rbp),%r13
3560	movq	16(%rbp),%r14
3561	movq	24(%rbp),%r15
3562L$sqrx4x_sub_entry:
3563	andnq	%rax,%r12,%r12
3564	leaq	32(%rbp),%rbp
3565	andnq	%rax,%r13,%r13
3566	andnq	%rax,%r14,%r14
3567	andnq	%rax,%r15,%r15
3568
3569	negq	%r8
3570	adcq	0(%rdi),%r12
3571	adcq	8(%rdi),%r13
3572	adcq	16(%rdi),%r14
3573	adcq	24(%rdi),%r15
3574	movq	%r12,0(%rdx)
3575	leaq	32(%rdi),%rdi
3576	movq	%r13,8(%rdx)
3577	sbbq	%r8,%r8
3578	movq	%r14,16(%rdx)
3579	movq	%r15,24(%rdx)
3580	leaq	32(%rdx),%rdx
3581
3582	incq	%rcx
3583	jnz	L$sqrx4x_sub
3584
3585	negq	%r9
3586
3587	.byte	0xf3,0xc3
3588
3589
3590.globl	_GFp_bn_scatter5
3591.private_extern _GFp_bn_scatter5
3592
3593.p2align	4
3594_GFp_bn_scatter5:
3595
3596	cmpl	$0,%esi
3597	jz	L$scatter_epilogue
3598	leaq	(%rdx,%rcx,8),%rdx
3599L$scatter:
3600	movq	(%rdi),%rax
3601	leaq	8(%rdi),%rdi
3602	movq	%rax,(%rdx)
3603	leaq	256(%rdx),%rdx
3604	subl	$1,%esi
3605	jnz	L$scatter
3606L$scatter_epilogue:
3607	.byte	0xf3,0xc3
3608
3609
3610
3611.globl	_GFp_bn_gather5
3612.private_extern _GFp_bn_gather5
3613
3614.p2align	5
3615_GFp_bn_gather5:
3616
3617L$SEH_begin_GFp_bn_gather5:
3618
3619.byte	0x4c,0x8d,0x14,0x24
3620
3621.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3622	leaq	L$inc(%rip),%rax
3623	andq	$-16,%rsp
3624
3625	movd	%ecx,%xmm5
3626	movdqa	0(%rax),%xmm0
3627	movdqa	16(%rax),%xmm1
3628	leaq	128(%rdx),%r11
3629	leaq	128(%rsp),%rax
3630
3631	pshufd	$0,%xmm5,%xmm5
3632	movdqa	%xmm1,%xmm4
3633	movdqa	%xmm1,%xmm2
3634	paddd	%xmm0,%xmm1
3635	pcmpeqd	%xmm5,%xmm0
3636	movdqa	%xmm4,%xmm3
3637
3638	paddd	%xmm1,%xmm2
3639	pcmpeqd	%xmm5,%xmm1
3640	movdqa	%xmm0,-128(%rax)
3641	movdqa	%xmm4,%xmm0
3642
3643	paddd	%xmm2,%xmm3
3644	pcmpeqd	%xmm5,%xmm2
3645	movdqa	%xmm1,-112(%rax)
3646	movdqa	%xmm4,%xmm1
3647
3648	paddd	%xmm3,%xmm0
3649	pcmpeqd	%xmm5,%xmm3
3650	movdqa	%xmm2,-96(%rax)
3651	movdqa	%xmm4,%xmm2
3652	paddd	%xmm0,%xmm1
3653	pcmpeqd	%xmm5,%xmm0
3654	movdqa	%xmm3,-80(%rax)
3655	movdqa	%xmm4,%xmm3
3656
3657	paddd	%xmm1,%xmm2
3658	pcmpeqd	%xmm5,%xmm1
3659	movdqa	%xmm0,-64(%rax)
3660	movdqa	%xmm4,%xmm0
3661
3662	paddd	%xmm2,%xmm3
3663	pcmpeqd	%xmm5,%xmm2
3664	movdqa	%xmm1,-48(%rax)
3665	movdqa	%xmm4,%xmm1
3666
3667	paddd	%xmm3,%xmm0
3668	pcmpeqd	%xmm5,%xmm3
3669	movdqa	%xmm2,-32(%rax)
3670	movdqa	%xmm4,%xmm2
3671	paddd	%xmm0,%xmm1
3672	pcmpeqd	%xmm5,%xmm0
3673	movdqa	%xmm3,-16(%rax)
3674	movdqa	%xmm4,%xmm3
3675
3676	paddd	%xmm1,%xmm2
3677	pcmpeqd	%xmm5,%xmm1
3678	movdqa	%xmm0,0(%rax)
3679	movdqa	%xmm4,%xmm0
3680
3681	paddd	%xmm2,%xmm3
3682	pcmpeqd	%xmm5,%xmm2
3683	movdqa	%xmm1,16(%rax)
3684	movdqa	%xmm4,%xmm1
3685
3686	paddd	%xmm3,%xmm0
3687	pcmpeqd	%xmm5,%xmm3
3688	movdqa	%xmm2,32(%rax)
3689	movdqa	%xmm4,%xmm2
3690	paddd	%xmm0,%xmm1
3691	pcmpeqd	%xmm5,%xmm0
3692	movdqa	%xmm3,48(%rax)
3693	movdqa	%xmm4,%xmm3
3694
3695	paddd	%xmm1,%xmm2
3696	pcmpeqd	%xmm5,%xmm1
3697	movdqa	%xmm0,64(%rax)
3698	movdqa	%xmm4,%xmm0
3699
3700	paddd	%xmm2,%xmm3
3701	pcmpeqd	%xmm5,%xmm2
3702	movdqa	%xmm1,80(%rax)
3703	movdqa	%xmm4,%xmm1
3704
3705	paddd	%xmm3,%xmm0
3706	pcmpeqd	%xmm5,%xmm3
3707	movdqa	%xmm2,96(%rax)
3708	movdqa	%xmm4,%xmm2
3709	movdqa	%xmm3,112(%rax)
3710	jmp	L$gather
3711
3712.p2align	5
3713L$gather:
3714	pxor	%xmm4,%xmm4
3715	pxor	%xmm5,%xmm5
3716	movdqa	-128(%r11),%xmm0
3717	movdqa	-112(%r11),%xmm1
3718	movdqa	-96(%r11),%xmm2
3719	pand	-128(%rax),%xmm0
3720	movdqa	-80(%r11),%xmm3
3721	pand	-112(%rax),%xmm1
3722	por	%xmm0,%xmm4
3723	pand	-96(%rax),%xmm2
3724	por	%xmm1,%xmm5
3725	pand	-80(%rax),%xmm3
3726	por	%xmm2,%xmm4
3727	por	%xmm3,%xmm5
3728	movdqa	-64(%r11),%xmm0
3729	movdqa	-48(%r11),%xmm1
3730	movdqa	-32(%r11),%xmm2
3731	pand	-64(%rax),%xmm0
3732	movdqa	-16(%r11),%xmm3
3733	pand	-48(%rax),%xmm1
3734	por	%xmm0,%xmm4
3735	pand	-32(%rax),%xmm2
3736	por	%xmm1,%xmm5
3737	pand	-16(%rax),%xmm3
3738	por	%xmm2,%xmm4
3739	por	%xmm3,%xmm5
3740	movdqa	0(%r11),%xmm0
3741	movdqa	16(%r11),%xmm1
3742	movdqa	32(%r11),%xmm2
3743	pand	0(%rax),%xmm0
3744	movdqa	48(%r11),%xmm3
3745	pand	16(%rax),%xmm1
3746	por	%xmm0,%xmm4
3747	pand	32(%rax),%xmm2
3748	por	%xmm1,%xmm5
3749	pand	48(%rax),%xmm3
3750	por	%xmm2,%xmm4
3751	por	%xmm3,%xmm5
3752	movdqa	64(%r11),%xmm0
3753	movdqa	80(%r11),%xmm1
3754	movdqa	96(%r11),%xmm2
3755	pand	64(%rax),%xmm0
3756	movdqa	112(%r11),%xmm3
3757	pand	80(%rax),%xmm1
3758	por	%xmm0,%xmm4
3759	pand	96(%rax),%xmm2
3760	por	%xmm1,%xmm5
3761	pand	112(%rax),%xmm3
3762	por	%xmm2,%xmm4
3763	por	%xmm3,%xmm5
3764	por	%xmm5,%xmm4
3765	leaq	256(%r11),%r11
3766	pshufd	$0x4e,%xmm4,%xmm0
3767	por	%xmm4,%xmm0
3768	movq	%xmm0,(%rdi)
3769	leaq	8(%rdi),%rdi
3770	subl	$1,%esi
3771	jnz	L$gather
3772
3773	leaq	(%r10),%rsp
3774
3775	.byte	0xf3,0xc3
3776L$SEH_end_GFp_bn_gather5:
3777
3778
3779.p2align	6
3780L$inc:
3781.long	0,0, 1,1
3782.long	2,2, 2,2
3783.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3784#endif
3785