1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16
17
18.globl	_bn_mul_mont_gather5
19.private_extern _bn_mul_mont_gather5
20
21.p2align	6
22_bn_mul_mont_gather5:
23
24	movl	%r9d,%r9d
25	movq	%rsp,%rax
26
27	testl	$7,%r9d
28	jnz	L$mul_enter
29	leaq	_OPENSSL_ia32cap_P(%rip),%r11
30	movl	8(%r11),%r11d
31	jmp	L$mul4x_enter
32
33.p2align	4
34L$mul_enter:
35	movd	8(%rsp),%xmm5
36	pushq	%rbx
37
38	pushq	%rbp
39
40	pushq	%r12
41
42	pushq	%r13
43
44	pushq	%r14
45
46	pushq	%r15
47
48
49	negq	%r9
50	movq	%rsp,%r11
51	leaq	-280(%rsp,%r9,8),%r10
52	negq	%r9
53	andq	$-1024,%r10
54
55
56
57
58
59
60
61
62
63	subq	%r10,%r11
64	andq	$-4096,%r11
65	leaq	(%r10,%r11,1),%rsp
66	movq	(%rsp),%r11
67	cmpq	%r10,%rsp
68	ja	L$mul_page_walk
69	jmp	L$mul_page_walk_done
70
71L$mul_page_walk:
72	leaq	-4096(%rsp),%rsp
73	movq	(%rsp),%r11
74	cmpq	%r10,%rsp
75	ja	L$mul_page_walk
76L$mul_page_walk_done:
77
78	leaq	L$inc(%rip),%r10
79	movq	%rax,8(%rsp,%r9,8)
80
81L$mul_body:
82
83	leaq	128(%rdx),%r12
84	movdqa	0(%r10),%xmm0
85	movdqa	16(%r10),%xmm1
86	leaq	24-112(%rsp,%r9,8),%r10
87	andq	$-16,%r10
88
89	pshufd	$0,%xmm5,%xmm5
90	movdqa	%xmm1,%xmm4
91	movdqa	%xmm1,%xmm2
92	paddd	%xmm0,%xmm1
93	pcmpeqd	%xmm5,%xmm0
94.byte	0x67
95	movdqa	%xmm4,%xmm3
96	paddd	%xmm1,%xmm2
97	pcmpeqd	%xmm5,%xmm1
98	movdqa	%xmm0,112(%r10)
99	movdqa	%xmm4,%xmm0
100
101	paddd	%xmm2,%xmm3
102	pcmpeqd	%xmm5,%xmm2
103	movdqa	%xmm1,128(%r10)
104	movdqa	%xmm4,%xmm1
105
106	paddd	%xmm3,%xmm0
107	pcmpeqd	%xmm5,%xmm3
108	movdqa	%xmm2,144(%r10)
109	movdqa	%xmm4,%xmm2
110
111	paddd	%xmm0,%xmm1
112	pcmpeqd	%xmm5,%xmm0
113	movdqa	%xmm3,160(%r10)
114	movdqa	%xmm4,%xmm3
115	paddd	%xmm1,%xmm2
116	pcmpeqd	%xmm5,%xmm1
117	movdqa	%xmm0,176(%r10)
118	movdqa	%xmm4,%xmm0
119
120	paddd	%xmm2,%xmm3
121	pcmpeqd	%xmm5,%xmm2
122	movdqa	%xmm1,192(%r10)
123	movdqa	%xmm4,%xmm1
124
125	paddd	%xmm3,%xmm0
126	pcmpeqd	%xmm5,%xmm3
127	movdqa	%xmm2,208(%r10)
128	movdqa	%xmm4,%xmm2
129
130	paddd	%xmm0,%xmm1
131	pcmpeqd	%xmm5,%xmm0
132	movdqa	%xmm3,224(%r10)
133	movdqa	%xmm4,%xmm3
134	paddd	%xmm1,%xmm2
135	pcmpeqd	%xmm5,%xmm1
136	movdqa	%xmm0,240(%r10)
137	movdqa	%xmm4,%xmm0
138
139	paddd	%xmm2,%xmm3
140	pcmpeqd	%xmm5,%xmm2
141	movdqa	%xmm1,256(%r10)
142	movdqa	%xmm4,%xmm1
143
144	paddd	%xmm3,%xmm0
145	pcmpeqd	%xmm5,%xmm3
146	movdqa	%xmm2,272(%r10)
147	movdqa	%xmm4,%xmm2
148
149	paddd	%xmm0,%xmm1
150	pcmpeqd	%xmm5,%xmm0
151	movdqa	%xmm3,288(%r10)
152	movdqa	%xmm4,%xmm3
153	paddd	%xmm1,%xmm2
154	pcmpeqd	%xmm5,%xmm1
155	movdqa	%xmm0,304(%r10)
156
157	paddd	%xmm2,%xmm3
158.byte	0x67
159	pcmpeqd	%xmm5,%xmm2
160	movdqa	%xmm1,320(%r10)
161
162	pcmpeqd	%xmm5,%xmm3
163	movdqa	%xmm2,336(%r10)
164	pand	64(%r12),%xmm0
165
166	pand	80(%r12),%xmm1
167	pand	96(%r12),%xmm2
168	movdqa	%xmm3,352(%r10)
169	pand	112(%r12),%xmm3
170	por	%xmm2,%xmm0
171	por	%xmm3,%xmm1
172	movdqa	-128(%r12),%xmm4
173	movdqa	-112(%r12),%xmm5
174	movdqa	-96(%r12),%xmm2
175	pand	112(%r10),%xmm4
176	movdqa	-80(%r12),%xmm3
177	pand	128(%r10),%xmm5
178	por	%xmm4,%xmm0
179	pand	144(%r10),%xmm2
180	por	%xmm5,%xmm1
181	pand	160(%r10),%xmm3
182	por	%xmm2,%xmm0
183	por	%xmm3,%xmm1
184	movdqa	-64(%r12),%xmm4
185	movdqa	-48(%r12),%xmm5
186	movdqa	-32(%r12),%xmm2
187	pand	176(%r10),%xmm4
188	movdqa	-16(%r12),%xmm3
189	pand	192(%r10),%xmm5
190	por	%xmm4,%xmm0
191	pand	208(%r10),%xmm2
192	por	%xmm5,%xmm1
193	pand	224(%r10),%xmm3
194	por	%xmm2,%xmm0
195	por	%xmm3,%xmm1
196	movdqa	0(%r12),%xmm4
197	movdqa	16(%r12),%xmm5
198	movdqa	32(%r12),%xmm2
199	pand	240(%r10),%xmm4
200	movdqa	48(%r12),%xmm3
201	pand	256(%r10),%xmm5
202	por	%xmm4,%xmm0
203	pand	272(%r10),%xmm2
204	por	%xmm5,%xmm1
205	pand	288(%r10),%xmm3
206	por	%xmm2,%xmm0
207	por	%xmm3,%xmm1
208	por	%xmm1,%xmm0
209	pshufd	$0x4e,%xmm0,%xmm1
210	por	%xmm1,%xmm0
211	leaq	256(%r12),%r12
212.byte	102,72,15,126,195
213
214	movq	(%r8),%r8
215	movq	(%rsi),%rax
216
217	xorq	%r14,%r14
218	xorq	%r15,%r15
219
220	movq	%r8,%rbp
221	mulq	%rbx
222	movq	%rax,%r10
223	movq	(%rcx),%rax
224
225	imulq	%r10,%rbp
226	movq	%rdx,%r11
227
228	mulq	%rbp
229	addq	%rax,%r10
230	movq	8(%rsi),%rax
231	adcq	$0,%rdx
232	movq	%rdx,%r13
233
234	leaq	1(%r15),%r15
235	jmp	L$1st_enter
236
237.p2align	4
238L$1st:
239	addq	%rax,%r13
240	movq	(%rsi,%r15,8),%rax
241	adcq	$0,%rdx
242	addq	%r11,%r13
243	movq	%r10,%r11
244	adcq	$0,%rdx
245	movq	%r13,-16(%rsp,%r15,8)
246	movq	%rdx,%r13
247
248L$1st_enter:
249	mulq	%rbx
250	addq	%rax,%r11
251	movq	(%rcx,%r15,8),%rax
252	adcq	$0,%rdx
253	leaq	1(%r15),%r15
254	movq	%rdx,%r10
255
256	mulq	%rbp
257	cmpq	%r9,%r15
258	jne	L$1st
259
260
261	addq	%rax,%r13
262	adcq	$0,%rdx
263	addq	%r11,%r13
264	adcq	$0,%rdx
265	movq	%r13,-16(%rsp,%r9,8)
266	movq	%rdx,%r13
267	movq	%r10,%r11
268
269	xorq	%rdx,%rdx
270	addq	%r11,%r13
271	adcq	$0,%rdx
272	movq	%r13,-8(%rsp,%r9,8)
273	movq	%rdx,(%rsp,%r9,8)
274
275	leaq	1(%r14),%r14
276	jmp	L$outer
277.p2align	4
278L$outer:
279	leaq	24+128(%rsp,%r9,8),%rdx
280	andq	$-16,%rdx
281	pxor	%xmm4,%xmm4
282	pxor	%xmm5,%xmm5
283	movdqa	-128(%r12),%xmm0
284	movdqa	-112(%r12),%xmm1
285	movdqa	-96(%r12),%xmm2
286	movdqa	-80(%r12),%xmm3
287	pand	-128(%rdx),%xmm0
288	pand	-112(%rdx),%xmm1
289	por	%xmm0,%xmm4
290	pand	-96(%rdx),%xmm2
291	por	%xmm1,%xmm5
292	pand	-80(%rdx),%xmm3
293	por	%xmm2,%xmm4
294	por	%xmm3,%xmm5
295	movdqa	-64(%r12),%xmm0
296	movdqa	-48(%r12),%xmm1
297	movdqa	-32(%r12),%xmm2
298	movdqa	-16(%r12),%xmm3
299	pand	-64(%rdx),%xmm0
300	pand	-48(%rdx),%xmm1
301	por	%xmm0,%xmm4
302	pand	-32(%rdx),%xmm2
303	por	%xmm1,%xmm5
304	pand	-16(%rdx),%xmm3
305	por	%xmm2,%xmm4
306	por	%xmm3,%xmm5
307	movdqa	0(%r12),%xmm0
308	movdqa	16(%r12),%xmm1
309	movdqa	32(%r12),%xmm2
310	movdqa	48(%r12),%xmm3
311	pand	0(%rdx),%xmm0
312	pand	16(%rdx),%xmm1
313	por	%xmm0,%xmm4
314	pand	32(%rdx),%xmm2
315	por	%xmm1,%xmm5
316	pand	48(%rdx),%xmm3
317	por	%xmm2,%xmm4
318	por	%xmm3,%xmm5
319	movdqa	64(%r12),%xmm0
320	movdqa	80(%r12),%xmm1
321	movdqa	96(%r12),%xmm2
322	movdqa	112(%r12),%xmm3
323	pand	64(%rdx),%xmm0
324	pand	80(%rdx),%xmm1
325	por	%xmm0,%xmm4
326	pand	96(%rdx),%xmm2
327	por	%xmm1,%xmm5
328	pand	112(%rdx),%xmm3
329	por	%xmm2,%xmm4
330	por	%xmm3,%xmm5
331	por	%xmm5,%xmm4
332	pshufd	$0x4e,%xmm4,%xmm0
333	por	%xmm4,%xmm0
334	leaq	256(%r12),%r12
335
336	movq	(%rsi),%rax
337.byte	102,72,15,126,195
338
339	xorq	%r15,%r15
340	movq	%r8,%rbp
341	movq	(%rsp),%r10
342
343	mulq	%rbx
344	addq	%rax,%r10
345	movq	(%rcx),%rax
346	adcq	$0,%rdx
347
348	imulq	%r10,%rbp
349	movq	%rdx,%r11
350
351	mulq	%rbp
352	addq	%rax,%r10
353	movq	8(%rsi),%rax
354	adcq	$0,%rdx
355	movq	8(%rsp),%r10
356	movq	%rdx,%r13
357
358	leaq	1(%r15),%r15
359	jmp	L$inner_enter
360
361.p2align	4
362L$inner:
363	addq	%rax,%r13
364	movq	(%rsi,%r15,8),%rax
365	adcq	$0,%rdx
366	addq	%r10,%r13
367	movq	(%rsp,%r15,8),%r10
368	adcq	$0,%rdx
369	movq	%r13,-16(%rsp,%r15,8)
370	movq	%rdx,%r13
371
372L$inner_enter:
373	mulq	%rbx
374	addq	%rax,%r11
375	movq	(%rcx,%r15,8),%rax
376	adcq	$0,%rdx
377	addq	%r11,%r10
378	movq	%rdx,%r11
379	adcq	$0,%r11
380	leaq	1(%r15),%r15
381
382	mulq	%rbp
383	cmpq	%r9,%r15
384	jne	L$inner
385
386	addq	%rax,%r13
387	adcq	$0,%rdx
388	addq	%r10,%r13
389	movq	(%rsp,%r9,8),%r10
390	adcq	$0,%rdx
391	movq	%r13,-16(%rsp,%r9,8)
392	movq	%rdx,%r13
393
394	xorq	%rdx,%rdx
395	addq	%r11,%r13
396	adcq	$0,%rdx
397	addq	%r10,%r13
398	adcq	$0,%rdx
399	movq	%r13,-8(%rsp,%r9,8)
400	movq	%rdx,(%rsp,%r9,8)
401
402	leaq	1(%r14),%r14
403	cmpq	%r9,%r14
404	jb	L$outer
405
406	xorq	%r14,%r14
407	movq	(%rsp),%rax
408	leaq	(%rsp),%rsi
409	movq	%r9,%r15
410	jmp	L$sub
411.p2align	4
412L$sub:	sbbq	(%rcx,%r14,8),%rax
413	movq	%rax,(%rdi,%r14,8)
414	movq	8(%rsi,%r14,8),%rax
415	leaq	1(%r14),%r14
416	decq	%r15
417	jnz	L$sub
418
419	sbbq	$0,%rax
420	movq	$-1,%rbx
421	xorq	%rax,%rbx
422	xorq	%r14,%r14
423	movq	%r9,%r15
424
425L$copy:
426	movq	(%rdi,%r14,8),%rcx
427	movq	(%rsp,%r14,8),%rdx
428	andq	%rbx,%rcx
429	andq	%rax,%rdx
430	movq	%r14,(%rsp,%r14,8)
431	orq	%rcx,%rdx
432	movq	%rdx,(%rdi,%r14,8)
433	leaq	1(%r14),%r14
434	subq	$1,%r15
435	jnz	L$copy
436
437	movq	8(%rsp,%r9,8),%rsi
438
439	movq	$1,%rax
440
441	movq	-48(%rsi),%r15
442
443	movq	-40(%rsi),%r14
444
445	movq	-32(%rsi),%r13
446
447	movq	-24(%rsi),%r12
448
449	movq	-16(%rsi),%rbp
450
451	movq	-8(%rsi),%rbx
452
453	leaq	(%rsi),%rsp
454
455L$mul_epilogue:
456	.byte	0xf3,0xc3
457
458
459
460.p2align	5
461bn_mul4x_mont_gather5:
462
463.byte	0x67
464	movq	%rsp,%rax
465
466L$mul4x_enter:
467	andl	$0x80108,%r11d
468	cmpl	$0x80108,%r11d
469	je	L$mulx4x_enter
470	pushq	%rbx
471
472	pushq	%rbp
473
474	pushq	%r12
475
476	pushq	%r13
477
478	pushq	%r14
479
480	pushq	%r15
481
482L$mul4x_prologue:
483
484.byte	0x67
485	shll	$3,%r9d
486	leaq	(%r9,%r9,2),%r10
487	negq	%r9
488
489
490
491
492
493
494
495
496
497
498	leaq	-320(%rsp,%r9,2),%r11
499	movq	%rsp,%rbp
500	subq	%rdi,%r11
501	andq	$4095,%r11
502	cmpq	%r11,%r10
503	jb	L$mul4xsp_alt
504	subq	%r11,%rbp
505	leaq	-320(%rbp,%r9,2),%rbp
506	jmp	L$mul4xsp_done
507
508.p2align	5
509L$mul4xsp_alt:
510	leaq	4096-320(,%r9,2),%r10
511	leaq	-320(%rbp,%r9,2),%rbp
512	subq	%r10,%r11
513	movq	$0,%r10
514	cmovcq	%r10,%r11
515	subq	%r11,%rbp
516L$mul4xsp_done:
517	andq	$-64,%rbp
518	movq	%rsp,%r11
519	subq	%rbp,%r11
520	andq	$-4096,%r11
521	leaq	(%r11,%rbp,1),%rsp
522	movq	(%rsp),%r10
523	cmpq	%rbp,%rsp
524	ja	L$mul4x_page_walk
525	jmp	L$mul4x_page_walk_done
526
527L$mul4x_page_walk:
528	leaq	-4096(%rsp),%rsp
529	movq	(%rsp),%r10
530	cmpq	%rbp,%rsp
531	ja	L$mul4x_page_walk
532L$mul4x_page_walk_done:
533
534	negq	%r9
535
536	movq	%rax,40(%rsp)
537
538L$mul4x_body:
539
540	call	mul4x_internal
541
542	movq	40(%rsp),%rsi
543
544	movq	$1,%rax
545
546	movq	-48(%rsi),%r15
547
548	movq	-40(%rsi),%r14
549
550	movq	-32(%rsi),%r13
551
552	movq	-24(%rsi),%r12
553
554	movq	-16(%rsi),%rbp
555
556	movq	-8(%rsi),%rbx
557
558	leaq	(%rsi),%rsp
559
560L$mul4x_epilogue:
561	.byte	0xf3,0xc3
562
563
564
565
566.p2align	5
567mul4x_internal:
568
569	shlq	$5,%r9
570	movd	8(%rax),%xmm5
571	leaq	L$inc(%rip),%rax
572	leaq	128(%rdx,%r9,1),%r13
573	shrq	$5,%r9
574	movdqa	0(%rax),%xmm0
575	movdqa	16(%rax),%xmm1
576	leaq	88-112(%rsp,%r9,1),%r10
577	leaq	128(%rdx),%r12
578
579	pshufd	$0,%xmm5,%xmm5
580	movdqa	%xmm1,%xmm4
581.byte	0x67,0x67
582	movdqa	%xmm1,%xmm2
583	paddd	%xmm0,%xmm1
584	pcmpeqd	%xmm5,%xmm0
585.byte	0x67
586	movdqa	%xmm4,%xmm3
587	paddd	%xmm1,%xmm2
588	pcmpeqd	%xmm5,%xmm1
589	movdqa	%xmm0,112(%r10)
590	movdqa	%xmm4,%xmm0
591
592	paddd	%xmm2,%xmm3
593	pcmpeqd	%xmm5,%xmm2
594	movdqa	%xmm1,128(%r10)
595	movdqa	%xmm4,%xmm1
596
597	paddd	%xmm3,%xmm0
598	pcmpeqd	%xmm5,%xmm3
599	movdqa	%xmm2,144(%r10)
600	movdqa	%xmm4,%xmm2
601
602	paddd	%xmm0,%xmm1
603	pcmpeqd	%xmm5,%xmm0
604	movdqa	%xmm3,160(%r10)
605	movdqa	%xmm4,%xmm3
606	paddd	%xmm1,%xmm2
607	pcmpeqd	%xmm5,%xmm1
608	movdqa	%xmm0,176(%r10)
609	movdqa	%xmm4,%xmm0
610
611	paddd	%xmm2,%xmm3
612	pcmpeqd	%xmm5,%xmm2
613	movdqa	%xmm1,192(%r10)
614	movdqa	%xmm4,%xmm1
615
616	paddd	%xmm3,%xmm0
617	pcmpeqd	%xmm5,%xmm3
618	movdqa	%xmm2,208(%r10)
619	movdqa	%xmm4,%xmm2
620
621	paddd	%xmm0,%xmm1
622	pcmpeqd	%xmm5,%xmm0
623	movdqa	%xmm3,224(%r10)
624	movdqa	%xmm4,%xmm3
625	paddd	%xmm1,%xmm2
626	pcmpeqd	%xmm5,%xmm1
627	movdqa	%xmm0,240(%r10)
628	movdqa	%xmm4,%xmm0
629
630	paddd	%xmm2,%xmm3
631	pcmpeqd	%xmm5,%xmm2
632	movdqa	%xmm1,256(%r10)
633	movdqa	%xmm4,%xmm1
634
635	paddd	%xmm3,%xmm0
636	pcmpeqd	%xmm5,%xmm3
637	movdqa	%xmm2,272(%r10)
638	movdqa	%xmm4,%xmm2
639
640	paddd	%xmm0,%xmm1
641	pcmpeqd	%xmm5,%xmm0
642	movdqa	%xmm3,288(%r10)
643	movdqa	%xmm4,%xmm3
644	paddd	%xmm1,%xmm2
645	pcmpeqd	%xmm5,%xmm1
646	movdqa	%xmm0,304(%r10)
647
648	paddd	%xmm2,%xmm3
649.byte	0x67
650	pcmpeqd	%xmm5,%xmm2
651	movdqa	%xmm1,320(%r10)
652
653	pcmpeqd	%xmm5,%xmm3
654	movdqa	%xmm2,336(%r10)
655	pand	64(%r12),%xmm0
656
657	pand	80(%r12),%xmm1
658	pand	96(%r12),%xmm2
659	movdqa	%xmm3,352(%r10)
660	pand	112(%r12),%xmm3
661	por	%xmm2,%xmm0
662	por	%xmm3,%xmm1
663	movdqa	-128(%r12),%xmm4
664	movdqa	-112(%r12),%xmm5
665	movdqa	-96(%r12),%xmm2
666	pand	112(%r10),%xmm4
667	movdqa	-80(%r12),%xmm3
668	pand	128(%r10),%xmm5
669	por	%xmm4,%xmm0
670	pand	144(%r10),%xmm2
671	por	%xmm5,%xmm1
672	pand	160(%r10),%xmm3
673	por	%xmm2,%xmm0
674	por	%xmm3,%xmm1
675	movdqa	-64(%r12),%xmm4
676	movdqa	-48(%r12),%xmm5
677	movdqa	-32(%r12),%xmm2
678	pand	176(%r10),%xmm4
679	movdqa	-16(%r12),%xmm3
680	pand	192(%r10),%xmm5
681	por	%xmm4,%xmm0
682	pand	208(%r10),%xmm2
683	por	%xmm5,%xmm1
684	pand	224(%r10),%xmm3
685	por	%xmm2,%xmm0
686	por	%xmm3,%xmm1
687	movdqa	0(%r12),%xmm4
688	movdqa	16(%r12),%xmm5
689	movdqa	32(%r12),%xmm2
690	pand	240(%r10),%xmm4
691	movdqa	48(%r12),%xmm3
692	pand	256(%r10),%xmm5
693	por	%xmm4,%xmm0
694	pand	272(%r10),%xmm2
695	por	%xmm5,%xmm1
696	pand	288(%r10),%xmm3
697	por	%xmm2,%xmm0
698	por	%xmm3,%xmm1
699	por	%xmm1,%xmm0
700	pshufd	$0x4e,%xmm0,%xmm1
701	por	%xmm1,%xmm0
702	leaq	256(%r12),%r12
703.byte	102,72,15,126,195
704
705	movq	%r13,16+8(%rsp)
706	movq	%rdi,56+8(%rsp)
707
708	movq	(%r8),%r8
709	movq	(%rsi),%rax
710	leaq	(%rsi,%r9,1),%rsi
711	negq	%r9
712
713	movq	%r8,%rbp
714	mulq	%rbx
715	movq	%rax,%r10
716	movq	(%rcx),%rax
717
718	imulq	%r10,%rbp
719	leaq	64+8(%rsp),%r14
720	movq	%rdx,%r11
721
722	mulq	%rbp
723	addq	%rax,%r10
724	movq	8(%rsi,%r9,1),%rax
725	adcq	$0,%rdx
726	movq	%rdx,%rdi
727
728	mulq	%rbx
729	addq	%rax,%r11
730	movq	8(%rcx),%rax
731	adcq	$0,%rdx
732	movq	%rdx,%r10
733
734	mulq	%rbp
735	addq	%rax,%rdi
736	movq	16(%rsi,%r9,1),%rax
737	adcq	$0,%rdx
738	addq	%r11,%rdi
739	leaq	32(%r9),%r15
740	leaq	32(%rcx),%rcx
741	adcq	$0,%rdx
742	movq	%rdi,(%r14)
743	movq	%rdx,%r13
744	jmp	L$1st4x
745
746.p2align	5
747L$1st4x:
748	mulq	%rbx
749	addq	%rax,%r10
750	movq	-16(%rcx),%rax
751	leaq	32(%r14),%r14
752	adcq	$0,%rdx
753	movq	%rdx,%r11
754
755	mulq	%rbp
756	addq	%rax,%r13
757	movq	-8(%rsi,%r15,1),%rax
758	adcq	$0,%rdx
759	addq	%r10,%r13
760	adcq	$0,%rdx
761	movq	%r13,-24(%r14)
762	movq	%rdx,%rdi
763
764	mulq	%rbx
765	addq	%rax,%r11
766	movq	-8(%rcx),%rax
767	adcq	$0,%rdx
768	movq	%rdx,%r10
769
770	mulq	%rbp
771	addq	%rax,%rdi
772	movq	(%rsi,%r15,1),%rax
773	adcq	$0,%rdx
774	addq	%r11,%rdi
775	adcq	$0,%rdx
776	movq	%rdi,-16(%r14)
777	movq	%rdx,%r13
778
779	mulq	%rbx
780	addq	%rax,%r10
781	movq	0(%rcx),%rax
782	adcq	$0,%rdx
783	movq	%rdx,%r11
784
785	mulq	%rbp
786	addq	%rax,%r13
787	movq	8(%rsi,%r15,1),%rax
788	adcq	$0,%rdx
789	addq	%r10,%r13
790	adcq	$0,%rdx
791	movq	%r13,-8(%r14)
792	movq	%rdx,%rdi
793
794	mulq	%rbx
795	addq	%rax,%r11
796	movq	8(%rcx),%rax
797	adcq	$0,%rdx
798	movq	%rdx,%r10
799
800	mulq	%rbp
801	addq	%rax,%rdi
802	movq	16(%rsi,%r15,1),%rax
803	adcq	$0,%rdx
804	addq	%r11,%rdi
805	leaq	32(%rcx),%rcx
806	adcq	$0,%rdx
807	movq	%rdi,(%r14)
808	movq	%rdx,%r13
809
810	addq	$32,%r15
811	jnz	L$1st4x
812
813	mulq	%rbx
814	addq	%rax,%r10
815	movq	-16(%rcx),%rax
816	leaq	32(%r14),%r14
817	adcq	$0,%rdx
818	movq	%rdx,%r11
819
820	mulq	%rbp
821	addq	%rax,%r13
822	movq	-8(%rsi),%rax
823	adcq	$0,%rdx
824	addq	%r10,%r13
825	adcq	$0,%rdx
826	movq	%r13,-24(%r14)
827	movq	%rdx,%rdi
828
829	mulq	%rbx
830	addq	%rax,%r11
831	movq	-8(%rcx),%rax
832	adcq	$0,%rdx
833	movq	%rdx,%r10
834
835	mulq	%rbp
836	addq	%rax,%rdi
837	movq	(%rsi,%r9,1),%rax
838	adcq	$0,%rdx
839	addq	%r11,%rdi
840	adcq	$0,%rdx
841	movq	%rdi,-16(%r14)
842	movq	%rdx,%r13
843
844	leaq	(%rcx,%r9,1),%rcx
845
846	xorq	%rdi,%rdi
847	addq	%r10,%r13
848	adcq	$0,%rdi
849	movq	%r13,-8(%r14)
850
851	jmp	L$outer4x
852
853.p2align	5
854L$outer4x:
855	leaq	16+128(%r14),%rdx
856	pxor	%xmm4,%xmm4
857	pxor	%xmm5,%xmm5
858	movdqa	-128(%r12),%xmm0
859	movdqa	-112(%r12),%xmm1
860	movdqa	-96(%r12),%xmm2
861	movdqa	-80(%r12),%xmm3
862	pand	-128(%rdx),%xmm0
863	pand	-112(%rdx),%xmm1
864	por	%xmm0,%xmm4
865	pand	-96(%rdx),%xmm2
866	por	%xmm1,%xmm5
867	pand	-80(%rdx),%xmm3
868	por	%xmm2,%xmm4
869	por	%xmm3,%xmm5
870	movdqa	-64(%r12),%xmm0
871	movdqa	-48(%r12),%xmm1
872	movdqa	-32(%r12),%xmm2
873	movdqa	-16(%r12),%xmm3
874	pand	-64(%rdx),%xmm0
875	pand	-48(%rdx),%xmm1
876	por	%xmm0,%xmm4
877	pand	-32(%rdx),%xmm2
878	por	%xmm1,%xmm5
879	pand	-16(%rdx),%xmm3
880	por	%xmm2,%xmm4
881	por	%xmm3,%xmm5
882	movdqa	0(%r12),%xmm0
883	movdqa	16(%r12),%xmm1
884	movdqa	32(%r12),%xmm2
885	movdqa	48(%r12),%xmm3
886	pand	0(%rdx),%xmm0
887	pand	16(%rdx),%xmm1
888	por	%xmm0,%xmm4
889	pand	32(%rdx),%xmm2
890	por	%xmm1,%xmm5
891	pand	48(%rdx),%xmm3
892	por	%xmm2,%xmm4
893	por	%xmm3,%xmm5
894	movdqa	64(%r12),%xmm0
895	movdqa	80(%r12),%xmm1
896	movdqa	96(%r12),%xmm2
897	movdqa	112(%r12),%xmm3
898	pand	64(%rdx),%xmm0
899	pand	80(%rdx),%xmm1
900	por	%xmm0,%xmm4
901	pand	96(%rdx),%xmm2
902	por	%xmm1,%xmm5
903	pand	112(%rdx),%xmm3
904	por	%xmm2,%xmm4
905	por	%xmm3,%xmm5
906	por	%xmm5,%xmm4
907	pshufd	$0x4e,%xmm4,%xmm0
908	por	%xmm4,%xmm0
909	leaq	256(%r12),%r12
910.byte	102,72,15,126,195
911
912	movq	(%r14,%r9,1),%r10
913	movq	%r8,%rbp
914	mulq	%rbx
915	addq	%rax,%r10
916	movq	(%rcx),%rax
917	adcq	$0,%rdx
918
919	imulq	%r10,%rbp
920	movq	%rdx,%r11
921	movq	%rdi,(%r14)
922
923	leaq	(%r14,%r9,1),%r14
924
925	mulq	%rbp
926	addq	%rax,%r10
927	movq	8(%rsi,%r9,1),%rax
928	adcq	$0,%rdx
929	movq	%rdx,%rdi
930
931	mulq	%rbx
932	addq	%rax,%r11
933	movq	8(%rcx),%rax
934	adcq	$0,%rdx
935	addq	8(%r14),%r11
936	adcq	$0,%rdx
937	movq	%rdx,%r10
938
939	mulq	%rbp
940	addq	%rax,%rdi
941	movq	16(%rsi,%r9,1),%rax
942	adcq	$0,%rdx
943	addq	%r11,%rdi
944	leaq	32(%r9),%r15
945	leaq	32(%rcx),%rcx
946	adcq	$0,%rdx
947	movq	%rdx,%r13
948	jmp	L$inner4x
949
950.p2align	5
951L$inner4x:
952	mulq	%rbx
953	addq	%rax,%r10
954	movq	-16(%rcx),%rax
955	adcq	$0,%rdx
956	addq	16(%r14),%r10
957	leaq	32(%r14),%r14
958	adcq	$0,%rdx
959	movq	%rdx,%r11
960
961	mulq	%rbp
962	addq	%rax,%r13
963	movq	-8(%rsi,%r15,1),%rax
964	adcq	$0,%rdx
965	addq	%r10,%r13
966	adcq	$0,%rdx
967	movq	%rdi,-32(%r14)
968	movq	%rdx,%rdi
969
970	mulq	%rbx
971	addq	%rax,%r11
972	movq	-8(%rcx),%rax
973	adcq	$0,%rdx
974	addq	-8(%r14),%r11
975	adcq	$0,%rdx
976	movq	%rdx,%r10
977
978	mulq	%rbp
979	addq	%rax,%rdi
980	movq	(%rsi,%r15,1),%rax
981	adcq	$0,%rdx
982	addq	%r11,%rdi
983	adcq	$0,%rdx
984	movq	%r13,-24(%r14)
985	movq	%rdx,%r13
986
987	mulq	%rbx
988	addq	%rax,%r10
989	movq	0(%rcx),%rax
990	adcq	$0,%rdx
991	addq	(%r14),%r10
992	adcq	$0,%rdx
993	movq	%rdx,%r11
994
995	mulq	%rbp
996	addq	%rax,%r13
997	movq	8(%rsi,%r15,1),%rax
998	adcq	$0,%rdx
999	addq	%r10,%r13
1000	adcq	$0,%rdx
1001	movq	%rdi,-16(%r14)
1002	movq	%rdx,%rdi
1003
1004	mulq	%rbx
1005	addq	%rax,%r11
1006	movq	8(%rcx),%rax
1007	adcq	$0,%rdx
1008	addq	8(%r14),%r11
1009	adcq	$0,%rdx
1010	movq	%rdx,%r10
1011
1012	mulq	%rbp
1013	addq	%rax,%rdi
1014	movq	16(%rsi,%r15,1),%rax
1015	adcq	$0,%rdx
1016	addq	%r11,%rdi
1017	leaq	32(%rcx),%rcx
1018	adcq	$0,%rdx
1019	movq	%r13,-8(%r14)
1020	movq	%rdx,%r13
1021
1022	addq	$32,%r15
1023	jnz	L$inner4x
1024
1025	mulq	%rbx
1026	addq	%rax,%r10
1027	movq	-16(%rcx),%rax
1028	adcq	$0,%rdx
1029	addq	16(%r14),%r10
1030	leaq	32(%r14),%r14
1031	adcq	$0,%rdx
1032	movq	%rdx,%r11
1033
1034	mulq	%rbp
1035	addq	%rax,%r13
1036	movq	-8(%rsi),%rax
1037	adcq	$0,%rdx
1038	addq	%r10,%r13
1039	adcq	$0,%rdx
1040	movq	%rdi,-32(%r14)
1041	movq	%rdx,%rdi
1042
1043	mulq	%rbx
1044	addq	%rax,%r11
1045	movq	%rbp,%rax
1046	movq	-8(%rcx),%rbp
1047	adcq	$0,%rdx
1048	addq	-8(%r14),%r11
1049	adcq	$0,%rdx
1050	movq	%rdx,%r10
1051
1052	mulq	%rbp
1053	addq	%rax,%rdi
1054	movq	(%rsi,%r9,1),%rax
1055	adcq	$0,%rdx
1056	addq	%r11,%rdi
1057	adcq	$0,%rdx
1058	movq	%r13,-24(%r14)
1059	movq	%rdx,%r13
1060
1061	movq	%rdi,-16(%r14)
1062	leaq	(%rcx,%r9,1),%rcx
1063
1064	xorq	%rdi,%rdi
1065	addq	%r10,%r13
1066	adcq	$0,%rdi
1067	addq	(%r14),%r13
1068	adcq	$0,%rdi
1069	movq	%r13,-8(%r14)
1070
1071	cmpq	16+8(%rsp),%r12
1072	jb	L$outer4x
1073	xorq	%rax,%rax
1074	subq	%r13,%rbp
1075	adcq	%r15,%r15
1076	orq	%r15,%rdi
1077	subq	%rdi,%rax
1078	leaq	(%r14,%r9,1),%rbx
1079	movq	(%rcx),%r12
1080	leaq	(%rcx),%rbp
1081	movq	%r9,%rcx
1082	sarq	$3+2,%rcx
1083	movq	56+8(%rsp),%rdi
1084	decq	%r12
1085	xorq	%r10,%r10
1086	movq	8(%rbp),%r13
1087	movq	16(%rbp),%r14
1088	movq	24(%rbp),%r15
1089	jmp	L$sqr4x_sub_entry
1090
1091
1092.globl	_bn_power5
1093.private_extern _bn_power5
1094
1095.p2align	5
1096_bn_power5:
1097
1098	movq	%rsp,%rax
1099
1100	leaq	_OPENSSL_ia32cap_P(%rip),%r11
1101	movl	8(%r11),%r11d
1102	andl	$0x80108,%r11d
1103	cmpl	$0x80108,%r11d
1104	je	L$powerx5_enter
1105	pushq	%rbx
1106
1107	pushq	%rbp
1108
1109	pushq	%r12
1110
1111	pushq	%r13
1112
1113	pushq	%r14
1114
1115	pushq	%r15
1116
1117L$power5_prologue:
1118
1119	shll	$3,%r9d
1120	leal	(%r9,%r9,2),%r10d
1121	negq	%r9
1122	movq	(%r8),%r8
1123
1124
1125
1126
1127
1128
1129
1130
1131	leaq	-320(%rsp,%r9,2),%r11
1132	movq	%rsp,%rbp
1133	subq	%rdi,%r11
1134	andq	$4095,%r11
1135	cmpq	%r11,%r10
1136	jb	L$pwr_sp_alt
1137	subq	%r11,%rbp
1138	leaq	-320(%rbp,%r9,2),%rbp
1139	jmp	L$pwr_sp_done
1140
1141.p2align	5
1142L$pwr_sp_alt:
1143	leaq	4096-320(,%r9,2),%r10
1144	leaq	-320(%rbp,%r9,2),%rbp
1145	subq	%r10,%r11
1146	movq	$0,%r10
1147	cmovcq	%r10,%r11
1148	subq	%r11,%rbp
1149L$pwr_sp_done:
1150	andq	$-64,%rbp
1151	movq	%rsp,%r11
1152	subq	%rbp,%r11
1153	andq	$-4096,%r11
1154	leaq	(%r11,%rbp,1),%rsp
1155	movq	(%rsp),%r10
1156	cmpq	%rbp,%rsp
1157	ja	L$pwr_page_walk
1158	jmp	L$pwr_page_walk_done
1159
1160L$pwr_page_walk:
1161	leaq	-4096(%rsp),%rsp
1162	movq	(%rsp),%r10
1163	cmpq	%rbp,%rsp
1164	ja	L$pwr_page_walk
1165L$pwr_page_walk_done:
1166
1167	movq	%r9,%r10
1168	negq	%r9
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179	movq	%r8,32(%rsp)
1180	movq	%rax,40(%rsp)
1181
1182L$power5_body:
1183.byte	102,72,15,110,207
1184.byte	102,72,15,110,209
1185.byte	102,73,15,110,218
1186.byte	102,72,15,110,226
1187
1188	call	__bn_sqr8x_internal
1189	call	__bn_post4x_internal
1190	call	__bn_sqr8x_internal
1191	call	__bn_post4x_internal
1192	call	__bn_sqr8x_internal
1193	call	__bn_post4x_internal
1194	call	__bn_sqr8x_internal
1195	call	__bn_post4x_internal
1196	call	__bn_sqr8x_internal
1197	call	__bn_post4x_internal
1198
1199.byte	102,72,15,126,209
1200.byte	102,72,15,126,226
1201	movq	%rsi,%rdi
1202	movq	40(%rsp),%rax
1203	leaq	32(%rsp),%r8
1204
1205	call	mul4x_internal
1206
1207	movq	40(%rsp),%rsi
1208
1209	movq	$1,%rax
1210	movq	-48(%rsi),%r15
1211
1212	movq	-40(%rsi),%r14
1213
1214	movq	-32(%rsi),%r13
1215
1216	movq	-24(%rsi),%r12
1217
1218	movq	-16(%rsi),%rbp
1219
1220	movq	-8(%rsi),%rbx
1221
1222	leaq	(%rsi),%rsp
1223
1224L$power5_epilogue:
1225	.byte	0xf3,0xc3
1226
1227
1228
1229.globl	_bn_sqr8x_internal
1230.private_extern _bn_sqr8x_internal
1231.private_extern	_bn_sqr8x_internal
1232
1233.p2align	5
1234_bn_sqr8x_internal:
1235__bn_sqr8x_internal:
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310	leaq	32(%r10),%rbp
1311	leaq	(%rsi,%r9,1),%rsi
1312
1313	movq	%r9,%rcx
1314
1315
1316	movq	-32(%rsi,%rbp,1),%r14
1317	leaq	48+8(%rsp,%r9,2),%rdi
1318	movq	-24(%rsi,%rbp,1),%rax
1319	leaq	-32(%rdi,%rbp,1),%rdi
1320	movq	-16(%rsi,%rbp,1),%rbx
1321	movq	%rax,%r15
1322
1323	mulq	%r14
1324	movq	%rax,%r10
1325	movq	%rbx,%rax
1326	movq	%rdx,%r11
1327	movq	%r10,-24(%rdi,%rbp,1)
1328
1329	mulq	%r14
1330	addq	%rax,%r11
1331	movq	%rbx,%rax
1332	adcq	$0,%rdx
1333	movq	%r11,-16(%rdi,%rbp,1)
1334	movq	%rdx,%r10
1335
1336
1337	movq	-8(%rsi,%rbp,1),%rbx
1338	mulq	%r15
1339	movq	%rax,%r12
1340	movq	%rbx,%rax
1341	movq	%rdx,%r13
1342
1343	leaq	(%rbp),%rcx
1344	mulq	%r14
1345	addq	%rax,%r10
1346	movq	%rbx,%rax
1347	movq	%rdx,%r11
1348	adcq	$0,%r11
1349	addq	%r12,%r10
1350	adcq	$0,%r11
1351	movq	%r10,-8(%rdi,%rcx,1)
1352	jmp	L$sqr4x_1st
1353
1354.p2align	5
1355L$sqr4x_1st:
1356	movq	(%rsi,%rcx,1),%rbx
1357	mulq	%r15
1358	addq	%rax,%r13
1359	movq	%rbx,%rax
1360	movq	%rdx,%r12
1361	adcq	$0,%r12
1362
1363	mulq	%r14
1364	addq	%rax,%r11
1365	movq	%rbx,%rax
1366	movq	8(%rsi,%rcx,1),%rbx
1367	movq	%rdx,%r10
1368	adcq	$0,%r10
1369	addq	%r13,%r11
1370	adcq	$0,%r10
1371
1372
1373	mulq	%r15
1374	addq	%rax,%r12
1375	movq	%rbx,%rax
1376	movq	%r11,(%rdi,%rcx,1)
1377	movq	%rdx,%r13
1378	adcq	$0,%r13
1379
1380	mulq	%r14
1381	addq	%rax,%r10
1382	movq	%rbx,%rax
1383	movq	16(%rsi,%rcx,1),%rbx
1384	movq	%rdx,%r11
1385	adcq	$0,%r11
1386	addq	%r12,%r10
1387	adcq	$0,%r11
1388
1389	mulq	%r15
1390	addq	%rax,%r13
1391	movq	%rbx,%rax
1392	movq	%r10,8(%rdi,%rcx,1)
1393	movq	%rdx,%r12
1394	adcq	$0,%r12
1395
1396	mulq	%r14
1397	addq	%rax,%r11
1398	movq	%rbx,%rax
1399	movq	24(%rsi,%rcx,1),%rbx
1400	movq	%rdx,%r10
1401	adcq	$0,%r10
1402	addq	%r13,%r11
1403	adcq	$0,%r10
1404
1405
1406	mulq	%r15
1407	addq	%rax,%r12
1408	movq	%rbx,%rax
1409	movq	%r11,16(%rdi,%rcx,1)
1410	movq	%rdx,%r13
1411	adcq	$0,%r13
1412	leaq	32(%rcx),%rcx
1413
1414	mulq	%r14
1415	addq	%rax,%r10
1416	movq	%rbx,%rax
1417	movq	%rdx,%r11
1418	adcq	$0,%r11
1419	addq	%r12,%r10
1420	adcq	$0,%r11
1421	movq	%r10,-8(%rdi,%rcx,1)
1422
1423	cmpq	$0,%rcx
1424	jne	L$sqr4x_1st
1425
1426	mulq	%r15
1427	addq	%rax,%r13
1428	leaq	16(%rbp),%rbp
1429	adcq	$0,%rdx
1430	addq	%r11,%r13
1431	adcq	$0,%rdx
1432
1433	movq	%r13,(%rdi)
1434	movq	%rdx,%r12
1435	movq	%rdx,8(%rdi)
1436	jmp	L$sqr4x_outer
1437
1438.p2align	5
1439L$sqr4x_outer:
1440	movq	-32(%rsi,%rbp,1),%r14
1441	leaq	48+8(%rsp,%r9,2),%rdi
1442	movq	-24(%rsi,%rbp,1),%rax
1443	leaq	-32(%rdi,%rbp,1),%rdi
1444	movq	-16(%rsi,%rbp,1),%rbx
1445	movq	%rax,%r15
1446
1447	mulq	%r14
1448	movq	-24(%rdi,%rbp,1),%r10
1449	addq	%rax,%r10
1450	movq	%rbx,%rax
1451	adcq	$0,%rdx
1452	movq	%r10,-24(%rdi,%rbp,1)
1453	movq	%rdx,%r11
1454
1455	mulq	%r14
1456	addq	%rax,%r11
1457	movq	%rbx,%rax
1458	adcq	$0,%rdx
1459	addq	-16(%rdi,%rbp,1),%r11
1460	movq	%rdx,%r10
1461	adcq	$0,%r10
1462	movq	%r11,-16(%rdi,%rbp,1)
1463
1464	xorq	%r12,%r12
1465
1466	movq	-8(%rsi,%rbp,1),%rbx
1467	mulq	%r15
1468	addq	%rax,%r12
1469	movq	%rbx,%rax
1470	adcq	$0,%rdx
1471	addq	-8(%rdi,%rbp,1),%r12
1472	movq	%rdx,%r13
1473	adcq	$0,%r13
1474
1475	mulq	%r14
1476	addq	%rax,%r10
1477	movq	%rbx,%rax
1478	adcq	$0,%rdx
1479	addq	%r12,%r10
1480	movq	%rdx,%r11
1481	adcq	$0,%r11
1482	movq	%r10,-8(%rdi,%rbp,1)
1483
1484	leaq	(%rbp),%rcx
1485	jmp	L$sqr4x_inner
1486
1487.p2align	5
1488L$sqr4x_inner:
1489	movq	(%rsi,%rcx,1),%rbx
1490	mulq	%r15
1491	addq	%rax,%r13
1492	movq	%rbx,%rax
1493	movq	%rdx,%r12
1494	adcq	$0,%r12
1495	addq	(%rdi,%rcx,1),%r13
1496	adcq	$0,%r12
1497
1498.byte	0x67
1499	mulq	%r14
1500	addq	%rax,%r11
1501	movq	%rbx,%rax
1502	movq	8(%rsi,%rcx,1),%rbx
1503	movq	%rdx,%r10
1504	adcq	$0,%r10
1505	addq	%r13,%r11
1506	adcq	$0,%r10
1507
1508	mulq	%r15
1509	addq	%rax,%r12
1510	movq	%r11,(%rdi,%rcx,1)
1511	movq	%rbx,%rax
1512	movq	%rdx,%r13
1513	adcq	$0,%r13
1514	addq	8(%rdi,%rcx,1),%r12
1515	leaq	16(%rcx),%rcx
1516	adcq	$0,%r13
1517
1518	mulq	%r14
1519	addq	%rax,%r10
1520	movq	%rbx,%rax
1521	adcq	$0,%rdx
1522	addq	%r12,%r10
1523	movq	%rdx,%r11
1524	adcq	$0,%r11
1525	movq	%r10,-8(%rdi,%rcx,1)
1526
1527	cmpq	$0,%rcx
1528	jne	L$sqr4x_inner
1529
1530.byte	0x67
1531	mulq	%r15
1532	addq	%rax,%r13
1533	adcq	$0,%rdx
1534	addq	%r11,%r13
1535	adcq	$0,%rdx
1536
1537	movq	%r13,(%rdi)
1538	movq	%rdx,%r12
1539	movq	%rdx,8(%rdi)
1540
1541	addq	$16,%rbp
1542	jnz	L$sqr4x_outer
1543
1544
1545	movq	-32(%rsi),%r14
1546	leaq	48+8(%rsp,%r9,2),%rdi
1547	movq	-24(%rsi),%rax
1548	leaq	-32(%rdi,%rbp,1),%rdi
1549	movq	-16(%rsi),%rbx
1550	movq	%rax,%r15
1551
1552	mulq	%r14
1553	addq	%rax,%r10
1554	movq	%rbx,%rax
1555	movq	%rdx,%r11
1556	adcq	$0,%r11
1557
1558	mulq	%r14
1559	addq	%rax,%r11
1560	movq	%rbx,%rax
1561	movq	%r10,-24(%rdi)
1562	movq	%rdx,%r10
1563	adcq	$0,%r10
1564	addq	%r13,%r11
1565	movq	-8(%rsi),%rbx
1566	adcq	$0,%r10
1567
1568	mulq	%r15
1569	addq	%rax,%r12
1570	movq	%rbx,%rax
1571	movq	%r11,-16(%rdi)
1572	movq	%rdx,%r13
1573	adcq	$0,%r13
1574
1575	mulq	%r14
1576	addq	%rax,%r10
1577	movq	%rbx,%rax
1578	movq	%rdx,%r11
1579	adcq	$0,%r11
1580	addq	%r12,%r10
1581	adcq	$0,%r11
1582	movq	%r10,-8(%rdi)
1583
1584	mulq	%r15
1585	addq	%rax,%r13
1586	movq	-16(%rsi),%rax
1587	adcq	$0,%rdx
1588	addq	%r11,%r13
1589	adcq	$0,%rdx
1590
1591	movq	%r13,(%rdi)
1592	movq	%rdx,%r12
1593	movq	%rdx,8(%rdi)
1594
1595	mulq	%rbx
1596	addq	$16,%rbp
1597	xorq	%r14,%r14
1598	subq	%r9,%rbp
1599	xorq	%r15,%r15
1600
1601	addq	%r12,%rax
1602	adcq	$0,%rdx
1603	movq	%rax,8(%rdi)
1604	movq	%rdx,16(%rdi)
1605	movq	%r15,24(%rdi)
1606
1607	movq	-16(%rsi,%rbp,1),%rax
1608	leaq	48+8(%rsp),%rdi
1609	xorq	%r10,%r10
1610	movq	8(%rdi),%r11
1611
1612	leaq	(%r14,%r10,2),%r12
1613	shrq	$63,%r10
1614	leaq	(%rcx,%r11,2),%r13
1615	shrq	$63,%r11
1616	orq	%r10,%r13
1617	movq	16(%rdi),%r10
1618	movq	%r11,%r14
1619	mulq	%rax
1620	negq	%r15
1621	movq	24(%rdi),%r11
1622	adcq	%rax,%r12
1623	movq	-8(%rsi,%rbp,1),%rax
1624	movq	%r12,(%rdi)
1625	adcq	%rdx,%r13
1626
1627	leaq	(%r14,%r10,2),%rbx
1628	movq	%r13,8(%rdi)
1629	sbbq	%r15,%r15
1630	shrq	$63,%r10
1631	leaq	(%rcx,%r11,2),%r8
1632	shrq	$63,%r11
1633	orq	%r10,%r8
1634	movq	32(%rdi),%r10
1635	movq	%r11,%r14
1636	mulq	%rax
1637	negq	%r15
1638	movq	40(%rdi),%r11
1639	adcq	%rax,%rbx
1640	movq	0(%rsi,%rbp,1),%rax
1641	movq	%rbx,16(%rdi)
1642	adcq	%rdx,%r8
1643	leaq	16(%rbp),%rbp
1644	movq	%r8,24(%rdi)
1645	sbbq	%r15,%r15
1646	leaq	64(%rdi),%rdi
1647	jmp	L$sqr4x_shift_n_add
1648
1649.p2align	5
1650L$sqr4x_shift_n_add:
1651	leaq	(%r14,%r10,2),%r12
1652	shrq	$63,%r10
1653	leaq	(%rcx,%r11,2),%r13
1654	shrq	$63,%r11
1655	orq	%r10,%r13
1656	movq	-16(%rdi),%r10
1657	movq	%r11,%r14
1658	mulq	%rax
1659	negq	%r15
1660	movq	-8(%rdi),%r11
1661	adcq	%rax,%r12
1662	movq	-8(%rsi,%rbp,1),%rax
1663	movq	%r12,-32(%rdi)
1664	adcq	%rdx,%r13
1665
1666	leaq	(%r14,%r10,2),%rbx
1667	movq	%r13,-24(%rdi)
1668	sbbq	%r15,%r15
1669	shrq	$63,%r10
1670	leaq	(%rcx,%r11,2),%r8
1671	shrq	$63,%r11
1672	orq	%r10,%r8
1673	movq	0(%rdi),%r10
1674	movq	%r11,%r14
1675	mulq	%rax
1676	negq	%r15
1677	movq	8(%rdi),%r11
1678	adcq	%rax,%rbx
1679	movq	0(%rsi,%rbp,1),%rax
1680	movq	%rbx,-16(%rdi)
1681	adcq	%rdx,%r8
1682
1683	leaq	(%r14,%r10,2),%r12
1684	movq	%r8,-8(%rdi)
1685	sbbq	%r15,%r15
1686	shrq	$63,%r10
1687	leaq	(%rcx,%r11,2),%r13
1688	shrq	$63,%r11
1689	orq	%r10,%r13
1690	movq	16(%rdi),%r10
1691	movq	%r11,%r14
1692	mulq	%rax
1693	negq	%r15
1694	movq	24(%rdi),%r11
1695	adcq	%rax,%r12
1696	movq	8(%rsi,%rbp,1),%rax
1697	movq	%r12,0(%rdi)
1698	adcq	%rdx,%r13
1699
1700	leaq	(%r14,%r10,2),%rbx
1701	movq	%r13,8(%rdi)
1702	sbbq	%r15,%r15
1703	shrq	$63,%r10
1704	leaq	(%rcx,%r11,2),%r8
1705	shrq	$63,%r11
1706	orq	%r10,%r8
1707	movq	32(%rdi),%r10
1708	movq	%r11,%r14
1709	mulq	%rax
1710	negq	%r15
1711	movq	40(%rdi),%r11
1712	adcq	%rax,%rbx
1713	movq	16(%rsi,%rbp,1),%rax
1714	movq	%rbx,16(%rdi)
1715	adcq	%rdx,%r8
1716	movq	%r8,24(%rdi)
1717	sbbq	%r15,%r15
1718	leaq	64(%rdi),%rdi
1719	addq	$32,%rbp
1720	jnz	L$sqr4x_shift_n_add
1721
1722	leaq	(%r14,%r10,2),%r12
1723.byte	0x67
1724	shrq	$63,%r10
1725	leaq	(%rcx,%r11,2),%r13
1726	shrq	$63,%r11
1727	orq	%r10,%r13
1728	movq	-16(%rdi),%r10
1729	movq	%r11,%r14
1730	mulq	%rax
1731	negq	%r15
1732	movq	-8(%rdi),%r11
1733	adcq	%rax,%r12
1734	movq	-8(%rsi),%rax
1735	movq	%r12,-32(%rdi)
1736	adcq	%rdx,%r13
1737
1738	leaq	(%r14,%r10,2),%rbx
1739	movq	%r13,-24(%rdi)
1740	sbbq	%r15,%r15
1741	shrq	$63,%r10
1742	leaq	(%rcx,%r11,2),%r8
1743	shrq	$63,%r11
1744	orq	%r10,%r8
1745	mulq	%rax
1746	negq	%r15
1747	adcq	%rax,%rbx
1748	adcq	%rdx,%r8
1749	movq	%rbx,-16(%rdi)
1750	movq	%r8,-8(%rdi)
1751.byte	102,72,15,126,213
1752__bn_sqr8x_reduction:
1753	xorq	%rax,%rax
1754	leaq	(%r9,%rbp,1),%rcx
1755	leaq	48+8(%rsp,%r9,2),%rdx
1756	movq	%rcx,0+8(%rsp)
1757	leaq	48+8(%rsp,%r9,1),%rdi
1758	movq	%rdx,8+8(%rsp)
1759	negq	%r9
1760	jmp	L$8x_reduction_loop
1761
1762.p2align	5
1763L$8x_reduction_loop:
1764	leaq	(%rdi,%r9,1),%rdi
1765.byte	0x66
1766	movq	0(%rdi),%rbx
1767	movq	8(%rdi),%r9
1768	movq	16(%rdi),%r10
1769	movq	24(%rdi),%r11
1770	movq	32(%rdi),%r12
1771	movq	40(%rdi),%r13
1772	movq	48(%rdi),%r14
1773	movq	56(%rdi),%r15
1774	movq	%rax,(%rdx)
1775	leaq	64(%rdi),%rdi
1776
1777.byte	0x67
1778	movq	%rbx,%r8
1779	imulq	32+8(%rsp),%rbx
1780	movq	0(%rbp),%rax
1781	movl	$8,%ecx
1782	jmp	L$8x_reduce
1783
1784.p2align	5
1785L$8x_reduce:
1786	mulq	%rbx
1787	movq	8(%rbp),%rax
1788	negq	%r8
1789	movq	%rdx,%r8
1790	adcq	$0,%r8
1791
1792	mulq	%rbx
1793	addq	%rax,%r9
1794	movq	16(%rbp),%rax
1795	adcq	$0,%rdx
1796	addq	%r9,%r8
1797	movq	%rbx,48-8+8(%rsp,%rcx,8)
1798	movq	%rdx,%r9
1799	adcq	$0,%r9
1800
1801	mulq	%rbx
1802	addq	%rax,%r10
1803	movq	24(%rbp),%rax
1804	adcq	$0,%rdx
1805	addq	%r10,%r9
1806	movq	32+8(%rsp),%rsi
1807	movq	%rdx,%r10
1808	adcq	$0,%r10
1809
1810	mulq	%rbx
1811	addq	%rax,%r11
1812	movq	32(%rbp),%rax
1813	adcq	$0,%rdx
1814	imulq	%r8,%rsi
1815	addq	%r11,%r10
1816	movq	%rdx,%r11
1817	adcq	$0,%r11
1818
1819	mulq	%rbx
1820	addq	%rax,%r12
1821	movq	40(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r12,%r11
1824	movq	%rdx,%r12
1825	adcq	$0,%r12
1826
1827	mulq	%rbx
1828	addq	%rax,%r13
1829	movq	48(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r13,%r12
1832	movq	%rdx,%r13
1833	adcq	$0,%r13
1834
1835	mulq	%rbx
1836	addq	%rax,%r14
1837	movq	56(%rbp),%rax
1838	adcq	$0,%rdx
1839	addq	%r14,%r13
1840	movq	%rdx,%r14
1841	adcq	$0,%r14
1842
1843	mulq	%rbx
1844	movq	%rsi,%rbx
1845	addq	%rax,%r15
1846	movq	0(%rbp),%rax
1847	adcq	$0,%rdx
1848	addq	%r15,%r14
1849	movq	%rdx,%r15
1850	adcq	$0,%r15
1851
1852	decl	%ecx
1853	jnz	L$8x_reduce
1854
1855	leaq	64(%rbp),%rbp
1856	xorq	%rax,%rax
1857	movq	8+8(%rsp),%rdx
1858	cmpq	0+8(%rsp),%rbp
1859	jae	L$8x_no_tail
1860
1861.byte	0x66
1862	addq	0(%rdi),%r8
1863	adcq	8(%rdi),%r9
1864	adcq	16(%rdi),%r10
1865	adcq	24(%rdi),%r11
1866	adcq	32(%rdi),%r12
1867	adcq	40(%rdi),%r13
1868	adcq	48(%rdi),%r14
1869	adcq	56(%rdi),%r15
1870	sbbq	%rsi,%rsi
1871
1872	movq	48+56+8(%rsp),%rbx
1873	movl	$8,%ecx
1874	movq	0(%rbp),%rax
1875	jmp	L$8x_tail
1876
1877.p2align	5
1878L$8x_tail:
1879	mulq	%rbx
1880	addq	%rax,%r8
1881	movq	8(%rbp),%rax
1882	movq	%r8,(%rdi)
1883	movq	%rdx,%r8
1884	adcq	$0,%r8
1885
1886	mulq	%rbx
1887	addq	%rax,%r9
1888	movq	16(%rbp),%rax
1889	adcq	$0,%rdx
1890	addq	%r9,%r8
1891	leaq	8(%rdi),%rdi
1892	movq	%rdx,%r9
1893	adcq	$0,%r9
1894
1895	mulq	%rbx
1896	addq	%rax,%r10
1897	movq	24(%rbp),%rax
1898	adcq	$0,%rdx
1899	addq	%r10,%r9
1900	movq	%rdx,%r10
1901	adcq	$0,%r10
1902
1903	mulq	%rbx
1904	addq	%rax,%r11
1905	movq	32(%rbp),%rax
1906	adcq	$0,%rdx
1907	addq	%r11,%r10
1908	movq	%rdx,%r11
1909	adcq	$0,%r11
1910
1911	mulq	%rbx
1912	addq	%rax,%r12
1913	movq	40(%rbp),%rax
1914	adcq	$0,%rdx
1915	addq	%r12,%r11
1916	movq	%rdx,%r12
1917	adcq	$0,%r12
1918
1919	mulq	%rbx
1920	addq	%rax,%r13
1921	movq	48(%rbp),%rax
1922	adcq	$0,%rdx
1923	addq	%r13,%r12
1924	movq	%rdx,%r13
1925	adcq	$0,%r13
1926
1927	mulq	%rbx
1928	addq	%rax,%r14
1929	movq	56(%rbp),%rax
1930	adcq	$0,%rdx
1931	addq	%r14,%r13
1932	movq	%rdx,%r14
1933	adcq	$0,%r14
1934
1935	mulq	%rbx
1936	movq	48-16+8(%rsp,%rcx,8),%rbx
1937	addq	%rax,%r15
1938	adcq	$0,%rdx
1939	addq	%r15,%r14
1940	movq	0(%rbp),%rax
1941	movq	%rdx,%r15
1942	adcq	$0,%r15
1943
1944	decl	%ecx
1945	jnz	L$8x_tail
1946
1947	leaq	64(%rbp),%rbp
1948	movq	8+8(%rsp),%rdx
1949	cmpq	0+8(%rsp),%rbp
1950	jae	L$8x_tail_done
1951
1952	movq	48+56+8(%rsp),%rbx
1953	negq	%rsi
1954	movq	0(%rbp),%rax
1955	adcq	0(%rdi),%r8
1956	adcq	8(%rdi),%r9
1957	adcq	16(%rdi),%r10
1958	adcq	24(%rdi),%r11
1959	adcq	32(%rdi),%r12
1960	adcq	40(%rdi),%r13
1961	adcq	48(%rdi),%r14
1962	adcq	56(%rdi),%r15
1963	sbbq	%rsi,%rsi
1964
1965	movl	$8,%ecx
1966	jmp	L$8x_tail
1967
1968.p2align	5
1969L$8x_tail_done:
1970	xorq	%rax,%rax
1971	addq	(%rdx),%r8
1972	adcq	$0,%r9
1973	adcq	$0,%r10
1974	adcq	$0,%r11
1975	adcq	$0,%r12
1976	adcq	$0,%r13
1977	adcq	$0,%r14
1978	adcq	$0,%r15
1979	adcq	$0,%rax
1980
1981	negq	%rsi
1982L$8x_no_tail:
1983	adcq	0(%rdi),%r8
1984	adcq	8(%rdi),%r9
1985	adcq	16(%rdi),%r10
1986	adcq	24(%rdi),%r11
1987	adcq	32(%rdi),%r12
1988	adcq	40(%rdi),%r13
1989	adcq	48(%rdi),%r14
1990	adcq	56(%rdi),%r15
1991	adcq	$0,%rax
1992	movq	-8(%rbp),%rcx
1993	xorq	%rsi,%rsi
1994
1995.byte	102,72,15,126,213
1996
1997	movq	%r8,0(%rdi)
1998	movq	%r9,8(%rdi)
1999.byte	102,73,15,126,217
2000	movq	%r10,16(%rdi)
2001	movq	%r11,24(%rdi)
2002	movq	%r12,32(%rdi)
2003	movq	%r13,40(%rdi)
2004	movq	%r14,48(%rdi)
2005	movq	%r15,56(%rdi)
2006	leaq	64(%rdi),%rdi
2007
2008	cmpq	%rdx,%rdi
2009	jb	L$8x_reduction_loop
2010	.byte	0xf3,0xc3
2011
2012
2013
2014.p2align	5
2015__bn_post4x_internal:
2016
2017	movq	0(%rbp),%r12
2018	leaq	(%rdi,%r9,1),%rbx
2019	movq	%r9,%rcx
2020.byte	102,72,15,126,207
2021	negq	%rax
2022.byte	102,72,15,126,206
2023	sarq	$3+2,%rcx
2024	decq	%r12
2025	xorq	%r10,%r10
2026	movq	8(%rbp),%r13
2027	movq	16(%rbp),%r14
2028	movq	24(%rbp),%r15
2029	jmp	L$sqr4x_sub_entry
2030
2031.p2align	4
2032L$sqr4x_sub:
2033	movq	0(%rbp),%r12
2034	movq	8(%rbp),%r13
2035	movq	16(%rbp),%r14
2036	movq	24(%rbp),%r15
2037L$sqr4x_sub_entry:
2038	leaq	32(%rbp),%rbp
2039	notq	%r12
2040	notq	%r13
2041	notq	%r14
2042	notq	%r15
2043	andq	%rax,%r12
2044	andq	%rax,%r13
2045	andq	%rax,%r14
2046	andq	%rax,%r15
2047
2048	negq	%r10
2049	adcq	0(%rbx),%r12
2050	adcq	8(%rbx),%r13
2051	adcq	16(%rbx),%r14
2052	adcq	24(%rbx),%r15
2053	movq	%r12,0(%rdi)
2054	leaq	32(%rbx),%rbx
2055	movq	%r13,8(%rdi)
2056	sbbq	%r10,%r10
2057	movq	%r14,16(%rdi)
2058	movq	%r15,24(%rdi)
2059	leaq	32(%rdi),%rdi
2060
2061	incq	%rcx
2062	jnz	L$sqr4x_sub
2063
2064	movq	%r9,%r10
2065	negq	%r9
2066	.byte	0xf3,0xc3
2067
2068
2069.globl	_bn_from_montgomery
2070.private_extern _bn_from_montgomery
2071
2072.p2align	5
2073_bn_from_montgomery:
2074
2075	testl	$7,%r9d
2076	jz	bn_from_mont8x
2077	xorl	%eax,%eax
2078	.byte	0xf3,0xc3
2079
2080
2081
2082
2083.p2align	5
2084bn_from_mont8x:
2085
2086.byte	0x67
2087	movq	%rsp,%rax
2088
2089	pushq	%rbx
2090
2091	pushq	%rbp
2092
2093	pushq	%r12
2094
2095	pushq	%r13
2096
2097	pushq	%r14
2098
2099	pushq	%r15
2100
2101L$from_prologue:
2102
2103	shll	$3,%r9d
2104	leaq	(%r9,%r9,2),%r10
2105	negq	%r9
2106	movq	(%r8),%r8
2107
2108
2109
2110
2111
2112
2113
2114
2115	leaq	-320(%rsp,%r9,2),%r11
2116	movq	%rsp,%rbp
2117	subq	%rdi,%r11
2118	andq	$4095,%r11
2119	cmpq	%r11,%r10
2120	jb	L$from_sp_alt
2121	subq	%r11,%rbp
2122	leaq	-320(%rbp,%r9,2),%rbp
2123	jmp	L$from_sp_done
2124
2125.p2align	5
2126L$from_sp_alt:
2127	leaq	4096-320(,%r9,2),%r10
2128	leaq	-320(%rbp,%r9,2),%rbp
2129	subq	%r10,%r11
2130	movq	$0,%r10
2131	cmovcq	%r10,%r11
2132	subq	%r11,%rbp
2133L$from_sp_done:
2134	andq	$-64,%rbp
2135	movq	%rsp,%r11
2136	subq	%rbp,%r11
2137	andq	$-4096,%r11
2138	leaq	(%r11,%rbp,1),%rsp
2139	movq	(%rsp),%r10
2140	cmpq	%rbp,%rsp
2141	ja	L$from_page_walk
2142	jmp	L$from_page_walk_done
2143
2144L$from_page_walk:
2145	leaq	-4096(%rsp),%rsp
2146	movq	(%rsp),%r10
2147	cmpq	%rbp,%rsp
2148	ja	L$from_page_walk
2149L$from_page_walk_done:
2150
2151	movq	%r9,%r10
2152	negq	%r9
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163	movq	%r8,32(%rsp)
2164	movq	%rax,40(%rsp)
2165
2166L$from_body:
2167	movq	%r9,%r11
2168	leaq	48(%rsp),%rax
2169	pxor	%xmm0,%xmm0
2170	jmp	L$mul_by_1
2171
2172.p2align	5
2173L$mul_by_1:
2174	movdqu	(%rsi),%xmm1
2175	movdqu	16(%rsi),%xmm2
2176	movdqu	32(%rsi),%xmm3
2177	movdqa	%xmm0,(%rax,%r9,1)
2178	movdqu	48(%rsi),%xmm4
2179	movdqa	%xmm0,16(%rax,%r9,1)
2180.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2181	movdqa	%xmm1,(%rax)
2182	movdqa	%xmm0,32(%rax,%r9,1)
2183	movdqa	%xmm2,16(%rax)
2184	movdqa	%xmm0,48(%rax,%r9,1)
2185	movdqa	%xmm3,32(%rax)
2186	movdqa	%xmm4,48(%rax)
2187	leaq	64(%rax),%rax
2188	subq	$64,%r11
2189	jnz	L$mul_by_1
2190
2191.byte	102,72,15,110,207
2192.byte	102,72,15,110,209
2193.byte	0x67
2194	movq	%rcx,%rbp
2195.byte	102,73,15,110,218
2196	leaq	_OPENSSL_ia32cap_P(%rip),%r11
2197	movl	8(%r11),%r11d
2198	andl	$0x80108,%r11d
2199	cmpl	$0x80108,%r11d
2200	jne	L$from_mont_nox
2201
2202	leaq	(%rax,%r9,1),%rdi
2203	call	__bn_sqrx8x_reduction
2204	call	__bn_postx4x_internal
2205
2206	pxor	%xmm0,%xmm0
2207	leaq	48(%rsp),%rax
2208	jmp	L$from_mont_zero
2209
2210.p2align	5
2211L$from_mont_nox:
2212	call	__bn_sqr8x_reduction
2213	call	__bn_post4x_internal
2214
2215	pxor	%xmm0,%xmm0
2216	leaq	48(%rsp),%rax
2217	jmp	L$from_mont_zero
2218
2219.p2align	5
2220L$from_mont_zero:
2221	movq	40(%rsp),%rsi
2222
2223	movdqa	%xmm0,0(%rax)
2224	movdqa	%xmm0,16(%rax)
2225	movdqa	%xmm0,32(%rax)
2226	movdqa	%xmm0,48(%rax)
2227	leaq	64(%rax),%rax
2228	subq	$32,%r9
2229	jnz	L$from_mont_zero
2230
2231	movq	$1,%rax
2232	movq	-48(%rsi),%r15
2233
2234	movq	-40(%rsi),%r14
2235
2236	movq	-32(%rsi),%r13
2237
2238	movq	-24(%rsi),%r12
2239
2240	movq	-16(%rsi),%rbp
2241
2242	movq	-8(%rsi),%rbx
2243
2244	leaq	(%rsi),%rsp
2245
2246L$from_epilogue:
2247	.byte	0xf3,0xc3
2248
2249
2250
2251.p2align	5
2252bn_mulx4x_mont_gather5:
2253
2254	movq	%rsp,%rax
2255
2256L$mulx4x_enter:
2257	pushq	%rbx
2258
2259	pushq	%rbp
2260
2261	pushq	%r12
2262
2263	pushq	%r13
2264
2265	pushq	%r14
2266
2267	pushq	%r15
2268
2269L$mulx4x_prologue:
2270
2271	shll	$3,%r9d
2272	leaq	(%r9,%r9,2),%r10
2273	negq	%r9
2274	movq	(%r8),%r8
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285	leaq	-320(%rsp,%r9,2),%r11
2286	movq	%rsp,%rbp
2287	subq	%rdi,%r11
2288	andq	$4095,%r11
2289	cmpq	%r11,%r10
2290	jb	L$mulx4xsp_alt
2291	subq	%r11,%rbp
2292	leaq	-320(%rbp,%r9,2),%rbp
2293	jmp	L$mulx4xsp_done
2294
2295L$mulx4xsp_alt:
2296	leaq	4096-320(,%r9,2),%r10
2297	leaq	-320(%rbp,%r9,2),%rbp
2298	subq	%r10,%r11
2299	movq	$0,%r10
2300	cmovcq	%r10,%r11
2301	subq	%r11,%rbp
2302L$mulx4xsp_done:
2303	andq	$-64,%rbp
2304	movq	%rsp,%r11
2305	subq	%rbp,%r11
2306	andq	$-4096,%r11
2307	leaq	(%r11,%rbp,1),%rsp
2308	movq	(%rsp),%r10
2309	cmpq	%rbp,%rsp
2310	ja	L$mulx4x_page_walk
2311	jmp	L$mulx4x_page_walk_done
2312
2313L$mulx4x_page_walk:
2314	leaq	-4096(%rsp),%rsp
2315	movq	(%rsp),%r10
2316	cmpq	%rbp,%rsp
2317	ja	L$mulx4x_page_walk
2318L$mulx4x_page_walk_done:
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332	movq	%r8,32(%rsp)
2333	movq	%rax,40(%rsp)
2334
2335L$mulx4x_body:
2336	call	mulx4x_internal
2337
2338	movq	40(%rsp),%rsi
2339
2340	movq	$1,%rax
2341
2342	movq	-48(%rsi),%r15
2343
2344	movq	-40(%rsi),%r14
2345
2346	movq	-32(%rsi),%r13
2347
2348	movq	-24(%rsi),%r12
2349
2350	movq	-16(%rsi),%rbp
2351
2352	movq	-8(%rsi),%rbx
2353
2354	leaq	(%rsi),%rsp
2355
2356L$mulx4x_epilogue:
2357	.byte	0xf3,0xc3
2358
2359
2360
2361
2362.p2align	5
2363mulx4x_internal:
2364
2365	movq	%r9,8(%rsp)
2366	movq	%r9,%r10
2367	negq	%r9
2368	shlq	$5,%r9
2369	negq	%r10
2370	leaq	128(%rdx,%r9,1),%r13
2371	shrq	$5+5,%r9
2372	movd	8(%rax),%xmm5
2373	subq	$1,%r9
2374	leaq	L$inc(%rip),%rax
2375	movq	%r13,16+8(%rsp)
2376	movq	%r9,24+8(%rsp)
2377	movq	%rdi,56+8(%rsp)
2378	movdqa	0(%rax),%xmm0
2379	movdqa	16(%rax),%xmm1
2380	leaq	88-112(%rsp,%r10,1),%r10
2381	leaq	128(%rdx),%rdi
2382
2383	pshufd	$0,%xmm5,%xmm5
2384	movdqa	%xmm1,%xmm4
2385.byte	0x67
2386	movdqa	%xmm1,%xmm2
2387.byte	0x67
2388	paddd	%xmm0,%xmm1
2389	pcmpeqd	%xmm5,%xmm0
2390	movdqa	%xmm4,%xmm3
2391	paddd	%xmm1,%xmm2
2392	pcmpeqd	%xmm5,%xmm1
2393	movdqa	%xmm0,112(%r10)
2394	movdqa	%xmm4,%xmm0
2395
2396	paddd	%xmm2,%xmm3
2397	pcmpeqd	%xmm5,%xmm2
2398	movdqa	%xmm1,128(%r10)
2399	movdqa	%xmm4,%xmm1
2400
2401	paddd	%xmm3,%xmm0
2402	pcmpeqd	%xmm5,%xmm3
2403	movdqa	%xmm2,144(%r10)
2404	movdqa	%xmm4,%xmm2
2405
2406	paddd	%xmm0,%xmm1
2407	pcmpeqd	%xmm5,%xmm0
2408	movdqa	%xmm3,160(%r10)
2409	movdqa	%xmm4,%xmm3
2410	paddd	%xmm1,%xmm2
2411	pcmpeqd	%xmm5,%xmm1
2412	movdqa	%xmm0,176(%r10)
2413	movdqa	%xmm4,%xmm0
2414
2415	paddd	%xmm2,%xmm3
2416	pcmpeqd	%xmm5,%xmm2
2417	movdqa	%xmm1,192(%r10)
2418	movdqa	%xmm4,%xmm1
2419
2420	paddd	%xmm3,%xmm0
2421	pcmpeqd	%xmm5,%xmm3
2422	movdqa	%xmm2,208(%r10)
2423	movdqa	%xmm4,%xmm2
2424
2425	paddd	%xmm0,%xmm1
2426	pcmpeqd	%xmm5,%xmm0
2427	movdqa	%xmm3,224(%r10)
2428	movdqa	%xmm4,%xmm3
2429	paddd	%xmm1,%xmm2
2430	pcmpeqd	%xmm5,%xmm1
2431	movdqa	%xmm0,240(%r10)
2432	movdqa	%xmm4,%xmm0
2433
2434	paddd	%xmm2,%xmm3
2435	pcmpeqd	%xmm5,%xmm2
2436	movdqa	%xmm1,256(%r10)
2437	movdqa	%xmm4,%xmm1
2438
2439	paddd	%xmm3,%xmm0
2440	pcmpeqd	%xmm5,%xmm3
2441	movdqa	%xmm2,272(%r10)
2442	movdqa	%xmm4,%xmm2
2443
2444	paddd	%xmm0,%xmm1
2445	pcmpeqd	%xmm5,%xmm0
2446	movdqa	%xmm3,288(%r10)
2447	movdqa	%xmm4,%xmm3
2448.byte	0x67
2449	paddd	%xmm1,%xmm2
2450	pcmpeqd	%xmm5,%xmm1
2451	movdqa	%xmm0,304(%r10)
2452
2453	paddd	%xmm2,%xmm3
2454	pcmpeqd	%xmm5,%xmm2
2455	movdqa	%xmm1,320(%r10)
2456
2457	pcmpeqd	%xmm5,%xmm3
2458	movdqa	%xmm2,336(%r10)
2459
2460	pand	64(%rdi),%xmm0
2461	pand	80(%rdi),%xmm1
2462	pand	96(%rdi),%xmm2
2463	movdqa	%xmm3,352(%r10)
2464	pand	112(%rdi),%xmm3
2465	por	%xmm2,%xmm0
2466	por	%xmm3,%xmm1
2467	movdqa	-128(%rdi),%xmm4
2468	movdqa	-112(%rdi),%xmm5
2469	movdqa	-96(%rdi),%xmm2
2470	pand	112(%r10),%xmm4
2471	movdqa	-80(%rdi),%xmm3
2472	pand	128(%r10),%xmm5
2473	por	%xmm4,%xmm0
2474	pand	144(%r10),%xmm2
2475	por	%xmm5,%xmm1
2476	pand	160(%r10),%xmm3
2477	por	%xmm2,%xmm0
2478	por	%xmm3,%xmm1
2479	movdqa	-64(%rdi),%xmm4
2480	movdqa	-48(%rdi),%xmm5
2481	movdqa	-32(%rdi),%xmm2
2482	pand	176(%r10),%xmm4
2483	movdqa	-16(%rdi),%xmm3
2484	pand	192(%r10),%xmm5
2485	por	%xmm4,%xmm0
2486	pand	208(%r10),%xmm2
2487	por	%xmm5,%xmm1
2488	pand	224(%r10),%xmm3
2489	por	%xmm2,%xmm0
2490	por	%xmm3,%xmm1
2491	movdqa	0(%rdi),%xmm4
2492	movdqa	16(%rdi),%xmm5
2493	movdqa	32(%rdi),%xmm2
2494	pand	240(%r10),%xmm4
2495	movdqa	48(%rdi),%xmm3
2496	pand	256(%r10),%xmm5
2497	por	%xmm4,%xmm0
2498	pand	272(%r10),%xmm2
2499	por	%xmm5,%xmm1
2500	pand	288(%r10),%xmm3
2501	por	%xmm2,%xmm0
2502	por	%xmm3,%xmm1
2503	pxor	%xmm1,%xmm0
2504	pshufd	$0x4e,%xmm0,%xmm1
2505	por	%xmm1,%xmm0
2506	leaq	256(%rdi),%rdi
2507.byte	102,72,15,126,194
2508	leaq	64+32+8(%rsp),%rbx
2509
2510	movq	%rdx,%r9
2511	mulxq	0(%rsi),%r8,%rax
2512	mulxq	8(%rsi),%r11,%r12
2513	addq	%rax,%r11
2514	mulxq	16(%rsi),%rax,%r13
2515	adcq	%rax,%r12
2516	adcq	$0,%r13
2517	mulxq	24(%rsi),%rax,%r14
2518
2519	movq	%r8,%r15
2520	imulq	32+8(%rsp),%r8
2521	xorq	%rbp,%rbp
2522	movq	%r8,%rdx
2523
2524	movq	%rdi,8+8(%rsp)
2525
2526	leaq	32(%rsi),%rsi
2527	adcxq	%rax,%r13
2528	adcxq	%rbp,%r14
2529
2530	mulxq	0(%rcx),%rax,%r10
2531	adcxq	%rax,%r15
2532	adoxq	%r11,%r10
2533	mulxq	8(%rcx),%rax,%r11
2534	adcxq	%rax,%r10
2535	adoxq	%r12,%r11
2536	mulxq	16(%rcx),%rax,%r12
2537	movq	24+8(%rsp),%rdi
2538	movq	%r10,-32(%rbx)
2539	adcxq	%rax,%r11
2540	adoxq	%r13,%r12
2541	mulxq	24(%rcx),%rax,%r15
2542	movq	%r9,%rdx
2543	movq	%r11,-24(%rbx)
2544	adcxq	%rax,%r12
2545	adoxq	%rbp,%r15
2546	leaq	32(%rcx),%rcx
2547	movq	%r12,-16(%rbx)
2548	jmp	L$mulx4x_1st
2549
2550.p2align	5
2551L$mulx4x_1st:
2552	adcxq	%rbp,%r15
2553	mulxq	0(%rsi),%r10,%rax
2554	adcxq	%r14,%r10
2555	mulxq	8(%rsi),%r11,%r14
2556	adcxq	%rax,%r11
2557	mulxq	16(%rsi),%r12,%rax
2558	adcxq	%r14,%r12
2559	mulxq	24(%rsi),%r13,%r14
2560.byte	0x67,0x67
2561	movq	%r8,%rdx
2562	adcxq	%rax,%r13
2563	adcxq	%rbp,%r14
2564	leaq	32(%rsi),%rsi
2565	leaq	32(%rbx),%rbx
2566
2567	adoxq	%r15,%r10
2568	mulxq	0(%rcx),%rax,%r15
2569	adcxq	%rax,%r10
2570	adoxq	%r15,%r11
2571	mulxq	8(%rcx),%rax,%r15
2572	adcxq	%rax,%r11
2573	adoxq	%r15,%r12
2574	mulxq	16(%rcx),%rax,%r15
2575	movq	%r10,-40(%rbx)
2576	adcxq	%rax,%r12
2577	movq	%r11,-32(%rbx)
2578	adoxq	%r15,%r13
2579	mulxq	24(%rcx),%rax,%r15
2580	movq	%r9,%rdx
2581	movq	%r12,-24(%rbx)
2582	adcxq	%rax,%r13
2583	adoxq	%rbp,%r15
2584	leaq	32(%rcx),%rcx
2585	movq	%r13,-16(%rbx)
2586
2587	decq	%rdi
2588	jnz	L$mulx4x_1st
2589
2590	movq	8(%rsp),%rax
2591	adcq	%rbp,%r15
2592	leaq	(%rsi,%rax,1),%rsi
2593	addq	%r15,%r14
2594	movq	8+8(%rsp),%rdi
2595	adcq	%rbp,%rbp
2596	movq	%r14,-8(%rbx)
2597	jmp	L$mulx4x_outer
2598
2599.p2align	5
2600L$mulx4x_outer:
2601	leaq	16-256(%rbx),%r10
2602	pxor	%xmm4,%xmm4
2603.byte	0x67,0x67
2604	pxor	%xmm5,%xmm5
2605	movdqa	-128(%rdi),%xmm0
2606	movdqa	-112(%rdi),%xmm1
2607	movdqa	-96(%rdi),%xmm2
2608	pand	256(%r10),%xmm0
2609	movdqa	-80(%rdi),%xmm3
2610	pand	272(%r10),%xmm1
2611	por	%xmm0,%xmm4
2612	pand	288(%r10),%xmm2
2613	por	%xmm1,%xmm5
2614	pand	304(%r10),%xmm3
2615	por	%xmm2,%xmm4
2616	por	%xmm3,%xmm5
2617	movdqa	-64(%rdi),%xmm0
2618	movdqa	-48(%rdi),%xmm1
2619	movdqa	-32(%rdi),%xmm2
2620	pand	320(%r10),%xmm0
2621	movdqa	-16(%rdi),%xmm3
2622	pand	336(%r10),%xmm1
2623	por	%xmm0,%xmm4
2624	pand	352(%r10),%xmm2
2625	por	%xmm1,%xmm5
2626	pand	368(%r10),%xmm3
2627	por	%xmm2,%xmm4
2628	por	%xmm3,%xmm5
2629	movdqa	0(%rdi),%xmm0
2630	movdqa	16(%rdi),%xmm1
2631	movdqa	32(%rdi),%xmm2
2632	pand	384(%r10),%xmm0
2633	movdqa	48(%rdi),%xmm3
2634	pand	400(%r10),%xmm1
2635	por	%xmm0,%xmm4
2636	pand	416(%r10),%xmm2
2637	por	%xmm1,%xmm5
2638	pand	432(%r10),%xmm3
2639	por	%xmm2,%xmm4
2640	por	%xmm3,%xmm5
2641	movdqa	64(%rdi),%xmm0
2642	movdqa	80(%rdi),%xmm1
2643	movdqa	96(%rdi),%xmm2
2644	pand	448(%r10),%xmm0
2645	movdqa	112(%rdi),%xmm3
2646	pand	464(%r10),%xmm1
2647	por	%xmm0,%xmm4
2648	pand	480(%r10),%xmm2
2649	por	%xmm1,%xmm5
2650	pand	496(%r10),%xmm3
2651	por	%xmm2,%xmm4
2652	por	%xmm3,%xmm5
2653	por	%xmm5,%xmm4
2654	pshufd	$0x4e,%xmm4,%xmm0
2655	por	%xmm4,%xmm0
2656	leaq	256(%rdi),%rdi
2657.byte	102,72,15,126,194
2658
2659	movq	%rbp,(%rbx)
2660	leaq	32(%rbx,%rax,1),%rbx
2661	mulxq	0(%rsi),%r8,%r11
2662	xorq	%rbp,%rbp
2663	movq	%rdx,%r9
2664	mulxq	8(%rsi),%r14,%r12
2665	adoxq	-32(%rbx),%r8
2666	adcxq	%r14,%r11
2667	mulxq	16(%rsi),%r15,%r13
2668	adoxq	-24(%rbx),%r11
2669	adcxq	%r15,%r12
2670	mulxq	24(%rsi),%rdx,%r14
2671	adoxq	-16(%rbx),%r12
2672	adcxq	%rdx,%r13
2673	leaq	(%rcx,%rax,1),%rcx
2674	leaq	32(%rsi),%rsi
2675	adoxq	-8(%rbx),%r13
2676	adcxq	%rbp,%r14
2677	adoxq	%rbp,%r14
2678
2679	movq	%r8,%r15
2680	imulq	32+8(%rsp),%r8
2681
2682	movq	%r8,%rdx
2683	xorq	%rbp,%rbp
2684	movq	%rdi,8+8(%rsp)
2685
2686	mulxq	0(%rcx),%rax,%r10
2687	adcxq	%rax,%r15
2688	adoxq	%r11,%r10
2689	mulxq	8(%rcx),%rax,%r11
2690	adcxq	%rax,%r10
2691	adoxq	%r12,%r11
2692	mulxq	16(%rcx),%rax,%r12
2693	adcxq	%rax,%r11
2694	adoxq	%r13,%r12
2695	mulxq	24(%rcx),%rax,%r15
2696	movq	%r9,%rdx
2697	movq	24+8(%rsp),%rdi
2698	movq	%r10,-32(%rbx)
2699	adcxq	%rax,%r12
2700	movq	%r11,-24(%rbx)
2701	adoxq	%rbp,%r15
2702	movq	%r12,-16(%rbx)
2703	leaq	32(%rcx),%rcx
2704	jmp	L$mulx4x_inner
2705
2706.p2align	5
2707L$mulx4x_inner:
2708	mulxq	0(%rsi),%r10,%rax
2709	adcxq	%rbp,%r15
2710	adoxq	%r14,%r10
2711	mulxq	8(%rsi),%r11,%r14
2712	adcxq	0(%rbx),%r10
2713	adoxq	%rax,%r11
2714	mulxq	16(%rsi),%r12,%rax
2715	adcxq	8(%rbx),%r11
2716	adoxq	%r14,%r12
2717	mulxq	24(%rsi),%r13,%r14
2718	movq	%r8,%rdx
2719	adcxq	16(%rbx),%r12
2720	adoxq	%rax,%r13
2721	adcxq	24(%rbx),%r13
2722	adoxq	%rbp,%r14
2723	leaq	32(%rsi),%rsi
2724	leaq	32(%rbx),%rbx
2725	adcxq	%rbp,%r14
2726
2727	adoxq	%r15,%r10
2728	mulxq	0(%rcx),%rax,%r15
2729	adcxq	%rax,%r10
2730	adoxq	%r15,%r11
2731	mulxq	8(%rcx),%rax,%r15
2732	adcxq	%rax,%r11
2733	adoxq	%r15,%r12
2734	mulxq	16(%rcx),%rax,%r15
2735	movq	%r10,-40(%rbx)
2736	adcxq	%rax,%r12
2737	adoxq	%r15,%r13
2738	movq	%r11,-32(%rbx)
2739	mulxq	24(%rcx),%rax,%r15
2740	movq	%r9,%rdx
2741	leaq	32(%rcx),%rcx
2742	movq	%r12,-24(%rbx)
2743	adcxq	%rax,%r13
2744	adoxq	%rbp,%r15
2745	movq	%r13,-16(%rbx)
2746
2747	decq	%rdi
2748	jnz	L$mulx4x_inner
2749
2750	movq	0+8(%rsp),%rax
2751	adcq	%rbp,%r15
2752	subq	0(%rbx),%rdi
2753	movq	8+8(%rsp),%rdi
2754	movq	16+8(%rsp),%r10
2755	adcq	%r15,%r14
2756	leaq	(%rsi,%rax,1),%rsi
2757	adcq	%rbp,%rbp
2758	movq	%r14,-8(%rbx)
2759
2760	cmpq	%r10,%rdi
2761	jb	L$mulx4x_outer
2762
2763	movq	-8(%rcx),%r10
2764	movq	%rbp,%r8
2765	movq	(%rcx,%rax,1),%r12
2766	leaq	(%rcx,%rax,1),%rbp
2767	movq	%rax,%rcx
2768	leaq	(%rbx,%rax,1),%rdi
2769	xorl	%eax,%eax
2770	xorq	%r15,%r15
2771	subq	%r14,%r10
2772	adcq	%r15,%r15
2773	orq	%r15,%r8
2774	sarq	$3+2,%rcx
2775	subq	%r8,%rax
2776	movq	56+8(%rsp),%rdx
2777	decq	%r12
2778	movq	8(%rbp),%r13
2779	xorq	%r8,%r8
2780	movq	16(%rbp),%r14
2781	movq	24(%rbp),%r15
2782	jmp	L$sqrx4x_sub_entry
2783
2784
2785
2786.p2align	5
2787bn_powerx5:
2788
2789	movq	%rsp,%rax
2790
2791L$powerx5_enter:
2792	pushq	%rbx
2793
2794	pushq	%rbp
2795
2796	pushq	%r12
2797
2798	pushq	%r13
2799
2800	pushq	%r14
2801
2802	pushq	%r15
2803
2804L$powerx5_prologue:
2805
2806	shll	$3,%r9d
2807	leaq	(%r9,%r9,2),%r10
2808	negq	%r9
2809	movq	(%r8),%r8
2810
2811
2812
2813
2814
2815
2816
2817
2818	leaq	-320(%rsp,%r9,2),%r11
2819	movq	%rsp,%rbp
2820	subq	%rdi,%r11
2821	andq	$4095,%r11
2822	cmpq	%r11,%r10
2823	jb	L$pwrx_sp_alt
2824	subq	%r11,%rbp
2825	leaq	-320(%rbp,%r9,2),%rbp
2826	jmp	L$pwrx_sp_done
2827
2828.p2align	5
2829L$pwrx_sp_alt:
2830	leaq	4096-320(,%r9,2),%r10
2831	leaq	-320(%rbp,%r9,2),%rbp
2832	subq	%r10,%r11
2833	movq	$0,%r10
2834	cmovcq	%r10,%r11
2835	subq	%r11,%rbp
2836L$pwrx_sp_done:
2837	andq	$-64,%rbp
2838	movq	%rsp,%r11
2839	subq	%rbp,%r11
2840	andq	$-4096,%r11
2841	leaq	(%r11,%rbp,1),%rsp
2842	movq	(%rsp),%r10
2843	cmpq	%rbp,%rsp
2844	ja	L$pwrx_page_walk
2845	jmp	L$pwrx_page_walk_done
2846
2847L$pwrx_page_walk:
2848	leaq	-4096(%rsp),%rsp
2849	movq	(%rsp),%r10
2850	cmpq	%rbp,%rsp
2851	ja	L$pwrx_page_walk
2852L$pwrx_page_walk_done:
2853
2854	movq	%r9,%r10
2855	negq	%r9
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868	pxor	%xmm0,%xmm0
2869.byte	102,72,15,110,207
2870.byte	102,72,15,110,209
2871.byte	102,73,15,110,218
2872.byte	102,72,15,110,226
2873	movq	%r8,32(%rsp)
2874	movq	%rax,40(%rsp)
2875
2876L$powerx5_body:
2877
2878	call	__bn_sqrx8x_internal
2879	call	__bn_postx4x_internal
2880	call	__bn_sqrx8x_internal
2881	call	__bn_postx4x_internal
2882	call	__bn_sqrx8x_internal
2883	call	__bn_postx4x_internal
2884	call	__bn_sqrx8x_internal
2885	call	__bn_postx4x_internal
2886	call	__bn_sqrx8x_internal
2887	call	__bn_postx4x_internal
2888
2889	movq	%r10,%r9
2890	movq	%rsi,%rdi
2891.byte	102,72,15,126,209
2892.byte	102,72,15,126,226
2893	movq	40(%rsp),%rax
2894
2895	call	mulx4x_internal
2896
2897	movq	40(%rsp),%rsi
2898
2899	movq	$1,%rax
2900
2901	movq	-48(%rsi),%r15
2902
2903	movq	-40(%rsi),%r14
2904
2905	movq	-32(%rsi),%r13
2906
2907	movq	-24(%rsi),%r12
2908
2909	movq	-16(%rsi),%rbp
2910
2911	movq	-8(%rsi),%rbx
2912
2913	leaq	(%rsi),%rsp
2914
2915L$powerx5_epilogue:
2916	.byte	0xf3,0xc3
2917
2918
2919
2920.globl	_bn_sqrx8x_internal
2921.private_extern _bn_sqrx8x_internal
2922.private_extern	_bn_sqrx8x_internal
2923
2924.p2align	5
2925_bn_sqrx8x_internal:
2926__bn_sqrx8x_internal:
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968	leaq	48+8(%rsp),%rdi
2969	leaq	(%rsi,%r9,1),%rbp
2970	movq	%r9,0+8(%rsp)
2971	movq	%rbp,8+8(%rsp)
2972	jmp	L$sqr8x_zero_start
2973
2974.p2align	5
2975.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2976L$sqrx8x_zero:
2977.byte	0x3e
2978	movdqa	%xmm0,0(%rdi)
2979	movdqa	%xmm0,16(%rdi)
2980	movdqa	%xmm0,32(%rdi)
2981	movdqa	%xmm0,48(%rdi)
2982L$sqr8x_zero_start:
2983	movdqa	%xmm0,64(%rdi)
2984	movdqa	%xmm0,80(%rdi)
2985	movdqa	%xmm0,96(%rdi)
2986	movdqa	%xmm0,112(%rdi)
2987	leaq	128(%rdi),%rdi
2988	subq	$64,%r9
2989	jnz	L$sqrx8x_zero
2990
2991	movq	0(%rsi),%rdx
2992
2993	xorq	%r10,%r10
2994	xorq	%r11,%r11
2995	xorq	%r12,%r12
2996	xorq	%r13,%r13
2997	xorq	%r14,%r14
2998	xorq	%r15,%r15
2999	leaq	48+8(%rsp),%rdi
3000	xorq	%rbp,%rbp
3001	jmp	L$sqrx8x_outer_loop
3002
3003.p2align	5
3004L$sqrx8x_outer_loop:
3005	mulxq	8(%rsi),%r8,%rax
3006	adcxq	%r9,%r8
3007	adoxq	%rax,%r10
3008	mulxq	16(%rsi),%r9,%rax
3009	adcxq	%r10,%r9
3010	adoxq	%rax,%r11
3011.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3012	adcxq	%r11,%r10
3013	adoxq	%rax,%r12
3014.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3015	adcxq	%r12,%r11
3016	adoxq	%rax,%r13
3017	mulxq	40(%rsi),%r12,%rax
3018	adcxq	%r13,%r12
3019	adoxq	%rax,%r14
3020	mulxq	48(%rsi),%r13,%rax
3021	adcxq	%r14,%r13
3022	adoxq	%r15,%rax
3023	mulxq	56(%rsi),%r14,%r15
3024	movq	8(%rsi),%rdx
3025	adcxq	%rax,%r14
3026	adoxq	%rbp,%r15
3027	adcq	64(%rdi),%r15
3028	movq	%r8,8(%rdi)
3029	movq	%r9,16(%rdi)
3030	sbbq	%rcx,%rcx
3031	xorq	%rbp,%rbp
3032
3033
3034	mulxq	16(%rsi),%r8,%rbx
3035	mulxq	24(%rsi),%r9,%rax
3036	adcxq	%r10,%r8
3037	adoxq	%rbx,%r9
3038	mulxq	32(%rsi),%r10,%rbx
3039	adcxq	%r11,%r9
3040	adoxq	%rax,%r10
3041.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3042	adcxq	%r12,%r10
3043	adoxq	%rbx,%r11
3044.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3045	adcxq	%r13,%r11
3046	adoxq	%r14,%r12
3047.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3048	movq	16(%rsi),%rdx
3049	adcxq	%rax,%r12
3050	adoxq	%rbx,%r13
3051	adcxq	%r15,%r13
3052	adoxq	%rbp,%r14
3053	adcxq	%rbp,%r14
3054
3055	movq	%r8,24(%rdi)
3056	movq	%r9,32(%rdi)
3057
3058	mulxq	24(%rsi),%r8,%rbx
3059	mulxq	32(%rsi),%r9,%rax
3060	adcxq	%r10,%r8
3061	adoxq	%rbx,%r9
3062	mulxq	40(%rsi),%r10,%rbx
3063	adcxq	%r11,%r9
3064	adoxq	%rax,%r10
3065.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3066	adcxq	%r12,%r10
3067	adoxq	%r13,%r11
3068.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3069.byte	0x3e
3070	movq	24(%rsi),%rdx
3071	adcxq	%rbx,%r11
3072	adoxq	%rax,%r12
3073	adcxq	%r14,%r12
3074	movq	%r8,40(%rdi)
3075	movq	%r9,48(%rdi)
3076	mulxq	32(%rsi),%r8,%rax
3077	adoxq	%rbp,%r13
3078	adcxq	%rbp,%r13
3079
3080	mulxq	40(%rsi),%r9,%rbx
3081	adcxq	%r10,%r8
3082	adoxq	%rax,%r9
3083	mulxq	48(%rsi),%r10,%rax
3084	adcxq	%r11,%r9
3085	adoxq	%r12,%r10
3086	mulxq	56(%rsi),%r11,%r12
3087	movq	32(%rsi),%rdx
3088	movq	40(%rsi),%r14
3089	adcxq	%rbx,%r10
3090	adoxq	%rax,%r11
3091	movq	48(%rsi),%r15
3092	adcxq	%r13,%r11
3093	adoxq	%rbp,%r12
3094	adcxq	%rbp,%r12
3095
3096	movq	%r8,56(%rdi)
3097	movq	%r9,64(%rdi)
3098
3099	mulxq	%r14,%r9,%rax
3100	movq	56(%rsi),%r8
3101	adcxq	%r10,%r9
3102	mulxq	%r15,%r10,%rbx
3103	adoxq	%rax,%r10
3104	adcxq	%r11,%r10
3105	mulxq	%r8,%r11,%rax
3106	movq	%r14,%rdx
3107	adoxq	%rbx,%r11
3108	adcxq	%r12,%r11
3109
3110	adcxq	%rbp,%rax
3111
3112	mulxq	%r15,%r14,%rbx
3113	mulxq	%r8,%r12,%r13
3114	movq	%r15,%rdx
3115	leaq	64(%rsi),%rsi
3116	adcxq	%r14,%r11
3117	adoxq	%rbx,%r12
3118	adcxq	%rax,%r12
3119	adoxq	%rbp,%r13
3120
3121.byte	0x67,0x67
3122	mulxq	%r8,%r8,%r14
3123	adcxq	%r8,%r13
3124	adcxq	%rbp,%r14
3125
3126	cmpq	8+8(%rsp),%rsi
3127	je	L$sqrx8x_outer_break
3128
3129	negq	%rcx
3130	movq	$-8,%rcx
3131	movq	%rbp,%r15
3132	movq	64(%rdi),%r8
3133	adcxq	72(%rdi),%r9
3134	adcxq	80(%rdi),%r10
3135	adcxq	88(%rdi),%r11
3136	adcq	96(%rdi),%r12
3137	adcq	104(%rdi),%r13
3138	adcq	112(%rdi),%r14
3139	adcq	120(%rdi),%r15
3140	leaq	(%rsi),%rbp
3141	leaq	128(%rdi),%rdi
3142	sbbq	%rax,%rax
3143
3144	movq	-64(%rsi),%rdx
3145	movq	%rax,16+8(%rsp)
3146	movq	%rdi,24+8(%rsp)
3147
3148
3149	xorl	%eax,%eax
3150	jmp	L$sqrx8x_loop
3151
3152.p2align	5
3153L$sqrx8x_loop:
3154	movq	%r8,%rbx
3155	mulxq	0(%rbp),%rax,%r8
3156	adcxq	%rax,%rbx
3157	adoxq	%r9,%r8
3158
3159	mulxq	8(%rbp),%rax,%r9
3160	adcxq	%rax,%r8
3161	adoxq	%r10,%r9
3162
3163	mulxq	16(%rbp),%rax,%r10
3164	adcxq	%rax,%r9
3165	adoxq	%r11,%r10
3166
3167	mulxq	24(%rbp),%rax,%r11
3168	adcxq	%rax,%r10
3169	adoxq	%r12,%r11
3170
3171.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3172	adcxq	%rax,%r11
3173	adoxq	%r13,%r12
3174
3175	mulxq	40(%rbp),%rax,%r13
3176	adcxq	%rax,%r12
3177	adoxq	%r14,%r13
3178
3179	mulxq	48(%rbp),%rax,%r14
3180	movq	%rbx,(%rdi,%rcx,8)
3181	movl	$0,%ebx
3182	adcxq	%rax,%r13
3183	adoxq	%r15,%r14
3184
3185.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3186	movq	8(%rsi,%rcx,8),%rdx
3187	adcxq	%rax,%r14
3188	adoxq	%rbx,%r15
3189	adcxq	%rbx,%r15
3190
3191.byte	0x67
3192	incq	%rcx
3193	jnz	L$sqrx8x_loop
3194
3195	leaq	64(%rbp),%rbp
3196	movq	$-8,%rcx
3197	cmpq	8+8(%rsp),%rbp
3198	je	L$sqrx8x_break
3199
3200	subq	16+8(%rsp),%rbx
3201.byte	0x66
3202	movq	-64(%rsi),%rdx
3203	adcxq	0(%rdi),%r8
3204	adcxq	8(%rdi),%r9
3205	adcq	16(%rdi),%r10
3206	adcq	24(%rdi),%r11
3207	adcq	32(%rdi),%r12
3208	adcq	40(%rdi),%r13
3209	adcq	48(%rdi),%r14
3210	adcq	56(%rdi),%r15
3211	leaq	64(%rdi),%rdi
3212.byte	0x67
3213	sbbq	%rax,%rax
3214	xorl	%ebx,%ebx
3215	movq	%rax,16+8(%rsp)
3216	jmp	L$sqrx8x_loop
3217
3218.p2align	5
3219L$sqrx8x_break:
3220	xorq	%rbp,%rbp
3221	subq	16+8(%rsp),%rbx
3222	adcxq	%rbp,%r8
3223	movq	24+8(%rsp),%rcx
3224	adcxq	%rbp,%r9
3225	movq	0(%rsi),%rdx
3226	adcq	$0,%r10
3227	movq	%r8,0(%rdi)
3228	adcq	$0,%r11
3229	adcq	$0,%r12
3230	adcq	$0,%r13
3231	adcq	$0,%r14
3232	adcq	$0,%r15
3233	cmpq	%rcx,%rdi
3234	je	L$sqrx8x_outer_loop
3235
3236	movq	%r9,8(%rdi)
3237	movq	8(%rcx),%r9
3238	movq	%r10,16(%rdi)
3239	movq	16(%rcx),%r10
3240	movq	%r11,24(%rdi)
3241	movq	24(%rcx),%r11
3242	movq	%r12,32(%rdi)
3243	movq	32(%rcx),%r12
3244	movq	%r13,40(%rdi)
3245	movq	40(%rcx),%r13
3246	movq	%r14,48(%rdi)
3247	movq	48(%rcx),%r14
3248	movq	%r15,56(%rdi)
3249	movq	56(%rcx),%r15
3250	movq	%rcx,%rdi
3251	jmp	L$sqrx8x_outer_loop
3252
3253.p2align	5
3254L$sqrx8x_outer_break:
3255	movq	%r9,72(%rdi)
3256.byte	102,72,15,126,217
3257	movq	%r10,80(%rdi)
3258	movq	%r11,88(%rdi)
3259	movq	%r12,96(%rdi)
3260	movq	%r13,104(%rdi)
3261	movq	%r14,112(%rdi)
3262	leaq	48+8(%rsp),%rdi
3263	movq	(%rsi,%rcx,1),%rdx
3264
3265	movq	8(%rdi),%r11
3266	xorq	%r10,%r10
3267	movq	0+8(%rsp),%r9
3268	adoxq	%r11,%r11
3269	movq	16(%rdi),%r12
3270	movq	24(%rdi),%r13
3271
3272
3273.p2align	5
3274L$sqrx4x_shift_n_add:
3275	mulxq	%rdx,%rax,%rbx
3276	adoxq	%r12,%r12
3277	adcxq	%r10,%rax
3278.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3279.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3280	adoxq	%r13,%r13
3281	adcxq	%r11,%rbx
3282	movq	40(%rdi),%r11
3283	movq	%rax,0(%rdi)
3284	movq	%rbx,8(%rdi)
3285
3286	mulxq	%rdx,%rax,%rbx
3287	adoxq	%r10,%r10
3288	adcxq	%r12,%rax
3289	movq	16(%rsi,%rcx,1),%rdx
3290	movq	48(%rdi),%r12
3291	adoxq	%r11,%r11
3292	adcxq	%r13,%rbx
3293	movq	56(%rdi),%r13
3294	movq	%rax,16(%rdi)
3295	movq	%rbx,24(%rdi)
3296
3297	mulxq	%rdx,%rax,%rbx
3298	adoxq	%r12,%r12
3299	adcxq	%r10,%rax
3300	movq	24(%rsi,%rcx,1),%rdx
3301	leaq	32(%rcx),%rcx
3302	movq	64(%rdi),%r10
3303	adoxq	%r13,%r13
3304	adcxq	%r11,%rbx
3305	movq	72(%rdi),%r11
3306	movq	%rax,32(%rdi)
3307	movq	%rbx,40(%rdi)
3308
3309	mulxq	%rdx,%rax,%rbx
3310	adoxq	%r10,%r10
3311	adcxq	%r12,%rax
3312	jrcxz	L$sqrx4x_shift_n_add_break
3313.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3314	adoxq	%r11,%r11
3315	adcxq	%r13,%rbx
3316	movq	80(%rdi),%r12
3317	movq	88(%rdi),%r13
3318	movq	%rax,48(%rdi)
3319	movq	%rbx,56(%rdi)
3320	leaq	64(%rdi),%rdi
3321	nop
3322	jmp	L$sqrx4x_shift_n_add
3323
3324.p2align	5
3325L$sqrx4x_shift_n_add_break:
3326	adcxq	%r13,%rbx
3327	movq	%rax,48(%rdi)
3328	movq	%rbx,56(%rdi)
3329	leaq	64(%rdi),%rdi
3330.byte	102,72,15,126,213
3331__bn_sqrx8x_reduction:
3332	xorl	%eax,%eax
3333	movq	32+8(%rsp),%rbx
3334	movq	48+8(%rsp),%rdx
3335	leaq	-64(%rbp,%r9,1),%rcx
3336
3337	movq	%rcx,0+8(%rsp)
3338	movq	%rdi,8+8(%rsp)
3339
3340	leaq	48+8(%rsp),%rdi
3341	jmp	L$sqrx8x_reduction_loop
3342
3343.p2align	5
3344L$sqrx8x_reduction_loop:
3345	movq	8(%rdi),%r9
3346	movq	16(%rdi),%r10
3347	movq	24(%rdi),%r11
3348	movq	32(%rdi),%r12
3349	movq	%rdx,%r8
3350	imulq	%rbx,%rdx
3351	movq	40(%rdi),%r13
3352	movq	48(%rdi),%r14
3353	movq	56(%rdi),%r15
3354	movq	%rax,24+8(%rsp)
3355
3356	leaq	64(%rdi),%rdi
3357	xorq	%rsi,%rsi
3358	movq	$-8,%rcx
3359	jmp	L$sqrx8x_reduce
3360
3361.p2align	5
3362L$sqrx8x_reduce:
3363	movq	%r8,%rbx
3364	mulxq	0(%rbp),%rax,%r8
3365	adcxq	%rbx,%rax
3366	adoxq	%r9,%r8
3367
3368	mulxq	8(%rbp),%rbx,%r9
3369	adcxq	%rbx,%r8
3370	adoxq	%r10,%r9
3371
3372	mulxq	16(%rbp),%rbx,%r10
3373	adcxq	%rbx,%r9
3374	adoxq	%r11,%r10
3375
3376	mulxq	24(%rbp),%rbx,%r11
3377	adcxq	%rbx,%r10
3378	adoxq	%r12,%r11
3379
3380.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3381	movq	%rdx,%rax
3382	movq	%r8,%rdx
3383	adcxq	%rbx,%r11
3384	adoxq	%r13,%r12
3385
3386	mulxq	32+8(%rsp),%rbx,%rdx
3387	movq	%rax,%rdx
3388	movq	%rax,64+48+8(%rsp,%rcx,8)
3389
3390	mulxq	40(%rbp),%rax,%r13
3391	adcxq	%rax,%r12
3392	adoxq	%r14,%r13
3393
3394	mulxq	48(%rbp),%rax,%r14
3395	adcxq	%rax,%r13
3396	adoxq	%r15,%r14
3397
3398	mulxq	56(%rbp),%rax,%r15
3399	movq	%rbx,%rdx
3400	adcxq	%rax,%r14
3401	adoxq	%rsi,%r15
3402	adcxq	%rsi,%r15
3403
3404.byte	0x67,0x67,0x67
3405	incq	%rcx
3406	jnz	L$sqrx8x_reduce
3407
3408	movq	%rsi,%rax
3409	cmpq	0+8(%rsp),%rbp
3410	jae	L$sqrx8x_no_tail
3411
3412	movq	48+8(%rsp),%rdx
3413	addq	0(%rdi),%r8
3414	leaq	64(%rbp),%rbp
3415	movq	$-8,%rcx
3416	adcxq	8(%rdi),%r9
3417	adcxq	16(%rdi),%r10
3418	adcq	24(%rdi),%r11
3419	adcq	32(%rdi),%r12
3420	adcq	40(%rdi),%r13
3421	adcq	48(%rdi),%r14
3422	adcq	56(%rdi),%r15
3423	leaq	64(%rdi),%rdi
3424	sbbq	%rax,%rax
3425
3426	xorq	%rsi,%rsi
3427	movq	%rax,16+8(%rsp)
3428	jmp	L$sqrx8x_tail
3429
3430.p2align	5
3431L$sqrx8x_tail:
3432	movq	%r8,%rbx
3433	mulxq	0(%rbp),%rax,%r8
3434	adcxq	%rax,%rbx
3435	adoxq	%r9,%r8
3436
3437	mulxq	8(%rbp),%rax,%r9
3438	adcxq	%rax,%r8
3439	adoxq	%r10,%r9
3440
3441	mulxq	16(%rbp),%rax,%r10
3442	adcxq	%rax,%r9
3443	adoxq	%r11,%r10
3444
3445	mulxq	24(%rbp),%rax,%r11
3446	adcxq	%rax,%r10
3447	adoxq	%r12,%r11
3448
3449.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3450	adcxq	%rax,%r11
3451	adoxq	%r13,%r12
3452
3453	mulxq	40(%rbp),%rax,%r13
3454	adcxq	%rax,%r12
3455	adoxq	%r14,%r13
3456
3457	mulxq	48(%rbp),%rax,%r14
3458	adcxq	%rax,%r13
3459	adoxq	%r15,%r14
3460
3461	mulxq	56(%rbp),%rax,%r15
3462	movq	72+48+8(%rsp,%rcx,8),%rdx
3463	adcxq	%rax,%r14
3464	adoxq	%rsi,%r15
3465	movq	%rbx,(%rdi,%rcx,8)
3466	movq	%r8,%rbx
3467	adcxq	%rsi,%r15
3468
3469	incq	%rcx
3470	jnz	L$sqrx8x_tail
3471
3472	cmpq	0+8(%rsp),%rbp
3473	jae	L$sqrx8x_tail_done
3474
3475	subq	16+8(%rsp),%rsi
3476	movq	48+8(%rsp),%rdx
3477	leaq	64(%rbp),%rbp
3478	adcq	0(%rdi),%r8
3479	adcq	8(%rdi),%r9
3480	adcq	16(%rdi),%r10
3481	adcq	24(%rdi),%r11
3482	adcq	32(%rdi),%r12
3483	adcq	40(%rdi),%r13
3484	adcq	48(%rdi),%r14
3485	adcq	56(%rdi),%r15
3486	leaq	64(%rdi),%rdi
3487	sbbq	%rax,%rax
3488	subq	$8,%rcx
3489
3490	xorq	%rsi,%rsi
3491	movq	%rax,16+8(%rsp)
3492	jmp	L$sqrx8x_tail
3493
3494.p2align	5
3495L$sqrx8x_tail_done:
3496	xorq	%rax,%rax
3497	addq	24+8(%rsp),%r8
3498	adcq	$0,%r9
3499	adcq	$0,%r10
3500	adcq	$0,%r11
3501	adcq	$0,%r12
3502	adcq	$0,%r13
3503	adcq	$0,%r14
3504	adcq	$0,%r15
3505	adcq	$0,%rax
3506
3507	subq	16+8(%rsp),%rsi
3508L$sqrx8x_no_tail:
3509	adcq	0(%rdi),%r8
3510.byte	102,72,15,126,217
3511	adcq	8(%rdi),%r9
3512	movq	56(%rbp),%rsi
3513.byte	102,72,15,126,213
3514	adcq	16(%rdi),%r10
3515	adcq	24(%rdi),%r11
3516	adcq	32(%rdi),%r12
3517	adcq	40(%rdi),%r13
3518	adcq	48(%rdi),%r14
3519	adcq	56(%rdi),%r15
3520	adcq	$0,%rax
3521
3522	movq	32+8(%rsp),%rbx
3523	movq	64(%rdi,%rcx,1),%rdx
3524
3525	movq	%r8,0(%rdi)
3526	leaq	64(%rdi),%r8
3527	movq	%r9,8(%rdi)
3528	movq	%r10,16(%rdi)
3529	movq	%r11,24(%rdi)
3530	movq	%r12,32(%rdi)
3531	movq	%r13,40(%rdi)
3532	movq	%r14,48(%rdi)
3533	movq	%r15,56(%rdi)
3534
3535	leaq	64(%rdi,%rcx,1),%rdi
3536	cmpq	8+8(%rsp),%r8
3537	jb	L$sqrx8x_reduction_loop
3538	.byte	0xf3,0xc3
3539
3540
3541.p2align	5
3542
3543__bn_postx4x_internal:
3544
3545	movq	0(%rbp),%r12
3546	movq	%rcx,%r10
3547	movq	%rcx,%r9
3548	negq	%rax
3549	sarq	$3+2,%rcx
3550
3551.byte	102,72,15,126,202
3552.byte	102,72,15,126,206
3553	decq	%r12
3554	movq	8(%rbp),%r13
3555	xorq	%r8,%r8
3556	movq	16(%rbp),%r14
3557	movq	24(%rbp),%r15
3558	jmp	L$sqrx4x_sub_entry
3559
3560.p2align	4
3561L$sqrx4x_sub:
3562	movq	0(%rbp),%r12
3563	movq	8(%rbp),%r13
3564	movq	16(%rbp),%r14
3565	movq	24(%rbp),%r15
3566L$sqrx4x_sub_entry:
3567	andnq	%rax,%r12,%r12
3568	leaq	32(%rbp),%rbp
3569	andnq	%rax,%r13,%r13
3570	andnq	%rax,%r14,%r14
3571	andnq	%rax,%r15,%r15
3572
3573	negq	%r8
3574	adcq	0(%rdi),%r12
3575	adcq	8(%rdi),%r13
3576	adcq	16(%rdi),%r14
3577	adcq	24(%rdi),%r15
3578	movq	%r12,0(%rdx)
3579	leaq	32(%rdi),%rdi
3580	movq	%r13,8(%rdx)
3581	sbbq	%r8,%r8
3582	movq	%r14,16(%rdx)
3583	movq	%r15,24(%rdx)
3584	leaq	32(%rdx),%rdx
3585
3586	incq	%rcx
3587	jnz	L$sqrx4x_sub
3588
3589	negq	%r9
3590
3591	.byte	0xf3,0xc3
3592
3593
3594.globl	_bn_scatter5
3595.private_extern _bn_scatter5
3596
3597.p2align	4
3598_bn_scatter5:
3599
3600	cmpl	$0,%esi
3601	jz	L$scatter_epilogue
3602	leaq	(%rdx,%rcx,8),%rdx
3603L$scatter:
3604	movq	(%rdi),%rax
3605	leaq	8(%rdi),%rdi
3606	movq	%rax,(%rdx)
3607	leaq	256(%rdx),%rdx
3608	subl	$1,%esi
3609	jnz	L$scatter
3610L$scatter_epilogue:
3611	.byte	0xf3,0xc3
3612
3613
3614
3615.globl	_bn_gather5
3616.private_extern _bn_gather5
3617
3618.p2align	5
3619_bn_gather5:
3620
3621L$SEH_begin_bn_gather5:
3622
3623.byte	0x4c,0x8d,0x14,0x24
3624
3625.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3626	leaq	L$inc(%rip),%rax
3627	andq	$-16,%rsp
3628
3629	movd	%ecx,%xmm5
3630	movdqa	0(%rax),%xmm0
3631	movdqa	16(%rax),%xmm1
3632	leaq	128(%rdx),%r11
3633	leaq	128(%rsp),%rax
3634
3635	pshufd	$0,%xmm5,%xmm5
3636	movdqa	%xmm1,%xmm4
3637	movdqa	%xmm1,%xmm2
3638	paddd	%xmm0,%xmm1
3639	pcmpeqd	%xmm5,%xmm0
3640	movdqa	%xmm4,%xmm3
3641
3642	paddd	%xmm1,%xmm2
3643	pcmpeqd	%xmm5,%xmm1
3644	movdqa	%xmm0,-128(%rax)
3645	movdqa	%xmm4,%xmm0
3646
3647	paddd	%xmm2,%xmm3
3648	pcmpeqd	%xmm5,%xmm2
3649	movdqa	%xmm1,-112(%rax)
3650	movdqa	%xmm4,%xmm1
3651
3652	paddd	%xmm3,%xmm0
3653	pcmpeqd	%xmm5,%xmm3
3654	movdqa	%xmm2,-96(%rax)
3655	movdqa	%xmm4,%xmm2
3656	paddd	%xmm0,%xmm1
3657	pcmpeqd	%xmm5,%xmm0
3658	movdqa	%xmm3,-80(%rax)
3659	movdqa	%xmm4,%xmm3
3660
3661	paddd	%xmm1,%xmm2
3662	pcmpeqd	%xmm5,%xmm1
3663	movdqa	%xmm0,-64(%rax)
3664	movdqa	%xmm4,%xmm0
3665
3666	paddd	%xmm2,%xmm3
3667	pcmpeqd	%xmm5,%xmm2
3668	movdqa	%xmm1,-48(%rax)
3669	movdqa	%xmm4,%xmm1
3670
3671	paddd	%xmm3,%xmm0
3672	pcmpeqd	%xmm5,%xmm3
3673	movdqa	%xmm2,-32(%rax)
3674	movdqa	%xmm4,%xmm2
3675	paddd	%xmm0,%xmm1
3676	pcmpeqd	%xmm5,%xmm0
3677	movdqa	%xmm3,-16(%rax)
3678	movdqa	%xmm4,%xmm3
3679
3680	paddd	%xmm1,%xmm2
3681	pcmpeqd	%xmm5,%xmm1
3682	movdqa	%xmm0,0(%rax)
3683	movdqa	%xmm4,%xmm0
3684
3685	paddd	%xmm2,%xmm3
3686	pcmpeqd	%xmm5,%xmm2
3687	movdqa	%xmm1,16(%rax)
3688	movdqa	%xmm4,%xmm1
3689
3690	paddd	%xmm3,%xmm0
3691	pcmpeqd	%xmm5,%xmm3
3692	movdqa	%xmm2,32(%rax)
3693	movdqa	%xmm4,%xmm2
3694	paddd	%xmm0,%xmm1
3695	pcmpeqd	%xmm5,%xmm0
3696	movdqa	%xmm3,48(%rax)
3697	movdqa	%xmm4,%xmm3
3698
3699	paddd	%xmm1,%xmm2
3700	pcmpeqd	%xmm5,%xmm1
3701	movdqa	%xmm0,64(%rax)
3702	movdqa	%xmm4,%xmm0
3703
3704	paddd	%xmm2,%xmm3
3705	pcmpeqd	%xmm5,%xmm2
3706	movdqa	%xmm1,80(%rax)
3707	movdqa	%xmm4,%xmm1
3708
3709	paddd	%xmm3,%xmm0
3710	pcmpeqd	%xmm5,%xmm3
3711	movdqa	%xmm2,96(%rax)
3712	movdqa	%xmm4,%xmm2
3713	movdqa	%xmm3,112(%rax)
3714	jmp	L$gather
3715
3716.p2align	5
3717L$gather:
3718	pxor	%xmm4,%xmm4
3719	pxor	%xmm5,%xmm5
3720	movdqa	-128(%r11),%xmm0
3721	movdqa	-112(%r11),%xmm1
3722	movdqa	-96(%r11),%xmm2
3723	pand	-128(%rax),%xmm0
3724	movdqa	-80(%r11),%xmm3
3725	pand	-112(%rax),%xmm1
3726	por	%xmm0,%xmm4
3727	pand	-96(%rax),%xmm2
3728	por	%xmm1,%xmm5
3729	pand	-80(%rax),%xmm3
3730	por	%xmm2,%xmm4
3731	por	%xmm3,%xmm5
3732	movdqa	-64(%r11),%xmm0
3733	movdqa	-48(%r11),%xmm1
3734	movdqa	-32(%r11),%xmm2
3735	pand	-64(%rax),%xmm0
3736	movdqa	-16(%r11),%xmm3
3737	pand	-48(%rax),%xmm1
3738	por	%xmm0,%xmm4
3739	pand	-32(%rax),%xmm2
3740	por	%xmm1,%xmm5
3741	pand	-16(%rax),%xmm3
3742	por	%xmm2,%xmm4
3743	por	%xmm3,%xmm5
3744	movdqa	0(%r11),%xmm0
3745	movdqa	16(%r11),%xmm1
3746	movdqa	32(%r11),%xmm2
3747	pand	0(%rax),%xmm0
3748	movdqa	48(%r11),%xmm3
3749	pand	16(%rax),%xmm1
3750	por	%xmm0,%xmm4
3751	pand	32(%rax),%xmm2
3752	por	%xmm1,%xmm5
3753	pand	48(%rax),%xmm3
3754	por	%xmm2,%xmm4
3755	por	%xmm3,%xmm5
3756	movdqa	64(%r11),%xmm0
3757	movdqa	80(%r11),%xmm1
3758	movdqa	96(%r11),%xmm2
3759	pand	64(%rax),%xmm0
3760	movdqa	112(%r11),%xmm3
3761	pand	80(%rax),%xmm1
3762	por	%xmm0,%xmm4
3763	pand	96(%rax),%xmm2
3764	por	%xmm1,%xmm5
3765	pand	112(%rax),%xmm3
3766	por	%xmm2,%xmm4
3767	por	%xmm3,%xmm5
3768	por	%xmm5,%xmm4
3769	leaq	256(%r11),%r11
3770	pshufd	$0x4e,%xmm4,%xmm0
3771	por	%xmm4,%xmm0
3772	movq	%xmm0,(%rdi)
3773	leaq	8(%rdi),%rdi
3774	subl	$1,%esi
3775	jnz	L$gather
3776
3777	leaq	(%r10),%rsp
3778
3779	.byte	0xf3,0xc3
3780L$SEH_end_bn_gather5:
3781
3782
3783.p2align	6
3784L$inc:
3785.long	0,0, 1,1
3786.long	2,2, 2,2
3787.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3788#endif
3789