1#if defined(__x86_64__)
2.text
3
4
5
6.globl	_bn_mul_mont
7.private_extern _bn_mul_mont
8
9.p2align	4
10_bn_mul_mont:
11
12	movl	%r9d,%r9d
13	movq	%rsp,%rax
14
15	testl	$3,%r9d
16	jnz	L$mul_enter
17	cmpl	$8,%r9d
18	jb	L$mul_enter
19	cmpq	%rsi,%rdx
20	jne	L$mul4x_enter
21	testl	$7,%r9d
22	jz	L$sqr8x_enter
23	jmp	L$mul4x_enter
24
25.p2align	4
26L$mul_enter:
27	pushq	%rbx
28
29	pushq	%rbp
30
31	pushq	%r12
32
33	pushq	%r13
34
35	pushq	%r14
36
37	pushq	%r15
38
39
40	negq	%r9
41	movq	%rsp,%r11
42	leaq	-16(%rsp,%r9,8),%r10
43	negq	%r9
44	andq	$-1024,%r10
45
46
47
48
49
50
51
52
53
54	subq	%r10,%r11
55	andq	$-4096,%r11
56	leaq	(%r10,%r11,1),%rsp
57	movq	(%rsp),%r11
58	cmpq	%r10,%rsp
59	ja	L$mul_page_walk
60	jmp	L$mul_page_walk_done
61
62.p2align	4
63L$mul_page_walk:
64	leaq	-4096(%rsp),%rsp
65	movq	(%rsp),%r11
66	cmpq	%r10,%rsp
67	ja	L$mul_page_walk
68L$mul_page_walk_done:
69
70	movq	%rax,8(%rsp,%r9,8)
71
72L$mul_body:
73	movq	%rdx,%r12
74	movq	(%r8),%r8
75	movq	(%r12),%rbx
76	movq	(%rsi),%rax
77
78	xorq	%r14,%r14
79	xorq	%r15,%r15
80
81	movq	%r8,%rbp
82	mulq	%rbx
83	movq	%rax,%r10
84	movq	(%rcx),%rax
85
86	imulq	%r10,%rbp
87	movq	%rdx,%r11
88
89	mulq	%rbp
90	addq	%rax,%r10
91	movq	8(%rsi),%rax
92	adcq	$0,%rdx
93	movq	%rdx,%r13
94
95	leaq	1(%r15),%r15
96	jmp	L$1st_enter
97
98.p2align	4
99L$1st:
100	addq	%rax,%r13
101	movq	(%rsi,%r15,8),%rax
102	adcq	$0,%rdx
103	addq	%r11,%r13
104	movq	%r10,%r11
105	adcq	$0,%rdx
106	movq	%r13,-16(%rsp,%r15,8)
107	movq	%rdx,%r13
108
109L$1st_enter:
110	mulq	%rbx
111	addq	%rax,%r11
112	movq	(%rcx,%r15,8),%rax
113	adcq	$0,%rdx
114	leaq	1(%r15),%r15
115	movq	%rdx,%r10
116
117	mulq	%rbp
118	cmpq	%r9,%r15
119	jne	L$1st
120
121	addq	%rax,%r13
122	movq	(%rsi),%rax
123	adcq	$0,%rdx
124	addq	%r11,%r13
125	adcq	$0,%rdx
126	movq	%r13,-16(%rsp,%r15,8)
127	movq	%rdx,%r13
128	movq	%r10,%r11
129
130	xorq	%rdx,%rdx
131	addq	%r11,%r13
132	adcq	$0,%rdx
133	movq	%r13,-8(%rsp,%r9,8)
134	movq	%rdx,(%rsp,%r9,8)
135
136	leaq	1(%r14),%r14
137	jmp	L$outer
138.p2align	4
139L$outer:
140	movq	(%r12,%r14,8),%rbx
141	xorq	%r15,%r15
142	movq	%r8,%rbp
143	movq	(%rsp),%r10
144	mulq	%rbx
145	addq	%rax,%r10
146	movq	(%rcx),%rax
147	adcq	$0,%rdx
148
149	imulq	%r10,%rbp
150	movq	%rdx,%r11
151
152	mulq	%rbp
153	addq	%rax,%r10
154	movq	8(%rsi),%rax
155	adcq	$0,%rdx
156	movq	8(%rsp),%r10
157	movq	%rdx,%r13
158
159	leaq	1(%r15),%r15
160	jmp	L$inner_enter
161
162.p2align	4
163L$inner:
164	addq	%rax,%r13
165	movq	(%rsi,%r15,8),%rax
166	adcq	$0,%rdx
167	addq	%r10,%r13
168	movq	(%rsp,%r15,8),%r10
169	adcq	$0,%rdx
170	movq	%r13,-16(%rsp,%r15,8)
171	movq	%rdx,%r13
172
173L$inner_enter:
174	mulq	%rbx
175	addq	%rax,%r11
176	movq	(%rcx,%r15,8),%rax
177	adcq	$0,%rdx
178	addq	%r11,%r10
179	movq	%rdx,%r11
180	adcq	$0,%r11
181	leaq	1(%r15),%r15
182
183	mulq	%rbp
184	cmpq	%r9,%r15
185	jne	L$inner
186
187	addq	%rax,%r13
188	movq	(%rsi),%rax
189	adcq	$0,%rdx
190	addq	%r10,%r13
191	movq	(%rsp,%r15,8),%r10
192	adcq	$0,%rdx
193	movq	%r13,-16(%rsp,%r15,8)
194	movq	%rdx,%r13
195
196	xorq	%rdx,%rdx
197	addq	%r11,%r13
198	adcq	$0,%rdx
199	addq	%r10,%r13
200	adcq	$0,%rdx
201	movq	%r13,-8(%rsp,%r9,8)
202	movq	%rdx,(%rsp,%r9,8)
203
204	leaq	1(%r14),%r14
205	cmpq	%r9,%r14
206	jb	L$outer
207
208	xorq	%r14,%r14
209	movq	(%rsp),%rax
210	leaq	(%rsp),%rsi
211	movq	%r9,%r15
212	jmp	L$sub
213.p2align	4
214L$sub:	sbbq	(%rcx,%r14,8),%rax
215	movq	%rax,(%rdi,%r14,8)
216	movq	8(%rsi,%r14,8),%rax
217	leaq	1(%r14),%r14
218	decq	%r15
219	jnz	L$sub
220
221	sbbq	$0,%rax
222	xorq	%r14,%r14
223	andq	%rax,%rsi
224	notq	%rax
225	movq	%rdi,%rcx
226	andq	%rax,%rcx
227	movq	%r9,%r15
228	orq	%rcx,%rsi
229.p2align	4
230L$copy:
231	movq	(%rsi,%r14,8),%rax
232	movq	%r14,(%rsp,%r14,8)
233	movq	%rax,(%rdi,%r14,8)
234	leaq	1(%r14),%r14
235	subq	$1,%r15
236	jnz	L$copy
237
238	movq	8(%rsp,%r9,8),%rsi
239
240	movq	$1,%rax
241	movq	-48(%rsi),%r15
242
243	movq	-40(%rsi),%r14
244
245	movq	-32(%rsi),%r13
246
247	movq	-24(%rsi),%r12
248
249	movq	-16(%rsi),%rbp
250
251	movq	-8(%rsi),%rbx
252
253	leaq	(%rsi),%rsp
254
255L$mul_epilogue:
256	.byte	0xf3,0xc3
257
258
259
260.p2align	4
261bn_mul4x_mont:
262
263	movl	%r9d,%r9d
264	movq	%rsp,%rax
265
266L$mul4x_enter:
267	pushq	%rbx
268
269	pushq	%rbp
270
271	pushq	%r12
272
273	pushq	%r13
274
275	pushq	%r14
276
277	pushq	%r15
278
279
280	negq	%r9
281	movq	%rsp,%r11
282	leaq	-32(%rsp,%r9,8),%r10
283	negq	%r9
284	andq	$-1024,%r10
285
286	subq	%r10,%r11
287	andq	$-4096,%r11
288	leaq	(%r10,%r11,1),%rsp
289	movq	(%rsp),%r11
290	cmpq	%r10,%rsp
291	ja	L$mul4x_page_walk
292	jmp	L$mul4x_page_walk_done
293
294L$mul4x_page_walk:
295	leaq	-4096(%rsp),%rsp
296	movq	(%rsp),%r11
297	cmpq	%r10,%rsp
298	ja	L$mul4x_page_walk
299L$mul4x_page_walk_done:
300
301	movq	%rax,8(%rsp,%r9,8)
302
303L$mul4x_body:
304	movq	%rdi,16(%rsp,%r9,8)
305	movq	%rdx,%r12
306	movq	(%r8),%r8
307	movq	(%r12),%rbx
308	movq	(%rsi),%rax
309
310	xorq	%r14,%r14
311	xorq	%r15,%r15
312
313	movq	%r8,%rbp
314	mulq	%rbx
315	movq	%rax,%r10
316	movq	(%rcx),%rax
317
318	imulq	%r10,%rbp
319	movq	%rdx,%r11
320
321	mulq	%rbp
322	addq	%rax,%r10
323	movq	8(%rsi),%rax
324	adcq	$0,%rdx
325	movq	%rdx,%rdi
326
327	mulq	%rbx
328	addq	%rax,%r11
329	movq	8(%rcx),%rax
330	adcq	$0,%rdx
331	movq	%rdx,%r10
332
333	mulq	%rbp
334	addq	%rax,%rdi
335	movq	16(%rsi),%rax
336	adcq	$0,%rdx
337	addq	%r11,%rdi
338	leaq	4(%r15),%r15
339	adcq	$0,%rdx
340	movq	%rdi,(%rsp)
341	movq	%rdx,%r13
342	jmp	L$1st4x
343.p2align	4
344L$1st4x:
345	mulq	%rbx
346	addq	%rax,%r10
347	movq	-16(%rcx,%r15,8),%rax
348	adcq	$0,%rdx
349	movq	%rdx,%r11
350
351	mulq	%rbp
352	addq	%rax,%r13
353	movq	-8(%rsi,%r15,8),%rax
354	adcq	$0,%rdx
355	addq	%r10,%r13
356	adcq	$0,%rdx
357	movq	%r13,-24(%rsp,%r15,8)
358	movq	%rdx,%rdi
359
360	mulq	%rbx
361	addq	%rax,%r11
362	movq	-8(%rcx,%r15,8),%rax
363	adcq	$0,%rdx
364	movq	%rdx,%r10
365
366	mulq	%rbp
367	addq	%rax,%rdi
368	movq	(%rsi,%r15,8),%rax
369	adcq	$0,%rdx
370	addq	%r11,%rdi
371	adcq	$0,%rdx
372	movq	%rdi,-16(%rsp,%r15,8)
373	movq	%rdx,%r13
374
375	mulq	%rbx
376	addq	%rax,%r10
377	movq	(%rcx,%r15,8),%rax
378	adcq	$0,%rdx
379	movq	%rdx,%r11
380
381	mulq	%rbp
382	addq	%rax,%r13
383	movq	8(%rsi,%r15,8),%rax
384	adcq	$0,%rdx
385	addq	%r10,%r13
386	adcq	$0,%rdx
387	movq	%r13,-8(%rsp,%r15,8)
388	movq	%rdx,%rdi
389
390	mulq	%rbx
391	addq	%rax,%r11
392	movq	8(%rcx,%r15,8),%rax
393	adcq	$0,%rdx
394	leaq	4(%r15),%r15
395	movq	%rdx,%r10
396
397	mulq	%rbp
398	addq	%rax,%rdi
399	movq	-16(%rsi,%r15,8),%rax
400	adcq	$0,%rdx
401	addq	%r11,%rdi
402	adcq	$0,%rdx
403	movq	%rdi,-32(%rsp,%r15,8)
404	movq	%rdx,%r13
405	cmpq	%r9,%r15
406	jb	L$1st4x
407
408	mulq	%rbx
409	addq	%rax,%r10
410	movq	-16(%rcx,%r15,8),%rax
411	adcq	$0,%rdx
412	movq	%rdx,%r11
413
414	mulq	%rbp
415	addq	%rax,%r13
416	movq	-8(%rsi,%r15,8),%rax
417	adcq	$0,%rdx
418	addq	%r10,%r13
419	adcq	$0,%rdx
420	movq	%r13,-24(%rsp,%r15,8)
421	movq	%rdx,%rdi
422
423	mulq	%rbx
424	addq	%rax,%r11
425	movq	-8(%rcx,%r15,8),%rax
426	adcq	$0,%rdx
427	movq	%rdx,%r10
428
429	mulq	%rbp
430	addq	%rax,%rdi
431	movq	(%rsi),%rax
432	adcq	$0,%rdx
433	addq	%r11,%rdi
434	adcq	$0,%rdx
435	movq	%rdi,-16(%rsp,%r15,8)
436	movq	%rdx,%r13
437
438	xorq	%rdi,%rdi
439	addq	%r10,%r13
440	adcq	$0,%rdi
441	movq	%r13,-8(%rsp,%r15,8)
442	movq	%rdi,(%rsp,%r15,8)
443
444	leaq	1(%r14),%r14
445.p2align	2
446L$outer4x:
447	movq	(%r12,%r14,8),%rbx
448	xorq	%r15,%r15
449	movq	(%rsp),%r10
450	movq	%r8,%rbp
451	mulq	%rbx
452	addq	%rax,%r10
453	movq	(%rcx),%rax
454	adcq	$0,%rdx
455
456	imulq	%r10,%rbp
457	movq	%rdx,%r11
458
459	mulq	%rbp
460	addq	%rax,%r10
461	movq	8(%rsi),%rax
462	adcq	$0,%rdx
463	movq	%rdx,%rdi
464
465	mulq	%rbx
466	addq	%rax,%r11
467	movq	8(%rcx),%rax
468	adcq	$0,%rdx
469	addq	8(%rsp),%r11
470	adcq	$0,%rdx
471	movq	%rdx,%r10
472
473	mulq	%rbp
474	addq	%rax,%rdi
475	movq	16(%rsi),%rax
476	adcq	$0,%rdx
477	addq	%r11,%rdi
478	leaq	4(%r15),%r15
479	adcq	$0,%rdx
480	movq	%rdi,(%rsp)
481	movq	%rdx,%r13
482	jmp	L$inner4x
483.p2align	4
484L$inner4x:
485	mulq	%rbx
486	addq	%rax,%r10
487	movq	-16(%rcx,%r15,8),%rax
488	adcq	$0,%rdx
489	addq	-16(%rsp,%r15,8),%r10
490	adcq	$0,%rdx
491	movq	%rdx,%r11
492
493	mulq	%rbp
494	addq	%rax,%r13
495	movq	-8(%rsi,%r15,8),%rax
496	adcq	$0,%rdx
497	addq	%r10,%r13
498	adcq	$0,%rdx
499	movq	%r13,-24(%rsp,%r15,8)
500	movq	%rdx,%rdi
501
502	mulq	%rbx
503	addq	%rax,%r11
504	movq	-8(%rcx,%r15,8),%rax
505	adcq	$0,%rdx
506	addq	-8(%rsp,%r15,8),%r11
507	adcq	$0,%rdx
508	movq	%rdx,%r10
509
510	mulq	%rbp
511	addq	%rax,%rdi
512	movq	(%rsi,%r15,8),%rax
513	adcq	$0,%rdx
514	addq	%r11,%rdi
515	adcq	$0,%rdx
516	movq	%rdi,-16(%rsp,%r15,8)
517	movq	%rdx,%r13
518
519	mulq	%rbx
520	addq	%rax,%r10
521	movq	(%rcx,%r15,8),%rax
522	adcq	$0,%rdx
523	addq	(%rsp,%r15,8),%r10
524	adcq	$0,%rdx
525	movq	%rdx,%r11
526
527	mulq	%rbp
528	addq	%rax,%r13
529	movq	8(%rsi,%r15,8),%rax
530	adcq	$0,%rdx
531	addq	%r10,%r13
532	adcq	$0,%rdx
533	movq	%r13,-8(%rsp,%r15,8)
534	movq	%rdx,%rdi
535
536	mulq	%rbx
537	addq	%rax,%r11
538	movq	8(%rcx,%r15,8),%rax
539	adcq	$0,%rdx
540	addq	8(%rsp,%r15,8),%r11
541	adcq	$0,%rdx
542	leaq	4(%r15),%r15
543	movq	%rdx,%r10
544
545	mulq	%rbp
546	addq	%rax,%rdi
547	movq	-16(%rsi,%r15,8),%rax
548	adcq	$0,%rdx
549	addq	%r11,%rdi
550	adcq	$0,%rdx
551	movq	%rdi,-32(%rsp,%r15,8)
552	movq	%rdx,%r13
553	cmpq	%r9,%r15
554	jb	L$inner4x
555
556	mulq	%rbx
557	addq	%rax,%r10
558	movq	-16(%rcx,%r15,8),%rax
559	adcq	$0,%rdx
560	addq	-16(%rsp,%r15,8),%r10
561	adcq	$0,%rdx
562	movq	%rdx,%r11
563
564	mulq	%rbp
565	addq	%rax,%r13
566	movq	-8(%rsi,%r15,8),%rax
567	adcq	$0,%rdx
568	addq	%r10,%r13
569	adcq	$0,%rdx
570	movq	%r13,-24(%rsp,%r15,8)
571	movq	%rdx,%rdi
572
573	mulq	%rbx
574	addq	%rax,%r11
575	movq	-8(%rcx,%r15,8),%rax
576	adcq	$0,%rdx
577	addq	-8(%rsp,%r15,8),%r11
578	adcq	$0,%rdx
579	leaq	1(%r14),%r14
580	movq	%rdx,%r10
581
582	mulq	%rbp
583	addq	%rax,%rdi
584	movq	(%rsi),%rax
585	adcq	$0,%rdx
586	addq	%r11,%rdi
587	adcq	$0,%rdx
588	movq	%rdi,-16(%rsp,%r15,8)
589	movq	%rdx,%r13
590
591	xorq	%rdi,%rdi
592	addq	%r10,%r13
593	adcq	$0,%rdi
594	addq	(%rsp,%r9,8),%r13
595	adcq	$0,%rdi
596	movq	%r13,-8(%rsp,%r15,8)
597	movq	%rdi,(%rsp,%r15,8)
598
599	cmpq	%r9,%r14
600	jb	L$outer4x
601	movq	16(%rsp,%r9,8),%rdi
602	leaq	-4(%r9),%r15
603	movq	0(%rsp),%rax
604	pxor	%xmm0,%xmm0
605	movq	8(%rsp),%rdx
606	shrq	$2,%r15
607	leaq	(%rsp),%rsi
608	xorq	%r14,%r14
609
610	subq	0(%rcx),%rax
611	movq	16(%rsi),%rbx
612	movq	24(%rsi),%rbp
613	sbbq	8(%rcx),%rdx
614	jmp	L$sub4x
615.p2align	4
616L$sub4x:
617	movq	%rax,0(%rdi,%r14,8)
618	movq	%rdx,8(%rdi,%r14,8)
619	sbbq	16(%rcx,%r14,8),%rbx
620	movq	32(%rsi,%r14,8),%rax
621	movq	40(%rsi,%r14,8),%rdx
622	sbbq	24(%rcx,%r14,8),%rbp
623	movq	%rbx,16(%rdi,%r14,8)
624	movq	%rbp,24(%rdi,%r14,8)
625	sbbq	32(%rcx,%r14,8),%rax
626	movq	48(%rsi,%r14,8),%rbx
627	movq	56(%rsi,%r14,8),%rbp
628	sbbq	40(%rcx,%r14,8),%rdx
629	leaq	4(%r14),%r14
630	decq	%r15
631	jnz	L$sub4x
632
633	movq	%rax,0(%rdi,%r14,8)
634	movq	32(%rsi,%r14,8),%rax
635	sbbq	16(%rcx,%r14,8),%rbx
636	movq	%rdx,8(%rdi,%r14,8)
637	sbbq	24(%rcx,%r14,8),%rbp
638	movq	%rbx,16(%rdi,%r14,8)
639
640	sbbq	$0,%rax
641	movq	%rbp,24(%rdi,%r14,8)
642	xorq	%r14,%r14
643	andq	%rax,%rsi
644	notq	%rax
645	movq	%rdi,%rcx
646	andq	%rax,%rcx
647	leaq	-4(%r9),%r15
648	orq	%rcx,%rsi
649	shrq	$2,%r15
650
651	movdqu	(%rsi),%xmm1
652	movdqa	%xmm0,(%rsp)
653	movdqu	%xmm1,(%rdi)
654	jmp	L$copy4x
655.p2align	4
656L$copy4x:
657	movdqu	16(%rsi,%r14,1),%xmm2
658	movdqu	32(%rsi,%r14,1),%xmm1
659	movdqa	%xmm0,16(%rsp,%r14,1)
660	movdqu	%xmm2,16(%rdi,%r14,1)
661	movdqa	%xmm0,32(%rsp,%r14,1)
662	movdqu	%xmm1,32(%rdi,%r14,1)
663	leaq	32(%r14),%r14
664	decq	%r15
665	jnz	L$copy4x
666
667	movdqu	16(%rsi,%r14,1),%xmm2
668	movdqa	%xmm0,16(%rsp,%r14,1)
669	movdqu	%xmm2,16(%rdi,%r14,1)
670	movq	8(%rsp,%r9,8),%rsi
671
672	movq	$1,%rax
673	movq	-48(%rsi),%r15
674
675	movq	-40(%rsi),%r14
676
677	movq	-32(%rsi),%r13
678
679	movq	-24(%rsi),%r12
680
681	movq	-16(%rsi),%rbp
682
683	movq	-8(%rsi),%rbx
684
685	leaq	(%rsi),%rsp
686
687L$mul4x_epilogue:
688	.byte	0xf3,0xc3
689
690
691
692
693
694.p2align	5
695bn_sqr8x_mont:
696
697	movq	%rsp,%rax
698
699L$sqr8x_enter:
700	pushq	%rbx
701
702	pushq	%rbp
703
704	pushq	%r12
705
706	pushq	%r13
707
708	pushq	%r14
709
710	pushq	%r15
711
712L$sqr8x_prologue:
713
714	movl	%r9d,%r10d
715	shll	$3,%r9d
716	shlq	$3+2,%r10
717	negq	%r9
718
719
720
721
722
723
724	leaq	-64(%rsp,%r9,2),%r11
725	movq	%rsp,%rbp
726	movq	(%r8),%r8
727	subq	%rsi,%r11
728	andq	$4095,%r11
729	cmpq	%r11,%r10
730	jb	L$sqr8x_sp_alt
731	subq	%r11,%rbp
732	leaq	-64(%rbp,%r9,2),%rbp
733	jmp	L$sqr8x_sp_done
734
735.p2align	5
736L$sqr8x_sp_alt:
737	leaq	4096-64(,%r9,2),%r10
738	leaq	-64(%rbp,%r9,2),%rbp
739	subq	%r10,%r11
740	movq	$0,%r10
741	cmovcq	%r10,%r11
742	subq	%r11,%rbp
743L$sqr8x_sp_done:
744	andq	$-64,%rbp
745	movq	%rsp,%r11
746	subq	%rbp,%r11
747	andq	$-4096,%r11
748	leaq	(%r11,%rbp,1),%rsp
749	movq	(%rsp),%r10
750	cmpq	%rbp,%rsp
751	ja	L$sqr8x_page_walk
752	jmp	L$sqr8x_page_walk_done
753
754.p2align	4
755L$sqr8x_page_walk:
756	leaq	-4096(%rsp),%rsp
757	movq	(%rsp),%r10
758	cmpq	%rbp,%rsp
759	ja	L$sqr8x_page_walk
760L$sqr8x_page_walk_done:
761
762	movq	%r9,%r10
763	negq	%r9
764
765	movq	%r8,32(%rsp)
766	movq	%rax,40(%rsp)
767
768L$sqr8x_body:
769
770.byte	102,72,15,110,209
771	pxor	%xmm0,%xmm0
772.byte	102,72,15,110,207
773.byte	102,73,15,110,218
774	call	_bn_sqr8x_internal
775
776
777
778
779	leaq	(%rdi,%r9,1),%rbx
780	movq	%r9,%rcx
781	movq	%r9,%rdx
782.byte	102,72,15,126,207
783	sarq	$3+2,%rcx
784	jmp	L$sqr8x_sub
785
786.p2align	5
787L$sqr8x_sub:
788	movq	0(%rbx),%r12
789	movq	8(%rbx),%r13
790	movq	16(%rbx),%r14
791	movq	24(%rbx),%r15
792	leaq	32(%rbx),%rbx
793	sbbq	0(%rbp),%r12
794	sbbq	8(%rbp),%r13
795	sbbq	16(%rbp),%r14
796	sbbq	24(%rbp),%r15
797	leaq	32(%rbp),%rbp
798	movq	%r12,0(%rdi)
799	movq	%r13,8(%rdi)
800	movq	%r14,16(%rdi)
801	movq	%r15,24(%rdi)
802	leaq	32(%rdi),%rdi
803	incq	%rcx
804	jnz	L$sqr8x_sub
805
806	sbbq	$0,%rax
807	leaq	(%rbx,%r9,1),%rbx
808	leaq	(%rdi,%r9,1),%rdi
809
810.byte	102,72,15,110,200
811	pxor	%xmm0,%xmm0
812	pshufd	$0,%xmm1,%xmm1
813	movq	40(%rsp),%rsi
814
815	jmp	L$sqr8x_cond_copy
816
817.p2align	5
818L$sqr8x_cond_copy:
819	movdqa	0(%rbx),%xmm2
820	movdqa	16(%rbx),%xmm3
821	leaq	32(%rbx),%rbx
822	movdqu	0(%rdi),%xmm4
823	movdqu	16(%rdi),%xmm5
824	leaq	32(%rdi),%rdi
825	movdqa	%xmm0,-32(%rbx)
826	movdqa	%xmm0,-16(%rbx)
827	movdqa	%xmm0,-32(%rbx,%rdx,1)
828	movdqa	%xmm0,-16(%rbx,%rdx,1)
829	pcmpeqd	%xmm1,%xmm0
830	pand	%xmm1,%xmm2
831	pand	%xmm1,%xmm3
832	pand	%xmm0,%xmm4
833	pand	%xmm0,%xmm5
834	pxor	%xmm0,%xmm0
835	por	%xmm2,%xmm4
836	por	%xmm3,%xmm5
837	movdqu	%xmm4,-32(%rdi)
838	movdqu	%xmm5,-16(%rdi)
839	addq	$32,%r9
840	jnz	L$sqr8x_cond_copy
841
842	movq	$1,%rax
843	movq	-48(%rsi),%r15
844
845	movq	-40(%rsi),%r14
846
847	movq	-32(%rsi),%r13
848
849	movq	-24(%rsi),%r12
850
851	movq	-16(%rsi),%rbp
852
853	movq	-8(%rsi),%rbx
854
855	leaq	(%rsi),%rsp
856
857L$sqr8x_epilogue:
858	.byte	0xf3,0xc3
859
860
861.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
862.p2align	4
863#endif
864