1#if defined(__x86_64__)
2.text
3
4
5
6.globl	_rsaz_512_sqr
7.private_extern _rsaz_512_sqr
8
9.p2align	5
10_rsaz_512_sqr:
11	pushq	%rbx
12	pushq	%rbp
13	pushq	%r12
14	pushq	%r13
15	pushq	%r14
16	pushq	%r15
17
18	subq	$128+24,%rsp
19L$sqr_body:
20	movq	%rdx,%rbp
21	movq	(%rsi),%rdx
22	movq	8(%rsi),%rax
23	movq	%rcx,128(%rsp)
24	jmp	L$oop_sqr
25
26.p2align	5
27L$oop_sqr:
28	movl	%r8d,128+8(%rsp)
29
30	movq	%rdx,%rbx
31	mulq	%rdx
32	movq	%rax,%r8
33	movq	16(%rsi),%rax
34	movq	%rdx,%r9
35
36	mulq	%rbx
37	addq	%rax,%r9
38	movq	24(%rsi),%rax
39	movq	%rdx,%r10
40	adcq	$0,%r10
41
42	mulq	%rbx
43	addq	%rax,%r10
44	movq	32(%rsi),%rax
45	movq	%rdx,%r11
46	adcq	$0,%r11
47
48	mulq	%rbx
49	addq	%rax,%r11
50	movq	40(%rsi),%rax
51	movq	%rdx,%r12
52	adcq	$0,%r12
53
54	mulq	%rbx
55	addq	%rax,%r12
56	movq	48(%rsi),%rax
57	movq	%rdx,%r13
58	adcq	$0,%r13
59
60	mulq	%rbx
61	addq	%rax,%r13
62	movq	56(%rsi),%rax
63	movq	%rdx,%r14
64	adcq	$0,%r14
65
66	mulq	%rbx
67	addq	%rax,%r14
68	movq	%rbx,%rax
69	movq	%rdx,%r15
70	adcq	$0,%r15
71
72	addq	%r8,%r8
73	movq	%r9,%rcx
74	adcq	%r9,%r9
75
76	mulq	%rax
77	movq	%rax,(%rsp)
78	addq	%rdx,%r8
79	adcq	$0,%r9
80
81	movq	%r8,8(%rsp)
82	shrq	$63,%rcx
83
84
85	movq	8(%rsi),%r8
86	movq	16(%rsi),%rax
87	mulq	%r8
88	addq	%rax,%r10
89	movq	24(%rsi),%rax
90	movq	%rdx,%rbx
91	adcq	$0,%rbx
92
93	mulq	%r8
94	addq	%rax,%r11
95	movq	32(%rsi),%rax
96	adcq	$0,%rdx
97	addq	%rbx,%r11
98	movq	%rdx,%rbx
99	adcq	$0,%rbx
100
101	mulq	%r8
102	addq	%rax,%r12
103	movq	40(%rsi),%rax
104	adcq	$0,%rdx
105	addq	%rbx,%r12
106	movq	%rdx,%rbx
107	adcq	$0,%rbx
108
109	mulq	%r8
110	addq	%rax,%r13
111	movq	48(%rsi),%rax
112	adcq	$0,%rdx
113	addq	%rbx,%r13
114	movq	%rdx,%rbx
115	adcq	$0,%rbx
116
117	mulq	%r8
118	addq	%rax,%r14
119	movq	56(%rsi),%rax
120	adcq	$0,%rdx
121	addq	%rbx,%r14
122	movq	%rdx,%rbx
123	adcq	$0,%rbx
124
125	mulq	%r8
126	addq	%rax,%r15
127	movq	%r8,%rax
128	adcq	$0,%rdx
129	addq	%rbx,%r15
130	movq	%rdx,%r8
131	movq	%r10,%rdx
132	adcq	$0,%r8
133
134	addq	%rdx,%rdx
135	leaq	(%rcx,%r10,2),%r10
136	movq	%r11,%rbx
137	adcq	%r11,%r11
138
139	mulq	%rax
140	addq	%rax,%r9
141	adcq	%rdx,%r10
142	adcq	$0,%r11
143
144	movq	%r9,16(%rsp)
145	movq	%r10,24(%rsp)
146	shrq	$63,%rbx
147
148
149	movq	16(%rsi),%r9
150	movq	24(%rsi),%rax
151	mulq	%r9
152	addq	%rax,%r12
153	movq	32(%rsi),%rax
154	movq	%rdx,%rcx
155	adcq	$0,%rcx
156
157	mulq	%r9
158	addq	%rax,%r13
159	movq	40(%rsi),%rax
160	adcq	$0,%rdx
161	addq	%rcx,%r13
162	movq	%rdx,%rcx
163	adcq	$0,%rcx
164
165	mulq	%r9
166	addq	%rax,%r14
167	movq	48(%rsi),%rax
168	adcq	$0,%rdx
169	addq	%rcx,%r14
170	movq	%rdx,%rcx
171	adcq	$0,%rcx
172
173	mulq	%r9
174	movq	%r12,%r10
175	leaq	(%rbx,%r12,2),%r12
176	addq	%rax,%r15
177	movq	56(%rsi),%rax
178	adcq	$0,%rdx
179	addq	%rcx,%r15
180	movq	%rdx,%rcx
181	adcq	$0,%rcx
182
183	mulq	%r9
184	shrq	$63,%r10
185	addq	%rax,%r8
186	movq	%r9,%rax
187	adcq	$0,%rdx
188	addq	%rcx,%r8
189	movq	%rdx,%r9
190	adcq	$0,%r9
191
192	movq	%r13,%rcx
193	leaq	(%r10,%r13,2),%r13
194
195	mulq	%rax
196	addq	%rax,%r11
197	adcq	%rdx,%r12
198	adcq	$0,%r13
199
200	movq	%r11,32(%rsp)
201	movq	%r12,40(%rsp)
202	shrq	$63,%rcx
203
204
205	movq	24(%rsi),%r10
206	movq	32(%rsi),%rax
207	mulq	%r10
208	addq	%rax,%r14
209	movq	40(%rsi),%rax
210	movq	%rdx,%rbx
211	adcq	$0,%rbx
212
213	mulq	%r10
214	addq	%rax,%r15
215	movq	48(%rsi),%rax
216	adcq	$0,%rdx
217	addq	%rbx,%r15
218	movq	%rdx,%rbx
219	adcq	$0,%rbx
220
221	mulq	%r10
222	movq	%r14,%r12
223	leaq	(%rcx,%r14,2),%r14
224	addq	%rax,%r8
225	movq	56(%rsi),%rax
226	adcq	$0,%rdx
227	addq	%rbx,%r8
228	movq	%rdx,%rbx
229	adcq	$0,%rbx
230
231	mulq	%r10
232	shrq	$63,%r12
233	addq	%rax,%r9
234	movq	%r10,%rax
235	adcq	$0,%rdx
236	addq	%rbx,%r9
237	movq	%rdx,%r10
238	adcq	$0,%r10
239
240	movq	%r15,%rbx
241	leaq	(%r12,%r15,2),%r15
242
243	mulq	%rax
244	addq	%rax,%r13
245	adcq	%rdx,%r14
246	adcq	$0,%r15
247
248	movq	%r13,48(%rsp)
249	movq	%r14,56(%rsp)
250	shrq	$63,%rbx
251
252
253	movq	32(%rsi),%r11
254	movq	40(%rsi),%rax
255	mulq	%r11
256	addq	%rax,%r8
257	movq	48(%rsi),%rax
258	movq	%rdx,%rcx
259	adcq	$0,%rcx
260
261	mulq	%r11
262	addq	%rax,%r9
263	movq	56(%rsi),%rax
264	adcq	$0,%rdx
265	movq	%r8,%r12
266	leaq	(%rbx,%r8,2),%r8
267	addq	%rcx,%r9
268	movq	%rdx,%rcx
269	adcq	$0,%rcx
270
271	mulq	%r11
272	shrq	$63,%r12
273	addq	%rax,%r10
274	movq	%r11,%rax
275	adcq	$0,%rdx
276	addq	%rcx,%r10
277	movq	%rdx,%r11
278	adcq	$0,%r11
279
280	movq	%r9,%rcx
281	leaq	(%r12,%r9,2),%r9
282
283	mulq	%rax
284	addq	%rax,%r15
285	adcq	%rdx,%r8
286	adcq	$0,%r9
287
288	movq	%r15,64(%rsp)
289	movq	%r8,72(%rsp)
290	shrq	$63,%rcx
291
292
293	movq	40(%rsi),%r12
294	movq	48(%rsi),%rax
295	mulq	%r12
296	addq	%rax,%r10
297	movq	56(%rsi),%rax
298	movq	%rdx,%rbx
299	adcq	$0,%rbx
300
301	mulq	%r12
302	addq	%rax,%r11
303	movq	%r12,%rax
304	movq	%r10,%r15
305	leaq	(%rcx,%r10,2),%r10
306	adcq	$0,%rdx
307	shrq	$63,%r15
308	addq	%rbx,%r11
309	movq	%rdx,%r12
310	adcq	$0,%r12
311
312	movq	%r11,%rbx
313	leaq	(%r15,%r11,2),%r11
314
315	mulq	%rax
316	addq	%rax,%r9
317	adcq	%rdx,%r10
318	adcq	$0,%r11
319
320	movq	%r9,80(%rsp)
321	movq	%r10,88(%rsp)
322
323
324	movq	48(%rsi),%r13
325	movq	56(%rsi),%rax
326	mulq	%r13
327	addq	%rax,%r12
328	movq	%r13,%rax
329	movq	%rdx,%r13
330	adcq	$0,%r13
331
332	xorq	%r14,%r14
333	shlq	$1,%rbx
334	adcq	%r12,%r12
335	adcq	%r13,%r13
336	adcq	%r14,%r14
337
338	mulq	%rax
339	addq	%rax,%r11
340	adcq	%rdx,%r12
341	adcq	$0,%r13
342
343	movq	%r11,96(%rsp)
344	movq	%r12,104(%rsp)
345
346
347	movq	56(%rsi),%rax
348	mulq	%rax
349	addq	%rax,%r13
350	adcq	$0,%rdx
351
352	addq	%rdx,%r14
353
354	movq	%r13,112(%rsp)
355	movq	%r14,120(%rsp)
356
357	movq	(%rsp),%r8
358	movq	8(%rsp),%r9
359	movq	16(%rsp),%r10
360	movq	24(%rsp),%r11
361	movq	32(%rsp),%r12
362	movq	40(%rsp),%r13
363	movq	48(%rsp),%r14
364	movq	56(%rsp),%r15
365
366	call	__rsaz_512_reduce
367
368	addq	64(%rsp),%r8
369	adcq	72(%rsp),%r9
370	adcq	80(%rsp),%r10
371	adcq	88(%rsp),%r11
372	adcq	96(%rsp),%r12
373	adcq	104(%rsp),%r13
374	adcq	112(%rsp),%r14
375	adcq	120(%rsp),%r15
376	sbbq	%rcx,%rcx
377
378	call	__rsaz_512_subtract
379
380	movq	%r8,%rdx
381	movq	%r9,%rax
382	movl	128+8(%rsp),%r8d
383	movq	%rdi,%rsi
384
385	decl	%r8d
386	jnz	L$oop_sqr
387
388	leaq	128+24+48(%rsp),%rax
389	movq	-48(%rax),%r15
390	movq	-40(%rax),%r14
391	movq	-32(%rax),%r13
392	movq	-24(%rax),%r12
393	movq	-16(%rax),%rbp
394	movq	-8(%rax),%rbx
395	leaq	(%rax),%rsp
396L$sqr_epilogue:
397	.byte	0xf3,0xc3
398
399.globl	_rsaz_512_mul
400.private_extern _rsaz_512_mul
401
402.p2align	5
403_rsaz_512_mul:
404	pushq	%rbx
405	pushq	%rbp
406	pushq	%r12
407	pushq	%r13
408	pushq	%r14
409	pushq	%r15
410
411	subq	$128+24,%rsp
412L$mul_body:
413.byte	102,72,15,110,199
414.byte	102,72,15,110,201
415	movq	%r8,128(%rsp)
416	movq	(%rdx),%rbx
417	movq	%rdx,%rbp
418	call	__rsaz_512_mul
419
420.byte	102,72,15,126,199
421.byte	102,72,15,126,205
422
423	movq	(%rsp),%r8
424	movq	8(%rsp),%r9
425	movq	16(%rsp),%r10
426	movq	24(%rsp),%r11
427	movq	32(%rsp),%r12
428	movq	40(%rsp),%r13
429	movq	48(%rsp),%r14
430	movq	56(%rsp),%r15
431
432	call	__rsaz_512_reduce
433	addq	64(%rsp),%r8
434	adcq	72(%rsp),%r9
435	adcq	80(%rsp),%r10
436	adcq	88(%rsp),%r11
437	adcq	96(%rsp),%r12
438	adcq	104(%rsp),%r13
439	adcq	112(%rsp),%r14
440	adcq	120(%rsp),%r15
441	sbbq	%rcx,%rcx
442
443	call	__rsaz_512_subtract
444
445	leaq	128+24+48(%rsp),%rax
446	movq	-48(%rax),%r15
447	movq	-40(%rax),%r14
448	movq	-32(%rax),%r13
449	movq	-24(%rax),%r12
450	movq	-16(%rax),%rbp
451	movq	-8(%rax),%rbx
452	leaq	(%rax),%rsp
453L$mul_epilogue:
454	.byte	0xf3,0xc3
455
456.globl	_rsaz_512_mul_gather4
457.private_extern _rsaz_512_mul_gather4
458
459.p2align	5
460_rsaz_512_mul_gather4:
461	pushq	%rbx
462	pushq	%rbp
463	pushq	%r12
464	pushq	%r13
465	pushq	%r14
466	pushq	%r15
467
468	movl	%r9d,%r9d
469	subq	$128+24,%rsp
470L$mul_gather4_body:
471	movl	64(%rdx,%r9,4),%eax
472.byte	102,72,15,110,199
473	movl	(%rdx,%r9,4),%ebx
474.byte	102,72,15,110,201
475	movq	%r8,128(%rsp)
476
477	shlq	$32,%rax
478	orq	%rax,%rbx
479	movq	(%rsi),%rax
480	movq	8(%rsi),%rcx
481	leaq	128(%rdx,%r9,4),%rbp
482	mulq	%rbx
483	movq	%rax,(%rsp)
484	movq	%rcx,%rax
485	movq	%rdx,%r8
486
487	mulq	%rbx
488	movd	(%rbp),%xmm4
489	addq	%rax,%r8
490	movq	16(%rsi),%rax
491	movq	%rdx,%r9
492	adcq	$0,%r9
493
494	mulq	%rbx
495	movd	64(%rbp),%xmm5
496	addq	%rax,%r9
497	movq	24(%rsi),%rax
498	movq	%rdx,%r10
499	adcq	$0,%r10
500
501	mulq	%rbx
502	pslldq	$4,%xmm5
503	addq	%rax,%r10
504	movq	32(%rsi),%rax
505	movq	%rdx,%r11
506	adcq	$0,%r11
507
508	mulq	%rbx
509	por	%xmm5,%xmm4
510	addq	%rax,%r11
511	movq	40(%rsi),%rax
512	movq	%rdx,%r12
513	adcq	$0,%r12
514
515	mulq	%rbx
516	addq	%rax,%r12
517	movq	48(%rsi),%rax
518	movq	%rdx,%r13
519	adcq	$0,%r13
520
521	mulq	%rbx
522	leaq	128(%rbp),%rbp
523	addq	%rax,%r13
524	movq	56(%rsi),%rax
525	movq	%rdx,%r14
526	adcq	$0,%r14
527
528	mulq	%rbx
529.byte	102,72,15,126,227
530	addq	%rax,%r14
531	movq	(%rsi),%rax
532	movq	%rdx,%r15
533	adcq	$0,%r15
534
535	leaq	8(%rsp),%rdi
536	movl	$7,%ecx
537	jmp	L$oop_mul_gather
538
539.p2align	5
540L$oop_mul_gather:
541	mulq	%rbx
542	addq	%rax,%r8
543	movq	8(%rsi),%rax
544	movq	%r8,(%rdi)
545	movq	%rdx,%r8
546	adcq	$0,%r8
547
548	mulq	%rbx
549	movd	(%rbp),%xmm4
550	addq	%rax,%r9
551	movq	16(%rsi),%rax
552	adcq	$0,%rdx
553	addq	%r9,%r8
554	movq	%rdx,%r9
555	adcq	$0,%r9
556
557	mulq	%rbx
558	movd	64(%rbp),%xmm5
559	addq	%rax,%r10
560	movq	24(%rsi),%rax
561	adcq	$0,%rdx
562	addq	%r10,%r9
563	movq	%rdx,%r10
564	adcq	$0,%r10
565
566	mulq	%rbx
567	pslldq	$4,%xmm5
568	addq	%rax,%r11
569	movq	32(%rsi),%rax
570	adcq	$0,%rdx
571	addq	%r11,%r10
572	movq	%rdx,%r11
573	adcq	$0,%r11
574
575	mulq	%rbx
576	por	%xmm5,%xmm4
577	addq	%rax,%r12
578	movq	40(%rsi),%rax
579	adcq	$0,%rdx
580	addq	%r12,%r11
581	movq	%rdx,%r12
582	adcq	$0,%r12
583
584	mulq	%rbx
585	addq	%rax,%r13
586	movq	48(%rsi),%rax
587	adcq	$0,%rdx
588	addq	%r13,%r12
589	movq	%rdx,%r13
590	adcq	$0,%r13
591
592	mulq	%rbx
593	addq	%rax,%r14
594	movq	56(%rsi),%rax
595	adcq	$0,%rdx
596	addq	%r14,%r13
597	movq	%rdx,%r14
598	adcq	$0,%r14
599
600	mulq	%rbx
601.byte	102,72,15,126,227
602	addq	%rax,%r15
603	movq	(%rsi),%rax
604	adcq	$0,%rdx
605	addq	%r15,%r14
606	movq	%rdx,%r15
607	adcq	$0,%r15
608
609	leaq	128(%rbp),%rbp
610	leaq	8(%rdi),%rdi
611
612	decl	%ecx
613	jnz	L$oop_mul_gather
614
615	movq	%r8,(%rdi)
616	movq	%r9,8(%rdi)
617	movq	%r10,16(%rdi)
618	movq	%r11,24(%rdi)
619	movq	%r12,32(%rdi)
620	movq	%r13,40(%rdi)
621	movq	%r14,48(%rdi)
622	movq	%r15,56(%rdi)
623
624.byte	102,72,15,126,199
625.byte	102,72,15,126,205
626
627	movq	(%rsp),%r8
628	movq	8(%rsp),%r9
629	movq	16(%rsp),%r10
630	movq	24(%rsp),%r11
631	movq	32(%rsp),%r12
632	movq	40(%rsp),%r13
633	movq	48(%rsp),%r14
634	movq	56(%rsp),%r15
635
636	call	__rsaz_512_reduce
637	addq	64(%rsp),%r8
638	adcq	72(%rsp),%r9
639	adcq	80(%rsp),%r10
640	adcq	88(%rsp),%r11
641	adcq	96(%rsp),%r12
642	adcq	104(%rsp),%r13
643	adcq	112(%rsp),%r14
644	adcq	120(%rsp),%r15
645	sbbq	%rcx,%rcx
646
647	call	__rsaz_512_subtract
648
649	leaq	128+24+48(%rsp),%rax
650	movq	-48(%rax),%r15
651	movq	-40(%rax),%r14
652	movq	-32(%rax),%r13
653	movq	-24(%rax),%r12
654	movq	-16(%rax),%rbp
655	movq	-8(%rax),%rbx
656	leaq	(%rax),%rsp
657L$mul_gather4_epilogue:
658	.byte	0xf3,0xc3
659
660.globl	_rsaz_512_mul_scatter4
661.private_extern _rsaz_512_mul_scatter4
662
663.p2align	5
664_rsaz_512_mul_scatter4:
665	pushq	%rbx
666	pushq	%rbp
667	pushq	%r12
668	pushq	%r13
669	pushq	%r14
670	pushq	%r15
671
672	movl	%r9d,%r9d
673	subq	$128+24,%rsp
674L$mul_scatter4_body:
675	leaq	(%r8,%r9,4),%r8
676.byte	102,72,15,110,199
677.byte	102,72,15,110,202
678.byte	102,73,15,110,208
679	movq	%rcx,128(%rsp)
680
681	movq	%rdi,%rbp
682	movq	(%rdi),%rbx
683	call	__rsaz_512_mul
684
685.byte	102,72,15,126,199
686.byte	102,72,15,126,205
687
688	movq	(%rsp),%r8
689	movq	8(%rsp),%r9
690	movq	16(%rsp),%r10
691	movq	24(%rsp),%r11
692	movq	32(%rsp),%r12
693	movq	40(%rsp),%r13
694	movq	48(%rsp),%r14
695	movq	56(%rsp),%r15
696
697	call	__rsaz_512_reduce
698	addq	64(%rsp),%r8
699	adcq	72(%rsp),%r9
700	adcq	80(%rsp),%r10
701	adcq	88(%rsp),%r11
702	adcq	96(%rsp),%r12
703	adcq	104(%rsp),%r13
704	adcq	112(%rsp),%r14
705	adcq	120(%rsp),%r15
706.byte	102,72,15,126,214
707	sbbq	%rcx,%rcx
708
709	call	__rsaz_512_subtract
710
711	movl	%r8d,0(%rsi)
712	shrq	$32,%r8
713	movl	%r9d,128(%rsi)
714	shrq	$32,%r9
715	movl	%r10d,256(%rsi)
716	shrq	$32,%r10
717	movl	%r11d,384(%rsi)
718	shrq	$32,%r11
719	movl	%r12d,512(%rsi)
720	shrq	$32,%r12
721	movl	%r13d,640(%rsi)
722	shrq	$32,%r13
723	movl	%r14d,768(%rsi)
724	shrq	$32,%r14
725	movl	%r15d,896(%rsi)
726	shrq	$32,%r15
727	movl	%r8d,64(%rsi)
728	movl	%r9d,192(%rsi)
729	movl	%r10d,320(%rsi)
730	movl	%r11d,448(%rsi)
731	movl	%r12d,576(%rsi)
732	movl	%r13d,704(%rsi)
733	movl	%r14d,832(%rsi)
734	movl	%r15d,960(%rsi)
735
736	leaq	128+24+48(%rsp),%rax
737	movq	-48(%rax),%r15
738	movq	-40(%rax),%r14
739	movq	-32(%rax),%r13
740	movq	-24(%rax),%r12
741	movq	-16(%rax),%rbp
742	movq	-8(%rax),%rbx
743	leaq	(%rax),%rsp
744L$mul_scatter4_epilogue:
745	.byte	0xf3,0xc3
746
747.globl	_rsaz_512_mul_by_one
748.private_extern _rsaz_512_mul_by_one
749
750.p2align	5
751_rsaz_512_mul_by_one:
752	pushq	%rbx
753	pushq	%rbp
754	pushq	%r12
755	pushq	%r13
756	pushq	%r14
757	pushq	%r15
758
759	subq	$128+24,%rsp
760L$mul_by_one_body:
761	movq	%rdx,%rbp
762	movq	%rcx,128(%rsp)
763
764	movq	(%rsi),%r8
765	pxor	%xmm0,%xmm0
766	movq	8(%rsi),%r9
767	movq	16(%rsi),%r10
768	movq	24(%rsi),%r11
769	movq	32(%rsi),%r12
770	movq	40(%rsi),%r13
771	movq	48(%rsi),%r14
772	movq	56(%rsi),%r15
773
774	movdqa	%xmm0,(%rsp)
775	movdqa	%xmm0,16(%rsp)
776	movdqa	%xmm0,32(%rsp)
777	movdqa	%xmm0,48(%rsp)
778	movdqa	%xmm0,64(%rsp)
779	movdqa	%xmm0,80(%rsp)
780	movdqa	%xmm0,96(%rsp)
781	call	__rsaz_512_reduce
782	movq	%r8,(%rdi)
783	movq	%r9,8(%rdi)
784	movq	%r10,16(%rdi)
785	movq	%r11,24(%rdi)
786	movq	%r12,32(%rdi)
787	movq	%r13,40(%rdi)
788	movq	%r14,48(%rdi)
789	movq	%r15,56(%rdi)
790
791	leaq	128+24+48(%rsp),%rax
792	movq	-48(%rax),%r15
793	movq	-40(%rax),%r14
794	movq	-32(%rax),%r13
795	movq	-24(%rax),%r12
796	movq	-16(%rax),%rbp
797	movq	-8(%rax),%rbx
798	leaq	(%rax),%rsp
799L$mul_by_one_epilogue:
800	.byte	0xf3,0xc3
801
802
803.p2align	5
804__rsaz_512_reduce:
805	movq	%r8,%rbx
806	imulq	128+8(%rsp),%rbx
807	movq	0(%rbp),%rax
808	movl	$8,%ecx
809	jmp	L$reduction_loop
810
811.p2align	5
812L$reduction_loop:
813	mulq	%rbx
814	movq	8(%rbp),%rax
815	negq	%r8
816	movq	%rdx,%r8
817	adcq	$0,%r8
818
819	mulq	%rbx
820	addq	%rax,%r9
821	movq	16(%rbp),%rax
822	adcq	$0,%rdx
823	addq	%r9,%r8
824	movq	%rdx,%r9
825	adcq	$0,%r9
826
827	mulq	%rbx
828	addq	%rax,%r10
829	movq	24(%rbp),%rax
830	adcq	$0,%rdx
831	addq	%r10,%r9
832	movq	%rdx,%r10
833	adcq	$0,%r10
834
835	mulq	%rbx
836	addq	%rax,%r11
837	movq	32(%rbp),%rax
838	adcq	$0,%rdx
839	addq	%r11,%r10
840	movq	128+8(%rsp),%rsi
841
842
843	adcq	$0,%rdx
844	movq	%rdx,%r11
845
846	mulq	%rbx
847	addq	%rax,%r12
848	movq	40(%rbp),%rax
849	adcq	$0,%rdx
850	imulq	%r8,%rsi
851	addq	%r12,%r11
852	movq	%rdx,%r12
853	adcq	$0,%r12
854
855	mulq	%rbx
856	addq	%rax,%r13
857	movq	48(%rbp),%rax
858	adcq	$0,%rdx
859	addq	%r13,%r12
860	movq	%rdx,%r13
861	adcq	$0,%r13
862
863	mulq	%rbx
864	addq	%rax,%r14
865	movq	56(%rbp),%rax
866	adcq	$0,%rdx
867	addq	%r14,%r13
868	movq	%rdx,%r14
869	adcq	$0,%r14
870
871	mulq	%rbx
872	movq	%rsi,%rbx
873	addq	%rax,%r15
874	movq	0(%rbp),%rax
875	adcq	$0,%rdx
876	addq	%r15,%r14
877	movq	%rdx,%r15
878	adcq	$0,%r15
879
880	decl	%ecx
881	jne	L$reduction_loop
882
883	.byte	0xf3,0xc3
884
885
886.p2align	5
887__rsaz_512_subtract:
888	movq	%r8,(%rdi)
889	movq	%r9,8(%rdi)
890	movq	%r10,16(%rdi)
891	movq	%r11,24(%rdi)
892	movq	%r12,32(%rdi)
893	movq	%r13,40(%rdi)
894	movq	%r14,48(%rdi)
895	movq	%r15,56(%rdi)
896
897	movq	0(%rbp),%r8
898	movq	8(%rbp),%r9
899	negq	%r8
900	notq	%r9
901	andq	%rcx,%r8
902	movq	16(%rbp),%r10
903	andq	%rcx,%r9
904	notq	%r10
905	movq	24(%rbp),%r11
906	andq	%rcx,%r10
907	notq	%r11
908	movq	32(%rbp),%r12
909	andq	%rcx,%r11
910	notq	%r12
911	movq	40(%rbp),%r13
912	andq	%rcx,%r12
913	notq	%r13
914	movq	48(%rbp),%r14
915	andq	%rcx,%r13
916	notq	%r14
917	movq	56(%rbp),%r15
918	andq	%rcx,%r14
919	notq	%r15
920	andq	%rcx,%r15
921
922	addq	(%rdi),%r8
923	adcq	8(%rdi),%r9
924	adcq	16(%rdi),%r10
925	adcq	24(%rdi),%r11
926	adcq	32(%rdi),%r12
927	adcq	40(%rdi),%r13
928	adcq	48(%rdi),%r14
929	adcq	56(%rdi),%r15
930
931	movq	%r8,(%rdi)
932	movq	%r9,8(%rdi)
933	movq	%r10,16(%rdi)
934	movq	%r11,24(%rdi)
935	movq	%r12,32(%rdi)
936	movq	%r13,40(%rdi)
937	movq	%r14,48(%rdi)
938	movq	%r15,56(%rdi)
939
940	.byte	0xf3,0xc3
941
942
943.p2align	5
944__rsaz_512_mul:
945	leaq	8(%rsp),%rdi
946
947	movq	(%rsi),%rax
948	mulq	%rbx
949	movq	%rax,(%rdi)
950	movq	8(%rsi),%rax
951	movq	%rdx,%r8
952
953	mulq	%rbx
954	addq	%rax,%r8
955	movq	16(%rsi),%rax
956	movq	%rdx,%r9
957	adcq	$0,%r9
958
959	mulq	%rbx
960	addq	%rax,%r9
961	movq	24(%rsi),%rax
962	movq	%rdx,%r10
963	adcq	$0,%r10
964
965	mulq	%rbx
966	addq	%rax,%r10
967	movq	32(%rsi),%rax
968	movq	%rdx,%r11
969	adcq	$0,%r11
970
971	mulq	%rbx
972	addq	%rax,%r11
973	movq	40(%rsi),%rax
974	movq	%rdx,%r12
975	adcq	$0,%r12
976
977	mulq	%rbx
978	addq	%rax,%r12
979	movq	48(%rsi),%rax
980	movq	%rdx,%r13
981	adcq	$0,%r13
982
983	mulq	%rbx
984	addq	%rax,%r13
985	movq	56(%rsi),%rax
986	movq	%rdx,%r14
987	adcq	$0,%r14
988
989	mulq	%rbx
990	addq	%rax,%r14
991	movq	(%rsi),%rax
992	movq	%rdx,%r15
993	adcq	$0,%r15
994
995	leaq	8(%rbp),%rbp
996	leaq	8(%rdi),%rdi
997
998	movl	$7,%ecx
999	jmp	L$oop_mul
1000
1001.p2align	5
1002L$oop_mul:
1003	movq	(%rbp),%rbx
1004	mulq	%rbx
1005	addq	%rax,%r8
1006	movq	8(%rsi),%rax
1007	movq	%r8,(%rdi)
1008	movq	%rdx,%r8
1009	adcq	$0,%r8
1010
1011	mulq	%rbx
1012	addq	%rax,%r9
1013	movq	16(%rsi),%rax
1014	adcq	$0,%rdx
1015	addq	%r9,%r8
1016	movq	%rdx,%r9
1017	adcq	$0,%r9
1018
1019	mulq	%rbx
1020	addq	%rax,%r10
1021	movq	24(%rsi),%rax
1022	adcq	$0,%rdx
1023	addq	%r10,%r9
1024	movq	%rdx,%r10
1025	adcq	$0,%r10
1026
1027	mulq	%rbx
1028	addq	%rax,%r11
1029	movq	32(%rsi),%rax
1030	adcq	$0,%rdx
1031	addq	%r11,%r10
1032	movq	%rdx,%r11
1033	adcq	$0,%r11
1034
1035	mulq	%rbx
1036	addq	%rax,%r12
1037	movq	40(%rsi),%rax
1038	adcq	$0,%rdx
1039	addq	%r12,%r11
1040	movq	%rdx,%r12
1041	adcq	$0,%r12
1042
1043	mulq	%rbx
1044	addq	%rax,%r13
1045	movq	48(%rsi),%rax
1046	adcq	$0,%rdx
1047	addq	%r13,%r12
1048	movq	%rdx,%r13
1049	adcq	$0,%r13
1050
1051	mulq	%rbx
1052	addq	%rax,%r14
1053	movq	56(%rsi),%rax
1054	adcq	$0,%rdx
1055	addq	%r14,%r13
1056	movq	%rdx,%r14
1057	leaq	8(%rbp),%rbp
1058	adcq	$0,%r14
1059
1060	mulq	%rbx
1061	addq	%rax,%r15
1062	movq	(%rsi),%rax
1063	adcq	$0,%rdx
1064	addq	%r15,%r14
1065	movq	%rdx,%r15
1066	adcq	$0,%r15
1067
1068	leaq	8(%rdi),%rdi
1069
1070	decl	%ecx
1071	jnz	L$oop_mul
1072
1073	movq	%r8,(%rdi)
1074	movq	%r9,8(%rdi)
1075	movq	%r10,16(%rdi)
1076	movq	%r11,24(%rdi)
1077	movq	%r12,32(%rdi)
1078	movq	%r13,40(%rdi)
1079	movq	%r14,48(%rdi)
1080	movq	%r15,56(%rdi)
1081
1082	.byte	0xf3,0xc3
1083
1084.globl	_rsaz_512_scatter4
1085.private_extern _rsaz_512_scatter4
1086
1087.p2align	4
1088_rsaz_512_scatter4:
1089	leaq	(%rdi,%rdx,4),%rdi
1090	movl	$8,%r9d
1091	jmp	L$oop_scatter
1092.p2align	4
1093L$oop_scatter:
1094	movq	(%rsi),%rax
1095	leaq	8(%rsi),%rsi
1096	movl	%eax,(%rdi)
1097	shrq	$32,%rax
1098	movl	%eax,64(%rdi)
1099	leaq	128(%rdi),%rdi
1100	decl	%r9d
1101	jnz	L$oop_scatter
1102	.byte	0xf3,0xc3
1103
1104
1105.globl	_rsaz_512_gather4
1106.private_extern _rsaz_512_gather4
1107
1108.p2align	4
1109_rsaz_512_gather4:
1110	leaq	(%rsi,%rdx,4),%rsi
1111	movl	$8,%r9d
1112	jmp	L$oop_gather
1113.p2align	4
1114L$oop_gather:
1115	movl	(%rsi),%eax
1116	movl	64(%rsi),%r8d
1117	leaq	128(%rsi),%rsi
1118	shlq	$32,%r8
1119	orq	%r8,%rax
1120	movq	%rax,(%rdi)
1121	leaq	8(%rdi),%rdi
1122	decl	%r9d
1123	jnz	L$oop_gather
1124	.byte	0xf3,0xc3
1125
1126#endif
1127