1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12.extern	GFp_ia32cap_P
13.hidden GFp_ia32cap_P
14
15
16.align	64
17.Lpoly:
18.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
19
20.LOne:
21.long	1,1,1,1,1,1,1,1
22.LTwo:
23.long	2,2,2,2,2,2,2,2
24.LThree:
25.long	3,3,3,3,3,3,3,3
26.LONE_mont:
27.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
28
29
30.Lord:
31.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
32.LordK:
33.quad	0xccd1c8aaee00bc4f
34
35
36
37.globl	GFp_nistz256_add
38.hidden GFp_nistz256_add
39.type	GFp_nistz256_add,@function
40.align	32
41GFp_nistz256_add:
42	pushq	%r12
43	pushq	%r13
44
45	movq	0(%rsi),%r8
46	xorq	%r13,%r13
47	movq	8(%rsi),%r9
48	movq	16(%rsi),%r10
49	movq	24(%rsi),%r11
50	leaq	.Lpoly(%rip),%rsi
51
52	addq	0(%rdx),%r8
53	adcq	8(%rdx),%r9
54	movq	%r8,%rax
55	adcq	16(%rdx),%r10
56	adcq	24(%rdx),%r11
57	movq	%r9,%rdx
58	adcq	$0,%r13
59
60	subq	0(%rsi),%r8
61	movq	%r10,%rcx
62	sbbq	8(%rsi),%r9
63	sbbq	16(%rsi),%r10
64	movq	%r11,%r12
65	sbbq	24(%rsi),%r11
66	sbbq	$0,%r13
67
68	cmovcq	%rax,%r8
69	cmovcq	%rdx,%r9
70	movq	%r8,0(%rdi)
71	cmovcq	%rcx,%r10
72	movq	%r9,8(%rdi)
73	cmovcq	%r12,%r11
74	movq	%r10,16(%rdi)
75	movq	%r11,24(%rdi)
76
77	popq	%r13
78	popq	%r12
79	.byte	0xf3,0xc3
80.size	GFp_nistz256_add,.-GFp_nistz256_add
81
82
83
84.globl	GFp_nistz256_neg
85.hidden GFp_nistz256_neg
86.type	GFp_nistz256_neg,@function
87.align	32
88GFp_nistz256_neg:
89.cfi_startproc
90	pushq	%r12
91.cfi_adjust_cfa_offset	8
92.cfi_offset	%r12,-16
93	pushq	%r13
94.cfi_adjust_cfa_offset	8
95.cfi_offset	%r13,-24
96.Lneg_body:
97
98	xorq	%r8,%r8
99	xorq	%r9,%r9
100	xorq	%r10,%r10
101	xorq	%r11,%r11
102	xorq	%r13,%r13
103
104	subq	0(%rsi),%r8
105	sbbq	8(%rsi),%r9
106	sbbq	16(%rsi),%r10
107	movq	%r8,%rax
108	sbbq	24(%rsi),%r11
109	leaq	.Lpoly(%rip),%rsi
110	movq	%r9,%rdx
111	sbbq	$0,%r13
112
113	addq	0(%rsi),%r8
114	movq	%r10,%rcx
115	adcq	8(%rsi),%r9
116	adcq	16(%rsi),%r10
117	movq	%r11,%r12
118	adcq	24(%rsi),%r11
119	testq	%r13,%r13
120
121	cmovzq	%rax,%r8
122	cmovzq	%rdx,%r9
123	movq	%r8,0(%rdi)
124	cmovzq	%rcx,%r10
125	movq	%r9,8(%rdi)
126	cmovzq	%r12,%r11
127	movq	%r10,16(%rdi)
128	movq	%r11,24(%rdi)
129
130	movq	0(%rsp),%r13
131.cfi_restore	%r13
132	movq	8(%rsp),%r12
133.cfi_restore	%r12
134	leaq	16(%rsp),%rsp
135.cfi_adjust_cfa_offset	-16
136.Lneg_epilogue:
137	.byte	0xf3,0xc3
138.cfi_endproc
139.size	GFp_nistz256_neg,.-GFp_nistz256_neg
140
141
142
143
144
145
146.globl	GFp_p256_scalar_mul_mont
147.hidden GFp_p256_scalar_mul_mont
148.type	GFp_p256_scalar_mul_mont,@function
149.align	32
150GFp_p256_scalar_mul_mont:
151.cfi_startproc
152	leaq	GFp_ia32cap_P(%rip),%rcx
153	movq	8(%rcx),%rcx
154	andl	$0x80100,%ecx
155	cmpl	$0x80100,%ecx
156	je	.Lecp_nistz256_ord_mul_montx
157	pushq	%rbp
158.cfi_adjust_cfa_offset	8
159.cfi_offset	%rbp,-16
160	pushq	%rbx
161.cfi_adjust_cfa_offset	8
162.cfi_offset	%rbx,-24
163	pushq	%r12
164.cfi_adjust_cfa_offset	8
165.cfi_offset	%r12,-32
166	pushq	%r13
167.cfi_adjust_cfa_offset	8
168.cfi_offset	%r13,-40
169	pushq	%r14
170.cfi_adjust_cfa_offset	8
171.cfi_offset	%r14,-48
172	pushq	%r15
173.cfi_adjust_cfa_offset	8
174.cfi_offset	%r15,-56
175.Lord_mul_body:
176
177	movq	0(%rdx),%rax
178	movq	%rdx,%rbx
179	leaq	.Lord(%rip),%r14
180	movq	.LordK(%rip),%r15
181
182
183	movq	%rax,%rcx
184	mulq	0(%rsi)
185	movq	%rax,%r8
186	movq	%rcx,%rax
187	movq	%rdx,%r9
188
189	mulq	8(%rsi)
190	addq	%rax,%r9
191	movq	%rcx,%rax
192	adcq	$0,%rdx
193	movq	%rdx,%r10
194
195	mulq	16(%rsi)
196	addq	%rax,%r10
197	movq	%rcx,%rax
198	adcq	$0,%rdx
199
200	movq	%r8,%r13
201	imulq	%r15,%r8
202
203	movq	%rdx,%r11
204	mulq	24(%rsi)
205	addq	%rax,%r11
206	movq	%r8,%rax
207	adcq	$0,%rdx
208	movq	%rdx,%r12
209
210
211	mulq	0(%r14)
212	movq	%r8,%rbp
213	addq	%rax,%r13
214	movq	%r8,%rax
215	adcq	$0,%rdx
216	movq	%rdx,%rcx
217
218	subq	%r8,%r10
219	sbbq	$0,%r8
220
221	mulq	8(%r14)
222	addq	%rcx,%r9
223	adcq	$0,%rdx
224	addq	%rax,%r9
225	movq	%rbp,%rax
226	adcq	%rdx,%r10
227	movq	%rbp,%rdx
228	adcq	$0,%r8
229
230	shlq	$32,%rax
231	shrq	$32,%rdx
232	subq	%rax,%r11
233	movq	8(%rbx),%rax
234	sbbq	%rdx,%rbp
235
236	addq	%r8,%r11
237	adcq	%rbp,%r12
238	adcq	$0,%r13
239
240
241	movq	%rax,%rcx
242	mulq	0(%rsi)
243	addq	%rax,%r9
244	movq	%rcx,%rax
245	adcq	$0,%rdx
246	movq	%rdx,%rbp
247
248	mulq	8(%rsi)
249	addq	%rbp,%r10
250	adcq	$0,%rdx
251	addq	%rax,%r10
252	movq	%rcx,%rax
253	adcq	$0,%rdx
254	movq	%rdx,%rbp
255
256	mulq	16(%rsi)
257	addq	%rbp,%r11
258	adcq	$0,%rdx
259	addq	%rax,%r11
260	movq	%rcx,%rax
261	adcq	$0,%rdx
262
263	movq	%r9,%rcx
264	imulq	%r15,%r9
265
266	movq	%rdx,%rbp
267	mulq	24(%rsi)
268	addq	%rbp,%r12
269	adcq	$0,%rdx
270	xorq	%r8,%r8
271	addq	%rax,%r12
272	movq	%r9,%rax
273	adcq	%rdx,%r13
274	adcq	$0,%r8
275
276
277	mulq	0(%r14)
278	movq	%r9,%rbp
279	addq	%rax,%rcx
280	movq	%r9,%rax
281	adcq	%rdx,%rcx
282
283	subq	%r9,%r11
284	sbbq	$0,%r9
285
286	mulq	8(%r14)
287	addq	%rcx,%r10
288	adcq	$0,%rdx
289	addq	%rax,%r10
290	movq	%rbp,%rax
291	adcq	%rdx,%r11
292	movq	%rbp,%rdx
293	adcq	$0,%r9
294
295	shlq	$32,%rax
296	shrq	$32,%rdx
297	subq	%rax,%r12
298	movq	16(%rbx),%rax
299	sbbq	%rdx,%rbp
300
301	addq	%r9,%r12
302	adcq	%rbp,%r13
303	adcq	$0,%r8
304
305
306	movq	%rax,%rcx
307	mulq	0(%rsi)
308	addq	%rax,%r10
309	movq	%rcx,%rax
310	adcq	$0,%rdx
311	movq	%rdx,%rbp
312
313	mulq	8(%rsi)
314	addq	%rbp,%r11
315	adcq	$0,%rdx
316	addq	%rax,%r11
317	movq	%rcx,%rax
318	adcq	$0,%rdx
319	movq	%rdx,%rbp
320
321	mulq	16(%rsi)
322	addq	%rbp,%r12
323	adcq	$0,%rdx
324	addq	%rax,%r12
325	movq	%rcx,%rax
326	adcq	$0,%rdx
327
328	movq	%r10,%rcx
329	imulq	%r15,%r10
330
331	movq	%rdx,%rbp
332	mulq	24(%rsi)
333	addq	%rbp,%r13
334	adcq	$0,%rdx
335	xorq	%r9,%r9
336	addq	%rax,%r13
337	movq	%r10,%rax
338	adcq	%rdx,%r8
339	adcq	$0,%r9
340
341
342	mulq	0(%r14)
343	movq	%r10,%rbp
344	addq	%rax,%rcx
345	movq	%r10,%rax
346	adcq	%rdx,%rcx
347
348	subq	%r10,%r12
349	sbbq	$0,%r10
350
351	mulq	8(%r14)
352	addq	%rcx,%r11
353	adcq	$0,%rdx
354	addq	%rax,%r11
355	movq	%rbp,%rax
356	adcq	%rdx,%r12
357	movq	%rbp,%rdx
358	adcq	$0,%r10
359
360	shlq	$32,%rax
361	shrq	$32,%rdx
362	subq	%rax,%r13
363	movq	24(%rbx),%rax
364	sbbq	%rdx,%rbp
365
366	addq	%r10,%r13
367	adcq	%rbp,%r8
368	adcq	$0,%r9
369
370
371	movq	%rax,%rcx
372	mulq	0(%rsi)
373	addq	%rax,%r11
374	movq	%rcx,%rax
375	adcq	$0,%rdx
376	movq	%rdx,%rbp
377
378	mulq	8(%rsi)
379	addq	%rbp,%r12
380	adcq	$0,%rdx
381	addq	%rax,%r12
382	movq	%rcx,%rax
383	adcq	$0,%rdx
384	movq	%rdx,%rbp
385
386	mulq	16(%rsi)
387	addq	%rbp,%r13
388	adcq	$0,%rdx
389	addq	%rax,%r13
390	movq	%rcx,%rax
391	adcq	$0,%rdx
392
393	movq	%r11,%rcx
394	imulq	%r15,%r11
395
396	movq	%rdx,%rbp
397	mulq	24(%rsi)
398	addq	%rbp,%r8
399	adcq	$0,%rdx
400	xorq	%r10,%r10
401	addq	%rax,%r8
402	movq	%r11,%rax
403	adcq	%rdx,%r9
404	adcq	$0,%r10
405
406
407	mulq	0(%r14)
408	movq	%r11,%rbp
409	addq	%rax,%rcx
410	movq	%r11,%rax
411	adcq	%rdx,%rcx
412
413	subq	%r11,%r13
414	sbbq	$0,%r11
415
416	mulq	8(%r14)
417	addq	%rcx,%r12
418	adcq	$0,%rdx
419	addq	%rax,%r12
420	movq	%rbp,%rax
421	adcq	%rdx,%r13
422	movq	%rbp,%rdx
423	adcq	$0,%r11
424
425	shlq	$32,%rax
426	shrq	$32,%rdx
427	subq	%rax,%r8
428	sbbq	%rdx,%rbp
429
430	addq	%r11,%r8
431	adcq	%rbp,%r9
432	adcq	$0,%r10
433
434
435	movq	%r12,%rsi
436	subq	0(%r14),%r12
437	movq	%r13,%r11
438	sbbq	8(%r14),%r13
439	movq	%r8,%rcx
440	sbbq	16(%r14),%r8
441	movq	%r9,%rbp
442	sbbq	24(%r14),%r9
443	sbbq	$0,%r10
444
445	cmovcq	%rsi,%r12
446	cmovcq	%r11,%r13
447	cmovcq	%rcx,%r8
448	cmovcq	%rbp,%r9
449
450	movq	%r12,0(%rdi)
451	movq	%r13,8(%rdi)
452	movq	%r8,16(%rdi)
453	movq	%r9,24(%rdi)
454
455	movq	0(%rsp),%r15
456.cfi_restore	%r15
457	movq	8(%rsp),%r14
458.cfi_restore	%r14
459	movq	16(%rsp),%r13
460.cfi_restore	%r13
461	movq	24(%rsp),%r12
462.cfi_restore	%r12
463	movq	32(%rsp),%rbx
464.cfi_restore	%rbx
465	movq	40(%rsp),%rbp
466.cfi_restore	%rbp
467	leaq	48(%rsp),%rsp
468.cfi_adjust_cfa_offset	-48
469.Lord_mul_epilogue:
470	.byte	0xf3,0xc3
471.cfi_endproc
472.size	GFp_p256_scalar_mul_mont,.-GFp_p256_scalar_mul_mont
473
474
475
476
477
478
479
480.globl	GFp_p256_scalar_sqr_rep_mont
481.hidden GFp_p256_scalar_sqr_rep_mont
482.type	GFp_p256_scalar_sqr_rep_mont,@function
483.align	32
484GFp_p256_scalar_sqr_rep_mont:
485.cfi_startproc
486	leaq	GFp_ia32cap_P(%rip),%rcx
487	movq	8(%rcx),%rcx
488	andl	$0x80100,%ecx
489	cmpl	$0x80100,%ecx
490	je	.Lecp_nistz256_ord_sqr_montx
491	pushq	%rbp
492.cfi_adjust_cfa_offset	8
493.cfi_offset	%rbp,-16
494	pushq	%rbx
495.cfi_adjust_cfa_offset	8
496.cfi_offset	%rbx,-24
497	pushq	%r12
498.cfi_adjust_cfa_offset	8
499.cfi_offset	%r12,-32
500	pushq	%r13
501.cfi_adjust_cfa_offset	8
502.cfi_offset	%r13,-40
503	pushq	%r14
504.cfi_adjust_cfa_offset	8
505.cfi_offset	%r14,-48
506	pushq	%r15
507.cfi_adjust_cfa_offset	8
508.cfi_offset	%r15,-56
509.Lord_sqr_body:
510
511	movq	0(%rsi),%r8
512	movq	8(%rsi),%rax
513	movq	16(%rsi),%r14
514	movq	24(%rsi),%r15
515	leaq	.Lord(%rip),%rsi
516	movq	%rdx,%rbx
517	jmp	.Loop_ord_sqr
518
519.align	32
520.Loop_ord_sqr:
521
522	movq	%rax,%rbp
523	mulq	%r8
524	movq	%rax,%r9
525.byte	102,72,15,110,205
526	movq	%r14,%rax
527	movq	%rdx,%r10
528
529	mulq	%r8
530	addq	%rax,%r10
531	movq	%r15,%rax
532.byte	102,73,15,110,214
533	adcq	$0,%rdx
534	movq	%rdx,%r11
535
536	mulq	%r8
537	addq	%rax,%r11
538	movq	%r15,%rax
539.byte	102,73,15,110,223
540	adcq	$0,%rdx
541	movq	%rdx,%r12
542
543
544	mulq	%r14
545	movq	%rax,%r13
546	movq	%r14,%rax
547	movq	%rdx,%r14
548
549
550	mulq	%rbp
551	addq	%rax,%r11
552	movq	%r15,%rax
553	adcq	$0,%rdx
554	movq	%rdx,%r15
555
556	mulq	%rbp
557	addq	%rax,%r12
558	adcq	$0,%rdx
559
560	addq	%r15,%r12
561	adcq	%rdx,%r13
562	adcq	$0,%r14
563
564
565	xorq	%r15,%r15
566	movq	%r8,%rax
567	addq	%r9,%r9
568	adcq	%r10,%r10
569	adcq	%r11,%r11
570	adcq	%r12,%r12
571	adcq	%r13,%r13
572	adcq	%r14,%r14
573	adcq	$0,%r15
574
575
576	mulq	%rax
577	movq	%rax,%r8
578.byte	102,72,15,126,200
579	movq	%rdx,%rbp
580
581	mulq	%rax
582	addq	%rbp,%r9
583	adcq	%rax,%r10
584.byte	102,72,15,126,208
585	adcq	$0,%rdx
586	movq	%rdx,%rbp
587
588	mulq	%rax
589	addq	%rbp,%r11
590	adcq	%rax,%r12
591.byte	102,72,15,126,216
592	adcq	$0,%rdx
593	movq	%rdx,%rbp
594
595	movq	%r8,%rcx
596	imulq	32(%rsi),%r8
597
598	mulq	%rax
599	addq	%rbp,%r13
600	adcq	%rax,%r14
601	movq	0(%rsi),%rax
602	adcq	%rdx,%r15
603
604
605	mulq	%r8
606	movq	%r8,%rbp
607	addq	%rax,%rcx
608	movq	8(%rsi),%rax
609	adcq	%rdx,%rcx
610
611	subq	%r8,%r10
612	sbbq	$0,%rbp
613
614	mulq	%r8
615	addq	%rcx,%r9
616	adcq	$0,%rdx
617	addq	%rax,%r9
618	movq	%r8,%rax
619	adcq	%rdx,%r10
620	movq	%r8,%rdx
621	adcq	$0,%rbp
622
623	movq	%r9,%rcx
624	imulq	32(%rsi),%r9
625
626	shlq	$32,%rax
627	shrq	$32,%rdx
628	subq	%rax,%r11
629	movq	0(%rsi),%rax
630	sbbq	%rdx,%r8
631
632	addq	%rbp,%r11
633	adcq	$0,%r8
634
635
636	mulq	%r9
637	movq	%r9,%rbp
638	addq	%rax,%rcx
639	movq	8(%rsi),%rax
640	adcq	%rdx,%rcx
641
642	subq	%r9,%r11
643	sbbq	$0,%rbp
644
645	mulq	%r9
646	addq	%rcx,%r10
647	adcq	$0,%rdx
648	addq	%rax,%r10
649	movq	%r9,%rax
650	adcq	%rdx,%r11
651	movq	%r9,%rdx
652	adcq	$0,%rbp
653
654	movq	%r10,%rcx
655	imulq	32(%rsi),%r10
656
657	shlq	$32,%rax
658	shrq	$32,%rdx
659	subq	%rax,%r8
660	movq	0(%rsi),%rax
661	sbbq	%rdx,%r9
662
663	addq	%rbp,%r8
664	adcq	$0,%r9
665
666
667	mulq	%r10
668	movq	%r10,%rbp
669	addq	%rax,%rcx
670	movq	8(%rsi),%rax
671	adcq	%rdx,%rcx
672
673	subq	%r10,%r8
674	sbbq	$0,%rbp
675
676	mulq	%r10
677	addq	%rcx,%r11
678	adcq	$0,%rdx
679	addq	%rax,%r11
680	movq	%r10,%rax
681	adcq	%rdx,%r8
682	movq	%r10,%rdx
683	adcq	$0,%rbp
684
685	movq	%r11,%rcx
686	imulq	32(%rsi),%r11
687
688	shlq	$32,%rax
689	shrq	$32,%rdx
690	subq	%rax,%r9
691	movq	0(%rsi),%rax
692	sbbq	%rdx,%r10
693
694	addq	%rbp,%r9
695	adcq	$0,%r10
696
697
698	mulq	%r11
699	movq	%r11,%rbp
700	addq	%rax,%rcx
701	movq	8(%rsi),%rax
702	adcq	%rdx,%rcx
703
704	subq	%r11,%r9
705	sbbq	$0,%rbp
706
707	mulq	%r11
708	addq	%rcx,%r8
709	adcq	$0,%rdx
710	addq	%rax,%r8
711	movq	%r11,%rax
712	adcq	%rdx,%r9
713	movq	%r11,%rdx
714	adcq	$0,%rbp
715
716	shlq	$32,%rax
717	shrq	$32,%rdx
718	subq	%rax,%r10
719	sbbq	%rdx,%r11
720
721	addq	%rbp,%r10
722	adcq	$0,%r11
723
724
725	xorq	%rdx,%rdx
726	addq	%r12,%r8
727	adcq	%r13,%r9
728	movq	%r8,%r12
729	adcq	%r14,%r10
730	adcq	%r15,%r11
731	movq	%r9,%rax
732	adcq	$0,%rdx
733
734
735	subq	0(%rsi),%r8
736	movq	%r10,%r14
737	sbbq	8(%rsi),%r9
738	sbbq	16(%rsi),%r10
739	movq	%r11,%r15
740	sbbq	24(%rsi),%r11
741	sbbq	$0,%rdx
742
743	cmovcq	%r12,%r8
744	cmovncq	%r9,%rax
745	cmovncq	%r10,%r14
746	cmovncq	%r11,%r15
747
748	decq	%rbx
749	jnz	.Loop_ord_sqr
750
751	movq	%r8,0(%rdi)
752	movq	%rax,8(%rdi)
753	pxor	%xmm1,%xmm1
754	movq	%r14,16(%rdi)
755	pxor	%xmm2,%xmm2
756	movq	%r15,24(%rdi)
757	pxor	%xmm3,%xmm3
758
759	movq	0(%rsp),%r15
760.cfi_restore	%r15
761	movq	8(%rsp),%r14
762.cfi_restore	%r14
763	movq	16(%rsp),%r13
764.cfi_restore	%r13
765	movq	24(%rsp),%r12
766.cfi_restore	%r12
767	movq	32(%rsp),%rbx
768.cfi_restore	%rbx
769	movq	40(%rsp),%rbp
770.cfi_restore	%rbp
771	leaq	48(%rsp),%rsp
772.cfi_adjust_cfa_offset	-48
773.Lord_sqr_epilogue:
774	.byte	0xf3,0xc3
775.cfi_endproc
776.size	GFp_p256_scalar_sqr_rep_mont,.-GFp_p256_scalar_sqr_rep_mont
777
778.type	ecp_nistz256_ord_mul_montx,@function
779.align	32
780ecp_nistz256_ord_mul_montx:
781.cfi_startproc
782.Lecp_nistz256_ord_mul_montx:
783	pushq	%rbp
784.cfi_adjust_cfa_offset	8
785.cfi_offset	%rbp,-16
786	pushq	%rbx
787.cfi_adjust_cfa_offset	8
788.cfi_offset	%rbx,-24
789	pushq	%r12
790.cfi_adjust_cfa_offset	8
791.cfi_offset	%r12,-32
792	pushq	%r13
793.cfi_adjust_cfa_offset	8
794.cfi_offset	%r13,-40
795	pushq	%r14
796.cfi_adjust_cfa_offset	8
797.cfi_offset	%r14,-48
798	pushq	%r15
799.cfi_adjust_cfa_offset	8
800.cfi_offset	%r15,-56
801.Lord_mulx_body:
802
803	movq	%rdx,%rbx
804	movq	0(%rdx),%rdx
805	movq	0(%rsi),%r9
806	movq	8(%rsi),%r10
807	movq	16(%rsi),%r11
808	movq	24(%rsi),%r12
809	leaq	-128(%rsi),%rsi
810	leaq	.Lord-128(%rip),%r14
811	movq	.LordK(%rip),%r15
812
813
814	mulxq	%r9,%r8,%r9
815	mulxq	%r10,%rcx,%r10
816	mulxq	%r11,%rbp,%r11
817	addq	%rcx,%r9
818	mulxq	%r12,%rcx,%r12
819	movq	%r8,%rdx
820	mulxq	%r15,%rdx,%rax
821	adcq	%rbp,%r10
822	adcq	%rcx,%r11
823	adcq	$0,%r12
824
825
826	xorq	%r13,%r13
827	mulxq	0+128(%r14),%rcx,%rbp
828	adcxq	%rcx,%r8
829	adoxq	%rbp,%r9
830
831	mulxq	8+128(%r14),%rcx,%rbp
832	adcxq	%rcx,%r9
833	adoxq	%rbp,%r10
834
835	mulxq	16+128(%r14),%rcx,%rbp
836	adcxq	%rcx,%r10
837	adoxq	%rbp,%r11
838
839	mulxq	24+128(%r14),%rcx,%rbp
840	movq	8(%rbx),%rdx
841	adcxq	%rcx,%r11
842	adoxq	%rbp,%r12
843	adcxq	%r8,%r12
844	adoxq	%r8,%r13
845	adcq	$0,%r13
846
847
848	mulxq	0+128(%rsi),%rcx,%rbp
849	adcxq	%rcx,%r9
850	adoxq	%rbp,%r10
851
852	mulxq	8+128(%rsi),%rcx,%rbp
853	adcxq	%rcx,%r10
854	adoxq	%rbp,%r11
855
856	mulxq	16+128(%rsi),%rcx,%rbp
857	adcxq	%rcx,%r11
858	adoxq	%rbp,%r12
859
860	mulxq	24+128(%rsi),%rcx,%rbp
861	movq	%r9,%rdx
862	mulxq	%r15,%rdx,%rax
863	adcxq	%rcx,%r12
864	adoxq	%rbp,%r13
865
866	adcxq	%r8,%r13
867	adoxq	%r8,%r8
868	adcq	$0,%r8
869
870
871	mulxq	0+128(%r14),%rcx,%rbp
872	adcxq	%rcx,%r9
873	adoxq	%rbp,%r10
874
875	mulxq	8+128(%r14),%rcx,%rbp
876	adcxq	%rcx,%r10
877	adoxq	%rbp,%r11
878
879	mulxq	16+128(%r14),%rcx,%rbp
880	adcxq	%rcx,%r11
881	adoxq	%rbp,%r12
882
883	mulxq	24+128(%r14),%rcx,%rbp
884	movq	16(%rbx),%rdx
885	adcxq	%rcx,%r12
886	adoxq	%rbp,%r13
887	adcxq	%r9,%r13
888	adoxq	%r9,%r8
889	adcq	$0,%r8
890
891
892	mulxq	0+128(%rsi),%rcx,%rbp
893	adcxq	%rcx,%r10
894	adoxq	%rbp,%r11
895
896	mulxq	8+128(%rsi),%rcx,%rbp
897	adcxq	%rcx,%r11
898	adoxq	%rbp,%r12
899
900	mulxq	16+128(%rsi),%rcx,%rbp
901	adcxq	%rcx,%r12
902	adoxq	%rbp,%r13
903
904	mulxq	24+128(%rsi),%rcx,%rbp
905	movq	%r10,%rdx
906	mulxq	%r15,%rdx,%rax
907	adcxq	%rcx,%r13
908	adoxq	%rbp,%r8
909
910	adcxq	%r9,%r8
911	adoxq	%r9,%r9
912	adcq	$0,%r9
913
914
915	mulxq	0+128(%r14),%rcx,%rbp
916	adcxq	%rcx,%r10
917	adoxq	%rbp,%r11
918
919	mulxq	8+128(%r14),%rcx,%rbp
920	adcxq	%rcx,%r11
921	adoxq	%rbp,%r12
922
923	mulxq	16+128(%r14),%rcx,%rbp
924	adcxq	%rcx,%r12
925	adoxq	%rbp,%r13
926
927	mulxq	24+128(%r14),%rcx,%rbp
928	movq	24(%rbx),%rdx
929	adcxq	%rcx,%r13
930	adoxq	%rbp,%r8
931	adcxq	%r10,%r8
932	adoxq	%r10,%r9
933	adcq	$0,%r9
934
935
936	mulxq	0+128(%rsi),%rcx,%rbp
937	adcxq	%rcx,%r11
938	adoxq	%rbp,%r12
939
940	mulxq	8+128(%rsi),%rcx,%rbp
941	adcxq	%rcx,%r12
942	adoxq	%rbp,%r13
943
944	mulxq	16+128(%rsi),%rcx,%rbp
945	adcxq	%rcx,%r13
946	adoxq	%rbp,%r8
947
948	mulxq	24+128(%rsi),%rcx,%rbp
949	movq	%r11,%rdx
950	mulxq	%r15,%rdx,%rax
951	adcxq	%rcx,%r8
952	adoxq	%rbp,%r9
953
954	adcxq	%r10,%r9
955	adoxq	%r10,%r10
956	adcq	$0,%r10
957
958
959	mulxq	0+128(%r14),%rcx,%rbp
960	adcxq	%rcx,%r11
961	adoxq	%rbp,%r12
962
963	mulxq	8+128(%r14),%rcx,%rbp
964	adcxq	%rcx,%r12
965	adoxq	%rbp,%r13
966
967	mulxq	16+128(%r14),%rcx,%rbp
968	adcxq	%rcx,%r13
969	adoxq	%rbp,%r8
970
971	mulxq	24+128(%r14),%rcx,%rbp
972	leaq	128(%r14),%r14
973	movq	%r12,%rbx
974	adcxq	%rcx,%r8
975	adoxq	%rbp,%r9
976	movq	%r13,%rdx
977	adcxq	%r11,%r9
978	adoxq	%r11,%r10
979	adcq	$0,%r10
980
981
982
983	movq	%r8,%rcx
984	subq	0(%r14),%r12
985	sbbq	8(%r14),%r13
986	sbbq	16(%r14),%r8
987	movq	%r9,%rbp
988	sbbq	24(%r14),%r9
989	sbbq	$0,%r10
990
991	cmovcq	%rbx,%r12
992	cmovcq	%rdx,%r13
993	cmovcq	%rcx,%r8
994	cmovcq	%rbp,%r9
995
996	movq	%r12,0(%rdi)
997	movq	%r13,8(%rdi)
998	movq	%r8,16(%rdi)
999	movq	%r9,24(%rdi)
1000
1001	movq	0(%rsp),%r15
1002.cfi_restore	%r15
1003	movq	8(%rsp),%r14
1004.cfi_restore	%r14
1005	movq	16(%rsp),%r13
1006.cfi_restore	%r13
1007	movq	24(%rsp),%r12
1008.cfi_restore	%r12
1009	movq	32(%rsp),%rbx
1010.cfi_restore	%rbx
1011	movq	40(%rsp),%rbp
1012.cfi_restore	%rbp
1013	leaq	48(%rsp),%rsp
1014.cfi_adjust_cfa_offset	-48
1015.Lord_mulx_epilogue:
1016	.byte	0xf3,0xc3
1017.cfi_endproc
1018.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1019
1020.type	ecp_nistz256_ord_sqr_montx,@function
1021.align	32
1022ecp_nistz256_ord_sqr_montx:
1023.cfi_startproc
1024.Lecp_nistz256_ord_sqr_montx:
1025	pushq	%rbp
1026.cfi_adjust_cfa_offset	8
1027.cfi_offset	%rbp,-16
1028	pushq	%rbx
1029.cfi_adjust_cfa_offset	8
1030.cfi_offset	%rbx,-24
1031	pushq	%r12
1032.cfi_adjust_cfa_offset	8
1033.cfi_offset	%r12,-32
1034	pushq	%r13
1035.cfi_adjust_cfa_offset	8
1036.cfi_offset	%r13,-40
1037	pushq	%r14
1038.cfi_adjust_cfa_offset	8
1039.cfi_offset	%r14,-48
1040	pushq	%r15
1041.cfi_adjust_cfa_offset	8
1042.cfi_offset	%r15,-56
1043.Lord_sqrx_body:
1044
1045	movq	%rdx,%rbx
1046	movq	0(%rsi),%rdx
1047	movq	8(%rsi),%r14
1048	movq	16(%rsi),%r15
1049	movq	24(%rsi),%r8
1050	leaq	.Lord(%rip),%rsi
1051	jmp	.Loop_ord_sqrx
1052
1053.align	32
1054.Loop_ord_sqrx:
1055	mulxq	%r14,%r9,%r10
1056	mulxq	%r15,%rcx,%r11
1057	movq	%rdx,%rax
1058.byte	102,73,15,110,206
1059	mulxq	%r8,%rbp,%r12
1060	movq	%r14,%rdx
1061	addq	%rcx,%r10
1062.byte	102,73,15,110,215
1063	adcq	%rbp,%r11
1064	adcq	$0,%r12
1065	xorq	%r13,%r13
1066
1067	mulxq	%r15,%rcx,%rbp
1068	adcxq	%rcx,%r11
1069	adoxq	%rbp,%r12
1070
1071	mulxq	%r8,%rcx,%rbp
1072	movq	%r15,%rdx
1073	adcxq	%rcx,%r12
1074	adoxq	%rbp,%r13
1075	adcq	$0,%r13
1076
1077	mulxq	%r8,%rcx,%r14
1078	movq	%rax,%rdx
1079.byte	102,73,15,110,216
1080	xorq	%r15,%r15
1081	adcxq	%r9,%r9
1082	adoxq	%rcx,%r13
1083	adcxq	%r10,%r10
1084	adoxq	%r15,%r14
1085
1086
1087	mulxq	%rdx,%r8,%rbp
1088.byte	102,72,15,126,202
1089	adcxq	%r11,%r11
1090	adoxq	%rbp,%r9
1091	adcxq	%r12,%r12
1092	mulxq	%rdx,%rcx,%rax
1093.byte	102,72,15,126,210
1094	adcxq	%r13,%r13
1095	adoxq	%rcx,%r10
1096	adcxq	%r14,%r14
1097	mulxq	%rdx,%rcx,%rbp
1098.byte	0x67
1099.byte	102,72,15,126,218
1100	adoxq	%rax,%r11
1101	adcxq	%r15,%r15
1102	adoxq	%rcx,%r12
1103	adoxq	%rbp,%r13
1104	mulxq	%rdx,%rcx,%rax
1105	adoxq	%rcx,%r14
1106	adoxq	%rax,%r15
1107
1108
1109	movq	%r8,%rdx
1110	mulxq	32(%rsi),%rdx,%rcx
1111
1112	xorq	%rax,%rax
1113	mulxq	0(%rsi),%rcx,%rbp
1114	adcxq	%rcx,%r8
1115	adoxq	%rbp,%r9
1116	mulxq	8(%rsi),%rcx,%rbp
1117	adcxq	%rcx,%r9
1118	adoxq	%rbp,%r10
1119	mulxq	16(%rsi),%rcx,%rbp
1120	adcxq	%rcx,%r10
1121	adoxq	%rbp,%r11
1122	mulxq	24(%rsi),%rcx,%rbp
1123	adcxq	%rcx,%r11
1124	adoxq	%rbp,%r8
1125	adcxq	%rax,%r8
1126
1127
1128	movq	%r9,%rdx
1129	mulxq	32(%rsi),%rdx,%rcx
1130
1131	mulxq	0(%rsi),%rcx,%rbp
1132	adoxq	%rcx,%r9
1133	adcxq	%rbp,%r10
1134	mulxq	8(%rsi),%rcx,%rbp
1135	adoxq	%rcx,%r10
1136	adcxq	%rbp,%r11
1137	mulxq	16(%rsi),%rcx,%rbp
1138	adoxq	%rcx,%r11
1139	adcxq	%rbp,%r8
1140	mulxq	24(%rsi),%rcx,%rbp
1141	adoxq	%rcx,%r8
1142	adcxq	%rbp,%r9
1143	adoxq	%rax,%r9
1144
1145
1146	movq	%r10,%rdx
1147	mulxq	32(%rsi),%rdx,%rcx
1148
1149	mulxq	0(%rsi),%rcx,%rbp
1150	adcxq	%rcx,%r10
1151	adoxq	%rbp,%r11
1152	mulxq	8(%rsi),%rcx,%rbp
1153	adcxq	%rcx,%r11
1154	adoxq	%rbp,%r8
1155	mulxq	16(%rsi),%rcx,%rbp
1156	adcxq	%rcx,%r8
1157	adoxq	%rbp,%r9
1158	mulxq	24(%rsi),%rcx,%rbp
1159	adcxq	%rcx,%r9
1160	adoxq	%rbp,%r10
1161	adcxq	%rax,%r10
1162
1163
1164	movq	%r11,%rdx
1165	mulxq	32(%rsi),%rdx,%rcx
1166
1167	mulxq	0(%rsi),%rcx,%rbp
1168	adoxq	%rcx,%r11
1169	adcxq	%rbp,%r8
1170	mulxq	8(%rsi),%rcx,%rbp
1171	adoxq	%rcx,%r8
1172	adcxq	%rbp,%r9
1173	mulxq	16(%rsi),%rcx,%rbp
1174	adoxq	%rcx,%r9
1175	adcxq	%rbp,%r10
1176	mulxq	24(%rsi),%rcx,%rbp
1177	adoxq	%rcx,%r10
1178	adcxq	%rbp,%r11
1179	adoxq	%rax,%r11
1180
1181
1182	addq	%r8,%r12
1183	adcq	%r13,%r9
1184	movq	%r12,%rdx
1185	adcq	%r14,%r10
1186	adcq	%r15,%r11
1187	movq	%r9,%r14
1188	adcq	$0,%rax
1189
1190
1191	subq	0(%rsi),%r12
1192	movq	%r10,%r15
1193	sbbq	8(%rsi),%r9
1194	sbbq	16(%rsi),%r10
1195	movq	%r11,%r8
1196	sbbq	24(%rsi),%r11
1197	sbbq	$0,%rax
1198
1199	cmovncq	%r12,%rdx
1200	cmovncq	%r9,%r14
1201	cmovncq	%r10,%r15
1202	cmovncq	%r11,%r8
1203
1204	decq	%rbx
1205	jnz	.Loop_ord_sqrx
1206
1207	movq	%rdx,0(%rdi)
1208	movq	%r14,8(%rdi)
1209	pxor	%xmm1,%xmm1
1210	movq	%r15,16(%rdi)
1211	pxor	%xmm2,%xmm2
1212	movq	%r8,24(%rdi)
1213	pxor	%xmm3,%xmm3
1214
1215	movq	0(%rsp),%r15
1216.cfi_restore	%r15
1217	movq	8(%rsp),%r14
1218.cfi_restore	%r14
1219	movq	16(%rsp),%r13
1220.cfi_restore	%r13
1221	movq	24(%rsp),%r12
1222.cfi_restore	%r12
1223	movq	32(%rsp),%rbx
1224.cfi_restore	%rbx
1225	movq	40(%rsp),%rbp
1226.cfi_restore	%rbp
1227	leaq	48(%rsp),%rsp
1228.cfi_adjust_cfa_offset	-48
1229.Lord_sqrx_epilogue:
1230	.byte	0xf3,0xc3
1231.cfi_endproc
1232.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1233
1234
1235
1236
1237
1238
1239.globl	GFp_nistz256_mul_mont
1240.hidden GFp_nistz256_mul_mont
1241.type	GFp_nistz256_mul_mont,@function
1242.align	32
1243GFp_nistz256_mul_mont:
1244.cfi_startproc
1245	leaq	GFp_ia32cap_P(%rip),%rcx
1246	movq	8(%rcx),%rcx
1247	andl	$0x80100,%ecx
1248.Lmul_mont:
1249	pushq	%rbp
1250.cfi_adjust_cfa_offset	8
1251.cfi_offset	%rbp,-16
1252	pushq	%rbx
1253.cfi_adjust_cfa_offset	8
1254.cfi_offset	%rbx,-24
1255	pushq	%r12
1256.cfi_adjust_cfa_offset	8
1257.cfi_offset	%r12,-32
1258	pushq	%r13
1259.cfi_adjust_cfa_offset	8
1260.cfi_offset	%r13,-40
1261	pushq	%r14
1262.cfi_adjust_cfa_offset	8
1263.cfi_offset	%r14,-48
1264	pushq	%r15
1265.cfi_adjust_cfa_offset	8
1266.cfi_offset	%r15,-56
1267.Lmul_body:
1268	cmpl	$0x80100,%ecx
1269	je	.Lmul_montx
1270	movq	%rdx,%rbx
1271	movq	0(%rdx),%rax
1272	movq	0(%rsi),%r9
1273	movq	8(%rsi),%r10
1274	movq	16(%rsi),%r11
1275	movq	24(%rsi),%r12
1276
1277	call	__ecp_nistz256_mul_montq
1278	jmp	.Lmul_mont_done
1279
1280.align	32
1281.Lmul_montx:
1282	movq	%rdx,%rbx
1283	movq	0(%rdx),%rdx
1284	movq	0(%rsi),%r9
1285	movq	8(%rsi),%r10
1286	movq	16(%rsi),%r11
1287	movq	24(%rsi),%r12
1288	leaq	-128(%rsi),%rsi
1289
1290	call	__ecp_nistz256_mul_montx
1291.Lmul_mont_done:
1292	movq	0(%rsp),%r15
1293.cfi_restore	%r15
1294	movq	8(%rsp),%r14
1295.cfi_restore	%r14
1296	movq	16(%rsp),%r13
1297.cfi_restore	%r13
1298	movq	24(%rsp),%r12
1299.cfi_restore	%r12
1300	movq	32(%rsp),%rbx
1301.cfi_restore	%rbx
1302	movq	40(%rsp),%rbp
1303.cfi_restore	%rbp
1304	leaq	48(%rsp),%rsp
1305.cfi_adjust_cfa_offset	-48
1306.Lmul_epilogue:
1307	.byte	0xf3,0xc3
1308.cfi_endproc
1309.size	GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
1310
1311.type	__ecp_nistz256_mul_montq,@function
1312.align	32
1313__ecp_nistz256_mul_montq:
1314.cfi_startproc
1315
1316
1317	movq	%rax,%rbp
1318	mulq	%r9
1319	movq	.Lpoly+8(%rip),%r14
1320	movq	%rax,%r8
1321	movq	%rbp,%rax
1322	movq	%rdx,%r9
1323
1324	mulq	%r10
1325	movq	.Lpoly+24(%rip),%r15
1326	addq	%rax,%r9
1327	movq	%rbp,%rax
1328	adcq	$0,%rdx
1329	movq	%rdx,%r10
1330
1331	mulq	%r11
1332	addq	%rax,%r10
1333	movq	%rbp,%rax
1334	adcq	$0,%rdx
1335	movq	%rdx,%r11
1336
1337	mulq	%r12
1338	addq	%rax,%r11
1339	movq	%r8,%rax
1340	adcq	$0,%rdx
1341	xorq	%r13,%r13
1342	movq	%rdx,%r12
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353	movq	%r8,%rbp
1354	shlq	$32,%r8
1355	mulq	%r15
1356	shrq	$32,%rbp
1357	addq	%r8,%r9
1358	adcq	%rbp,%r10
1359	adcq	%rax,%r11
1360	movq	8(%rbx),%rax
1361	adcq	%rdx,%r12
1362	adcq	$0,%r13
1363	xorq	%r8,%r8
1364
1365
1366
1367	movq	%rax,%rbp
1368	mulq	0(%rsi)
1369	addq	%rax,%r9
1370	movq	%rbp,%rax
1371	adcq	$0,%rdx
1372	movq	%rdx,%rcx
1373
1374	mulq	8(%rsi)
1375	addq	%rcx,%r10
1376	adcq	$0,%rdx
1377	addq	%rax,%r10
1378	movq	%rbp,%rax
1379	adcq	$0,%rdx
1380	movq	%rdx,%rcx
1381
1382	mulq	16(%rsi)
1383	addq	%rcx,%r11
1384	adcq	$0,%rdx
1385	addq	%rax,%r11
1386	movq	%rbp,%rax
1387	adcq	$0,%rdx
1388	movq	%rdx,%rcx
1389
1390	mulq	24(%rsi)
1391	addq	%rcx,%r12
1392	adcq	$0,%rdx
1393	addq	%rax,%r12
1394	movq	%r9,%rax
1395	adcq	%rdx,%r13
1396	adcq	$0,%r8
1397
1398
1399
1400	movq	%r9,%rbp
1401	shlq	$32,%r9
1402	mulq	%r15
1403	shrq	$32,%rbp
1404	addq	%r9,%r10
1405	adcq	%rbp,%r11
1406	adcq	%rax,%r12
1407	movq	16(%rbx),%rax
1408	adcq	%rdx,%r13
1409	adcq	$0,%r8
1410	xorq	%r9,%r9
1411
1412
1413
1414	movq	%rax,%rbp
1415	mulq	0(%rsi)
1416	addq	%rax,%r10
1417	movq	%rbp,%rax
1418	adcq	$0,%rdx
1419	movq	%rdx,%rcx
1420
1421	mulq	8(%rsi)
1422	addq	%rcx,%r11
1423	adcq	$0,%rdx
1424	addq	%rax,%r11
1425	movq	%rbp,%rax
1426	adcq	$0,%rdx
1427	movq	%rdx,%rcx
1428
1429	mulq	16(%rsi)
1430	addq	%rcx,%r12
1431	adcq	$0,%rdx
1432	addq	%rax,%r12
1433	movq	%rbp,%rax
1434	adcq	$0,%rdx
1435	movq	%rdx,%rcx
1436
1437	mulq	24(%rsi)
1438	addq	%rcx,%r13
1439	adcq	$0,%rdx
1440	addq	%rax,%r13
1441	movq	%r10,%rax
1442	adcq	%rdx,%r8
1443	adcq	$0,%r9
1444
1445
1446
1447	movq	%r10,%rbp
1448	shlq	$32,%r10
1449	mulq	%r15
1450	shrq	$32,%rbp
1451	addq	%r10,%r11
1452	adcq	%rbp,%r12
1453	adcq	%rax,%r13
1454	movq	24(%rbx),%rax
1455	adcq	%rdx,%r8
1456	adcq	$0,%r9
1457	xorq	%r10,%r10
1458
1459
1460
1461	movq	%rax,%rbp
1462	mulq	0(%rsi)
1463	addq	%rax,%r11
1464	movq	%rbp,%rax
1465	adcq	$0,%rdx
1466	movq	%rdx,%rcx
1467
1468	mulq	8(%rsi)
1469	addq	%rcx,%r12
1470	adcq	$0,%rdx
1471	addq	%rax,%r12
1472	movq	%rbp,%rax
1473	adcq	$0,%rdx
1474	movq	%rdx,%rcx
1475
1476	mulq	16(%rsi)
1477	addq	%rcx,%r13
1478	adcq	$0,%rdx
1479	addq	%rax,%r13
1480	movq	%rbp,%rax
1481	adcq	$0,%rdx
1482	movq	%rdx,%rcx
1483
1484	mulq	24(%rsi)
1485	addq	%rcx,%r8
1486	adcq	$0,%rdx
1487	addq	%rax,%r8
1488	movq	%r11,%rax
1489	adcq	%rdx,%r9
1490	adcq	$0,%r10
1491
1492
1493
1494	movq	%r11,%rbp
1495	shlq	$32,%r11
1496	mulq	%r15
1497	shrq	$32,%rbp
1498	addq	%r11,%r12
1499	adcq	%rbp,%r13
1500	movq	%r12,%rcx
1501	adcq	%rax,%r8
1502	adcq	%rdx,%r9
1503	movq	%r13,%rbp
1504	adcq	$0,%r10
1505
1506
1507
1508	subq	$-1,%r12
1509	movq	%r8,%rbx
1510	sbbq	%r14,%r13
1511	sbbq	$0,%r8
1512	movq	%r9,%rdx
1513	sbbq	%r15,%r9
1514	sbbq	$0,%r10
1515
1516	cmovcq	%rcx,%r12
1517	cmovcq	%rbp,%r13
1518	movq	%r12,0(%rdi)
1519	cmovcq	%rbx,%r8
1520	movq	%r13,8(%rdi)
1521	cmovcq	%rdx,%r9
1522	movq	%r8,16(%rdi)
1523	movq	%r9,24(%rdi)
1524
1525	.byte	0xf3,0xc3
1526.cfi_endproc
1527.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1528
1529
1530
1531
1532
1533
1534
1535
1536.globl	GFp_nistz256_sqr_mont
1537.hidden GFp_nistz256_sqr_mont
1538.type	GFp_nistz256_sqr_mont,@function
1539.align	32
1540GFp_nistz256_sqr_mont:
1541.cfi_startproc
1542	leaq	GFp_ia32cap_P(%rip),%rcx
1543	movq	8(%rcx),%rcx
1544	andl	$0x80100,%ecx
1545	pushq	%rbp
1546.cfi_adjust_cfa_offset	8
1547.cfi_offset	%rbp,-16
1548	pushq	%rbx
1549.cfi_adjust_cfa_offset	8
1550.cfi_offset	%rbx,-24
1551	pushq	%r12
1552.cfi_adjust_cfa_offset	8
1553.cfi_offset	%r12,-32
1554	pushq	%r13
1555.cfi_adjust_cfa_offset	8
1556.cfi_offset	%r13,-40
1557	pushq	%r14
1558.cfi_adjust_cfa_offset	8
1559.cfi_offset	%r14,-48
1560	pushq	%r15
1561.cfi_adjust_cfa_offset	8
1562.cfi_offset	%r15,-56
1563.Lsqr_body:
1564	cmpl	$0x80100,%ecx
1565	je	.Lsqr_montx
1566	movq	0(%rsi),%rax
1567	movq	8(%rsi),%r14
1568	movq	16(%rsi),%r15
1569	movq	24(%rsi),%r8
1570
1571	call	__ecp_nistz256_sqr_montq
1572	jmp	.Lsqr_mont_done
1573
1574.align	32
1575.Lsqr_montx:
1576	movq	0(%rsi),%rdx
1577	movq	8(%rsi),%r14
1578	movq	16(%rsi),%r15
1579	movq	24(%rsi),%r8
1580	leaq	-128(%rsi),%rsi
1581
1582	call	__ecp_nistz256_sqr_montx
1583.Lsqr_mont_done:
1584	movq	0(%rsp),%r15
1585.cfi_restore	%r15
1586	movq	8(%rsp),%r14
1587.cfi_restore	%r14
1588	movq	16(%rsp),%r13
1589.cfi_restore	%r13
1590	movq	24(%rsp),%r12
1591.cfi_restore	%r12
1592	movq	32(%rsp),%rbx
1593.cfi_restore	%rbx
1594	movq	40(%rsp),%rbp
1595.cfi_restore	%rbp
1596	leaq	48(%rsp),%rsp
1597.cfi_adjust_cfa_offset	-48
1598.Lsqr_epilogue:
1599	.byte	0xf3,0xc3
1600.cfi_endproc
1601.size	GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont
1602
1603.type	__ecp_nistz256_sqr_montq,@function
1604.align	32
1605__ecp_nistz256_sqr_montq:
1606.cfi_startproc
1607	movq	%rax,%r13
1608	mulq	%r14
1609	movq	%rax,%r9
1610	movq	%r15,%rax
1611	movq	%rdx,%r10
1612
1613	mulq	%r13
1614	addq	%rax,%r10
1615	movq	%r8,%rax
1616	adcq	$0,%rdx
1617	movq	%rdx,%r11
1618
1619	mulq	%r13
1620	addq	%rax,%r11
1621	movq	%r15,%rax
1622	adcq	$0,%rdx
1623	movq	%rdx,%r12
1624
1625
1626	mulq	%r14
1627	addq	%rax,%r11
1628	movq	%r8,%rax
1629	adcq	$0,%rdx
1630	movq	%rdx,%rbp
1631
1632	mulq	%r14
1633	addq	%rax,%r12
1634	movq	%r8,%rax
1635	adcq	$0,%rdx
1636	addq	%rbp,%r12
1637	movq	%rdx,%r13
1638	adcq	$0,%r13
1639
1640
1641	mulq	%r15
1642	xorq	%r15,%r15
1643	addq	%rax,%r13
1644	movq	0(%rsi),%rax
1645	movq	%rdx,%r14
1646	adcq	$0,%r14
1647
1648	addq	%r9,%r9
1649	adcq	%r10,%r10
1650	adcq	%r11,%r11
1651	adcq	%r12,%r12
1652	adcq	%r13,%r13
1653	adcq	%r14,%r14
1654	adcq	$0,%r15
1655
1656	mulq	%rax
1657	movq	%rax,%r8
1658	movq	8(%rsi),%rax
1659	movq	%rdx,%rcx
1660
1661	mulq	%rax
1662	addq	%rcx,%r9
1663	adcq	%rax,%r10
1664	movq	16(%rsi),%rax
1665	adcq	$0,%rdx
1666	movq	%rdx,%rcx
1667
1668	mulq	%rax
1669	addq	%rcx,%r11
1670	adcq	%rax,%r12
1671	movq	24(%rsi),%rax
1672	adcq	$0,%rdx
1673	movq	%rdx,%rcx
1674
1675	mulq	%rax
1676	addq	%rcx,%r13
1677	adcq	%rax,%r14
1678	movq	%r8,%rax
1679	adcq	%rdx,%r15
1680
1681	movq	.Lpoly+8(%rip),%rsi
1682	movq	.Lpoly+24(%rip),%rbp
1683
1684
1685
1686
1687	movq	%r8,%rcx
1688	shlq	$32,%r8
1689	mulq	%rbp
1690	shrq	$32,%rcx
1691	addq	%r8,%r9
1692	adcq	%rcx,%r10
1693	adcq	%rax,%r11
1694	movq	%r9,%rax
1695	adcq	$0,%rdx
1696
1697
1698
1699	movq	%r9,%rcx
1700	shlq	$32,%r9
1701	movq	%rdx,%r8
1702	mulq	%rbp
1703	shrq	$32,%rcx
1704	addq	%r9,%r10
1705	adcq	%rcx,%r11
1706	adcq	%rax,%r8
1707	movq	%r10,%rax
1708	adcq	$0,%rdx
1709
1710
1711
1712	movq	%r10,%rcx
1713	shlq	$32,%r10
1714	movq	%rdx,%r9
1715	mulq	%rbp
1716	shrq	$32,%rcx
1717	addq	%r10,%r11
1718	adcq	%rcx,%r8
1719	adcq	%rax,%r9
1720	movq	%r11,%rax
1721	adcq	$0,%rdx
1722
1723
1724
1725	movq	%r11,%rcx
1726	shlq	$32,%r11
1727	movq	%rdx,%r10
1728	mulq	%rbp
1729	shrq	$32,%rcx
1730	addq	%r11,%r8
1731	adcq	%rcx,%r9
1732	adcq	%rax,%r10
1733	adcq	$0,%rdx
1734	xorq	%r11,%r11
1735
1736
1737
1738	addq	%r8,%r12
1739	adcq	%r9,%r13
1740	movq	%r12,%r8
1741	adcq	%r10,%r14
1742	adcq	%rdx,%r15
1743	movq	%r13,%r9
1744	adcq	$0,%r11
1745
1746	subq	$-1,%r12
1747	movq	%r14,%r10
1748	sbbq	%rsi,%r13
1749	sbbq	$0,%r14
1750	movq	%r15,%rcx
1751	sbbq	%rbp,%r15
1752	sbbq	$0,%r11
1753
1754	cmovcq	%r8,%r12
1755	cmovcq	%r9,%r13
1756	movq	%r12,0(%rdi)
1757	cmovcq	%r10,%r14
1758	movq	%r13,8(%rdi)
1759	cmovcq	%rcx,%r15
1760	movq	%r14,16(%rdi)
1761	movq	%r15,24(%rdi)
1762
1763	.byte	0xf3,0xc3
1764.cfi_endproc
1765.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
1766.type	__ecp_nistz256_mul_montx,@function
1767.align	32
1768__ecp_nistz256_mul_montx:
1769.cfi_startproc
1770
1771
1772	mulxq	%r9,%r8,%r9
1773	mulxq	%r10,%rcx,%r10
1774	movq	$32,%r14
1775	xorq	%r13,%r13
1776	mulxq	%r11,%rbp,%r11
1777	movq	.Lpoly+24(%rip),%r15
1778	adcq	%rcx,%r9
1779	mulxq	%r12,%rcx,%r12
1780	movq	%r8,%rdx
1781	adcq	%rbp,%r10
1782	shlxq	%r14,%r8,%rbp
1783	adcq	%rcx,%r11
1784	shrxq	%r14,%r8,%rcx
1785	adcq	$0,%r12
1786
1787
1788
1789	addq	%rbp,%r9
1790	adcq	%rcx,%r10
1791
1792	mulxq	%r15,%rcx,%rbp
1793	movq	8(%rbx),%rdx
1794	adcq	%rcx,%r11
1795	adcq	%rbp,%r12
1796	adcq	$0,%r13
1797	xorq	%r8,%r8
1798
1799
1800
1801	mulxq	0+128(%rsi),%rcx,%rbp
1802	adcxq	%rcx,%r9
1803	adoxq	%rbp,%r10
1804
1805	mulxq	8+128(%rsi),%rcx,%rbp
1806	adcxq	%rcx,%r10
1807	adoxq	%rbp,%r11
1808
1809	mulxq	16+128(%rsi),%rcx,%rbp
1810	adcxq	%rcx,%r11
1811	adoxq	%rbp,%r12
1812
1813	mulxq	24+128(%rsi),%rcx,%rbp
1814	movq	%r9,%rdx
1815	adcxq	%rcx,%r12
1816	shlxq	%r14,%r9,%rcx
1817	adoxq	%rbp,%r13
1818	shrxq	%r14,%r9,%rbp
1819
1820	adcxq	%r8,%r13
1821	adoxq	%r8,%r8
1822	adcq	$0,%r8
1823
1824
1825
1826	addq	%rcx,%r10
1827	adcq	%rbp,%r11
1828
1829	mulxq	%r15,%rcx,%rbp
1830	movq	16(%rbx),%rdx
1831	adcq	%rcx,%r12
1832	adcq	%rbp,%r13
1833	adcq	$0,%r8
1834	xorq	%r9,%r9
1835
1836
1837
1838	mulxq	0+128(%rsi),%rcx,%rbp
1839	adcxq	%rcx,%r10
1840	adoxq	%rbp,%r11
1841
1842	mulxq	8+128(%rsi),%rcx,%rbp
1843	adcxq	%rcx,%r11
1844	adoxq	%rbp,%r12
1845
1846	mulxq	16+128(%rsi),%rcx,%rbp
1847	adcxq	%rcx,%r12
1848	adoxq	%rbp,%r13
1849
1850	mulxq	24+128(%rsi),%rcx,%rbp
1851	movq	%r10,%rdx
1852	adcxq	%rcx,%r13
1853	shlxq	%r14,%r10,%rcx
1854	adoxq	%rbp,%r8
1855	shrxq	%r14,%r10,%rbp
1856
1857	adcxq	%r9,%r8
1858	adoxq	%r9,%r9
1859	adcq	$0,%r9
1860
1861
1862
1863	addq	%rcx,%r11
1864	adcq	%rbp,%r12
1865
1866	mulxq	%r15,%rcx,%rbp
1867	movq	24(%rbx),%rdx
1868	adcq	%rcx,%r13
1869	adcq	%rbp,%r8
1870	adcq	$0,%r9
1871	xorq	%r10,%r10
1872
1873
1874
1875	mulxq	0+128(%rsi),%rcx,%rbp
1876	adcxq	%rcx,%r11
1877	adoxq	%rbp,%r12
1878
1879	mulxq	8+128(%rsi),%rcx,%rbp
1880	adcxq	%rcx,%r12
1881	adoxq	%rbp,%r13
1882
1883	mulxq	16+128(%rsi),%rcx,%rbp
1884	adcxq	%rcx,%r13
1885	adoxq	%rbp,%r8
1886
1887	mulxq	24+128(%rsi),%rcx,%rbp
1888	movq	%r11,%rdx
1889	adcxq	%rcx,%r8
1890	shlxq	%r14,%r11,%rcx
1891	adoxq	%rbp,%r9
1892	shrxq	%r14,%r11,%rbp
1893
1894	adcxq	%r10,%r9
1895	adoxq	%r10,%r10
1896	adcq	$0,%r10
1897
1898
1899
1900	addq	%rcx,%r12
1901	adcq	%rbp,%r13
1902
1903	mulxq	%r15,%rcx,%rbp
1904	movq	%r12,%rbx
1905	movq	.Lpoly+8(%rip),%r14
1906	adcq	%rcx,%r8
1907	movq	%r13,%rdx
1908	adcq	%rbp,%r9
1909	adcq	$0,%r10
1910
1911
1912
1913	xorl	%eax,%eax
1914	movq	%r8,%rcx
1915	sbbq	$-1,%r12
1916	sbbq	%r14,%r13
1917	sbbq	$0,%r8
1918	movq	%r9,%rbp
1919	sbbq	%r15,%r9
1920	sbbq	$0,%r10
1921
1922	cmovcq	%rbx,%r12
1923	cmovcq	%rdx,%r13
1924	movq	%r12,0(%rdi)
1925	cmovcq	%rcx,%r8
1926	movq	%r13,8(%rdi)
1927	cmovcq	%rbp,%r9
1928	movq	%r8,16(%rdi)
1929	movq	%r9,24(%rdi)
1930
1931	.byte	0xf3,0xc3
1932.cfi_endproc
1933.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
1934
1935.type	__ecp_nistz256_sqr_montx,@function
1936.align	32
1937__ecp_nistz256_sqr_montx:
1938.cfi_startproc
1939	mulxq	%r14,%r9,%r10
1940	mulxq	%r15,%rcx,%r11
1941	xorl	%eax,%eax
1942	adcq	%rcx,%r10
1943	mulxq	%r8,%rbp,%r12
1944	movq	%r14,%rdx
1945	adcq	%rbp,%r11
1946	adcq	$0,%r12
1947	xorq	%r13,%r13
1948
1949
1950	mulxq	%r15,%rcx,%rbp
1951	adcxq	%rcx,%r11
1952	adoxq	%rbp,%r12
1953
1954	mulxq	%r8,%rcx,%rbp
1955	movq	%r15,%rdx
1956	adcxq	%rcx,%r12
1957	adoxq	%rbp,%r13
1958	adcq	$0,%r13
1959
1960
1961	mulxq	%r8,%rcx,%r14
1962	movq	0+128(%rsi),%rdx
1963	xorq	%r15,%r15
1964	adcxq	%r9,%r9
1965	adoxq	%rcx,%r13
1966	adcxq	%r10,%r10
1967	adoxq	%r15,%r14
1968
1969	mulxq	%rdx,%r8,%rbp
1970	movq	8+128(%rsi),%rdx
1971	adcxq	%r11,%r11
1972	adoxq	%rbp,%r9
1973	adcxq	%r12,%r12
1974	mulxq	%rdx,%rcx,%rax
1975	movq	16+128(%rsi),%rdx
1976	adcxq	%r13,%r13
1977	adoxq	%rcx,%r10
1978	adcxq	%r14,%r14
1979.byte	0x67
1980	mulxq	%rdx,%rcx,%rbp
1981	movq	24+128(%rsi),%rdx
1982	adoxq	%rax,%r11
1983	adcxq	%r15,%r15
1984	adoxq	%rcx,%r12
1985	movq	$32,%rsi
1986	adoxq	%rbp,%r13
1987.byte	0x67,0x67
1988	mulxq	%rdx,%rcx,%rax
1989	movq	.Lpoly+24(%rip),%rdx
1990	adoxq	%rcx,%r14
1991	shlxq	%rsi,%r8,%rcx
1992	adoxq	%rax,%r15
1993	shrxq	%rsi,%r8,%rax
1994	movq	%rdx,%rbp
1995
1996
1997	addq	%rcx,%r9
1998	adcq	%rax,%r10
1999
2000	mulxq	%r8,%rcx,%r8
2001	adcq	%rcx,%r11
2002	shlxq	%rsi,%r9,%rcx
2003	adcq	$0,%r8
2004	shrxq	%rsi,%r9,%rax
2005
2006
2007	addq	%rcx,%r10
2008	adcq	%rax,%r11
2009
2010	mulxq	%r9,%rcx,%r9
2011	adcq	%rcx,%r8
2012	shlxq	%rsi,%r10,%rcx
2013	adcq	$0,%r9
2014	shrxq	%rsi,%r10,%rax
2015
2016
2017	addq	%rcx,%r11
2018	adcq	%rax,%r8
2019
2020	mulxq	%r10,%rcx,%r10
2021	adcq	%rcx,%r9
2022	shlxq	%rsi,%r11,%rcx
2023	adcq	$0,%r10
2024	shrxq	%rsi,%r11,%rax
2025
2026
2027	addq	%rcx,%r8
2028	adcq	%rax,%r9
2029
2030	mulxq	%r11,%rcx,%r11
2031	adcq	%rcx,%r10
2032	adcq	$0,%r11
2033
2034	xorq	%rdx,%rdx
2035	addq	%r8,%r12
2036	movq	.Lpoly+8(%rip),%rsi
2037	adcq	%r9,%r13
2038	movq	%r12,%r8
2039	adcq	%r10,%r14
2040	adcq	%r11,%r15
2041	movq	%r13,%r9
2042	adcq	$0,%rdx
2043
2044	subq	$-1,%r12
2045	movq	%r14,%r10
2046	sbbq	%rsi,%r13
2047	sbbq	$0,%r14
2048	movq	%r15,%r11
2049	sbbq	%rbp,%r15
2050	sbbq	$0,%rdx
2051
2052	cmovcq	%r8,%r12
2053	cmovcq	%r9,%r13
2054	movq	%r12,0(%rdi)
2055	cmovcq	%r10,%r14
2056	movq	%r13,8(%rdi)
2057	cmovcq	%r11,%r15
2058	movq	%r14,16(%rdi)
2059	movq	%r15,24(%rdi)
2060
2061	.byte	0xf3,0xc3
2062.cfi_endproc
2063.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2064
2065
2066.globl	GFp_nistz256_select_w5
2067.hidden GFp_nistz256_select_w5
2068.type	GFp_nistz256_select_w5,@function
2069.align	32
2070GFp_nistz256_select_w5:
2071.cfi_startproc
2072	leaq	GFp_ia32cap_P(%rip),%rax
2073	movq	8(%rax),%rax
2074	testl	$32,%eax
2075	jnz	.Lavx2_select_w5
2076	movdqa	.LOne(%rip),%xmm0
2077	movd	%edx,%xmm1
2078
2079	pxor	%xmm2,%xmm2
2080	pxor	%xmm3,%xmm3
2081	pxor	%xmm4,%xmm4
2082	pxor	%xmm5,%xmm5
2083	pxor	%xmm6,%xmm6
2084	pxor	%xmm7,%xmm7
2085
2086	movdqa	%xmm0,%xmm8
2087	pshufd	$0,%xmm1,%xmm1
2088
2089	movq	$16,%rax
2090.Lselect_loop_sse_w5:
2091
2092	movdqa	%xmm8,%xmm15
2093	paddd	%xmm0,%xmm8
2094	pcmpeqd	%xmm1,%xmm15
2095
2096	movdqa	0(%rsi),%xmm9
2097	movdqa	16(%rsi),%xmm10
2098	movdqa	32(%rsi),%xmm11
2099	movdqa	48(%rsi),%xmm12
2100	movdqa	64(%rsi),%xmm13
2101	movdqa	80(%rsi),%xmm14
2102	leaq	96(%rsi),%rsi
2103
2104	pand	%xmm15,%xmm9
2105	pand	%xmm15,%xmm10
2106	por	%xmm9,%xmm2
2107	pand	%xmm15,%xmm11
2108	por	%xmm10,%xmm3
2109	pand	%xmm15,%xmm12
2110	por	%xmm11,%xmm4
2111	pand	%xmm15,%xmm13
2112	por	%xmm12,%xmm5
2113	pand	%xmm15,%xmm14
2114	por	%xmm13,%xmm6
2115	por	%xmm14,%xmm7
2116
2117	decq	%rax
2118	jnz	.Lselect_loop_sse_w5
2119
2120	movdqu	%xmm2,0(%rdi)
2121	movdqu	%xmm3,16(%rdi)
2122	movdqu	%xmm4,32(%rdi)
2123	movdqu	%xmm5,48(%rdi)
2124	movdqu	%xmm6,64(%rdi)
2125	movdqu	%xmm7,80(%rdi)
2126	.byte	0xf3,0xc3
2127.cfi_endproc
2128.LSEH_end_GFp_nistz256_select_w5:
2129.size	GFp_nistz256_select_w5,.-GFp_nistz256_select_w5
2130
2131
2132
2133.globl	GFp_nistz256_select_w7
2134.hidden GFp_nistz256_select_w7
2135.type	GFp_nistz256_select_w7,@function
2136.align	32
2137GFp_nistz256_select_w7:
2138.cfi_startproc
2139	leaq	GFp_ia32cap_P(%rip),%rax
2140	movq	8(%rax),%rax
2141	testl	$32,%eax
2142	jnz	.Lavx2_select_w7
2143	movdqa	.LOne(%rip),%xmm8
2144	movd	%edx,%xmm1
2145
2146	pxor	%xmm2,%xmm2
2147	pxor	%xmm3,%xmm3
2148	pxor	%xmm4,%xmm4
2149	pxor	%xmm5,%xmm5
2150
2151	movdqa	%xmm8,%xmm0
2152	pshufd	$0,%xmm1,%xmm1
2153	movq	$64,%rax
2154
2155.Lselect_loop_sse_w7:
2156	movdqa	%xmm8,%xmm15
2157	paddd	%xmm0,%xmm8
2158	movdqa	0(%rsi),%xmm9
2159	movdqa	16(%rsi),%xmm10
2160	pcmpeqd	%xmm1,%xmm15
2161	movdqa	32(%rsi),%xmm11
2162	movdqa	48(%rsi),%xmm12
2163	leaq	64(%rsi),%rsi
2164
2165	pand	%xmm15,%xmm9
2166	pand	%xmm15,%xmm10
2167	por	%xmm9,%xmm2
2168	pand	%xmm15,%xmm11
2169	por	%xmm10,%xmm3
2170	pand	%xmm15,%xmm12
2171	por	%xmm11,%xmm4
2172	prefetcht0	255(%rsi)
2173	por	%xmm12,%xmm5
2174
2175	decq	%rax
2176	jnz	.Lselect_loop_sse_w7
2177
2178	movdqu	%xmm2,0(%rdi)
2179	movdqu	%xmm3,16(%rdi)
2180	movdqu	%xmm4,32(%rdi)
2181	movdqu	%xmm5,48(%rdi)
2182	.byte	0xf3,0xc3
2183.cfi_endproc
2184.LSEH_end_GFp_nistz256_select_w7:
2185.size	GFp_nistz256_select_w7,.-GFp_nistz256_select_w7
2186
2187
2188.type	GFp_nistz256_avx2_select_w5,@function
2189.align	32
2190GFp_nistz256_avx2_select_w5:
2191.cfi_startproc
2192.Lavx2_select_w5:
2193	vzeroupper
2194	vmovdqa	.LTwo(%rip),%ymm0
2195
2196	vpxor	%ymm2,%ymm2,%ymm2
2197	vpxor	%ymm3,%ymm3,%ymm3
2198	vpxor	%ymm4,%ymm4,%ymm4
2199
2200	vmovdqa	.LOne(%rip),%ymm5
2201	vmovdqa	.LTwo(%rip),%ymm10
2202
2203	vmovd	%edx,%xmm1
2204	vpermd	%ymm1,%ymm2,%ymm1
2205
2206	movq	$8,%rax
2207.Lselect_loop_avx2_w5:
2208
2209	vmovdqa	0(%rsi),%ymm6
2210	vmovdqa	32(%rsi),%ymm7
2211	vmovdqa	64(%rsi),%ymm8
2212
2213	vmovdqa	96(%rsi),%ymm11
2214	vmovdqa	128(%rsi),%ymm12
2215	vmovdqa	160(%rsi),%ymm13
2216
2217	vpcmpeqd	%ymm1,%ymm5,%ymm9
2218	vpcmpeqd	%ymm1,%ymm10,%ymm14
2219
2220	vpaddd	%ymm0,%ymm5,%ymm5
2221	vpaddd	%ymm0,%ymm10,%ymm10
2222	leaq	192(%rsi),%rsi
2223
2224	vpand	%ymm9,%ymm6,%ymm6
2225	vpand	%ymm9,%ymm7,%ymm7
2226	vpand	%ymm9,%ymm8,%ymm8
2227	vpand	%ymm14,%ymm11,%ymm11
2228	vpand	%ymm14,%ymm12,%ymm12
2229	vpand	%ymm14,%ymm13,%ymm13
2230
2231	vpxor	%ymm6,%ymm2,%ymm2
2232	vpxor	%ymm7,%ymm3,%ymm3
2233	vpxor	%ymm8,%ymm4,%ymm4
2234	vpxor	%ymm11,%ymm2,%ymm2
2235	vpxor	%ymm12,%ymm3,%ymm3
2236	vpxor	%ymm13,%ymm4,%ymm4
2237
2238	decq	%rax
2239	jnz	.Lselect_loop_avx2_w5
2240
2241	vmovdqu	%ymm2,0(%rdi)
2242	vmovdqu	%ymm3,32(%rdi)
2243	vmovdqu	%ymm4,64(%rdi)
2244	vzeroupper
2245	.byte	0xf3,0xc3
2246.cfi_endproc
2247.LSEH_end_GFp_nistz256_avx2_select_w5:
2248.size	GFp_nistz256_avx2_select_w5,.-GFp_nistz256_avx2_select_w5
2249
2250
2251
2252.globl	GFp_nistz256_avx2_select_w7
2253.hidden GFp_nistz256_avx2_select_w7
2254.type	GFp_nistz256_avx2_select_w7,@function
2255.align	32
2256GFp_nistz256_avx2_select_w7:
2257.cfi_startproc
2258.Lavx2_select_w7:
2259	vzeroupper
2260	vmovdqa	.LThree(%rip),%ymm0
2261
2262	vpxor	%ymm2,%ymm2,%ymm2
2263	vpxor	%ymm3,%ymm3,%ymm3
2264
2265	vmovdqa	.LOne(%rip),%ymm4
2266	vmovdqa	.LTwo(%rip),%ymm8
2267	vmovdqa	.LThree(%rip),%ymm12
2268
2269	vmovd	%edx,%xmm1
2270	vpermd	%ymm1,%ymm2,%ymm1
2271
2272
2273	movq	$21,%rax
2274.Lselect_loop_avx2_w7:
2275
2276	vmovdqa	0(%rsi),%ymm5
2277	vmovdqa	32(%rsi),%ymm6
2278
2279	vmovdqa	64(%rsi),%ymm9
2280	vmovdqa	96(%rsi),%ymm10
2281
2282	vmovdqa	128(%rsi),%ymm13
2283	vmovdqa	160(%rsi),%ymm14
2284
2285	vpcmpeqd	%ymm1,%ymm4,%ymm7
2286	vpcmpeqd	%ymm1,%ymm8,%ymm11
2287	vpcmpeqd	%ymm1,%ymm12,%ymm15
2288
2289	vpaddd	%ymm0,%ymm4,%ymm4
2290	vpaddd	%ymm0,%ymm8,%ymm8
2291	vpaddd	%ymm0,%ymm12,%ymm12
2292	leaq	192(%rsi),%rsi
2293
2294	vpand	%ymm7,%ymm5,%ymm5
2295	vpand	%ymm7,%ymm6,%ymm6
2296	vpand	%ymm11,%ymm9,%ymm9
2297	vpand	%ymm11,%ymm10,%ymm10
2298	vpand	%ymm15,%ymm13,%ymm13
2299	vpand	%ymm15,%ymm14,%ymm14
2300
2301	vpxor	%ymm5,%ymm2,%ymm2
2302	vpxor	%ymm6,%ymm3,%ymm3
2303	vpxor	%ymm9,%ymm2,%ymm2
2304	vpxor	%ymm10,%ymm3,%ymm3
2305	vpxor	%ymm13,%ymm2,%ymm2
2306	vpxor	%ymm14,%ymm3,%ymm3
2307
2308	decq	%rax
2309	jnz	.Lselect_loop_avx2_w7
2310
2311
2312	vmovdqa	0(%rsi),%ymm5
2313	vmovdqa	32(%rsi),%ymm6
2314
2315	vpcmpeqd	%ymm1,%ymm4,%ymm7
2316
2317	vpand	%ymm7,%ymm5,%ymm5
2318	vpand	%ymm7,%ymm6,%ymm6
2319
2320	vpxor	%ymm5,%ymm2,%ymm2
2321	vpxor	%ymm6,%ymm3,%ymm3
2322
2323	vmovdqu	%ymm2,0(%rdi)
2324	vmovdqu	%ymm3,32(%rdi)
2325	vzeroupper
2326	.byte	0xf3,0xc3
2327.cfi_endproc
2328.LSEH_end_GFp_nistz256_avx2_select_w7:
2329.size	GFp_nistz256_avx2_select_w7,.-GFp_nistz256_avx2_select_w7
2330.type	__ecp_nistz256_add_toq,@function
2331.align	32
2332__ecp_nistz256_add_toq:
2333.cfi_startproc
2334	xorq	%r11,%r11
2335	addq	0(%rbx),%r12
2336	adcq	8(%rbx),%r13
2337	movq	%r12,%rax
2338	adcq	16(%rbx),%r8
2339	adcq	24(%rbx),%r9
2340	movq	%r13,%rbp
2341	adcq	$0,%r11
2342
2343	subq	$-1,%r12
2344	movq	%r8,%rcx
2345	sbbq	%r14,%r13
2346	sbbq	$0,%r8
2347	movq	%r9,%r10
2348	sbbq	%r15,%r9
2349	sbbq	$0,%r11
2350
2351	cmovcq	%rax,%r12
2352	cmovcq	%rbp,%r13
2353	movq	%r12,0(%rdi)
2354	cmovcq	%rcx,%r8
2355	movq	%r13,8(%rdi)
2356	cmovcq	%r10,%r9
2357	movq	%r8,16(%rdi)
2358	movq	%r9,24(%rdi)
2359
2360	.byte	0xf3,0xc3
2361.cfi_endproc
2362.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
2363
2364.type	__ecp_nistz256_sub_fromq,@function
2365.align	32
2366__ecp_nistz256_sub_fromq:
2367.cfi_startproc
2368	subq	0(%rbx),%r12
2369	sbbq	8(%rbx),%r13
2370	movq	%r12,%rax
2371	sbbq	16(%rbx),%r8
2372	sbbq	24(%rbx),%r9
2373	movq	%r13,%rbp
2374	sbbq	%r11,%r11
2375
2376	addq	$-1,%r12
2377	movq	%r8,%rcx
2378	adcq	%r14,%r13
2379	adcq	$0,%r8
2380	movq	%r9,%r10
2381	adcq	%r15,%r9
2382	testq	%r11,%r11
2383
2384	cmovzq	%rax,%r12
2385	cmovzq	%rbp,%r13
2386	movq	%r12,0(%rdi)
2387	cmovzq	%rcx,%r8
2388	movq	%r13,8(%rdi)
2389	cmovzq	%r10,%r9
2390	movq	%r8,16(%rdi)
2391	movq	%r9,24(%rdi)
2392
2393	.byte	0xf3,0xc3
2394.cfi_endproc
2395.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
2396
2397.type	__ecp_nistz256_subq,@function
2398.align	32
2399__ecp_nistz256_subq:
2400.cfi_startproc
2401	subq	%r12,%rax
2402	sbbq	%r13,%rbp
2403	movq	%rax,%r12
2404	sbbq	%r8,%rcx
2405	sbbq	%r9,%r10
2406	movq	%rbp,%r13
2407	sbbq	%r11,%r11
2408
2409	addq	$-1,%rax
2410	movq	%rcx,%r8
2411	adcq	%r14,%rbp
2412	adcq	$0,%rcx
2413	movq	%r10,%r9
2414	adcq	%r15,%r10
2415	testq	%r11,%r11
2416
2417	cmovnzq	%rax,%r12
2418	cmovnzq	%rbp,%r13
2419	cmovnzq	%rcx,%r8
2420	cmovnzq	%r10,%r9
2421
2422	.byte	0xf3,0xc3
2423.cfi_endproc
2424.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
2425
2426.type	__ecp_nistz256_mul_by_2q,@function
2427.align	32
2428__ecp_nistz256_mul_by_2q:
2429.cfi_startproc
2430	xorq	%r11,%r11
2431	addq	%r12,%r12
2432	adcq	%r13,%r13
2433	movq	%r12,%rax
2434	adcq	%r8,%r8
2435	adcq	%r9,%r9
2436	movq	%r13,%rbp
2437	adcq	$0,%r11
2438
2439	subq	$-1,%r12
2440	movq	%r8,%rcx
2441	sbbq	%r14,%r13
2442	sbbq	$0,%r8
2443	movq	%r9,%r10
2444	sbbq	%r15,%r9
2445	sbbq	$0,%r11
2446
2447	cmovcq	%rax,%r12
2448	cmovcq	%rbp,%r13
2449	movq	%r12,0(%rdi)
2450	cmovcq	%rcx,%r8
2451	movq	%r13,8(%rdi)
2452	cmovcq	%r10,%r9
2453	movq	%r8,16(%rdi)
2454	movq	%r9,24(%rdi)
2455
2456	.byte	0xf3,0xc3
2457.cfi_endproc
2458.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
2459.globl	GFp_nistz256_point_double
2460.hidden GFp_nistz256_point_double
2461.type	GFp_nistz256_point_double,@function
2462.align	32
2463GFp_nistz256_point_double:
2464.cfi_startproc
2465	leaq	GFp_ia32cap_P(%rip),%rcx
2466	movq	8(%rcx),%rcx
2467	andl	$0x80100,%ecx
2468	cmpl	$0x80100,%ecx
2469	je	.Lpoint_doublex
2470	pushq	%rbp
2471.cfi_adjust_cfa_offset	8
2472.cfi_offset	%rbp,-16
2473	pushq	%rbx
2474.cfi_adjust_cfa_offset	8
2475.cfi_offset	%rbx,-24
2476	pushq	%r12
2477.cfi_adjust_cfa_offset	8
2478.cfi_offset	%r12,-32
2479	pushq	%r13
2480.cfi_adjust_cfa_offset	8
2481.cfi_offset	%r13,-40
2482	pushq	%r14
2483.cfi_adjust_cfa_offset	8
2484.cfi_offset	%r14,-48
2485	pushq	%r15
2486.cfi_adjust_cfa_offset	8
2487.cfi_offset	%r15,-56
2488	subq	$160+8,%rsp
2489.cfi_adjust_cfa_offset	32*5+8
2490.Lpoint_doubleq_body:
2491
2492.Lpoint_double_shortcutq:
2493	movdqu	0(%rsi),%xmm0
2494	movq	%rsi,%rbx
2495	movdqu	16(%rsi),%xmm1
2496	movq	32+0(%rsi),%r12
2497	movq	32+8(%rsi),%r13
2498	movq	32+16(%rsi),%r8
2499	movq	32+24(%rsi),%r9
2500	movq	.Lpoly+8(%rip),%r14
2501	movq	.Lpoly+24(%rip),%r15
2502	movdqa	%xmm0,96(%rsp)
2503	movdqa	%xmm1,96+16(%rsp)
2504	leaq	32(%rdi),%r10
2505	leaq	64(%rdi),%r11
2506.byte	102,72,15,110,199
2507.byte	102,73,15,110,202
2508.byte	102,73,15,110,211
2509
2510	leaq	0(%rsp),%rdi
2511	call	__ecp_nistz256_mul_by_2q
2512
2513	movq	64+0(%rsi),%rax
2514	movq	64+8(%rsi),%r14
2515	movq	64+16(%rsi),%r15
2516	movq	64+24(%rsi),%r8
2517	leaq	64-0(%rsi),%rsi
2518	leaq	64(%rsp),%rdi
2519	call	__ecp_nistz256_sqr_montq
2520
2521	movq	0+0(%rsp),%rax
2522	movq	8+0(%rsp),%r14
2523	leaq	0+0(%rsp),%rsi
2524	movq	16+0(%rsp),%r15
2525	movq	24+0(%rsp),%r8
2526	leaq	0(%rsp),%rdi
2527	call	__ecp_nistz256_sqr_montq
2528
2529	movq	32(%rbx),%rax
2530	movq	64+0(%rbx),%r9
2531	movq	64+8(%rbx),%r10
2532	movq	64+16(%rbx),%r11
2533	movq	64+24(%rbx),%r12
2534	leaq	64-0(%rbx),%rsi
2535	leaq	32(%rbx),%rbx
2536.byte	102,72,15,126,215
2537	call	__ecp_nistz256_mul_montq
2538	call	__ecp_nistz256_mul_by_2q
2539
2540	movq	96+0(%rsp),%r12
2541	movq	96+8(%rsp),%r13
2542	leaq	64(%rsp),%rbx
2543	movq	96+16(%rsp),%r8
2544	movq	96+24(%rsp),%r9
2545	leaq	32(%rsp),%rdi
2546	call	__ecp_nistz256_add_toq
2547
2548	movq	96+0(%rsp),%r12
2549	movq	96+8(%rsp),%r13
2550	leaq	64(%rsp),%rbx
2551	movq	96+16(%rsp),%r8
2552	movq	96+24(%rsp),%r9
2553	leaq	64(%rsp),%rdi
2554	call	__ecp_nistz256_sub_fromq
2555
2556	movq	0+0(%rsp),%rax
2557	movq	8+0(%rsp),%r14
2558	leaq	0+0(%rsp),%rsi
2559	movq	16+0(%rsp),%r15
2560	movq	24+0(%rsp),%r8
2561.byte	102,72,15,126,207
2562	call	__ecp_nistz256_sqr_montq
2563	xorq	%r9,%r9
2564	movq	%r12,%rax
2565	addq	$-1,%r12
2566	movq	%r13,%r10
2567	adcq	%rsi,%r13
2568	movq	%r14,%rcx
2569	adcq	$0,%r14
2570	movq	%r15,%r8
2571	adcq	%rbp,%r15
2572	adcq	$0,%r9
2573	xorq	%rsi,%rsi
2574	testq	$1,%rax
2575
2576	cmovzq	%rax,%r12
2577	cmovzq	%r10,%r13
2578	cmovzq	%rcx,%r14
2579	cmovzq	%r8,%r15
2580	cmovzq	%rsi,%r9
2581
2582	movq	%r13,%rax
2583	shrq	$1,%r12
2584	shlq	$63,%rax
2585	movq	%r14,%r10
2586	shrq	$1,%r13
2587	orq	%rax,%r12
2588	shlq	$63,%r10
2589	movq	%r15,%rcx
2590	shrq	$1,%r14
2591	orq	%r10,%r13
2592	shlq	$63,%rcx
2593	movq	%r12,0(%rdi)
2594	shrq	$1,%r15
2595	movq	%r13,8(%rdi)
2596	shlq	$63,%r9
2597	orq	%rcx,%r14
2598	orq	%r9,%r15
2599	movq	%r14,16(%rdi)
2600	movq	%r15,24(%rdi)
2601	movq	64(%rsp),%rax
2602	leaq	64(%rsp),%rbx
2603	movq	0+32(%rsp),%r9
2604	movq	8+32(%rsp),%r10
2605	leaq	0+32(%rsp),%rsi
2606	movq	16+32(%rsp),%r11
2607	movq	24+32(%rsp),%r12
2608	leaq	32(%rsp),%rdi
2609	call	__ecp_nistz256_mul_montq
2610
2611	leaq	128(%rsp),%rdi
2612	call	__ecp_nistz256_mul_by_2q
2613
2614	leaq	32(%rsp),%rbx
2615	leaq	32(%rsp),%rdi
2616	call	__ecp_nistz256_add_toq
2617
2618	movq	96(%rsp),%rax
2619	leaq	96(%rsp),%rbx
2620	movq	0+0(%rsp),%r9
2621	movq	8+0(%rsp),%r10
2622	leaq	0+0(%rsp),%rsi
2623	movq	16+0(%rsp),%r11
2624	movq	24+0(%rsp),%r12
2625	leaq	0(%rsp),%rdi
2626	call	__ecp_nistz256_mul_montq
2627
2628	leaq	128(%rsp),%rdi
2629	call	__ecp_nistz256_mul_by_2q
2630
2631	movq	0+32(%rsp),%rax
2632	movq	8+32(%rsp),%r14
2633	leaq	0+32(%rsp),%rsi
2634	movq	16+32(%rsp),%r15
2635	movq	24+32(%rsp),%r8
2636.byte	102,72,15,126,199
2637	call	__ecp_nistz256_sqr_montq
2638
2639	leaq	128(%rsp),%rbx
2640	movq	%r14,%r8
2641	movq	%r15,%r9
2642	movq	%rsi,%r14
2643	movq	%rbp,%r15
2644	call	__ecp_nistz256_sub_fromq
2645
2646	movq	0+0(%rsp),%rax
2647	movq	0+8(%rsp),%rbp
2648	movq	0+16(%rsp),%rcx
2649	movq	0+24(%rsp),%r10
2650	leaq	0(%rsp),%rdi
2651	call	__ecp_nistz256_subq
2652
2653	movq	32(%rsp),%rax
2654	leaq	32(%rsp),%rbx
2655	movq	%r12,%r14
2656	xorl	%ecx,%ecx
2657	movq	%r12,0+0(%rsp)
2658	movq	%r13,%r10
2659	movq	%r13,0+8(%rsp)
2660	cmovzq	%r8,%r11
2661	movq	%r8,0+16(%rsp)
2662	leaq	0-0(%rsp),%rsi
2663	cmovzq	%r9,%r12
2664	movq	%r9,0+24(%rsp)
2665	movq	%r14,%r9
2666	leaq	0(%rsp),%rdi
2667	call	__ecp_nistz256_mul_montq
2668
2669.byte	102,72,15,126,203
2670.byte	102,72,15,126,207
2671	call	__ecp_nistz256_sub_fromq
2672
2673	leaq	160+56(%rsp),%rsi
2674.cfi_def_cfa	%rsi,8
2675	movq	-48(%rsi),%r15
2676.cfi_restore	%r15
2677	movq	-40(%rsi),%r14
2678.cfi_restore	%r14
2679	movq	-32(%rsi),%r13
2680.cfi_restore	%r13
2681	movq	-24(%rsi),%r12
2682.cfi_restore	%r12
2683	movq	-16(%rsi),%rbx
2684.cfi_restore	%rbx
2685	movq	-8(%rsi),%rbp
2686.cfi_restore	%rbp
2687	leaq	(%rsi),%rsp
2688.cfi_def_cfa_register	%rsp
2689.Lpoint_doubleq_epilogue:
2690	.byte	0xf3,0xc3
2691.cfi_endproc
2692.size	GFp_nistz256_point_double,.-GFp_nistz256_point_double
2693.globl	GFp_nistz256_point_add
2694.hidden GFp_nistz256_point_add
2695.type	GFp_nistz256_point_add,@function
2696.align	32
2697GFp_nistz256_point_add:
2698.cfi_startproc
2699	leaq	GFp_ia32cap_P(%rip),%rcx
2700	movq	8(%rcx),%rcx
2701	andl	$0x80100,%ecx
2702	cmpl	$0x80100,%ecx
2703	je	.Lpoint_addx
2704	pushq	%rbp
2705.cfi_adjust_cfa_offset	8
2706.cfi_offset	%rbp,-16
2707	pushq	%rbx
2708.cfi_adjust_cfa_offset	8
2709.cfi_offset	%rbx,-24
2710	pushq	%r12
2711.cfi_adjust_cfa_offset	8
2712.cfi_offset	%r12,-32
2713	pushq	%r13
2714.cfi_adjust_cfa_offset	8
2715.cfi_offset	%r13,-40
2716	pushq	%r14
2717.cfi_adjust_cfa_offset	8
2718.cfi_offset	%r14,-48
2719	pushq	%r15
2720.cfi_adjust_cfa_offset	8
2721.cfi_offset	%r15,-56
2722	subq	$576+8,%rsp
2723.cfi_adjust_cfa_offset	32*18+8
2724.Lpoint_addq_body:
2725
2726	movdqu	0(%rsi),%xmm0
2727	movdqu	16(%rsi),%xmm1
2728	movdqu	32(%rsi),%xmm2
2729	movdqu	48(%rsi),%xmm3
2730	movdqu	64(%rsi),%xmm4
2731	movdqu	80(%rsi),%xmm5
2732	movq	%rsi,%rbx
2733	movq	%rdx,%rsi
2734	movdqa	%xmm0,384(%rsp)
2735	movdqa	%xmm1,384+16(%rsp)
2736	movdqa	%xmm2,416(%rsp)
2737	movdqa	%xmm3,416+16(%rsp)
2738	movdqa	%xmm4,448(%rsp)
2739	movdqa	%xmm5,448+16(%rsp)
2740	por	%xmm4,%xmm5
2741
2742	movdqu	0(%rsi),%xmm0
2743	pshufd	$0xb1,%xmm5,%xmm3
2744	movdqu	16(%rsi),%xmm1
2745	movdqu	32(%rsi),%xmm2
2746	por	%xmm3,%xmm5
2747	movdqu	48(%rsi),%xmm3
2748	movq	64+0(%rsi),%rax
2749	movq	64+8(%rsi),%r14
2750	movq	64+16(%rsi),%r15
2751	movq	64+24(%rsi),%r8
2752	movdqa	%xmm0,480(%rsp)
2753	pshufd	$0x1e,%xmm5,%xmm4
2754	movdqa	%xmm1,480+16(%rsp)
2755	movdqu	64(%rsi),%xmm0
2756	movdqu	80(%rsi),%xmm1
2757	movdqa	%xmm2,512(%rsp)
2758	movdqa	%xmm3,512+16(%rsp)
2759	por	%xmm4,%xmm5
2760	pxor	%xmm4,%xmm4
2761	por	%xmm0,%xmm1
2762.byte	102,72,15,110,199
2763
2764	leaq	64-0(%rsi),%rsi
2765	movq	%rax,544+0(%rsp)
2766	movq	%r14,544+8(%rsp)
2767	movq	%r15,544+16(%rsp)
2768	movq	%r8,544+24(%rsp)
2769	leaq	96(%rsp),%rdi
2770	call	__ecp_nistz256_sqr_montq
2771
2772	pcmpeqd	%xmm4,%xmm5
2773	pshufd	$0xb1,%xmm1,%xmm4
2774	por	%xmm1,%xmm4
2775	pshufd	$0,%xmm5,%xmm5
2776	pshufd	$0x1e,%xmm4,%xmm3
2777	por	%xmm3,%xmm4
2778	pxor	%xmm3,%xmm3
2779	pcmpeqd	%xmm3,%xmm4
2780	pshufd	$0,%xmm4,%xmm4
2781	movq	64+0(%rbx),%rax
2782	movq	64+8(%rbx),%r14
2783	movq	64+16(%rbx),%r15
2784	movq	64+24(%rbx),%r8
2785.byte	102,72,15,110,203
2786
2787	leaq	64-0(%rbx),%rsi
2788	leaq	32(%rsp),%rdi
2789	call	__ecp_nistz256_sqr_montq
2790
2791	movq	544(%rsp),%rax
2792	leaq	544(%rsp),%rbx
2793	movq	0+96(%rsp),%r9
2794	movq	8+96(%rsp),%r10
2795	leaq	0+96(%rsp),%rsi
2796	movq	16+96(%rsp),%r11
2797	movq	24+96(%rsp),%r12
2798	leaq	224(%rsp),%rdi
2799	call	__ecp_nistz256_mul_montq
2800
2801	movq	448(%rsp),%rax
2802	leaq	448(%rsp),%rbx
2803	movq	0+32(%rsp),%r9
2804	movq	8+32(%rsp),%r10
2805	leaq	0+32(%rsp),%rsi
2806	movq	16+32(%rsp),%r11
2807	movq	24+32(%rsp),%r12
2808	leaq	256(%rsp),%rdi
2809	call	__ecp_nistz256_mul_montq
2810
2811	movq	416(%rsp),%rax
2812	leaq	416(%rsp),%rbx
2813	movq	0+224(%rsp),%r9
2814	movq	8+224(%rsp),%r10
2815	leaq	0+224(%rsp),%rsi
2816	movq	16+224(%rsp),%r11
2817	movq	24+224(%rsp),%r12
2818	leaq	224(%rsp),%rdi
2819	call	__ecp_nistz256_mul_montq
2820
2821	movq	512(%rsp),%rax
2822	leaq	512(%rsp),%rbx
2823	movq	0+256(%rsp),%r9
2824	movq	8+256(%rsp),%r10
2825	leaq	0+256(%rsp),%rsi
2826	movq	16+256(%rsp),%r11
2827	movq	24+256(%rsp),%r12
2828	leaq	256(%rsp),%rdi
2829	call	__ecp_nistz256_mul_montq
2830
2831	leaq	224(%rsp),%rbx
2832	leaq	64(%rsp),%rdi
2833	call	__ecp_nistz256_sub_fromq
2834
2835	orq	%r13,%r12
2836	movdqa	%xmm4,%xmm2
2837	orq	%r8,%r12
2838	orq	%r9,%r12
2839	por	%xmm5,%xmm2
2840.byte	102,73,15,110,220
2841
2842	movq	384(%rsp),%rax
2843	leaq	384(%rsp),%rbx
2844	movq	0+96(%rsp),%r9
2845	movq	8+96(%rsp),%r10
2846	leaq	0+96(%rsp),%rsi
2847	movq	16+96(%rsp),%r11
2848	movq	24+96(%rsp),%r12
2849	leaq	160(%rsp),%rdi
2850	call	__ecp_nistz256_mul_montq
2851
2852	movq	480(%rsp),%rax
2853	leaq	480(%rsp),%rbx
2854	movq	0+32(%rsp),%r9
2855	movq	8+32(%rsp),%r10
2856	leaq	0+32(%rsp),%rsi
2857	movq	16+32(%rsp),%r11
2858	movq	24+32(%rsp),%r12
2859	leaq	192(%rsp),%rdi
2860	call	__ecp_nistz256_mul_montq
2861
2862	leaq	160(%rsp),%rbx
2863	leaq	0(%rsp),%rdi
2864	call	__ecp_nistz256_sub_fromq
2865
2866	orq	%r13,%r12
2867	orq	%r8,%r12
2868	orq	%r9,%r12
2869
2870.byte	102,73,15,126,208
2871.byte	102,73,15,126,217
2872	orq	%r8,%r12
2873.byte	0x3e
2874	jnz	.Ladd_proceedq
2875
2876
2877
2878	testq	%r9,%r9
2879	jz	.Ladd_doubleq
2880
2881
2882
2883
2884
2885
2886.byte	102,72,15,126,199
2887	pxor	%xmm0,%xmm0
2888	movdqu	%xmm0,0(%rdi)
2889	movdqu	%xmm0,16(%rdi)
2890	movdqu	%xmm0,32(%rdi)
2891	movdqu	%xmm0,48(%rdi)
2892	movdqu	%xmm0,64(%rdi)
2893	movdqu	%xmm0,80(%rdi)
2894	jmp	.Ladd_doneq
2895
2896.align	32
2897.Ladd_doubleq:
2898.byte	102,72,15,126,206
2899.byte	102,72,15,126,199
2900	addq	$416,%rsp
2901.cfi_adjust_cfa_offset	-416
2902	jmp	.Lpoint_double_shortcutq
2903.cfi_adjust_cfa_offset	416
2904
2905.align	32
2906.Ladd_proceedq:
2907	movq	0+64(%rsp),%rax
2908	movq	8+64(%rsp),%r14
2909	leaq	0+64(%rsp),%rsi
2910	movq	16+64(%rsp),%r15
2911	movq	24+64(%rsp),%r8
2912	leaq	96(%rsp),%rdi
2913	call	__ecp_nistz256_sqr_montq
2914
2915	movq	448(%rsp),%rax
2916	leaq	448(%rsp),%rbx
2917	movq	0+0(%rsp),%r9
2918	movq	8+0(%rsp),%r10
2919	leaq	0+0(%rsp),%rsi
2920	movq	16+0(%rsp),%r11
2921	movq	24+0(%rsp),%r12
2922	leaq	352(%rsp),%rdi
2923	call	__ecp_nistz256_mul_montq
2924
2925	movq	0+0(%rsp),%rax
2926	movq	8+0(%rsp),%r14
2927	leaq	0+0(%rsp),%rsi
2928	movq	16+0(%rsp),%r15
2929	movq	24+0(%rsp),%r8
2930	leaq	32(%rsp),%rdi
2931	call	__ecp_nistz256_sqr_montq
2932
2933	movq	544(%rsp),%rax
2934	leaq	544(%rsp),%rbx
2935	movq	0+352(%rsp),%r9
2936	movq	8+352(%rsp),%r10
2937	leaq	0+352(%rsp),%rsi
2938	movq	16+352(%rsp),%r11
2939	movq	24+352(%rsp),%r12
2940	leaq	352(%rsp),%rdi
2941	call	__ecp_nistz256_mul_montq
2942
2943	movq	0(%rsp),%rax
2944	leaq	0(%rsp),%rbx
2945	movq	0+32(%rsp),%r9
2946	movq	8+32(%rsp),%r10
2947	leaq	0+32(%rsp),%rsi
2948	movq	16+32(%rsp),%r11
2949	movq	24+32(%rsp),%r12
2950	leaq	128(%rsp),%rdi
2951	call	__ecp_nistz256_mul_montq
2952
2953	movq	160(%rsp),%rax
2954	leaq	160(%rsp),%rbx
2955	movq	0+32(%rsp),%r9
2956	movq	8+32(%rsp),%r10
2957	leaq	0+32(%rsp),%rsi
2958	movq	16+32(%rsp),%r11
2959	movq	24+32(%rsp),%r12
2960	leaq	192(%rsp),%rdi
2961	call	__ecp_nistz256_mul_montq
2962
2963
2964
2965
2966	xorq	%r11,%r11
2967	addq	%r12,%r12
2968	leaq	96(%rsp),%rsi
2969	adcq	%r13,%r13
2970	movq	%r12,%rax
2971	adcq	%r8,%r8
2972	adcq	%r9,%r9
2973	movq	%r13,%rbp
2974	adcq	$0,%r11
2975
2976	subq	$-1,%r12
2977	movq	%r8,%rcx
2978	sbbq	%r14,%r13
2979	sbbq	$0,%r8
2980	movq	%r9,%r10
2981	sbbq	%r15,%r9
2982	sbbq	$0,%r11
2983
2984	cmovcq	%rax,%r12
2985	movq	0(%rsi),%rax
2986	cmovcq	%rbp,%r13
2987	movq	8(%rsi),%rbp
2988	cmovcq	%rcx,%r8
2989	movq	16(%rsi),%rcx
2990	cmovcq	%r10,%r9
2991	movq	24(%rsi),%r10
2992
2993	call	__ecp_nistz256_subq
2994
2995	leaq	128(%rsp),%rbx
2996	leaq	288(%rsp),%rdi
2997	call	__ecp_nistz256_sub_fromq
2998
2999	movq	192+0(%rsp),%rax
3000	movq	192+8(%rsp),%rbp
3001	movq	192+16(%rsp),%rcx
3002	movq	192+24(%rsp),%r10
3003	leaq	320(%rsp),%rdi
3004
3005	call	__ecp_nistz256_subq
3006
3007	movq	%r12,0(%rdi)
3008	movq	%r13,8(%rdi)
3009	movq	%r8,16(%rdi)
3010	movq	%r9,24(%rdi)
3011	movq	128(%rsp),%rax
3012	leaq	128(%rsp),%rbx
3013	movq	0+224(%rsp),%r9
3014	movq	8+224(%rsp),%r10
3015	leaq	0+224(%rsp),%rsi
3016	movq	16+224(%rsp),%r11
3017	movq	24+224(%rsp),%r12
3018	leaq	256(%rsp),%rdi
3019	call	__ecp_nistz256_mul_montq
3020
3021	movq	320(%rsp),%rax
3022	leaq	320(%rsp),%rbx
3023	movq	0+64(%rsp),%r9
3024	movq	8+64(%rsp),%r10
3025	leaq	0+64(%rsp),%rsi
3026	movq	16+64(%rsp),%r11
3027	movq	24+64(%rsp),%r12
3028	leaq	320(%rsp),%rdi
3029	call	__ecp_nistz256_mul_montq
3030
3031	leaq	256(%rsp),%rbx
3032	leaq	320(%rsp),%rdi
3033	call	__ecp_nistz256_sub_fromq
3034
3035.byte	102,72,15,126,199
3036
3037	movdqa	%xmm5,%xmm0
3038	movdqa	%xmm5,%xmm1
3039	pandn	352(%rsp),%xmm0
3040	movdqa	%xmm5,%xmm2
3041	pandn	352+16(%rsp),%xmm1
3042	movdqa	%xmm5,%xmm3
3043	pand	544(%rsp),%xmm2
3044	pand	544+16(%rsp),%xmm3
3045	por	%xmm0,%xmm2
3046	por	%xmm1,%xmm3
3047
3048	movdqa	%xmm4,%xmm0
3049	movdqa	%xmm4,%xmm1
3050	pandn	%xmm2,%xmm0
3051	movdqa	%xmm4,%xmm2
3052	pandn	%xmm3,%xmm1
3053	movdqa	%xmm4,%xmm3
3054	pand	448(%rsp),%xmm2
3055	pand	448+16(%rsp),%xmm3
3056	por	%xmm0,%xmm2
3057	por	%xmm1,%xmm3
3058	movdqu	%xmm2,64(%rdi)
3059	movdqu	%xmm3,80(%rdi)
3060
3061	movdqa	%xmm5,%xmm0
3062	movdqa	%xmm5,%xmm1
3063	pandn	288(%rsp),%xmm0
3064	movdqa	%xmm5,%xmm2
3065	pandn	288+16(%rsp),%xmm1
3066	movdqa	%xmm5,%xmm3
3067	pand	480(%rsp),%xmm2
3068	pand	480+16(%rsp),%xmm3
3069	por	%xmm0,%xmm2
3070	por	%xmm1,%xmm3
3071
3072	movdqa	%xmm4,%xmm0
3073	movdqa	%xmm4,%xmm1
3074	pandn	%xmm2,%xmm0
3075	movdqa	%xmm4,%xmm2
3076	pandn	%xmm3,%xmm1
3077	movdqa	%xmm4,%xmm3
3078	pand	384(%rsp),%xmm2
3079	pand	384+16(%rsp),%xmm3
3080	por	%xmm0,%xmm2
3081	por	%xmm1,%xmm3
3082	movdqu	%xmm2,0(%rdi)
3083	movdqu	%xmm3,16(%rdi)
3084
3085	movdqa	%xmm5,%xmm0
3086	movdqa	%xmm5,%xmm1
3087	pandn	320(%rsp),%xmm0
3088	movdqa	%xmm5,%xmm2
3089	pandn	320+16(%rsp),%xmm1
3090	movdqa	%xmm5,%xmm3
3091	pand	512(%rsp),%xmm2
3092	pand	512+16(%rsp),%xmm3
3093	por	%xmm0,%xmm2
3094	por	%xmm1,%xmm3
3095
3096	movdqa	%xmm4,%xmm0
3097	movdqa	%xmm4,%xmm1
3098	pandn	%xmm2,%xmm0
3099	movdqa	%xmm4,%xmm2
3100	pandn	%xmm3,%xmm1
3101	movdqa	%xmm4,%xmm3
3102	pand	416(%rsp),%xmm2
3103	pand	416+16(%rsp),%xmm3
3104	por	%xmm0,%xmm2
3105	por	%xmm1,%xmm3
3106	movdqu	%xmm2,32(%rdi)
3107	movdqu	%xmm3,48(%rdi)
3108
3109.Ladd_doneq:
3110	leaq	576+56(%rsp),%rsi
3111.cfi_def_cfa	%rsi,8
3112	movq	-48(%rsi),%r15
3113.cfi_restore	%r15
3114	movq	-40(%rsi),%r14
3115.cfi_restore	%r14
3116	movq	-32(%rsi),%r13
3117.cfi_restore	%r13
3118	movq	-24(%rsi),%r12
3119.cfi_restore	%r12
3120	movq	-16(%rsi),%rbx
3121.cfi_restore	%rbx
3122	movq	-8(%rsi),%rbp
3123.cfi_restore	%rbp
3124	leaq	(%rsi),%rsp
3125.cfi_def_cfa_register	%rsp
3126.Lpoint_addq_epilogue:
3127	.byte	0xf3,0xc3
3128.cfi_endproc
3129.size	GFp_nistz256_point_add,.-GFp_nistz256_point_add
3130.globl	GFp_nistz256_point_add_affine
3131.hidden GFp_nistz256_point_add_affine
3132.type	GFp_nistz256_point_add_affine,@function
3133.align	32
3134GFp_nistz256_point_add_affine:
3135.cfi_startproc
3136	leaq	GFp_ia32cap_P(%rip),%rcx
3137	movq	8(%rcx),%rcx
3138	andl	$0x80100,%ecx
3139	cmpl	$0x80100,%ecx
3140	je	.Lpoint_add_affinex
3141	pushq	%rbp
3142.cfi_adjust_cfa_offset	8
3143.cfi_offset	%rbp,-16
3144	pushq	%rbx
3145.cfi_adjust_cfa_offset	8
3146.cfi_offset	%rbx,-24
3147	pushq	%r12
3148.cfi_adjust_cfa_offset	8
3149.cfi_offset	%r12,-32
3150	pushq	%r13
3151.cfi_adjust_cfa_offset	8
3152.cfi_offset	%r13,-40
3153	pushq	%r14
3154.cfi_adjust_cfa_offset	8
3155.cfi_offset	%r14,-48
3156	pushq	%r15
3157.cfi_adjust_cfa_offset	8
3158.cfi_offset	%r15,-56
3159	subq	$480+8,%rsp
3160.cfi_adjust_cfa_offset	32*15+8
3161.Ladd_affineq_body:
3162
3163	movdqu	0(%rsi),%xmm0
3164	movq	%rdx,%rbx
3165	movdqu	16(%rsi),%xmm1
3166	movdqu	32(%rsi),%xmm2
3167	movdqu	48(%rsi),%xmm3
3168	movdqu	64(%rsi),%xmm4
3169	movdqu	80(%rsi),%xmm5
3170	movq	64+0(%rsi),%rax
3171	movq	64+8(%rsi),%r14
3172	movq	64+16(%rsi),%r15
3173	movq	64+24(%rsi),%r8
3174	movdqa	%xmm0,320(%rsp)
3175	movdqa	%xmm1,320+16(%rsp)
3176	movdqa	%xmm2,352(%rsp)
3177	movdqa	%xmm3,352+16(%rsp)
3178	movdqa	%xmm4,384(%rsp)
3179	movdqa	%xmm5,384+16(%rsp)
3180	por	%xmm4,%xmm5
3181
3182	movdqu	0(%rbx),%xmm0
3183	pshufd	$0xb1,%xmm5,%xmm3
3184	movdqu	16(%rbx),%xmm1
3185	movdqu	32(%rbx),%xmm2
3186	por	%xmm3,%xmm5
3187	movdqu	48(%rbx),%xmm3
3188	movdqa	%xmm0,416(%rsp)
3189	pshufd	$0x1e,%xmm5,%xmm4
3190	movdqa	%xmm1,416+16(%rsp)
3191	por	%xmm0,%xmm1
3192.byte	102,72,15,110,199
3193	movdqa	%xmm2,448(%rsp)
3194	movdqa	%xmm3,448+16(%rsp)
3195	por	%xmm2,%xmm3
3196	por	%xmm4,%xmm5
3197	pxor	%xmm4,%xmm4
3198	por	%xmm1,%xmm3
3199
3200	leaq	64-0(%rsi),%rsi
3201	leaq	32(%rsp),%rdi
3202	call	__ecp_nistz256_sqr_montq
3203
3204	pcmpeqd	%xmm4,%xmm5
3205	pshufd	$0xb1,%xmm3,%xmm4
3206	movq	0(%rbx),%rax
3207
3208	movq	%r12,%r9
3209	por	%xmm3,%xmm4
3210	pshufd	$0,%xmm5,%xmm5
3211	pshufd	$0x1e,%xmm4,%xmm3
3212	movq	%r13,%r10
3213	por	%xmm3,%xmm4
3214	pxor	%xmm3,%xmm3
3215	movq	%r14,%r11
3216	pcmpeqd	%xmm3,%xmm4
3217	pshufd	$0,%xmm4,%xmm4
3218
3219	leaq	32-0(%rsp),%rsi
3220	movq	%r15,%r12
3221	leaq	0(%rsp),%rdi
3222	call	__ecp_nistz256_mul_montq
3223
3224	leaq	320(%rsp),%rbx
3225	leaq	64(%rsp),%rdi
3226	call	__ecp_nistz256_sub_fromq
3227
3228	movq	384(%rsp),%rax
3229	leaq	384(%rsp),%rbx
3230	movq	0+32(%rsp),%r9
3231	movq	8+32(%rsp),%r10
3232	leaq	0+32(%rsp),%rsi
3233	movq	16+32(%rsp),%r11
3234	movq	24+32(%rsp),%r12
3235	leaq	32(%rsp),%rdi
3236	call	__ecp_nistz256_mul_montq
3237
3238	movq	384(%rsp),%rax
3239	leaq	384(%rsp),%rbx
3240	movq	0+64(%rsp),%r9
3241	movq	8+64(%rsp),%r10
3242	leaq	0+64(%rsp),%rsi
3243	movq	16+64(%rsp),%r11
3244	movq	24+64(%rsp),%r12
3245	leaq	288(%rsp),%rdi
3246	call	__ecp_nistz256_mul_montq
3247
3248	movq	448(%rsp),%rax
3249	leaq	448(%rsp),%rbx
3250	movq	0+32(%rsp),%r9
3251	movq	8+32(%rsp),%r10
3252	leaq	0+32(%rsp),%rsi
3253	movq	16+32(%rsp),%r11
3254	movq	24+32(%rsp),%r12
3255	leaq	32(%rsp),%rdi
3256	call	__ecp_nistz256_mul_montq
3257
3258	leaq	352(%rsp),%rbx
3259	leaq	96(%rsp),%rdi
3260	call	__ecp_nistz256_sub_fromq
3261
3262	movq	0+64(%rsp),%rax
3263	movq	8+64(%rsp),%r14
3264	leaq	0+64(%rsp),%rsi
3265	movq	16+64(%rsp),%r15
3266	movq	24+64(%rsp),%r8
3267	leaq	128(%rsp),%rdi
3268	call	__ecp_nistz256_sqr_montq
3269
3270	movq	0+96(%rsp),%rax
3271	movq	8+96(%rsp),%r14
3272	leaq	0+96(%rsp),%rsi
3273	movq	16+96(%rsp),%r15
3274	movq	24+96(%rsp),%r8
3275	leaq	192(%rsp),%rdi
3276	call	__ecp_nistz256_sqr_montq
3277
3278	movq	128(%rsp),%rax
3279	leaq	128(%rsp),%rbx
3280	movq	0+64(%rsp),%r9
3281	movq	8+64(%rsp),%r10
3282	leaq	0+64(%rsp),%rsi
3283	movq	16+64(%rsp),%r11
3284	movq	24+64(%rsp),%r12
3285	leaq	160(%rsp),%rdi
3286	call	__ecp_nistz256_mul_montq
3287
3288	movq	320(%rsp),%rax
3289	leaq	320(%rsp),%rbx
3290	movq	0+128(%rsp),%r9
3291	movq	8+128(%rsp),%r10
3292	leaq	0+128(%rsp),%rsi
3293	movq	16+128(%rsp),%r11
3294	movq	24+128(%rsp),%r12
3295	leaq	0(%rsp),%rdi
3296	call	__ecp_nistz256_mul_montq
3297
3298
3299
3300
3301	xorq	%r11,%r11
3302	addq	%r12,%r12
3303	leaq	192(%rsp),%rsi
3304	adcq	%r13,%r13
3305	movq	%r12,%rax
3306	adcq	%r8,%r8
3307	adcq	%r9,%r9
3308	movq	%r13,%rbp
3309	adcq	$0,%r11
3310
3311	subq	$-1,%r12
3312	movq	%r8,%rcx
3313	sbbq	%r14,%r13
3314	sbbq	$0,%r8
3315	movq	%r9,%r10
3316	sbbq	%r15,%r9
3317	sbbq	$0,%r11
3318
3319	cmovcq	%rax,%r12
3320	movq	0(%rsi),%rax
3321	cmovcq	%rbp,%r13
3322	movq	8(%rsi),%rbp
3323	cmovcq	%rcx,%r8
3324	movq	16(%rsi),%rcx
3325	cmovcq	%r10,%r9
3326	movq	24(%rsi),%r10
3327
3328	call	__ecp_nistz256_subq
3329
3330	leaq	160(%rsp),%rbx
3331	leaq	224(%rsp),%rdi
3332	call	__ecp_nistz256_sub_fromq
3333
3334	movq	0+0(%rsp),%rax
3335	movq	0+8(%rsp),%rbp
3336	movq	0+16(%rsp),%rcx
3337	movq	0+24(%rsp),%r10
3338	leaq	64(%rsp),%rdi
3339
3340	call	__ecp_nistz256_subq
3341
3342	movq	%r12,0(%rdi)
3343	movq	%r13,8(%rdi)
3344	movq	%r8,16(%rdi)
3345	movq	%r9,24(%rdi)
3346	movq	352(%rsp),%rax
3347	leaq	352(%rsp),%rbx
3348	movq	0+160(%rsp),%r9
3349	movq	8+160(%rsp),%r10
3350	leaq	0+160(%rsp),%rsi
3351	movq	16+160(%rsp),%r11
3352	movq	24+160(%rsp),%r12
3353	leaq	32(%rsp),%rdi
3354	call	__ecp_nistz256_mul_montq
3355
3356	movq	96(%rsp),%rax
3357	leaq	96(%rsp),%rbx
3358	movq	0+64(%rsp),%r9
3359	movq	8+64(%rsp),%r10
3360	leaq	0+64(%rsp),%rsi
3361	movq	16+64(%rsp),%r11
3362	movq	24+64(%rsp),%r12
3363	leaq	64(%rsp),%rdi
3364	call	__ecp_nistz256_mul_montq
3365
3366	leaq	32(%rsp),%rbx
3367	leaq	256(%rsp),%rdi
3368	call	__ecp_nistz256_sub_fromq
3369
3370.byte	102,72,15,126,199
3371
3372	movdqa	%xmm5,%xmm0
3373	movdqa	%xmm5,%xmm1
3374	pandn	288(%rsp),%xmm0
3375	movdqa	%xmm5,%xmm2
3376	pandn	288+16(%rsp),%xmm1
3377	movdqa	%xmm5,%xmm3
3378	pand	.LONE_mont(%rip),%xmm2
3379	pand	.LONE_mont+16(%rip),%xmm3
3380	por	%xmm0,%xmm2
3381	por	%xmm1,%xmm3
3382
3383	movdqa	%xmm4,%xmm0
3384	movdqa	%xmm4,%xmm1
3385	pandn	%xmm2,%xmm0
3386	movdqa	%xmm4,%xmm2
3387	pandn	%xmm3,%xmm1
3388	movdqa	%xmm4,%xmm3
3389	pand	384(%rsp),%xmm2
3390	pand	384+16(%rsp),%xmm3
3391	por	%xmm0,%xmm2
3392	por	%xmm1,%xmm3
3393	movdqu	%xmm2,64(%rdi)
3394	movdqu	%xmm3,80(%rdi)
3395
3396	movdqa	%xmm5,%xmm0
3397	movdqa	%xmm5,%xmm1
3398	pandn	224(%rsp),%xmm0
3399	movdqa	%xmm5,%xmm2
3400	pandn	224+16(%rsp),%xmm1
3401	movdqa	%xmm5,%xmm3
3402	pand	416(%rsp),%xmm2
3403	pand	416+16(%rsp),%xmm3
3404	por	%xmm0,%xmm2
3405	por	%xmm1,%xmm3
3406
3407	movdqa	%xmm4,%xmm0
3408	movdqa	%xmm4,%xmm1
3409	pandn	%xmm2,%xmm0
3410	movdqa	%xmm4,%xmm2
3411	pandn	%xmm3,%xmm1
3412	movdqa	%xmm4,%xmm3
3413	pand	320(%rsp),%xmm2
3414	pand	320+16(%rsp),%xmm3
3415	por	%xmm0,%xmm2
3416	por	%xmm1,%xmm3
3417	movdqu	%xmm2,0(%rdi)
3418	movdqu	%xmm3,16(%rdi)
3419
3420	movdqa	%xmm5,%xmm0
3421	movdqa	%xmm5,%xmm1
3422	pandn	256(%rsp),%xmm0
3423	movdqa	%xmm5,%xmm2
3424	pandn	256+16(%rsp),%xmm1
3425	movdqa	%xmm5,%xmm3
3426	pand	448(%rsp),%xmm2
3427	pand	448+16(%rsp),%xmm3
3428	por	%xmm0,%xmm2
3429	por	%xmm1,%xmm3
3430
3431	movdqa	%xmm4,%xmm0
3432	movdqa	%xmm4,%xmm1
3433	pandn	%xmm2,%xmm0
3434	movdqa	%xmm4,%xmm2
3435	pandn	%xmm3,%xmm1
3436	movdqa	%xmm4,%xmm3
3437	pand	352(%rsp),%xmm2
3438	pand	352+16(%rsp),%xmm3
3439	por	%xmm0,%xmm2
3440	por	%xmm1,%xmm3
3441	movdqu	%xmm2,32(%rdi)
3442	movdqu	%xmm3,48(%rdi)
3443
3444	leaq	480+56(%rsp),%rsi
3445.cfi_def_cfa	%rsi,8
3446	movq	-48(%rsi),%r15
3447.cfi_restore	%r15
3448	movq	-40(%rsi),%r14
3449.cfi_restore	%r14
3450	movq	-32(%rsi),%r13
3451.cfi_restore	%r13
3452	movq	-24(%rsi),%r12
3453.cfi_restore	%r12
3454	movq	-16(%rsi),%rbx
3455.cfi_restore	%rbx
3456	movq	-8(%rsi),%rbp
3457.cfi_restore	%rbp
3458	leaq	(%rsi),%rsp
3459.cfi_def_cfa_register	%rsp
3460.Ladd_affineq_epilogue:
3461	.byte	0xf3,0xc3
3462.cfi_endproc
3463.size	GFp_nistz256_point_add_affine,.-GFp_nistz256_point_add_affine
3464.type	__ecp_nistz256_add_tox,@function
3465.align	32
3466__ecp_nistz256_add_tox:
3467.cfi_startproc
3468	xorq	%r11,%r11
3469	adcq	0(%rbx),%r12
3470	adcq	8(%rbx),%r13
3471	movq	%r12,%rax
3472	adcq	16(%rbx),%r8
3473	adcq	24(%rbx),%r9
3474	movq	%r13,%rbp
3475	adcq	$0,%r11
3476
3477	xorq	%r10,%r10
3478	sbbq	$-1,%r12
3479	movq	%r8,%rcx
3480	sbbq	%r14,%r13
3481	sbbq	$0,%r8
3482	movq	%r9,%r10
3483	sbbq	%r15,%r9
3484	sbbq	$0,%r11
3485
3486	cmovcq	%rax,%r12
3487	cmovcq	%rbp,%r13
3488	movq	%r12,0(%rdi)
3489	cmovcq	%rcx,%r8
3490	movq	%r13,8(%rdi)
3491	cmovcq	%r10,%r9
3492	movq	%r8,16(%rdi)
3493	movq	%r9,24(%rdi)
3494
3495	.byte	0xf3,0xc3
3496.cfi_endproc
3497.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
3498
3499.type	__ecp_nistz256_sub_fromx,@function
3500.align	32
3501__ecp_nistz256_sub_fromx:
3502.cfi_startproc
3503	xorq	%r11,%r11
3504	sbbq	0(%rbx),%r12
3505	sbbq	8(%rbx),%r13
3506	movq	%r12,%rax
3507	sbbq	16(%rbx),%r8
3508	sbbq	24(%rbx),%r9
3509	movq	%r13,%rbp
3510	sbbq	$0,%r11
3511
3512	xorq	%r10,%r10
3513	adcq	$-1,%r12
3514	movq	%r8,%rcx
3515	adcq	%r14,%r13
3516	adcq	$0,%r8
3517	movq	%r9,%r10
3518	adcq	%r15,%r9
3519
3520	btq	$0,%r11
3521	cmovncq	%rax,%r12
3522	cmovncq	%rbp,%r13
3523	movq	%r12,0(%rdi)
3524	cmovncq	%rcx,%r8
3525	movq	%r13,8(%rdi)
3526	cmovncq	%r10,%r9
3527	movq	%r8,16(%rdi)
3528	movq	%r9,24(%rdi)
3529
3530	.byte	0xf3,0xc3
3531.cfi_endproc
3532.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
3533
3534.type	__ecp_nistz256_subx,@function
3535.align	32
3536__ecp_nistz256_subx:
3537.cfi_startproc
3538	xorq	%r11,%r11
3539	sbbq	%r12,%rax
3540	sbbq	%r13,%rbp
3541	movq	%rax,%r12
3542	sbbq	%r8,%rcx
3543	sbbq	%r9,%r10
3544	movq	%rbp,%r13
3545	sbbq	$0,%r11
3546
3547	xorq	%r9,%r9
3548	adcq	$-1,%rax
3549	movq	%rcx,%r8
3550	adcq	%r14,%rbp
3551	adcq	$0,%rcx
3552	movq	%r10,%r9
3553	adcq	%r15,%r10
3554
3555	btq	$0,%r11
3556	cmovcq	%rax,%r12
3557	cmovcq	%rbp,%r13
3558	cmovcq	%rcx,%r8
3559	cmovcq	%r10,%r9
3560
3561	.byte	0xf3,0xc3
3562.cfi_endproc
3563.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
3564
3565.type	__ecp_nistz256_mul_by_2x,@function
3566.align	32
3567__ecp_nistz256_mul_by_2x:
3568.cfi_startproc
3569	xorq	%r11,%r11
3570	adcq	%r12,%r12
3571	adcq	%r13,%r13
3572	movq	%r12,%rax
3573	adcq	%r8,%r8
3574	adcq	%r9,%r9
3575	movq	%r13,%rbp
3576	adcq	$0,%r11
3577
3578	xorq	%r10,%r10
3579	sbbq	$-1,%r12
3580	movq	%r8,%rcx
3581	sbbq	%r14,%r13
3582	sbbq	$0,%r8
3583	movq	%r9,%r10
3584	sbbq	%r15,%r9
3585	sbbq	$0,%r11
3586
3587	cmovcq	%rax,%r12
3588	cmovcq	%rbp,%r13
3589	movq	%r12,0(%rdi)
3590	cmovcq	%rcx,%r8
3591	movq	%r13,8(%rdi)
3592	cmovcq	%r10,%r9
3593	movq	%r8,16(%rdi)
3594	movq	%r9,24(%rdi)
3595
3596	.byte	0xf3,0xc3
3597.cfi_endproc
3598.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
3599.type	GFp_nistz256_point_doublex,@function
3600.align	32
3601GFp_nistz256_point_doublex:
3602.cfi_startproc
3603.Lpoint_doublex:
3604	pushq	%rbp
3605.cfi_adjust_cfa_offset	8
3606.cfi_offset	%rbp,-16
3607	pushq	%rbx
3608.cfi_adjust_cfa_offset	8
3609.cfi_offset	%rbx,-24
3610	pushq	%r12
3611.cfi_adjust_cfa_offset	8
3612.cfi_offset	%r12,-32
3613	pushq	%r13
3614.cfi_adjust_cfa_offset	8
3615.cfi_offset	%r13,-40
3616	pushq	%r14
3617.cfi_adjust_cfa_offset	8
3618.cfi_offset	%r14,-48
3619	pushq	%r15
3620.cfi_adjust_cfa_offset	8
3621.cfi_offset	%r15,-56
3622	subq	$160+8,%rsp
3623.cfi_adjust_cfa_offset	32*5+8
3624.Lpoint_doublex_body:
3625
3626.Lpoint_double_shortcutx:
3627	movdqu	0(%rsi),%xmm0
3628	movq	%rsi,%rbx
3629	movdqu	16(%rsi),%xmm1
3630	movq	32+0(%rsi),%r12
3631	movq	32+8(%rsi),%r13
3632	movq	32+16(%rsi),%r8
3633	movq	32+24(%rsi),%r9
3634	movq	.Lpoly+8(%rip),%r14
3635	movq	.Lpoly+24(%rip),%r15
3636	movdqa	%xmm0,96(%rsp)
3637	movdqa	%xmm1,96+16(%rsp)
3638	leaq	32(%rdi),%r10
3639	leaq	64(%rdi),%r11
3640.byte	102,72,15,110,199
3641.byte	102,73,15,110,202
3642.byte	102,73,15,110,211
3643
3644	leaq	0(%rsp),%rdi
3645	call	__ecp_nistz256_mul_by_2x
3646
3647	movq	64+0(%rsi),%rdx
3648	movq	64+8(%rsi),%r14
3649	movq	64+16(%rsi),%r15
3650	movq	64+24(%rsi),%r8
3651	leaq	64-128(%rsi),%rsi
3652	leaq	64(%rsp),%rdi
3653	call	__ecp_nistz256_sqr_montx
3654
3655	movq	0+0(%rsp),%rdx
3656	movq	8+0(%rsp),%r14
3657	leaq	-128+0(%rsp),%rsi
3658	movq	16+0(%rsp),%r15
3659	movq	24+0(%rsp),%r8
3660	leaq	0(%rsp),%rdi
3661	call	__ecp_nistz256_sqr_montx
3662
3663	movq	32(%rbx),%rdx
3664	movq	64+0(%rbx),%r9
3665	movq	64+8(%rbx),%r10
3666	movq	64+16(%rbx),%r11
3667	movq	64+24(%rbx),%r12
3668	leaq	64-128(%rbx),%rsi
3669	leaq	32(%rbx),%rbx
3670.byte	102,72,15,126,215
3671	call	__ecp_nistz256_mul_montx
3672	call	__ecp_nistz256_mul_by_2x
3673
3674	movq	96+0(%rsp),%r12
3675	movq	96+8(%rsp),%r13
3676	leaq	64(%rsp),%rbx
3677	movq	96+16(%rsp),%r8
3678	movq	96+24(%rsp),%r9
3679	leaq	32(%rsp),%rdi
3680	call	__ecp_nistz256_add_tox
3681
3682	movq	96+0(%rsp),%r12
3683	movq	96+8(%rsp),%r13
3684	leaq	64(%rsp),%rbx
3685	movq	96+16(%rsp),%r8
3686	movq	96+24(%rsp),%r9
3687	leaq	64(%rsp),%rdi
3688	call	__ecp_nistz256_sub_fromx
3689
3690	movq	0+0(%rsp),%rdx
3691	movq	8+0(%rsp),%r14
3692	leaq	-128+0(%rsp),%rsi
3693	movq	16+0(%rsp),%r15
3694	movq	24+0(%rsp),%r8
3695.byte	102,72,15,126,207
3696	call	__ecp_nistz256_sqr_montx
3697	xorq	%r9,%r9
3698	movq	%r12,%rax
3699	addq	$-1,%r12
3700	movq	%r13,%r10
3701	adcq	%rsi,%r13
3702	movq	%r14,%rcx
3703	adcq	$0,%r14
3704	movq	%r15,%r8
3705	adcq	%rbp,%r15
3706	adcq	$0,%r9
3707	xorq	%rsi,%rsi
3708	testq	$1,%rax
3709
3710	cmovzq	%rax,%r12
3711	cmovzq	%r10,%r13
3712	cmovzq	%rcx,%r14
3713	cmovzq	%r8,%r15
3714	cmovzq	%rsi,%r9
3715
3716	movq	%r13,%rax
3717	shrq	$1,%r12
3718	shlq	$63,%rax
3719	movq	%r14,%r10
3720	shrq	$1,%r13
3721	orq	%rax,%r12
3722	shlq	$63,%r10
3723	movq	%r15,%rcx
3724	shrq	$1,%r14
3725	orq	%r10,%r13
3726	shlq	$63,%rcx
3727	movq	%r12,0(%rdi)
3728	shrq	$1,%r15
3729	movq	%r13,8(%rdi)
3730	shlq	$63,%r9
3731	orq	%rcx,%r14
3732	orq	%r9,%r15
3733	movq	%r14,16(%rdi)
3734	movq	%r15,24(%rdi)
3735	movq	64(%rsp),%rdx
3736	leaq	64(%rsp),%rbx
3737	movq	0+32(%rsp),%r9
3738	movq	8+32(%rsp),%r10
3739	leaq	-128+32(%rsp),%rsi
3740	movq	16+32(%rsp),%r11
3741	movq	24+32(%rsp),%r12
3742	leaq	32(%rsp),%rdi
3743	call	__ecp_nistz256_mul_montx
3744
3745	leaq	128(%rsp),%rdi
3746	call	__ecp_nistz256_mul_by_2x
3747
3748	leaq	32(%rsp),%rbx
3749	leaq	32(%rsp),%rdi
3750	call	__ecp_nistz256_add_tox
3751
3752	movq	96(%rsp),%rdx
3753	leaq	96(%rsp),%rbx
3754	movq	0+0(%rsp),%r9
3755	movq	8+0(%rsp),%r10
3756	leaq	-128+0(%rsp),%rsi
3757	movq	16+0(%rsp),%r11
3758	movq	24+0(%rsp),%r12
3759	leaq	0(%rsp),%rdi
3760	call	__ecp_nistz256_mul_montx
3761
3762	leaq	128(%rsp),%rdi
3763	call	__ecp_nistz256_mul_by_2x
3764
3765	movq	0+32(%rsp),%rdx
3766	movq	8+32(%rsp),%r14
3767	leaq	-128+32(%rsp),%rsi
3768	movq	16+32(%rsp),%r15
3769	movq	24+32(%rsp),%r8
3770.byte	102,72,15,126,199
3771	call	__ecp_nistz256_sqr_montx
3772
3773	leaq	128(%rsp),%rbx
3774	movq	%r14,%r8
3775	movq	%r15,%r9
3776	movq	%rsi,%r14
3777	movq	%rbp,%r15
3778	call	__ecp_nistz256_sub_fromx
3779
3780	movq	0+0(%rsp),%rax
3781	movq	0+8(%rsp),%rbp
3782	movq	0+16(%rsp),%rcx
3783	movq	0+24(%rsp),%r10
3784	leaq	0(%rsp),%rdi
3785	call	__ecp_nistz256_subx
3786
3787	movq	32(%rsp),%rdx
3788	leaq	32(%rsp),%rbx
3789	movq	%r12,%r14
3790	xorl	%ecx,%ecx
3791	movq	%r12,0+0(%rsp)
3792	movq	%r13,%r10
3793	movq	%r13,0+8(%rsp)
3794	cmovzq	%r8,%r11
3795	movq	%r8,0+16(%rsp)
3796	leaq	0-128(%rsp),%rsi
3797	cmovzq	%r9,%r12
3798	movq	%r9,0+24(%rsp)
3799	movq	%r14,%r9
3800	leaq	0(%rsp),%rdi
3801	call	__ecp_nistz256_mul_montx
3802
3803.byte	102,72,15,126,203
3804.byte	102,72,15,126,207
3805	call	__ecp_nistz256_sub_fromx
3806
3807	leaq	160+56(%rsp),%rsi
3808.cfi_def_cfa	%rsi,8
3809	movq	-48(%rsi),%r15
3810.cfi_restore	%r15
3811	movq	-40(%rsi),%r14
3812.cfi_restore	%r14
3813	movq	-32(%rsi),%r13
3814.cfi_restore	%r13
3815	movq	-24(%rsi),%r12
3816.cfi_restore	%r12
3817	movq	-16(%rsi),%rbx
3818.cfi_restore	%rbx
3819	movq	-8(%rsi),%rbp
3820.cfi_restore	%rbp
3821	leaq	(%rsi),%rsp
3822.cfi_def_cfa_register	%rsp
3823.Lpoint_doublex_epilogue:
3824	.byte	0xf3,0xc3
3825.cfi_endproc
3826.size	GFp_nistz256_point_doublex,.-GFp_nistz256_point_doublex
3827.type	GFp_nistz256_point_addx,@function
3828.align	32
3829GFp_nistz256_point_addx:
3830.cfi_startproc
3831.Lpoint_addx:
3832	pushq	%rbp
3833.cfi_adjust_cfa_offset	8
3834.cfi_offset	%rbp,-16
3835	pushq	%rbx
3836.cfi_adjust_cfa_offset	8
3837.cfi_offset	%rbx,-24
3838	pushq	%r12
3839.cfi_adjust_cfa_offset	8
3840.cfi_offset	%r12,-32
3841	pushq	%r13
3842.cfi_adjust_cfa_offset	8
3843.cfi_offset	%r13,-40
3844	pushq	%r14
3845.cfi_adjust_cfa_offset	8
3846.cfi_offset	%r14,-48
3847	pushq	%r15
3848.cfi_adjust_cfa_offset	8
3849.cfi_offset	%r15,-56
3850	subq	$576+8,%rsp
3851.cfi_adjust_cfa_offset	32*18+8
3852.Lpoint_addx_body:
3853
3854	movdqu	0(%rsi),%xmm0
3855	movdqu	16(%rsi),%xmm1
3856	movdqu	32(%rsi),%xmm2
3857	movdqu	48(%rsi),%xmm3
3858	movdqu	64(%rsi),%xmm4
3859	movdqu	80(%rsi),%xmm5
3860	movq	%rsi,%rbx
3861	movq	%rdx,%rsi
3862	movdqa	%xmm0,384(%rsp)
3863	movdqa	%xmm1,384+16(%rsp)
3864	movdqa	%xmm2,416(%rsp)
3865	movdqa	%xmm3,416+16(%rsp)
3866	movdqa	%xmm4,448(%rsp)
3867	movdqa	%xmm5,448+16(%rsp)
3868	por	%xmm4,%xmm5
3869
3870	movdqu	0(%rsi),%xmm0
3871	pshufd	$0xb1,%xmm5,%xmm3
3872	movdqu	16(%rsi),%xmm1
3873	movdqu	32(%rsi),%xmm2
3874	por	%xmm3,%xmm5
3875	movdqu	48(%rsi),%xmm3
3876	movq	64+0(%rsi),%rdx
3877	movq	64+8(%rsi),%r14
3878	movq	64+16(%rsi),%r15
3879	movq	64+24(%rsi),%r8
3880	movdqa	%xmm0,480(%rsp)
3881	pshufd	$0x1e,%xmm5,%xmm4
3882	movdqa	%xmm1,480+16(%rsp)
3883	movdqu	64(%rsi),%xmm0
3884	movdqu	80(%rsi),%xmm1
3885	movdqa	%xmm2,512(%rsp)
3886	movdqa	%xmm3,512+16(%rsp)
3887	por	%xmm4,%xmm5
3888	pxor	%xmm4,%xmm4
3889	por	%xmm0,%xmm1
3890.byte	102,72,15,110,199
3891
3892	leaq	64-128(%rsi),%rsi
3893	movq	%rdx,544+0(%rsp)
3894	movq	%r14,544+8(%rsp)
3895	movq	%r15,544+16(%rsp)
3896	movq	%r8,544+24(%rsp)
3897	leaq	96(%rsp),%rdi
3898	call	__ecp_nistz256_sqr_montx
3899
3900	pcmpeqd	%xmm4,%xmm5
3901	pshufd	$0xb1,%xmm1,%xmm4
3902	por	%xmm1,%xmm4
3903	pshufd	$0,%xmm5,%xmm5
3904	pshufd	$0x1e,%xmm4,%xmm3
3905	por	%xmm3,%xmm4
3906	pxor	%xmm3,%xmm3
3907	pcmpeqd	%xmm3,%xmm4
3908	pshufd	$0,%xmm4,%xmm4
3909	movq	64+0(%rbx),%rdx
3910	movq	64+8(%rbx),%r14
3911	movq	64+16(%rbx),%r15
3912	movq	64+24(%rbx),%r8
3913.byte	102,72,15,110,203
3914
3915	leaq	64-128(%rbx),%rsi
3916	leaq	32(%rsp),%rdi
3917	call	__ecp_nistz256_sqr_montx
3918
3919	movq	544(%rsp),%rdx
3920	leaq	544(%rsp),%rbx
3921	movq	0+96(%rsp),%r9
3922	movq	8+96(%rsp),%r10
3923	leaq	-128+96(%rsp),%rsi
3924	movq	16+96(%rsp),%r11
3925	movq	24+96(%rsp),%r12
3926	leaq	224(%rsp),%rdi
3927	call	__ecp_nistz256_mul_montx
3928
3929	movq	448(%rsp),%rdx
3930	leaq	448(%rsp),%rbx
3931	movq	0+32(%rsp),%r9
3932	movq	8+32(%rsp),%r10
3933	leaq	-128+32(%rsp),%rsi
3934	movq	16+32(%rsp),%r11
3935	movq	24+32(%rsp),%r12
3936	leaq	256(%rsp),%rdi
3937	call	__ecp_nistz256_mul_montx
3938
3939	movq	416(%rsp),%rdx
3940	leaq	416(%rsp),%rbx
3941	movq	0+224(%rsp),%r9
3942	movq	8+224(%rsp),%r10
3943	leaq	-128+224(%rsp),%rsi
3944	movq	16+224(%rsp),%r11
3945	movq	24+224(%rsp),%r12
3946	leaq	224(%rsp),%rdi
3947	call	__ecp_nistz256_mul_montx
3948
3949	movq	512(%rsp),%rdx
3950	leaq	512(%rsp),%rbx
3951	movq	0+256(%rsp),%r9
3952	movq	8+256(%rsp),%r10
3953	leaq	-128+256(%rsp),%rsi
3954	movq	16+256(%rsp),%r11
3955	movq	24+256(%rsp),%r12
3956	leaq	256(%rsp),%rdi
3957	call	__ecp_nistz256_mul_montx
3958
3959	leaq	224(%rsp),%rbx
3960	leaq	64(%rsp),%rdi
3961	call	__ecp_nistz256_sub_fromx
3962
3963	orq	%r13,%r12
3964	movdqa	%xmm4,%xmm2
3965	orq	%r8,%r12
3966	orq	%r9,%r12
3967	por	%xmm5,%xmm2
3968.byte	102,73,15,110,220
3969
3970	movq	384(%rsp),%rdx
3971	leaq	384(%rsp),%rbx
3972	movq	0+96(%rsp),%r9
3973	movq	8+96(%rsp),%r10
3974	leaq	-128+96(%rsp),%rsi
3975	movq	16+96(%rsp),%r11
3976	movq	24+96(%rsp),%r12
3977	leaq	160(%rsp),%rdi
3978	call	__ecp_nistz256_mul_montx
3979
3980	movq	480(%rsp),%rdx
3981	leaq	480(%rsp),%rbx
3982	movq	0+32(%rsp),%r9
3983	movq	8+32(%rsp),%r10
3984	leaq	-128+32(%rsp),%rsi
3985	movq	16+32(%rsp),%r11
3986	movq	24+32(%rsp),%r12
3987	leaq	192(%rsp),%rdi
3988	call	__ecp_nistz256_mul_montx
3989
3990	leaq	160(%rsp),%rbx
3991	leaq	0(%rsp),%rdi
3992	call	__ecp_nistz256_sub_fromx
3993
3994	orq	%r13,%r12
3995	orq	%r8,%r12
3996	orq	%r9,%r12
3997
3998.byte	102,73,15,126,208
3999.byte	102,73,15,126,217
4000	orq	%r8,%r12
4001.byte	0x3e
4002	jnz	.Ladd_proceedx
4003
4004
4005
4006	testq	%r9,%r9
4007	jz	.Ladd_doublex
4008
4009
4010
4011
4012
4013
4014.byte	102,72,15,126,199
4015	pxor	%xmm0,%xmm0
4016	movdqu	%xmm0,0(%rdi)
4017	movdqu	%xmm0,16(%rdi)
4018	movdqu	%xmm0,32(%rdi)
4019	movdqu	%xmm0,48(%rdi)
4020	movdqu	%xmm0,64(%rdi)
4021	movdqu	%xmm0,80(%rdi)
4022	jmp	.Ladd_donex
4023
4024.align	32
4025.Ladd_doublex:
4026.byte	102,72,15,126,206
4027.byte	102,72,15,126,199
4028	addq	$416,%rsp
4029.cfi_adjust_cfa_offset	-416
4030	jmp	.Lpoint_double_shortcutx
4031.cfi_adjust_cfa_offset	416
4032
4033.align	32
4034.Ladd_proceedx:
4035	movq	0+64(%rsp),%rdx
4036	movq	8+64(%rsp),%r14
4037	leaq	-128+64(%rsp),%rsi
4038	movq	16+64(%rsp),%r15
4039	movq	24+64(%rsp),%r8
4040	leaq	96(%rsp),%rdi
4041	call	__ecp_nistz256_sqr_montx
4042
4043	movq	448(%rsp),%rdx
4044	leaq	448(%rsp),%rbx
4045	movq	0+0(%rsp),%r9
4046	movq	8+0(%rsp),%r10
4047	leaq	-128+0(%rsp),%rsi
4048	movq	16+0(%rsp),%r11
4049	movq	24+0(%rsp),%r12
4050	leaq	352(%rsp),%rdi
4051	call	__ecp_nistz256_mul_montx
4052
4053	movq	0+0(%rsp),%rdx
4054	movq	8+0(%rsp),%r14
4055	leaq	-128+0(%rsp),%rsi
4056	movq	16+0(%rsp),%r15
4057	movq	24+0(%rsp),%r8
4058	leaq	32(%rsp),%rdi
4059	call	__ecp_nistz256_sqr_montx
4060
4061	movq	544(%rsp),%rdx
4062	leaq	544(%rsp),%rbx
4063	movq	0+352(%rsp),%r9
4064	movq	8+352(%rsp),%r10
4065	leaq	-128+352(%rsp),%rsi
4066	movq	16+352(%rsp),%r11
4067	movq	24+352(%rsp),%r12
4068	leaq	352(%rsp),%rdi
4069	call	__ecp_nistz256_mul_montx
4070
4071	movq	0(%rsp),%rdx
4072	leaq	0(%rsp),%rbx
4073	movq	0+32(%rsp),%r9
4074	movq	8+32(%rsp),%r10
4075	leaq	-128+32(%rsp),%rsi
4076	movq	16+32(%rsp),%r11
4077	movq	24+32(%rsp),%r12
4078	leaq	128(%rsp),%rdi
4079	call	__ecp_nistz256_mul_montx
4080
4081	movq	160(%rsp),%rdx
4082	leaq	160(%rsp),%rbx
4083	movq	0+32(%rsp),%r9
4084	movq	8+32(%rsp),%r10
4085	leaq	-128+32(%rsp),%rsi
4086	movq	16+32(%rsp),%r11
4087	movq	24+32(%rsp),%r12
4088	leaq	192(%rsp),%rdi
4089	call	__ecp_nistz256_mul_montx
4090
4091
4092
4093
4094	xorq	%r11,%r11
4095	addq	%r12,%r12
4096	leaq	96(%rsp),%rsi
4097	adcq	%r13,%r13
4098	movq	%r12,%rax
4099	adcq	%r8,%r8
4100	adcq	%r9,%r9
4101	movq	%r13,%rbp
4102	adcq	$0,%r11
4103
4104	subq	$-1,%r12
4105	movq	%r8,%rcx
4106	sbbq	%r14,%r13
4107	sbbq	$0,%r8
4108	movq	%r9,%r10
4109	sbbq	%r15,%r9
4110	sbbq	$0,%r11
4111
4112	cmovcq	%rax,%r12
4113	movq	0(%rsi),%rax
4114	cmovcq	%rbp,%r13
4115	movq	8(%rsi),%rbp
4116	cmovcq	%rcx,%r8
4117	movq	16(%rsi),%rcx
4118	cmovcq	%r10,%r9
4119	movq	24(%rsi),%r10
4120
4121	call	__ecp_nistz256_subx
4122
4123	leaq	128(%rsp),%rbx
4124	leaq	288(%rsp),%rdi
4125	call	__ecp_nistz256_sub_fromx
4126
4127	movq	192+0(%rsp),%rax
4128	movq	192+8(%rsp),%rbp
4129	movq	192+16(%rsp),%rcx
4130	movq	192+24(%rsp),%r10
4131	leaq	320(%rsp),%rdi
4132
4133	call	__ecp_nistz256_subx
4134
4135	movq	%r12,0(%rdi)
4136	movq	%r13,8(%rdi)
4137	movq	%r8,16(%rdi)
4138	movq	%r9,24(%rdi)
4139	movq	128(%rsp),%rdx
4140	leaq	128(%rsp),%rbx
4141	movq	0+224(%rsp),%r9
4142	movq	8+224(%rsp),%r10
4143	leaq	-128+224(%rsp),%rsi
4144	movq	16+224(%rsp),%r11
4145	movq	24+224(%rsp),%r12
4146	leaq	256(%rsp),%rdi
4147	call	__ecp_nistz256_mul_montx
4148
4149	movq	320(%rsp),%rdx
4150	leaq	320(%rsp),%rbx
4151	movq	0+64(%rsp),%r9
4152	movq	8+64(%rsp),%r10
4153	leaq	-128+64(%rsp),%rsi
4154	movq	16+64(%rsp),%r11
4155	movq	24+64(%rsp),%r12
4156	leaq	320(%rsp),%rdi
4157	call	__ecp_nistz256_mul_montx
4158
4159	leaq	256(%rsp),%rbx
4160	leaq	320(%rsp),%rdi
4161	call	__ecp_nistz256_sub_fromx
4162
4163.byte	102,72,15,126,199
4164
4165	movdqa	%xmm5,%xmm0
4166	movdqa	%xmm5,%xmm1
4167	pandn	352(%rsp),%xmm0
4168	movdqa	%xmm5,%xmm2
4169	pandn	352+16(%rsp),%xmm1
4170	movdqa	%xmm5,%xmm3
4171	pand	544(%rsp),%xmm2
4172	pand	544+16(%rsp),%xmm3
4173	por	%xmm0,%xmm2
4174	por	%xmm1,%xmm3
4175
4176	movdqa	%xmm4,%xmm0
4177	movdqa	%xmm4,%xmm1
4178	pandn	%xmm2,%xmm0
4179	movdqa	%xmm4,%xmm2
4180	pandn	%xmm3,%xmm1
4181	movdqa	%xmm4,%xmm3
4182	pand	448(%rsp),%xmm2
4183	pand	448+16(%rsp),%xmm3
4184	por	%xmm0,%xmm2
4185	por	%xmm1,%xmm3
4186	movdqu	%xmm2,64(%rdi)
4187	movdqu	%xmm3,80(%rdi)
4188
4189	movdqa	%xmm5,%xmm0
4190	movdqa	%xmm5,%xmm1
4191	pandn	288(%rsp),%xmm0
4192	movdqa	%xmm5,%xmm2
4193	pandn	288+16(%rsp),%xmm1
4194	movdqa	%xmm5,%xmm3
4195	pand	480(%rsp),%xmm2
4196	pand	480+16(%rsp),%xmm3
4197	por	%xmm0,%xmm2
4198	por	%xmm1,%xmm3
4199
4200	movdqa	%xmm4,%xmm0
4201	movdqa	%xmm4,%xmm1
4202	pandn	%xmm2,%xmm0
4203	movdqa	%xmm4,%xmm2
4204	pandn	%xmm3,%xmm1
4205	movdqa	%xmm4,%xmm3
4206	pand	384(%rsp),%xmm2
4207	pand	384+16(%rsp),%xmm3
4208	por	%xmm0,%xmm2
4209	por	%xmm1,%xmm3
4210	movdqu	%xmm2,0(%rdi)
4211	movdqu	%xmm3,16(%rdi)
4212
4213	movdqa	%xmm5,%xmm0
4214	movdqa	%xmm5,%xmm1
4215	pandn	320(%rsp),%xmm0
4216	movdqa	%xmm5,%xmm2
4217	pandn	320+16(%rsp),%xmm1
4218	movdqa	%xmm5,%xmm3
4219	pand	512(%rsp),%xmm2
4220	pand	512+16(%rsp),%xmm3
4221	por	%xmm0,%xmm2
4222	por	%xmm1,%xmm3
4223
4224	movdqa	%xmm4,%xmm0
4225	movdqa	%xmm4,%xmm1
4226	pandn	%xmm2,%xmm0
4227	movdqa	%xmm4,%xmm2
4228	pandn	%xmm3,%xmm1
4229	movdqa	%xmm4,%xmm3
4230	pand	416(%rsp),%xmm2
4231	pand	416+16(%rsp),%xmm3
4232	por	%xmm0,%xmm2
4233	por	%xmm1,%xmm3
4234	movdqu	%xmm2,32(%rdi)
4235	movdqu	%xmm3,48(%rdi)
4236
4237.Ladd_donex:
4238	leaq	576+56(%rsp),%rsi
4239.cfi_def_cfa	%rsi,8
4240	movq	-48(%rsi),%r15
4241.cfi_restore	%r15
4242	movq	-40(%rsi),%r14
4243.cfi_restore	%r14
4244	movq	-32(%rsi),%r13
4245.cfi_restore	%r13
4246	movq	-24(%rsi),%r12
4247.cfi_restore	%r12
4248	movq	-16(%rsi),%rbx
4249.cfi_restore	%rbx
4250	movq	-8(%rsi),%rbp
4251.cfi_restore	%rbp
4252	leaq	(%rsi),%rsp
4253.cfi_def_cfa_register	%rsp
4254.Lpoint_addx_epilogue:
4255	.byte	0xf3,0xc3
4256.cfi_endproc
4257.size	GFp_nistz256_point_addx,.-GFp_nistz256_point_addx
4258.type	GFp_nistz256_point_add_affinex,@function
4259.align	32
4260GFp_nistz256_point_add_affinex:
4261.cfi_startproc
4262.Lpoint_add_affinex:
4263	pushq	%rbp
4264.cfi_adjust_cfa_offset	8
4265.cfi_offset	%rbp,-16
4266	pushq	%rbx
4267.cfi_adjust_cfa_offset	8
4268.cfi_offset	%rbx,-24
4269	pushq	%r12
4270.cfi_adjust_cfa_offset	8
4271.cfi_offset	%r12,-32
4272	pushq	%r13
4273.cfi_adjust_cfa_offset	8
4274.cfi_offset	%r13,-40
4275	pushq	%r14
4276.cfi_adjust_cfa_offset	8
4277.cfi_offset	%r14,-48
4278	pushq	%r15
4279.cfi_adjust_cfa_offset	8
4280.cfi_offset	%r15,-56
4281	subq	$480+8,%rsp
4282.cfi_adjust_cfa_offset	32*15+8
4283.Ladd_affinex_body:
4284
4285	movdqu	0(%rsi),%xmm0
4286	movq	%rdx,%rbx
4287	movdqu	16(%rsi),%xmm1
4288	movdqu	32(%rsi),%xmm2
4289	movdqu	48(%rsi),%xmm3
4290	movdqu	64(%rsi),%xmm4
4291	movdqu	80(%rsi),%xmm5
4292	movq	64+0(%rsi),%rdx
4293	movq	64+8(%rsi),%r14
4294	movq	64+16(%rsi),%r15
4295	movq	64+24(%rsi),%r8
4296	movdqa	%xmm0,320(%rsp)
4297	movdqa	%xmm1,320+16(%rsp)
4298	movdqa	%xmm2,352(%rsp)
4299	movdqa	%xmm3,352+16(%rsp)
4300	movdqa	%xmm4,384(%rsp)
4301	movdqa	%xmm5,384+16(%rsp)
4302	por	%xmm4,%xmm5
4303
4304	movdqu	0(%rbx),%xmm0
4305	pshufd	$0xb1,%xmm5,%xmm3
4306	movdqu	16(%rbx),%xmm1
4307	movdqu	32(%rbx),%xmm2
4308	por	%xmm3,%xmm5
4309	movdqu	48(%rbx),%xmm3
4310	movdqa	%xmm0,416(%rsp)
4311	pshufd	$0x1e,%xmm5,%xmm4
4312	movdqa	%xmm1,416+16(%rsp)
4313	por	%xmm0,%xmm1
4314.byte	102,72,15,110,199
4315	movdqa	%xmm2,448(%rsp)
4316	movdqa	%xmm3,448+16(%rsp)
4317	por	%xmm2,%xmm3
4318	por	%xmm4,%xmm5
4319	pxor	%xmm4,%xmm4
4320	por	%xmm1,%xmm3
4321
4322	leaq	64-128(%rsi),%rsi
4323	leaq	32(%rsp),%rdi
4324	call	__ecp_nistz256_sqr_montx
4325
4326	pcmpeqd	%xmm4,%xmm5
4327	pshufd	$0xb1,%xmm3,%xmm4
4328	movq	0(%rbx),%rdx
4329
4330	movq	%r12,%r9
4331	por	%xmm3,%xmm4
4332	pshufd	$0,%xmm5,%xmm5
4333	pshufd	$0x1e,%xmm4,%xmm3
4334	movq	%r13,%r10
4335	por	%xmm3,%xmm4
4336	pxor	%xmm3,%xmm3
4337	movq	%r14,%r11
4338	pcmpeqd	%xmm3,%xmm4
4339	pshufd	$0,%xmm4,%xmm4
4340
4341	leaq	32-128(%rsp),%rsi
4342	movq	%r15,%r12
4343	leaq	0(%rsp),%rdi
4344	call	__ecp_nistz256_mul_montx
4345
4346	leaq	320(%rsp),%rbx
4347	leaq	64(%rsp),%rdi
4348	call	__ecp_nistz256_sub_fromx
4349
4350	movq	384(%rsp),%rdx
4351	leaq	384(%rsp),%rbx
4352	movq	0+32(%rsp),%r9
4353	movq	8+32(%rsp),%r10
4354	leaq	-128+32(%rsp),%rsi
4355	movq	16+32(%rsp),%r11
4356	movq	24+32(%rsp),%r12
4357	leaq	32(%rsp),%rdi
4358	call	__ecp_nistz256_mul_montx
4359
4360	movq	384(%rsp),%rdx
4361	leaq	384(%rsp),%rbx
4362	movq	0+64(%rsp),%r9
4363	movq	8+64(%rsp),%r10
4364	leaq	-128+64(%rsp),%rsi
4365	movq	16+64(%rsp),%r11
4366	movq	24+64(%rsp),%r12
4367	leaq	288(%rsp),%rdi
4368	call	__ecp_nistz256_mul_montx
4369
4370	movq	448(%rsp),%rdx
4371	leaq	448(%rsp),%rbx
4372	movq	0+32(%rsp),%r9
4373	movq	8+32(%rsp),%r10
4374	leaq	-128+32(%rsp),%rsi
4375	movq	16+32(%rsp),%r11
4376	movq	24+32(%rsp),%r12
4377	leaq	32(%rsp),%rdi
4378	call	__ecp_nistz256_mul_montx
4379
4380	leaq	352(%rsp),%rbx
4381	leaq	96(%rsp),%rdi
4382	call	__ecp_nistz256_sub_fromx
4383
4384	movq	0+64(%rsp),%rdx
4385	movq	8+64(%rsp),%r14
4386	leaq	-128+64(%rsp),%rsi
4387	movq	16+64(%rsp),%r15
4388	movq	24+64(%rsp),%r8
4389	leaq	128(%rsp),%rdi
4390	call	__ecp_nistz256_sqr_montx
4391
4392	movq	0+96(%rsp),%rdx
4393	movq	8+96(%rsp),%r14
4394	leaq	-128+96(%rsp),%rsi
4395	movq	16+96(%rsp),%r15
4396	movq	24+96(%rsp),%r8
4397	leaq	192(%rsp),%rdi
4398	call	__ecp_nistz256_sqr_montx
4399
4400	movq	128(%rsp),%rdx
4401	leaq	128(%rsp),%rbx
4402	movq	0+64(%rsp),%r9
4403	movq	8+64(%rsp),%r10
4404	leaq	-128+64(%rsp),%rsi
4405	movq	16+64(%rsp),%r11
4406	movq	24+64(%rsp),%r12
4407	leaq	160(%rsp),%rdi
4408	call	__ecp_nistz256_mul_montx
4409
4410	movq	320(%rsp),%rdx
4411	leaq	320(%rsp),%rbx
4412	movq	0+128(%rsp),%r9
4413	movq	8+128(%rsp),%r10
4414	leaq	-128+128(%rsp),%rsi
4415	movq	16+128(%rsp),%r11
4416	movq	24+128(%rsp),%r12
4417	leaq	0(%rsp),%rdi
4418	call	__ecp_nistz256_mul_montx
4419
4420
4421
4422
4423	xorq	%r11,%r11
4424	addq	%r12,%r12
4425	leaq	192(%rsp),%rsi
4426	adcq	%r13,%r13
4427	movq	%r12,%rax
4428	adcq	%r8,%r8
4429	adcq	%r9,%r9
4430	movq	%r13,%rbp
4431	adcq	$0,%r11
4432
4433	subq	$-1,%r12
4434	movq	%r8,%rcx
4435	sbbq	%r14,%r13
4436	sbbq	$0,%r8
4437	movq	%r9,%r10
4438	sbbq	%r15,%r9
4439	sbbq	$0,%r11
4440
4441	cmovcq	%rax,%r12
4442	movq	0(%rsi),%rax
4443	cmovcq	%rbp,%r13
4444	movq	8(%rsi),%rbp
4445	cmovcq	%rcx,%r8
4446	movq	16(%rsi),%rcx
4447	cmovcq	%r10,%r9
4448	movq	24(%rsi),%r10
4449
4450	call	__ecp_nistz256_subx
4451
4452	leaq	160(%rsp),%rbx
4453	leaq	224(%rsp),%rdi
4454	call	__ecp_nistz256_sub_fromx
4455
4456	movq	0+0(%rsp),%rax
4457	movq	0+8(%rsp),%rbp
4458	movq	0+16(%rsp),%rcx
4459	movq	0+24(%rsp),%r10
4460	leaq	64(%rsp),%rdi
4461
4462	call	__ecp_nistz256_subx
4463
4464	movq	%r12,0(%rdi)
4465	movq	%r13,8(%rdi)
4466	movq	%r8,16(%rdi)
4467	movq	%r9,24(%rdi)
4468	movq	352(%rsp),%rdx
4469	leaq	352(%rsp),%rbx
4470	movq	0+160(%rsp),%r9
4471	movq	8+160(%rsp),%r10
4472	leaq	-128+160(%rsp),%rsi
4473	movq	16+160(%rsp),%r11
4474	movq	24+160(%rsp),%r12
4475	leaq	32(%rsp),%rdi
4476	call	__ecp_nistz256_mul_montx
4477
4478	movq	96(%rsp),%rdx
4479	leaq	96(%rsp),%rbx
4480	movq	0+64(%rsp),%r9
4481	movq	8+64(%rsp),%r10
4482	leaq	-128+64(%rsp),%rsi
4483	movq	16+64(%rsp),%r11
4484	movq	24+64(%rsp),%r12
4485	leaq	64(%rsp),%rdi
4486	call	__ecp_nistz256_mul_montx
4487
4488	leaq	32(%rsp),%rbx
4489	leaq	256(%rsp),%rdi
4490	call	__ecp_nistz256_sub_fromx
4491
4492.byte	102,72,15,126,199
4493
4494	movdqa	%xmm5,%xmm0
4495	movdqa	%xmm5,%xmm1
4496	pandn	288(%rsp),%xmm0
4497	movdqa	%xmm5,%xmm2
4498	pandn	288+16(%rsp),%xmm1
4499	movdqa	%xmm5,%xmm3
4500	pand	.LONE_mont(%rip),%xmm2
4501	pand	.LONE_mont+16(%rip),%xmm3
4502	por	%xmm0,%xmm2
4503	por	%xmm1,%xmm3
4504
4505	movdqa	%xmm4,%xmm0
4506	movdqa	%xmm4,%xmm1
4507	pandn	%xmm2,%xmm0
4508	movdqa	%xmm4,%xmm2
4509	pandn	%xmm3,%xmm1
4510	movdqa	%xmm4,%xmm3
4511	pand	384(%rsp),%xmm2
4512	pand	384+16(%rsp),%xmm3
4513	por	%xmm0,%xmm2
4514	por	%xmm1,%xmm3
4515	movdqu	%xmm2,64(%rdi)
4516	movdqu	%xmm3,80(%rdi)
4517
4518	movdqa	%xmm5,%xmm0
4519	movdqa	%xmm5,%xmm1
4520	pandn	224(%rsp),%xmm0
4521	movdqa	%xmm5,%xmm2
4522	pandn	224+16(%rsp),%xmm1
4523	movdqa	%xmm5,%xmm3
4524	pand	416(%rsp),%xmm2
4525	pand	416+16(%rsp),%xmm3
4526	por	%xmm0,%xmm2
4527	por	%xmm1,%xmm3
4528
4529	movdqa	%xmm4,%xmm0
4530	movdqa	%xmm4,%xmm1
4531	pandn	%xmm2,%xmm0
4532	movdqa	%xmm4,%xmm2
4533	pandn	%xmm3,%xmm1
4534	movdqa	%xmm4,%xmm3
4535	pand	320(%rsp),%xmm2
4536	pand	320+16(%rsp),%xmm3
4537	por	%xmm0,%xmm2
4538	por	%xmm1,%xmm3
4539	movdqu	%xmm2,0(%rdi)
4540	movdqu	%xmm3,16(%rdi)
4541
4542	movdqa	%xmm5,%xmm0
4543	movdqa	%xmm5,%xmm1
4544	pandn	256(%rsp),%xmm0
4545	movdqa	%xmm5,%xmm2
4546	pandn	256+16(%rsp),%xmm1
4547	movdqa	%xmm5,%xmm3
4548	pand	448(%rsp),%xmm2
4549	pand	448+16(%rsp),%xmm3
4550	por	%xmm0,%xmm2
4551	por	%xmm1,%xmm3
4552
4553	movdqa	%xmm4,%xmm0
4554	movdqa	%xmm4,%xmm1
4555	pandn	%xmm2,%xmm0
4556	movdqa	%xmm4,%xmm2
4557	pandn	%xmm3,%xmm1
4558	movdqa	%xmm4,%xmm3
4559	pand	352(%rsp),%xmm2
4560	pand	352+16(%rsp),%xmm3
4561	por	%xmm0,%xmm2
4562	por	%xmm1,%xmm3
4563	movdqu	%xmm2,32(%rdi)
4564	movdqu	%xmm3,48(%rdi)
4565
4566	leaq	480+56(%rsp),%rsi
4567.cfi_def_cfa	%rsi,8
4568	movq	-48(%rsi),%r15
4569.cfi_restore	%r15
4570	movq	-40(%rsi),%r14
4571.cfi_restore	%r14
4572	movq	-32(%rsi),%r13
4573.cfi_restore	%r13
4574	movq	-24(%rsi),%r12
4575.cfi_restore	%r12
4576	movq	-16(%rsi),%rbx
4577.cfi_restore	%rbx
4578	movq	-8(%rsi),%rbp
4579.cfi_restore	%rbp
4580	leaq	(%rsi),%rsp
4581.cfi_def_cfa_register	%rsp
4582.Ladd_affinex_epilogue:
4583	.byte	0xf3,0xc3
4584.cfi_endproc
4585.size	GFp_nistz256_point_add_affinex,.-GFp_nistz256_point_add_affinex
4586#endif
4587.section	.note.GNU-stack,"",@progbits
4588