1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7
8EXTERN	OPENSSL_ia32cap_P
9
10global	bn_mul_mont_gather5
11
12ALIGN	64
13bn_mul_mont_gather5:
14	mov	QWORD[8+rsp],rdi	;WIN64 prologue
15	mov	QWORD[16+rsp],rsi
16	mov	rax,rsp
17$L$SEH_begin_bn_mul_mont_gather5:
18	mov	rdi,rcx
19	mov	rsi,rdx
20	mov	rdx,r8
21	mov	rcx,r9
22	mov	r8,QWORD[40+rsp]
23	mov	r9,QWORD[48+rsp]
24
25
26	test	r9d,7
27	jnz	NEAR $L$mul_enter
28	jmp	NEAR $L$mul4x_enter
29
30ALIGN	16
31$L$mul_enter:
32	mov	r9d,r9d
33	mov	rax,rsp
34	mov	r10d,DWORD[56+rsp]
35	push	rbx
36	push	rbp
37	push	r12
38	push	r13
39	push	r14
40	push	r15
41	lea	rsp,[((-40))+rsp]
42	movaps	XMMWORD[rsp],xmm6
43	movaps	XMMWORD[16+rsp],xmm7
44	lea	r11,[2+r9]
45	neg	r11
46	lea	rsp,[r11*8+rsp]
47	and	rsp,-1024
48
49	mov	QWORD[8+r9*8+rsp],rax
50$L$mul_body:
51	mov	r12,rdx
52	mov	r11,r10
53	shr	r10,3
54	and	r11,7
55	not	r10
56	lea	rax,[$L$magic_masks]
57	and	r10,3
58	lea	r12,[96+r11*8+r12]
59	movq	xmm4,QWORD[r10*8+rax]
60	movq	xmm5,QWORD[8+r10*8+rax]
61	movq	xmm6,QWORD[16+r10*8+rax]
62	movq	xmm7,QWORD[24+r10*8+rax]
63
64	movq	xmm0,QWORD[(((-96)))+r12]
65	movq	xmm1,QWORD[((-32))+r12]
66	pand	xmm0,xmm4
67	movq	xmm2,QWORD[32+r12]
68	pand	xmm1,xmm5
69	movq	xmm3,QWORD[96+r12]
70	pand	xmm2,xmm6
71	por	xmm0,xmm1
72	pand	xmm3,xmm7
73	por	xmm0,xmm2
74	lea	r12,[256+r12]
75	por	xmm0,xmm3
76
77DB	102,72,15,126,195
78
79	mov	r8,QWORD[r8]
80	mov	rax,QWORD[rsi]
81
82	xor	r14,r14
83	xor	r15,r15
84
85	movq	xmm0,QWORD[(((-96)))+r12]
86	movq	xmm1,QWORD[((-32))+r12]
87	pand	xmm0,xmm4
88	movq	xmm2,QWORD[32+r12]
89	pand	xmm1,xmm5
90
91	mov	rbp,r8
92	mul	rbx
93	mov	r10,rax
94	mov	rax,QWORD[rcx]
95
96	movq	xmm3,QWORD[96+r12]
97	pand	xmm2,xmm6
98	por	xmm0,xmm1
99	pand	xmm3,xmm7
100
101	imul	rbp,r10
102	mov	r11,rdx
103
104	por	xmm0,xmm2
105	lea	r12,[256+r12]
106	por	xmm0,xmm3
107
108	mul	rbp
109	add	r10,rax
110	mov	rax,QWORD[8+rsi]
111	adc	rdx,0
112	mov	r13,rdx
113
114	lea	r15,[1+r15]
115	jmp	NEAR $L$1st_enter
116
117ALIGN	16
118$L$1st:
119	add	r13,rax
120	mov	rax,QWORD[r15*8+rsi]
121	adc	rdx,0
122	add	r13,r11
123	mov	r11,r10
124	adc	rdx,0
125	mov	QWORD[((-16))+r15*8+rsp],r13
126	mov	r13,rdx
127
128$L$1st_enter:
129	mul	rbx
130	add	r11,rax
131	mov	rax,QWORD[r15*8+rcx]
132	adc	rdx,0
133	lea	r15,[1+r15]
134	mov	r10,rdx
135
136	mul	rbp
137	cmp	r15,r9
138	jne	NEAR $L$1st
139
140DB	102,72,15,126,195
141
142	add	r13,rax
143	mov	rax,QWORD[rsi]
144	adc	rdx,0
145	add	r13,r11
146	adc	rdx,0
147	mov	QWORD[((-16))+r15*8+rsp],r13
148	mov	r13,rdx
149	mov	r11,r10
150
151	xor	rdx,rdx
152	add	r13,r11
153	adc	rdx,0
154	mov	QWORD[((-8))+r9*8+rsp],r13
155	mov	QWORD[r9*8+rsp],rdx
156
157	lea	r14,[1+r14]
158	jmp	NEAR $L$outer
159ALIGN	16
160$L$outer:
161	xor	r15,r15
162	mov	rbp,r8
163	mov	r10,QWORD[rsp]
164
165	movq	xmm0,QWORD[(((-96)))+r12]
166	movq	xmm1,QWORD[((-32))+r12]
167	pand	xmm0,xmm4
168	movq	xmm2,QWORD[32+r12]
169	pand	xmm1,xmm5
170
171	mul	rbx
172	add	r10,rax
173	mov	rax,QWORD[rcx]
174	adc	rdx,0
175
176	movq	xmm3,QWORD[96+r12]
177	pand	xmm2,xmm6
178	por	xmm0,xmm1
179	pand	xmm3,xmm7
180
181	imul	rbp,r10
182	mov	r11,rdx
183
184	por	xmm0,xmm2
185	lea	r12,[256+r12]
186	por	xmm0,xmm3
187
188	mul	rbp
189	add	r10,rax
190	mov	rax,QWORD[8+rsi]
191	adc	rdx,0
192	mov	r10,QWORD[8+rsp]
193	mov	r13,rdx
194
195	lea	r15,[1+r15]
196	jmp	NEAR $L$inner_enter
197
198ALIGN	16
199$L$inner:
200	add	r13,rax
201	mov	rax,QWORD[r15*8+rsi]
202	adc	rdx,0
203	add	r13,r10
204	mov	r10,QWORD[r15*8+rsp]
205	adc	rdx,0
206	mov	QWORD[((-16))+r15*8+rsp],r13
207	mov	r13,rdx
208
209$L$inner_enter:
210	mul	rbx
211	add	r11,rax
212	mov	rax,QWORD[r15*8+rcx]
213	adc	rdx,0
214	add	r10,r11
215	mov	r11,rdx
216	adc	r11,0
217	lea	r15,[1+r15]
218
219	mul	rbp
220	cmp	r15,r9
221	jne	NEAR $L$inner
222
223DB	102,72,15,126,195
224
225	add	r13,rax
226	mov	rax,QWORD[rsi]
227	adc	rdx,0
228	add	r13,r10
229	mov	r10,QWORD[r15*8+rsp]
230	adc	rdx,0
231	mov	QWORD[((-16))+r15*8+rsp],r13
232	mov	r13,rdx
233
234	xor	rdx,rdx
235	add	r13,r11
236	adc	rdx,0
237	add	r13,r10
238	adc	rdx,0
239	mov	QWORD[((-8))+r9*8+rsp],r13
240	mov	QWORD[r9*8+rsp],rdx
241
242	lea	r14,[1+r14]
243	cmp	r14,r9
244	jb	NEAR $L$outer
245
246	xor	r14,r14
247	mov	rax,QWORD[rsp]
248	lea	rsi,[rsp]
249	mov	r15,r9
250	jmp	NEAR $L$sub
251ALIGN	16
252$L$sub:	sbb	rax,QWORD[r14*8+rcx]
253	mov	QWORD[r14*8+rdi],rax
254	mov	rax,QWORD[8+r14*8+rsi]
255	lea	r14,[1+r14]
256	dec	r15
257	jnz	NEAR $L$sub
258
259	sbb	rax,0
260	xor	r14,r14
261	mov	r15,r9
262ALIGN	16
263$L$copy:
264	mov	rsi,QWORD[r14*8+rsp]
265	mov	rcx,QWORD[r14*8+rdi]
266	xor	rsi,rcx
267	and	rsi,rax
268	xor	rsi,rcx
269	mov	QWORD[r14*8+rsp],r14
270	mov	QWORD[r14*8+rdi],rsi
271	lea	r14,[1+r14]
272	sub	r15,1
273	jnz	NEAR $L$copy
274
275	mov	rsi,QWORD[8+r9*8+rsp]
276	mov	rax,1
277	movaps	xmm6,XMMWORD[((-88))+rsi]
278	movaps	xmm7,XMMWORD[((-72))+rsi]
279	mov	r15,QWORD[((-48))+rsi]
280	mov	r14,QWORD[((-40))+rsi]
281	mov	r13,QWORD[((-32))+rsi]
282	mov	r12,QWORD[((-24))+rsi]
283	mov	rbp,QWORD[((-16))+rsi]
284	mov	rbx,QWORD[((-8))+rsi]
285	lea	rsp,[rsi]
286$L$mul_epilogue:
287	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
288	mov	rsi,QWORD[16+rsp]
289	DB	0F3h,0C3h		;repret
290$L$SEH_end_bn_mul_mont_gather5:
291
292ALIGN	32
293bn_mul4x_mont_gather5:
294	mov	QWORD[8+rsp],rdi	;WIN64 prologue
295	mov	QWORD[16+rsp],rsi
296	mov	rax,rsp
297$L$SEH_begin_bn_mul4x_mont_gather5:
298	mov	rdi,rcx
299	mov	rsi,rdx
300	mov	rdx,r8
301	mov	rcx,r9
302	mov	r8,QWORD[40+rsp]
303	mov	r9,QWORD[48+rsp]
304
305
306$L$mul4x_enter:
307DB	0x67
308	mov	rax,rsp
309	push	rbx
310	push	rbp
311	push	r12
312	push	r13
313	push	r14
314	push	r15
315	lea	rsp,[((-40))+rsp]
316	movaps	XMMWORD[rsp],xmm6
317	movaps	XMMWORD[16+rsp],xmm7
318DB	0x67
319	mov	r10d,r9d
320	shl	r9d,3
321	shl	r10d,3+2
322	neg	r9
323
324
325
326
327
328
329
330
331	lea	r11,[((-64))+r9*2+rsp]
332	sub	r11,rsi
333	and	r11,4095
334	cmp	r10,r11
335	jb	NEAR $L$mul4xsp_alt
336	sub	rsp,r11
337	lea	rsp,[((-64))+r9*2+rsp]
338	jmp	NEAR $L$mul4xsp_done
339
340ALIGN	32
341$L$mul4xsp_alt:
342	lea	r10,[((4096-64))+r9*2]
343	lea	rsp,[((-64))+r9*2+rsp]
344	sub	r11,r10
345	mov	r10,0
346	cmovc	r11,r10
347	sub	rsp,r11
348$L$mul4xsp_done:
349	and	rsp,-64
350	neg	r9
351
352	mov	QWORD[40+rsp],rax
353$L$mul4x_body:
354
355	call	mul4x_internal
356
357	mov	rsi,QWORD[40+rsp]
358	mov	rax,1
359	movaps	xmm6,XMMWORD[((-88))+rsi]
360	movaps	xmm7,XMMWORD[((-72))+rsi]
361	mov	r15,QWORD[((-48))+rsi]
362	mov	r14,QWORD[((-40))+rsi]
363	mov	r13,QWORD[((-32))+rsi]
364	mov	r12,QWORD[((-24))+rsi]
365	mov	rbp,QWORD[((-16))+rsi]
366	mov	rbx,QWORD[((-8))+rsi]
367	lea	rsp,[rsi]
368$L$mul4x_epilogue:
369	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
370	mov	rsi,QWORD[16+rsp]
371	DB	0F3h,0C3h		;repret
372$L$SEH_end_bn_mul4x_mont_gather5:
373
374
375ALIGN	32
376mul4x_internal:
377	shl	r9,5
378	mov	r10d,DWORD[56+rax]
379	lea	r13,[256+r9*1+rdx]
380	shr	r9,5
381	mov	r11,r10
382	shr	r10,3
383	and	r11,7
384	not	r10
385	lea	rax,[$L$magic_masks]
386	and	r10,3
387	lea	r12,[96+r11*8+rdx]
388	movq	xmm4,QWORD[r10*8+rax]
389	movq	xmm5,QWORD[8+r10*8+rax]
390	add	r11,7
391	movq	xmm6,QWORD[16+r10*8+rax]
392	movq	xmm7,QWORD[24+r10*8+rax]
393	and	r11,7
394
395	movq	xmm0,QWORD[(((-96)))+r12]
396	lea	r14,[256+r12]
397	movq	xmm1,QWORD[((-32))+r12]
398	pand	xmm0,xmm4
399	movq	xmm2,QWORD[32+r12]
400	pand	xmm1,xmm5
401	movq	xmm3,QWORD[96+r12]
402	pand	xmm2,xmm6
403DB	0x67
404	por	xmm0,xmm1
405	movq	xmm1,QWORD[((-96))+r14]
406DB	0x67
407	pand	xmm3,xmm7
408DB	0x67
409	por	xmm0,xmm2
410	movq	xmm2,QWORD[((-32))+r14]
411DB	0x67
412	pand	xmm1,xmm4
413DB	0x67
414	por	xmm0,xmm3
415	movq	xmm3,QWORD[32+r14]
416
417DB	102,72,15,126,195
418	movq	xmm0,QWORD[96+r14]
419	mov	QWORD[((16+8))+rsp],r13
420	mov	QWORD[((56+8))+rsp],rdi
421
422	mov	r8,QWORD[r8]
423	mov	rax,QWORD[rsi]
424	lea	rsi,[r9*1+rsi]
425	neg	r9
426
427	mov	rbp,r8
428	mul	rbx
429	mov	r10,rax
430	mov	rax,QWORD[rcx]
431
432	pand	xmm2,xmm5
433	pand	xmm3,xmm6
434	por	xmm1,xmm2
435
436	imul	rbp,r10
437
438
439
440
441
442
443
444	lea	r14,[((64+8))+r11*8+rsp]
445	mov	r11,rdx
446
447	pand	xmm0,xmm7
448	por	xmm1,xmm3
449	lea	r12,[512+r12]
450	por	xmm0,xmm1
451
452	mul	rbp
453	add	r10,rax
454	mov	rax,QWORD[8+r9*1+rsi]
455	adc	rdx,0
456	mov	rdi,rdx
457
458	mul	rbx
459	add	r11,rax
460	mov	rax,QWORD[16+rcx]
461	adc	rdx,0
462	mov	r10,rdx
463
464	mul	rbp
465	add	rdi,rax
466	mov	rax,QWORD[16+r9*1+rsi]
467	adc	rdx,0
468	add	rdi,r11
469	lea	r15,[32+r9]
470	lea	rcx,[64+rcx]
471	adc	rdx,0
472	mov	QWORD[r14],rdi
473	mov	r13,rdx
474	jmp	NEAR $L$1st4x
475
476ALIGN	32
477$L$1st4x:
478	mul	rbx
479	add	r10,rax
480	mov	rax,QWORD[((-32))+rcx]
481	lea	r14,[32+r14]
482	adc	rdx,0
483	mov	r11,rdx
484
485	mul	rbp
486	add	r13,rax
487	mov	rax,QWORD[((-8))+r15*1+rsi]
488	adc	rdx,0
489	add	r13,r10
490	adc	rdx,0
491	mov	QWORD[((-24))+r14],r13
492	mov	rdi,rdx
493
494	mul	rbx
495	add	r11,rax
496	mov	rax,QWORD[((-16))+rcx]
497	adc	rdx,0
498	mov	r10,rdx
499
500	mul	rbp
501	add	rdi,rax
502	mov	rax,QWORD[r15*1+rsi]
503	adc	rdx,0
504	add	rdi,r11
505	adc	rdx,0
506	mov	QWORD[((-16))+r14],rdi
507	mov	r13,rdx
508
509	mul	rbx
510	add	r10,rax
511	mov	rax,QWORD[rcx]
512	adc	rdx,0
513	mov	r11,rdx
514
515	mul	rbp
516	add	r13,rax
517	mov	rax,QWORD[8+r15*1+rsi]
518	adc	rdx,0
519	add	r13,r10
520	adc	rdx,0
521	mov	QWORD[((-8))+r14],r13
522	mov	rdi,rdx
523
524	mul	rbx
525	add	r11,rax
526	mov	rax,QWORD[16+rcx]
527	adc	rdx,0
528	mov	r10,rdx
529
530	mul	rbp
531	add	rdi,rax
532	mov	rax,QWORD[16+r15*1+rsi]
533	adc	rdx,0
534	add	rdi,r11
535	lea	rcx,[64+rcx]
536	adc	rdx,0
537	mov	QWORD[r14],rdi
538	mov	r13,rdx
539
540	add	r15,32
541	jnz	NEAR $L$1st4x
542
543	mul	rbx
544	add	r10,rax
545	mov	rax,QWORD[((-32))+rcx]
546	lea	r14,[32+r14]
547	adc	rdx,0
548	mov	r11,rdx
549
550	mul	rbp
551	add	r13,rax
552	mov	rax,QWORD[((-8))+rsi]
553	adc	rdx,0
554	add	r13,r10
555	adc	rdx,0
556	mov	QWORD[((-24))+r14],r13
557	mov	rdi,rdx
558
559	mul	rbx
560	add	r11,rax
561	mov	rax,QWORD[((-16))+rcx]
562	adc	rdx,0
563	mov	r10,rdx
564
565	mul	rbp
566	add	rdi,rax
567	mov	rax,QWORD[r9*1+rsi]
568	adc	rdx,0
569	add	rdi,r11
570	adc	rdx,0
571	mov	QWORD[((-16))+r14],rdi
572	mov	r13,rdx
573
574DB	102,72,15,126,195
575	lea	rcx,[r9*2+rcx]
576
577	xor	rdi,rdi
578	add	r13,r10
579	adc	rdi,0
580	mov	QWORD[((-8))+r14],r13
581
582	jmp	NEAR $L$outer4x
583
584ALIGN	32
585$L$outer4x:
586	mov	r10,QWORD[r9*1+r14]
587	mov	rbp,r8
588	mul	rbx
589	add	r10,rax
590	mov	rax,QWORD[rcx]
591	adc	rdx,0
592
593	movq	xmm0,QWORD[(((-96)))+r12]
594	movq	xmm1,QWORD[((-32))+r12]
595	pand	xmm0,xmm4
596	movq	xmm2,QWORD[32+r12]
597	pand	xmm1,xmm5
598	movq	xmm3,QWORD[96+r12]
599
600	imul	rbp,r10
601DB	0x67
602	mov	r11,rdx
603	mov	QWORD[r14],rdi
604
605	pand	xmm2,xmm6
606	por	xmm0,xmm1
607	pand	xmm3,xmm7
608	por	xmm0,xmm2
609	lea	r14,[r9*1+r14]
610	lea	r12,[256+r12]
611	por	xmm0,xmm3
612
613	mul	rbp
614	add	r10,rax
615	mov	rax,QWORD[8+r9*1+rsi]
616	adc	rdx,0
617	mov	rdi,rdx
618
619	mul	rbx
620	add	r11,rax
621	mov	rax,QWORD[16+rcx]
622	adc	rdx,0
623	add	r11,QWORD[8+r14]
624	adc	rdx,0
625	mov	r10,rdx
626
627	mul	rbp
628	add	rdi,rax
629	mov	rax,QWORD[16+r9*1+rsi]
630	adc	rdx,0
631	add	rdi,r11
632	lea	r15,[32+r9]
633	lea	rcx,[64+rcx]
634	adc	rdx,0
635	mov	r13,rdx
636	jmp	NEAR $L$inner4x
637
638ALIGN	32
639$L$inner4x:
640	mul	rbx
641	add	r10,rax
642	mov	rax,QWORD[((-32))+rcx]
643	adc	rdx,0
644	add	r10,QWORD[16+r14]
645	lea	r14,[32+r14]
646	adc	rdx,0
647	mov	r11,rdx
648
649	mul	rbp
650	add	r13,rax
651	mov	rax,QWORD[((-8))+r15*1+rsi]
652	adc	rdx,0
653	add	r13,r10
654	adc	rdx,0
655	mov	QWORD[((-32))+r14],rdi
656	mov	rdi,rdx
657
658	mul	rbx
659	add	r11,rax
660	mov	rax,QWORD[((-16))+rcx]
661	adc	rdx,0
662	add	r11,QWORD[((-8))+r14]
663	adc	rdx,0
664	mov	r10,rdx
665
666	mul	rbp
667	add	rdi,rax
668	mov	rax,QWORD[r15*1+rsi]
669	adc	rdx,0
670	add	rdi,r11
671	adc	rdx,0
672	mov	QWORD[((-24))+r14],r13
673	mov	r13,rdx
674
675	mul	rbx
676	add	r10,rax
677	mov	rax,QWORD[rcx]
678	adc	rdx,0
679	add	r10,QWORD[r14]
680	adc	rdx,0
681	mov	r11,rdx
682
683	mul	rbp
684	add	r13,rax
685	mov	rax,QWORD[8+r15*1+rsi]
686	adc	rdx,0
687	add	r13,r10
688	adc	rdx,0
689	mov	QWORD[((-16))+r14],rdi
690	mov	rdi,rdx
691
692	mul	rbx
693	add	r11,rax
694	mov	rax,QWORD[16+rcx]
695	adc	rdx,0
696	add	r11,QWORD[8+r14]
697	adc	rdx,0
698	mov	r10,rdx
699
700	mul	rbp
701	add	rdi,rax
702	mov	rax,QWORD[16+r15*1+rsi]
703	adc	rdx,0
704	add	rdi,r11
705	lea	rcx,[64+rcx]
706	adc	rdx,0
707	mov	QWORD[((-8))+r14],r13
708	mov	r13,rdx
709
710	add	r15,32
711	jnz	NEAR $L$inner4x
712
713	mul	rbx
714	add	r10,rax
715	mov	rax,QWORD[((-32))+rcx]
716	adc	rdx,0
717	add	r10,QWORD[16+r14]
718	lea	r14,[32+r14]
719	adc	rdx,0
720	mov	r11,rdx
721
722	mul	rbp
723	add	r13,rax
724	mov	rax,QWORD[((-8))+rsi]
725	adc	rdx,0
726	add	r13,r10
727	adc	rdx,0
728	mov	QWORD[((-32))+r14],rdi
729	mov	rdi,rdx
730
731	mul	rbx
732	add	r11,rax
733	mov	rax,rbp
734	mov	rbp,QWORD[((-16))+rcx]
735	adc	rdx,0
736	add	r11,QWORD[((-8))+r14]
737	adc	rdx,0
738	mov	r10,rdx
739
740	mul	rbp
741	add	rdi,rax
742	mov	rax,QWORD[r9*1+rsi]
743	adc	rdx,0
744	add	rdi,r11
745	adc	rdx,0
746	mov	QWORD[((-24))+r14],r13
747	mov	r13,rdx
748
749DB	102,72,15,126,195
750	mov	QWORD[((-16))+r14],rdi
751	lea	rcx,[r9*2+rcx]
752
753	xor	rdi,rdi
754	add	r13,r10
755	adc	rdi,0
756	add	r13,QWORD[r14]
757	adc	rdi,0
758	mov	QWORD[((-8))+r14],r13
759
760	cmp	r12,QWORD[((16+8))+rsp]
761	jb	NEAR $L$outer4x
762	sub	rbp,r13
763	adc	r15,r15
764	or	rdi,r15
765	xor	rdi,1
766	lea	rbx,[r9*1+r14]
767	lea	rbp,[rdi*8+rcx]
768	mov	rcx,r9
769	sar	rcx,3+2
770	mov	rdi,QWORD[((56+8))+rsp]
771	jmp	NEAR $L$sqr4x_sub
772
773global	bn_power5
774
775ALIGN	32
776bn_power5:
777	mov	QWORD[8+rsp],rdi	;WIN64 prologue
778	mov	QWORD[16+rsp],rsi
779	mov	rax,rsp
780$L$SEH_begin_bn_power5:
781	mov	rdi,rcx
782	mov	rsi,rdx
783	mov	rdx,r8
784	mov	rcx,r9
785	mov	r8,QWORD[40+rsp]
786	mov	r9,QWORD[48+rsp]
787
788
789	mov	rax,rsp
790	push	rbx
791	push	rbp
792	push	r12
793	push	r13
794	push	r14
795	push	r15
796	lea	rsp,[((-40))+rsp]
797	movaps	XMMWORD[rsp],xmm6
798	movaps	XMMWORD[16+rsp],xmm7
799	mov	r10d,r9d
800	shl	r9d,3
801	shl	r10d,3+2
802	neg	r9
803	mov	r8,QWORD[r8]
804
805
806
807
808
809
810
811	lea	r11,[((-64))+r9*2+rsp]
812	sub	r11,rsi
813	and	r11,4095
814	cmp	r10,r11
815	jb	NEAR $L$pwr_sp_alt
816	sub	rsp,r11
817	lea	rsp,[((-64))+r9*2+rsp]
818	jmp	NEAR $L$pwr_sp_done
819
820ALIGN	32
821$L$pwr_sp_alt:
822	lea	r10,[((4096-64))+r9*2]
823	lea	rsp,[((-64))+r9*2+rsp]
824	sub	r11,r10
825	mov	r10,0
826	cmovc	r11,r10
827	sub	rsp,r11
828$L$pwr_sp_done:
829	and	rsp,-64
830	mov	r10,r9
831	neg	r9
832
833
834
835
836
837
838
839
840
841
842	mov	QWORD[32+rsp],r8
843	mov	QWORD[40+rsp],rax
844$L$power5_body:
845DB	102,72,15,110,207
846DB	102,72,15,110,209
847DB	102,73,15,110,218
848DB	102,72,15,110,226
849
850	call	__bn_sqr8x_internal
851	call	__bn_sqr8x_internal
852	call	__bn_sqr8x_internal
853	call	__bn_sqr8x_internal
854	call	__bn_sqr8x_internal
855
856DB	102,72,15,126,209
857DB	102,72,15,126,226
858	mov	rdi,rsi
859	mov	rax,QWORD[40+rsp]
860	lea	r8,[32+rsp]
861
862	call	mul4x_internal
863
864	mov	rsi,QWORD[40+rsp]
865	mov	rax,1
866	mov	r15,QWORD[((-48))+rsi]
867	mov	r14,QWORD[((-40))+rsi]
868	mov	r13,QWORD[((-32))+rsi]
869	mov	r12,QWORD[((-24))+rsi]
870	mov	rbp,QWORD[((-16))+rsi]
871	mov	rbx,QWORD[((-8))+rsi]
872	lea	rsp,[rsi]
873$L$power5_epilogue:
874	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
875	mov	rsi,QWORD[16+rsp]
876	DB	0F3h,0C3h		;repret
877$L$SEH_end_bn_power5:
878
879global	bn_sqr8x_internal
880
881
882ALIGN	32
883bn_sqr8x_internal:
884__bn_sqr8x_internal:
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958	lea	rbp,[32+r10]
959	lea	rsi,[r9*1+rsi]
960
961	mov	rcx,r9
962
963
964	mov	r14,QWORD[((-32))+rbp*1+rsi]
965	lea	rdi,[((48+8))+r9*2+rsp]
966	mov	rax,QWORD[((-24))+rbp*1+rsi]
967	lea	rdi,[((-32))+rbp*1+rdi]
968	mov	rbx,QWORD[((-16))+rbp*1+rsi]
969	mov	r15,rax
970
971	mul	r14
972	mov	r10,rax
973	mov	rax,rbx
974	mov	r11,rdx
975	mov	QWORD[((-24))+rbp*1+rdi],r10
976
977	mul	r14
978	add	r11,rax
979	mov	rax,rbx
980	adc	rdx,0
981	mov	QWORD[((-16))+rbp*1+rdi],r11
982	mov	r10,rdx
983
984
985	mov	rbx,QWORD[((-8))+rbp*1+rsi]
986	mul	r15
987	mov	r12,rax
988	mov	rax,rbx
989	mov	r13,rdx
990
991	lea	rcx,[rbp]
992	mul	r14
993	add	r10,rax
994	mov	rax,rbx
995	mov	r11,rdx
996	adc	r11,0
997	add	r10,r12
998	adc	r11,0
999	mov	QWORD[((-8))+rcx*1+rdi],r10
1000	jmp	NEAR $L$sqr4x_1st
1001
1002ALIGN	32
1003$L$sqr4x_1st:
1004	mov	rbx,QWORD[rcx*1+rsi]
1005	mul	r15
1006	add	r13,rax
1007	mov	rax,rbx
1008	mov	r12,rdx
1009	adc	r12,0
1010
1011	mul	r14
1012	add	r11,rax
1013	mov	rax,rbx
1014	mov	rbx,QWORD[8+rcx*1+rsi]
1015	mov	r10,rdx
1016	adc	r10,0
1017	add	r11,r13
1018	adc	r10,0
1019
1020
1021	mul	r15
1022	add	r12,rax
1023	mov	rax,rbx
1024	mov	QWORD[rcx*1+rdi],r11
1025	mov	r13,rdx
1026	adc	r13,0
1027
1028	mul	r14
1029	add	r10,rax
1030	mov	rax,rbx
1031	mov	rbx,QWORD[16+rcx*1+rsi]
1032	mov	r11,rdx
1033	adc	r11,0
1034	add	r10,r12
1035	adc	r11,0
1036
1037	mul	r15
1038	add	r13,rax
1039	mov	rax,rbx
1040	mov	QWORD[8+rcx*1+rdi],r10
1041	mov	r12,rdx
1042	adc	r12,0
1043
1044	mul	r14
1045	add	r11,rax
1046	mov	rax,rbx
1047	mov	rbx,QWORD[24+rcx*1+rsi]
1048	mov	r10,rdx
1049	adc	r10,0
1050	add	r11,r13
1051	adc	r10,0
1052
1053
1054	mul	r15
1055	add	r12,rax
1056	mov	rax,rbx
1057	mov	QWORD[16+rcx*1+rdi],r11
1058	mov	r13,rdx
1059	adc	r13,0
1060	lea	rcx,[32+rcx]
1061
1062	mul	r14
1063	add	r10,rax
1064	mov	rax,rbx
1065	mov	r11,rdx
1066	adc	r11,0
1067	add	r10,r12
1068	adc	r11,0
1069	mov	QWORD[((-8))+rcx*1+rdi],r10
1070
1071	cmp	rcx,0
1072	jne	NEAR $L$sqr4x_1st
1073
1074	mul	r15
1075	add	r13,rax
1076	lea	rbp,[16+rbp]
1077	adc	rdx,0
1078	add	r13,r11
1079	adc	rdx,0
1080
1081	mov	QWORD[rdi],r13
1082	mov	r12,rdx
1083	mov	QWORD[8+rdi],rdx
1084	jmp	NEAR $L$sqr4x_outer
1085
1086ALIGN	32
1087$L$sqr4x_outer:
1088	mov	r14,QWORD[((-32))+rbp*1+rsi]
1089	lea	rdi,[((48+8))+r9*2+rsp]
1090	mov	rax,QWORD[((-24))+rbp*1+rsi]
1091	lea	rdi,[((-32))+rbp*1+rdi]
1092	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1093	mov	r15,rax
1094
1095	mul	r14
1096	mov	r10,QWORD[((-24))+rbp*1+rdi]
1097	add	r10,rax
1098	mov	rax,rbx
1099	adc	rdx,0
1100	mov	QWORD[((-24))+rbp*1+rdi],r10
1101	mov	r11,rdx
1102
1103	mul	r14
1104	add	r11,rax
1105	mov	rax,rbx
1106	adc	rdx,0
1107	add	r11,QWORD[((-16))+rbp*1+rdi]
1108	mov	r10,rdx
1109	adc	r10,0
1110	mov	QWORD[((-16))+rbp*1+rdi],r11
1111
1112	xor	r12,r12
1113
1114	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1115	mul	r15
1116	add	r12,rax
1117	mov	rax,rbx
1118	adc	rdx,0
1119	add	r12,QWORD[((-8))+rbp*1+rdi]
1120	mov	r13,rdx
1121	adc	r13,0
1122
1123	mul	r14
1124	add	r10,rax
1125	mov	rax,rbx
1126	adc	rdx,0
1127	add	r10,r12
1128	mov	r11,rdx
1129	adc	r11,0
1130	mov	QWORD[((-8))+rbp*1+rdi],r10
1131
1132	lea	rcx,[rbp]
1133	jmp	NEAR $L$sqr4x_inner
1134
1135ALIGN	32
1136$L$sqr4x_inner:
1137	mov	rbx,QWORD[rcx*1+rsi]
1138	mul	r15
1139	add	r13,rax
1140	mov	rax,rbx
1141	mov	r12,rdx
1142	adc	r12,0
1143	add	r13,QWORD[rcx*1+rdi]
1144	adc	r12,0
1145
1146DB	0x67
1147	mul	r14
1148	add	r11,rax
1149	mov	rax,rbx
1150	mov	rbx,QWORD[8+rcx*1+rsi]
1151	mov	r10,rdx
1152	adc	r10,0
1153	add	r11,r13
1154	adc	r10,0
1155
1156	mul	r15
1157	add	r12,rax
1158	mov	QWORD[rcx*1+rdi],r11
1159	mov	rax,rbx
1160	mov	r13,rdx
1161	adc	r13,0
1162	add	r12,QWORD[8+rcx*1+rdi]
1163	lea	rcx,[16+rcx]
1164	adc	r13,0
1165
1166	mul	r14
1167	add	r10,rax
1168	mov	rax,rbx
1169	adc	rdx,0
1170	add	r10,r12
1171	mov	r11,rdx
1172	adc	r11,0
1173	mov	QWORD[((-8))+rcx*1+rdi],r10
1174
1175	cmp	rcx,0
1176	jne	NEAR $L$sqr4x_inner
1177
1178DB	0x67
1179	mul	r15
1180	add	r13,rax
1181	adc	rdx,0
1182	add	r13,r11
1183	adc	rdx,0
1184
1185	mov	QWORD[rdi],r13
1186	mov	r12,rdx
1187	mov	QWORD[8+rdi],rdx
1188
1189	add	rbp,16
1190	jnz	NEAR $L$sqr4x_outer
1191
1192
1193	mov	r14,QWORD[((-32))+rsi]
1194	lea	rdi,[((48+8))+r9*2+rsp]
1195	mov	rax,QWORD[((-24))+rsi]
1196	lea	rdi,[((-32))+rbp*1+rdi]
1197	mov	rbx,QWORD[((-16))+rsi]
1198	mov	r15,rax
1199
1200	mul	r14
1201	add	r10,rax
1202	mov	rax,rbx
1203	mov	r11,rdx
1204	adc	r11,0
1205
1206	mul	r14
1207	add	r11,rax
1208	mov	rax,rbx
1209	mov	QWORD[((-24))+rdi],r10
1210	mov	r10,rdx
1211	adc	r10,0
1212	add	r11,r13
1213	mov	rbx,QWORD[((-8))+rsi]
1214	adc	r10,0
1215
1216	mul	r15
1217	add	r12,rax
1218	mov	rax,rbx
1219	mov	QWORD[((-16))+rdi],r11
1220	mov	r13,rdx
1221	adc	r13,0
1222
1223	mul	r14
1224	add	r10,rax
1225	mov	rax,rbx
1226	mov	r11,rdx
1227	adc	r11,0
1228	add	r10,r12
1229	adc	r11,0
1230	mov	QWORD[((-8))+rdi],r10
1231
1232	mul	r15
1233	add	r13,rax
1234	mov	rax,QWORD[((-16))+rsi]
1235	adc	rdx,0
1236	add	r13,r11
1237	adc	rdx,0
1238
1239	mov	QWORD[rdi],r13
1240	mov	r12,rdx
1241	mov	QWORD[8+rdi],rdx
1242
1243	mul	rbx
1244	add	rbp,16
1245	xor	r14,r14
1246	sub	rbp,r9
1247	xor	r15,r15
1248
1249	add	rax,r12
1250	adc	rdx,0
1251	mov	QWORD[8+rdi],rax
1252	mov	QWORD[16+rdi],rdx
1253	mov	QWORD[24+rdi],r15
1254
1255	mov	rax,QWORD[((-16))+rbp*1+rsi]
1256	lea	rdi,[((48+8))+rsp]
1257	xor	r10,r10
1258	mov	r11,QWORD[8+rdi]
1259
1260	lea	r12,[r10*2+r14]
1261	shr	r10,63
1262	lea	r13,[r11*2+rcx]
1263	shr	r11,63
1264	or	r13,r10
1265	mov	r10,QWORD[16+rdi]
1266	mov	r14,r11
1267	mul	rax
1268	neg	r15
1269	mov	r11,QWORD[24+rdi]
1270	adc	r12,rax
1271	mov	rax,QWORD[((-8))+rbp*1+rsi]
1272	mov	QWORD[rdi],r12
1273	adc	r13,rdx
1274
1275	lea	rbx,[r10*2+r14]
1276	mov	QWORD[8+rdi],r13
1277	sbb	r15,r15
1278	shr	r10,63
1279	lea	r8,[r11*2+rcx]
1280	shr	r11,63
1281	or	r8,r10
1282	mov	r10,QWORD[32+rdi]
1283	mov	r14,r11
1284	mul	rax
1285	neg	r15
1286	mov	r11,QWORD[40+rdi]
1287	adc	rbx,rax
1288	mov	rax,QWORD[rbp*1+rsi]
1289	mov	QWORD[16+rdi],rbx
1290	adc	r8,rdx
1291	lea	rbp,[16+rbp]
1292	mov	QWORD[24+rdi],r8
1293	sbb	r15,r15
1294	lea	rdi,[64+rdi]
1295	jmp	NEAR $L$sqr4x_shift_n_add
1296
1297ALIGN	32
1298$L$sqr4x_shift_n_add:
1299	lea	r12,[r10*2+r14]
1300	shr	r10,63
1301	lea	r13,[r11*2+rcx]
1302	shr	r11,63
1303	or	r13,r10
1304	mov	r10,QWORD[((-16))+rdi]
1305	mov	r14,r11
1306	mul	rax
1307	neg	r15
1308	mov	r11,QWORD[((-8))+rdi]
1309	adc	r12,rax
1310	mov	rax,QWORD[((-8))+rbp*1+rsi]
1311	mov	QWORD[((-32))+rdi],r12
1312	adc	r13,rdx
1313
1314	lea	rbx,[r10*2+r14]
1315	mov	QWORD[((-24))+rdi],r13
1316	sbb	r15,r15
1317	shr	r10,63
1318	lea	r8,[r11*2+rcx]
1319	shr	r11,63
1320	or	r8,r10
1321	mov	r10,QWORD[rdi]
1322	mov	r14,r11
1323	mul	rax
1324	neg	r15
1325	mov	r11,QWORD[8+rdi]
1326	adc	rbx,rax
1327	mov	rax,QWORD[rbp*1+rsi]
1328	mov	QWORD[((-16))+rdi],rbx
1329	adc	r8,rdx
1330
1331	lea	r12,[r10*2+r14]
1332	mov	QWORD[((-8))+rdi],r8
1333	sbb	r15,r15
1334	shr	r10,63
1335	lea	r13,[r11*2+rcx]
1336	shr	r11,63
1337	or	r13,r10
1338	mov	r10,QWORD[16+rdi]
1339	mov	r14,r11
1340	mul	rax
1341	neg	r15
1342	mov	r11,QWORD[24+rdi]
1343	adc	r12,rax
1344	mov	rax,QWORD[8+rbp*1+rsi]
1345	mov	QWORD[rdi],r12
1346	adc	r13,rdx
1347
1348	lea	rbx,[r10*2+r14]
1349	mov	QWORD[8+rdi],r13
1350	sbb	r15,r15
1351	shr	r10,63
1352	lea	r8,[r11*2+rcx]
1353	shr	r11,63
1354	or	r8,r10
1355	mov	r10,QWORD[32+rdi]
1356	mov	r14,r11
1357	mul	rax
1358	neg	r15
1359	mov	r11,QWORD[40+rdi]
1360	adc	rbx,rax
1361	mov	rax,QWORD[16+rbp*1+rsi]
1362	mov	QWORD[16+rdi],rbx
1363	adc	r8,rdx
1364	mov	QWORD[24+rdi],r8
1365	sbb	r15,r15
1366	lea	rdi,[64+rdi]
1367	add	rbp,32
1368	jnz	NEAR $L$sqr4x_shift_n_add
1369
1370	lea	r12,[r10*2+r14]
1371DB	0x67
1372	shr	r10,63
1373	lea	r13,[r11*2+rcx]
1374	shr	r11,63
1375	or	r13,r10
1376	mov	r10,QWORD[((-16))+rdi]
1377	mov	r14,r11
1378	mul	rax
1379	neg	r15
1380	mov	r11,QWORD[((-8))+rdi]
1381	adc	r12,rax
1382	mov	rax,QWORD[((-8))+rsi]
1383	mov	QWORD[((-32))+rdi],r12
1384	adc	r13,rdx
1385
1386	lea	rbx,[r10*2+r14]
1387	mov	QWORD[((-24))+rdi],r13
1388	sbb	r15,r15
1389	shr	r10,63
1390	lea	r8,[r11*2+rcx]
1391	shr	r11,63
1392	or	r8,r10
1393	mul	rax
1394	neg	r15
1395	adc	rbx,rax
1396	adc	r8,rdx
1397	mov	QWORD[((-16))+rdi],rbx
1398	mov	QWORD[((-8))+rdi],r8
1399DB	102,72,15,126,213
1400sqr8x_reduction:
1401	xor	rax,rax
1402	lea	rcx,[r9*2+rbp]
1403	lea	rdx,[((48+8))+r9*2+rsp]
1404	mov	QWORD[((0+8))+rsp],rcx
1405	lea	rdi,[((48+8))+r9*1+rsp]
1406	mov	QWORD[((8+8))+rsp],rdx
1407	neg	r9
1408	jmp	NEAR $L$8x_reduction_loop
1409
1410ALIGN	32
1411$L$8x_reduction_loop:
1412	lea	rdi,[r9*1+rdi]
1413DB	0x66
1414	mov	rbx,QWORD[rdi]
1415	mov	r9,QWORD[8+rdi]
1416	mov	r10,QWORD[16+rdi]
1417	mov	r11,QWORD[24+rdi]
1418	mov	r12,QWORD[32+rdi]
1419	mov	r13,QWORD[40+rdi]
1420	mov	r14,QWORD[48+rdi]
1421	mov	r15,QWORD[56+rdi]
1422	mov	QWORD[rdx],rax
1423	lea	rdi,[64+rdi]
1424
1425DB	0x67
1426	mov	r8,rbx
1427	imul	rbx,QWORD[((32+8))+rsp]
1428	mov	rax,QWORD[rbp]
1429	mov	ecx,8
1430	jmp	NEAR $L$8x_reduce
1431
1432ALIGN	32
1433$L$8x_reduce:
1434	mul	rbx
1435	mov	rax,QWORD[16+rbp]
1436	neg	r8
1437	mov	r8,rdx
1438	adc	r8,0
1439
1440	mul	rbx
1441	add	r9,rax
1442	mov	rax,QWORD[32+rbp]
1443	adc	rdx,0
1444	add	r8,r9
1445	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
1446	mov	r9,rdx
1447	adc	r9,0
1448
1449	mul	rbx
1450	add	r10,rax
1451	mov	rax,QWORD[48+rbp]
1452	adc	rdx,0
1453	add	r9,r10
1454	mov	rsi,QWORD[((32+8))+rsp]
1455	mov	r10,rdx
1456	adc	r10,0
1457
1458	mul	rbx
1459	add	r11,rax
1460	mov	rax,QWORD[64+rbp]
1461	adc	rdx,0
1462	imul	rsi,r8
1463	add	r10,r11
1464	mov	r11,rdx
1465	adc	r11,0
1466
1467	mul	rbx
1468	add	r12,rax
1469	mov	rax,QWORD[80+rbp]
1470	adc	rdx,0
1471	add	r11,r12
1472	mov	r12,rdx
1473	adc	r12,0
1474
1475	mul	rbx
1476	add	r13,rax
1477	mov	rax,QWORD[96+rbp]
1478	adc	rdx,0
1479	add	r12,r13
1480	mov	r13,rdx
1481	adc	r13,0
1482
1483	mul	rbx
1484	add	r14,rax
1485	mov	rax,QWORD[112+rbp]
1486	adc	rdx,0
1487	add	r13,r14
1488	mov	r14,rdx
1489	adc	r14,0
1490
1491	mul	rbx
1492	mov	rbx,rsi
1493	add	r15,rax
1494	mov	rax,QWORD[rbp]
1495	adc	rdx,0
1496	add	r14,r15
1497	mov	r15,rdx
1498	adc	r15,0
1499
1500	dec	ecx
1501	jnz	NEAR $L$8x_reduce
1502
1503	lea	rbp,[128+rbp]
1504	xor	rax,rax
1505	mov	rdx,QWORD[((8+8))+rsp]
1506	cmp	rbp,QWORD[((0+8))+rsp]
1507	jae	NEAR $L$8x_no_tail
1508
1509DB	0x66
1510	add	r8,QWORD[rdi]
1511	adc	r9,QWORD[8+rdi]
1512	adc	r10,QWORD[16+rdi]
1513	adc	r11,QWORD[24+rdi]
1514	adc	r12,QWORD[32+rdi]
1515	adc	r13,QWORD[40+rdi]
1516	adc	r14,QWORD[48+rdi]
1517	adc	r15,QWORD[56+rdi]
1518	sbb	rsi,rsi
1519
1520	mov	rbx,QWORD[((48+56+8))+rsp]
1521	mov	ecx,8
1522	mov	rax,QWORD[rbp]
1523	jmp	NEAR $L$8x_tail
1524
1525ALIGN	32
1526$L$8x_tail:
1527	mul	rbx
1528	add	r8,rax
1529	mov	rax,QWORD[16+rbp]
1530	mov	QWORD[rdi],r8
1531	mov	r8,rdx
1532	adc	r8,0
1533
1534	mul	rbx
1535	add	r9,rax
1536	mov	rax,QWORD[32+rbp]
1537	adc	rdx,0
1538	add	r8,r9
1539	lea	rdi,[8+rdi]
1540	mov	r9,rdx
1541	adc	r9,0
1542
1543	mul	rbx
1544	add	r10,rax
1545	mov	rax,QWORD[48+rbp]
1546	adc	rdx,0
1547	add	r9,r10
1548	mov	r10,rdx
1549	adc	r10,0
1550
1551	mul	rbx
1552	add	r11,rax
1553	mov	rax,QWORD[64+rbp]
1554	adc	rdx,0
1555	add	r10,r11
1556	mov	r11,rdx
1557	adc	r11,0
1558
1559	mul	rbx
1560	add	r12,rax
1561	mov	rax,QWORD[80+rbp]
1562	adc	rdx,0
1563	add	r11,r12
1564	mov	r12,rdx
1565	adc	r12,0
1566
1567	mul	rbx
1568	add	r13,rax
1569	mov	rax,QWORD[96+rbp]
1570	adc	rdx,0
1571	add	r12,r13
1572	mov	r13,rdx
1573	adc	r13,0
1574
1575	mul	rbx
1576	add	r14,rax
1577	mov	rax,QWORD[112+rbp]
1578	adc	rdx,0
1579	add	r13,r14
1580	mov	r14,rdx
1581	adc	r14,0
1582
1583	mul	rbx
1584	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
1585	add	r15,rax
1586	adc	rdx,0
1587	add	r14,r15
1588	mov	rax,QWORD[rbp]
1589	mov	r15,rdx
1590	adc	r15,0
1591
1592	dec	ecx
1593	jnz	NEAR $L$8x_tail
1594
1595	lea	rbp,[128+rbp]
1596	mov	rdx,QWORD[((8+8))+rsp]
1597	cmp	rbp,QWORD[((0+8))+rsp]
1598	jae	NEAR $L$8x_tail_done
1599
1600	mov	rbx,QWORD[((48+56+8))+rsp]
1601	neg	rsi
1602	mov	rax,QWORD[rbp]
1603	adc	r8,QWORD[rdi]
1604	adc	r9,QWORD[8+rdi]
1605	adc	r10,QWORD[16+rdi]
1606	adc	r11,QWORD[24+rdi]
1607	adc	r12,QWORD[32+rdi]
1608	adc	r13,QWORD[40+rdi]
1609	adc	r14,QWORD[48+rdi]
1610	adc	r15,QWORD[56+rdi]
1611	sbb	rsi,rsi
1612
1613	mov	ecx,8
1614	jmp	NEAR $L$8x_tail
1615
1616ALIGN	32
1617$L$8x_tail_done:
1618	add	r8,QWORD[rdx]
1619	xor	rax,rax
1620
1621	neg	rsi
1622$L$8x_no_tail:
1623	adc	r8,QWORD[rdi]
1624	adc	r9,QWORD[8+rdi]
1625	adc	r10,QWORD[16+rdi]
1626	adc	r11,QWORD[24+rdi]
1627	adc	r12,QWORD[32+rdi]
1628	adc	r13,QWORD[40+rdi]
1629	adc	r14,QWORD[48+rdi]
1630	adc	r15,QWORD[56+rdi]
1631	adc	rax,0
1632	mov	rcx,QWORD[((-16))+rbp]
1633	xor	rsi,rsi
1634
1635DB	102,72,15,126,213
1636
1637	mov	QWORD[rdi],r8
1638	mov	QWORD[8+rdi],r9
1639DB	102,73,15,126,217
1640	mov	QWORD[16+rdi],r10
1641	mov	QWORD[24+rdi],r11
1642	mov	QWORD[32+rdi],r12
1643	mov	QWORD[40+rdi],r13
1644	mov	QWORD[48+rdi],r14
1645	mov	QWORD[56+rdi],r15
1646	lea	rdi,[64+rdi]
1647
1648	cmp	rdi,rdx
1649	jb	NEAR $L$8x_reduction_loop
1650
1651	sub	rcx,r15
1652	lea	rbx,[r9*1+rdi]
1653	adc	rsi,rsi
1654	mov	rcx,r9
1655	or	rax,rsi
1656DB	102,72,15,126,207
1657	xor	rax,1
1658DB	102,72,15,126,206
1659	lea	rbp,[rax*8+rbp]
1660	sar	rcx,3+2
1661	jmp	NEAR $L$sqr4x_sub
1662
1663ALIGN	32
1664$L$sqr4x_sub:
1665DB	0x66
1666	mov	r12,QWORD[rbx]
1667	mov	r13,QWORD[8+rbx]
1668	sbb	r12,QWORD[rbp]
1669	mov	r14,QWORD[16+rbx]
1670	sbb	r13,QWORD[16+rbp]
1671	mov	r15,QWORD[24+rbx]
1672	lea	rbx,[32+rbx]
1673	sbb	r14,QWORD[32+rbp]
1674	mov	QWORD[rdi],r12
1675	sbb	r15,QWORD[48+rbp]
1676	lea	rbp,[64+rbp]
1677	mov	QWORD[8+rdi],r13
1678	mov	QWORD[16+rdi],r14
1679	mov	QWORD[24+rdi],r15
1680	lea	rdi,[32+rdi]
1681
1682	inc	rcx
1683	jnz	NEAR $L$sqr4x_sub
1684	mov	r10,r9
1685	neg	r9
1686	DB	0F3h,0C3h		;repret
1687
1688global	bn_from_montgomery
1689
1690ALIGN	32
1691bn_from_montgomery:
1692	test	DWORD[48+rsp],7
1693	jz	NEAR bn_from_mont8x
1694	xor	eax,eax
1695	DB	0F3h,0C3h		;repret
1696
1697
1698
1699ALIGN	32
1700bn_from_mont8x:
1701	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1702	mov	QWORD[16+rsp],rsi
1703	mov	rax,rsp
1704$L$SEH_begin_bn_from_mont8x:
1705	mov	rdi,rcx
1706	mov	rsi,rdx
1707	mov	rdx,r8
1708	mov	rcx,r9
1709	mov	r8,QWORD[40+rsp]
1710	mov	r9,QWORD[48+rsp]
1711
1712
1713DB	0x67
1714	mov	rax,rsp
1715	push	rbx
1716	push	rbp
1717	push	r12
1718	push	r13
1719	push	r14
1720	push	r15
1721	lea	rsp,[((-40))+rsp]
1722	movaps	XMMWORD[rsp],xmm6
1723	movaps	XMMWORD[16+rsp],xmm7
1724DB	0x67
1725	mov	r10d,r9d
1726	shl	r9d,3
1727	shl	r10d,3+2
1728	neg	r9
1729	mov	r8,QWORD[r8]
1730
1731
1732
1733
1734
1735
1736
1737	lea	r11,[((-64))+r9*2+rsp]
1738	sub	r11,rsi
1739	and	r11,4095
1740	cmp	r10,r11
1741	jb	NEAR $L$from_sp_alt
1742	sub	rsp,r11
1743	lea	rsp,[((-64))+r9*2+rsp]
1744	jmp	NEAR $L$from_sp_done
1745
1746ALIGN	32
1747$L$from_sp_alt:
1748	lea	r10,[((4096-64))+r9*2]
1749	lea	rsp,[((-64))+r9*2+rsp]
1750	sub	r11,r10
1751	mov	r10,0
1752	cmovc	r11,r10
1753	sub	rsp,r11
1754$L$from_sp_done:
1755	and	rsp,-64
1756	mov	r10,r9
1757	neg	r9
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768	mov	QWORD[32+rsp],r8
1769	mov	QWORD[40+rsp],rax
1770$L$from_body:
1771	mov	r11,r9
1772	lea	rax,[48+rsp]
1773	pxor	xmm0,xmm0
1774	jmp	NEAR $L$mul_by_1
1775
1776ALIGN	32
1777$L$mul_by_1:
1778	movdqu	xmm1,XMMWORD[rsi]
1779	movdqu	xmm2,XMMWORD[16+rsi]
1780	movdqu	xmm3,XMMWORD[32+rsi]
1781	movdqa	XMMWORD[r9*1+rax],xmm0
1782	movdqu	xmm4,XMMWORD[48+rsi]
1783	movdqa	XMMWORD[16+r9*1+rax],xmm0
1784DB	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1785	movdqa	XMMWORD[rax],xmm1
1786	movdqa	XMMWORD[32+r9*1+rax],xmm0
1787	movdqa	XMMWORD[16+rax],xmm2
1788	movdqa	XMMWORD[48+r9*1+rax],xmm0
1789	movdqa	XMMWORD[32+rax],xmm3
1790	movdqa	XMMWORD[48+rax],xmm4
1791	lea	rax,[64+rax]
1792	sub	r11,64
1793	jnz	NEAR $L$mul_by_1
1794
1795DB	102,72,15,110,207
1796DB	102,72,15,110,209
1797DB	0x67
1798	mov	rbp,rcx
1799DB	102,73,15,110,218
1800	call	sqr8x_reduction
1801
1802	pxor	xmm0,xmm0
1803	lea	rax,[48+rsp]
1804	mov	rsi,QWORD[40+rsp]
1805	jmp	NEAR $L$from_mont_zero
1806
1807ALIGN	32
1808$L$from_mont_zero:
1809	movdqa	XMMWORD[rax],xmm0
1810	movdqa	XMMWORD[16+rax],xmm0
1811	movdqa	XMMWORD[32+rax],xmm0
1812	movdqa	XMMWORD[48+rax],xmm0
1813	lea	rax,[64+rax]
1814	sub	r9,32
1815	jnz	NEAR $L$from_mont_zero
1816
1817	mov	rax,1
1818	mov	r15,QWORD[((-48))+rsi]
1819	mov	r14,QWORD[((-40))+rsi]
1820	mov	r13,QWORD[((-32))+rsi]
1821	mov	r12,QWORD[((-24))+rsi]
1822	mov	rbp,QWORD[((-16))+rsi]
1823	mov	rbx,QWORD[((-8))+rsi]
1824	lea	rsp,[rsi]
1825$L$from_epilogue:
1826	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1827	mov	rsi,QWORD[16+rsp]
1828	DB	0F3h,0C3h		;repret
1829$L$SEH_end_bn_from_mont8x:
1830global	bn_scatter5
1831
1832ALIGN	16
1833bn_scatter5:
1834	cmp	edx,0
1835	jz	NEAR $L$scatter_epilogue
1836	lea	r8,[r9*8+r8]
1837$L$scatter:
1838	mov	rax,QWORD[rcx]
1839	lea	rcx,[8+rcx]
1840	mov	QWORD[r8],rax
1841	lea	r8,[256+r8]
1842	sub	edx,1
1843	jnz	NEAR $L$scatter
1844$L$scatter_epilogue:
1845	DB	0F3h,0C3h		;repret
1846
1847
1848global	bn_gather5
1849
1850ALIGN	16
1851bn_gather5:
1852$L$SEH_begin_bn_gather5:
1853
1854DB	0x48,0x83,0xec,0x28
1855DB	0x0f,0x29,0x34,0x24
1856DB	0x0f,0x29,0x7c,0x24,0x10
1857	mov	r11d,r9d
1858	shr	r9d,3
1859	and	r11,7
1860	not	r9d
1861	lea	rax,[$L$magic_masks]
1862	and	r9d,3
1863	lea	r8,[128+r11*8+r8]
1864	movq	xmm4,QWORD[r9*8+rax]
1865	movq	xmm5,QWORD[8+r9*8+rax]
1866	movq	xmm6,QWORD[16+r9*8+rax]
1867	movq	xmm7,QWORD[24+r9*8+rax]
1868	jmp	NEAR $L$gather
1869ALIGN	16
1870$L$gather:
1871	movq	xmm0,QWORD[(((-128)))+r8]
1872	movq	xmm1,QWORD[((-64))+r8]
1873	pand	xmm0,xmm4
1874	movq	xmm2,QWORD[r8]
1875	pand	xmm1,xmm5
1876	movq	xmm3,QWORD[64+r8]
1877	pand	xmm2,xmm6
1878	por	xmm0,xmm1
1879	pand	xmm3,xmm7
1880DB	0x67,0x67
1881	por	xmm0,xmm2
1882	lea	r8,[256+r8]
1883	por	xmm0,xmm3
1884
1885	movq	QWORD[rcx],xmm0
1886	lea	rcx,[8+rcx]
1887	sub	edx,1
1888	jnz	NEAR $L$gather
1889	movaps	xmm6,XMMWORD[rsp]
1890	movaps	xmm7,XMMWORD[16+rsp]
1891	lea	rsp,[40+rsp]
1892	DB	0F3h,0C3h		;repret
1893$L$SEH_end_bn_gather5:
1894
1895ALIGN	64
1896$L$magic_masks:
1897	DD	0,0,0,0,0,0,-1,-1
1898	DD	0,0,0,0,0,0,0,0
1899DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
1900DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
1901DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
1902DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
1903DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
1904DB	112,101,110,115,115,108,46,111,114,103,62,0
1905EXTERN	__imp_RtlVirtualUnwind
1906
1907ALIGN	16
1908mul_handler:
1909	push	rsi
1910	push	rdi
1911	push	rbx
1912	push	rbp
1913	push	r12
1914	push	r13
1915	push	r14
1916	push	r15
1917	pushfq
1918	sub	rsp,64
1919
1920	mov	rax,QWORD[120+r8]
1921	mov	rbx,QWORD[248+r8]
1922
1923	mov	rsi,QWORD[8+r9]
1924	mov	r11,QWORD[56+r9]
1925
1926	mov	r10d,DWORD[r11]
1927	lea	r10,[r10*1+rsi]
1928	cmp	rbx,r10
1929	jb	NEAR $L$common_seh_tail
1930
1931	mov	rax,QWORD[152+r8]
1932
1933	mov	r10d,DWORD[4+r11]
1934	lea	r10,[r10*1+rsi]
1935	cmp	rbx,r10
1936	jae	NEAR $L$common_seh_tail
1937
1938	lea	r10,[$L$mul_epilogue]
1939	cmp	rbx,r10
1940	jb	NEAR $L$body_40
1941
1942	mov	r10,QWORD[192+r8]
1943	mov	rax,QWORD[8+r10*8+rax]
1944	jmp	NEAR $L$body_proceed
1945
1946$L$body_40:
1947	mov	rax,QWORD[40+rax]
1948$L$body_proceed:
1949
1950	movaps	xmm0,XMMWORD[((-88))+rax]
1951	movaps	xmm1,XMMWORD[((-72))+rax]
1952
1953	mov	rbx,QWORD[((-8))+rax]
1954	mov	rbp,QWORD[((-16))+rax]
1955	mov	r12,QWORD[((-24))+rax]
1956	mov	r13,QWORD[((-32))+rax]
1957	mov	r14,QWORD[((-40))+rax]
1958	mov	r15,QWORD[((-48))+rax]
1959	mov	QWORD[144+r8],rbx
1960	mov	QWORD[160+r8],rbp
1961	mov	QWORD[216+r8],r12
1962	mov	QWORD[224+r8],r13
1963	mov	QWORD[232+r8],r14
1964	mov	QWORD[240+r8],r15
1965	movups	XMMWORD[512+r8],xmm0
1966	movups	XMMWORD[528+r8],xmm1
1967
1968$L$common_seh_tail:
1969	mov	rdi,QWORD[8+rax]
1970	mov	rsi,QWORD[16+rax]
1971	mov	QWORD[152+r8],rax
1972	mov	QWORD[168+r8],rsi
1973	mov	QWORD[176+r8],rdi
1974
1975	mov	rdi,QWORD[40+r9]
1976	mov	rsi,r8
1977	mov	ecx,154
1978	DD	0xa548f3fc
1979
1980	mov	rsi,r9
1981	xor	rcx,rcx
1982	mov	rdx,QWORD[8+rsi]
1983	mov	r8,QWORD[rsi]
1984	mov	r9,QWORD[16+rsi]
1985	mov	r10,QWORD[40+rsi]
1986	lea	r11,[56+rsi]
1987	lea	r12,[24+rsi]
1988	mov	QWORD[32+rsp],r10
1989	mov	QWORD[40+rsp],r11
1990	mov	QWORD[48+rsp],r12
1991	mov	QWORD[56+rsp],rcx
1992	call	QWORD[__imp_RtlVirtualUnwind]
1993
1994	mov	eax,1
1995	add	rsp,64
1996	popfq
1997	pop	r15
1998	pop	r14
1999	pop	r13
2000	pop	r12
2001	pop	rbp
2002	pop	rbx
2003	pop	rdi
2004	pop	rsi
2005	DB	0F3h,0C3h		;repret
2006
2007
2008section	.pdata rdata align=4
2009ALIGN	4
2010	DD	$L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
2011	DD	$L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
2012	DD	$L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
2013
2014	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
2015	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
2016	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
2017
2018	DD	$L$SEH_begin_bn_power5 wrt ..imagebase
2019	DD	$L$SEH_end_bn_power5 wrt ..imagebase
2020	DD	$L$SEH_info_bn_power5 wrt ..imagebase
2021
2022	DD	$L$SEH_begin_bn_from_mont8x wrt ..imagebase
2023	DD	$L$SEH_end_bn_from_mont8x wrt ..imagebase
2024	DD	$L$SEH_info_bn_from_mont8x wrt ..imagebase
2025	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
2026	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
2027	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
2028
2029section	.xdata rdata align=8
2030ALIGN	8
2031$L$SEH_info_bn_mul_mont_gather5:
2032DB	9,0,0,0
2033	DD	mul_handler wrt ..imagebase
2034	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
2035ALIGN	8
2036$L$SEH_info_bn_mul4x_mont_gather5:
2037DB	9,0,0,0
2038	DD	mul_handler wrt ..imagebase
2039	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
2040ALIGN	8
2041$L$SEH_info_bn_power5:
2042DB	9,0,0,0
2043	DD	mul_handler wrt ..imagebase
2044	DD	$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
2045ALIGN	8
2046$L$SEH_info_bn_from_mont8x:
2047DB	9,0,0,0
2048	DD	mul_handler wrt ..imagebase
2049	DD	$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
2050ALIGN	8
2051$L$SEH_info_bn_gather5:
2052DB	0x01,0x0d,0x05,0x00
2053DB	0x0d,0x78,0x01,0x00
2054DB	0x08,0x68,0x00,0x00
2055DB	0x04,0x42,0x00,0x00
2056ALIGN	8
2057