1#if defined(__aarch64__)
2.text
3
4.globl	bn_mul_mont
5.type	bn_mul_mont,%function
6.align	5
7bn_mul_mont:
8	tst	x5,#7
9	b.eq	__bn_sqr8x_mont
10	tst	x5,#3
11	b.eq	__bn_mul4x_mont
12.Lmul_mont:
13	stp	x29,x30,[sp,#-64]!
14	add	x29,sp,#0
15	stp	x19,x20,[sp,#16]
16	stp	x21,x22,[sp,#32]
17	stp	x23,x24,[sp,#48]
18
19	ldr	x9,[x2],#8		// bp[0]
20	sub	x22,sp,x5,lsl#3
21	ldp	x7,x8,[x1],#16	// ap[0..1]
22	lsl	x5,x5,#3
23	ldr	x4,[x4]		// *n0
24	and	x22,x22,#-16		// ABI says so
25	ldp	x13,x14,[x3],#16	// np[0..1]
26
27	mul	x6,x7,x9		// ap[0]*bp[0]
28	sub	x21,x5,#16		// j=num-2
29	umulh	x7,x7,x9
30	mul	x10,x8,x9		// ap[1]*bp[0]
31	umulh	x11,x8,x9
32
33	mul	x15,x6,x4		// "tp[0]"*n0
34	mov	sp,x22			// alloca
35
36	// (*)	mul	x12,x13,x15	// np[0]*m1
37	umulh	x13,x13,x15
38	mul	x16,x14,x15		// np[1]*m1
39	// (*)	adds	x12,x12,x6	// discarded
40	// (*)	As for removal of first multiplication and addition
41	//	instructions. The outcome of first addition is
42	//	guaranteed to be zero, which leaves two computationally
43	//	significant outcomes: it either carries or not. Then
44	//	question is when does it carry? Is there alternative
45	//	way to deduce it? If you follow operations, you can
46	//	observe that condition for carry is quite simple:
47	//	x6 being non-zero. So that carry can be calculated
48	//	by adding -1 to x6. That's what next instruction does.
49	subs	xzr,x6,#1		// (*)
50	umulh	x17,x14,x15
51	adc	x13,x13,xzr
52	cbz	x21,.L1st_skip
53
54.L1st:
55	ldr	x8,[x1],#8
56	adds	x6,x10,x7
57	sub	x21,x21,#8		// j--
58	adc	x7,x11,xzr
59
60	ldr	x14,[x3],#8
61	adds	x12,x16,x13
62	mul	x10,x8,x9		// ap[j]*bp[0]
63	adc	x13,x17,xzr
64	umulh	x11,x8,x9
65
66	adds	x12,x12,x6
67	mul	x16,x14,x15		// np[j]*m1
68	adc	x13,x13,xzr
69	umulh	x17,x14,x15
70	str	x12,[x22],#8		// tp[j-1]
71	cbnz	x21,.L1st
72
73.L1st_skip:
74	adds	x6,x10,x7
75	sub	x1,x1,x5		// rewind x1
76	adc	x7,x11,xzr
77
78	adds	x12,x16,x13
79	sub	x3,x3,x5		// rewind x3
80	adc	x13,x17,xzr
81
82	adds	x12,x12,x6
83	sub	x20,x5,#8		// i=num-1
84	adcs	x13,x13,x7
85
86	adc	x19,xzr,xzr		// upmost overflow bit
87	stp	x12,x13,[x22]
88
89.Louter:
90	ldr	x9,[x2],#8		// bp[i]
91	ldp	x7,x8,[x1],#16
92	ldr	x23,[sp]		// tp[0]
93	add	x22,sp,#8
94
95	mul	x6,x7,x9		// ap[0]*bp[i]
96	sub	x21,x5,#16		// j=num-2
97	umulh	x7,x7,x9
98	ldp	x13,x14,[x3],#16
99	mul	x10,x8,x9		// ap[1]*bp[i]
100	adds	x6,x6,x23
101	umulh	x11,x8,x9
102	adc	x7,x7,xzr
103
104	mul	x15,x6,x4
105	sub	x20,x20,#8		// i--
106
107	// (*)	mul	x12,x13,x15	// np[0]*m1
108	umulh	x13,x13,x15
109	mul	x16,x14,x15		// np[1]*m1
110	// (*)	adds	x12,x12,x6
111	subs	xzr,x6,#1		// (*)
112	umulh	x17,x14,x15
113	cbz	x21,.Linner_skip
114
115.Linner:
116	ldr	x8,[x1],#8
117	adc	x13,x13,xzr
118	ldr	x23,[x22],#8		// tp[j]
119	adds	x6,x10,x7
120	sub	x21,x21,#8		// j--
121	adc	x7,x11,xzr
122
123	adds	x12,x16,x13
124	ldr	x14,[x3],#8
125	adc	x13,x17,xzr
126
127	mul	x10,x8,x9		// ap[j]*bp[i]
128	adds	x6,x6,x23
129	umulh	x11,x8,x9
130	adc	x7,x7,xzr
131
132	mul	x16,x14,x15		// np[j]*m1
133	adds	x12,x12,x6
134	umulh	x17,x14,x15
135	str	x12,[x22,#-16]		// tp[j-1]
136	cbnz	x21,.Linner
137
138.Linner_skip:
139	ldr	x23,[x22],#8		// tp[j]
140	adc	x13,x13,xzr
141	adds	x6,x10,x7
142	sub	x1,x1,x5		// rewind x1
143	adc	x7,x11,xzr
144
145	adds	x12,x16,x13
146	sub	x3,x3,x5		// rewind x3
147	adcs	x13,x17,x19
148	adc	x19,xzr,xzr
149
150	adds	x6,x6,x23
151	adc	x7,x7,xzr
152
153	adds	x12,x12,x6
154	adcs	x13,x13,x7
155	adc	x19,x19,xzr		// upmost overflow bit
156	stp	x12,x13,[x22,#-16]
157
158	cbnz	x20,.Louter
159
160	// Final step. We see if result is larger than modulus, and
161	// if it is, subtract the modulus. But comparison implies
162	// subtraction. So we subtract modulus, see if it borrowed,
163	// and conditionally copy original value.
164	ldr	x23,[sp]		// tp[0]
165	add	x22,sp,#8
166	ldr	x14,[x3],#8		// np[0]
167	subs	x21,x5,#8		// j=num-1 and clear borrow
168	mov	x1,x0
169.Lsub:
170	sbcs	x8,x23,x14		// tp[j]-np[j]
171	ldr	x23,[x22],#8
172	sub	x21,x21,#8		// j--
173	ldr	x14,[x3],#8
174	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
175	cbnz	x21,.Lsub
176
177	sbcs	x8,x23,x14
178	sbcs	x19,x19,xzr		// did it borrow?
179	str	x8,[x1],#8		// rp[num-1]
180
181	ldr	x23,[sp]		// tp[0]
182	add	x22,sp,#8
183	ldr	x8,[x0],#8		// rp[0]
184	sub	x5,x5,#8		// num--
185	nop
186.Lcond_copy:
187	sub	x5,x5,#8		// num--
188	csel	x14,x23,x8,lo		// did it borrow?
189	ldr	x23,[x22],#8
190	ldr	x8,[x0],#8
191	str	xzr,[x22,#-16]		// wipe tp
192	str	x14,[x0,#-16]
193	cbnz	x5,.Lcond_copy
194
195	csel	x14,x23,x8,lo
196	str	xzr,[x22,#-8]		// wipe tp
197	str	x14,[x0,#-8]
198
199	ldp	x19,x20,[x29,#16]
200	mov	sp,x29
201	ldp	x21,x22,[x29,#32]
202	mov	x0,#1
203	ldp	x23,x24,[x29,#48]
204	ldr	x29,[sp],#64
205	ret
206.size	bn_mul_mont,.-bn_mul_mont
207.type	__bn_sqr8x_mont,%function
208.align	5
209__bn_sqr8x_mont:
210	cmp	x1,x2
211	b.ne	__bn_mul4x_mont
212.Lsqr8x_mont:
213	stp	x29,x30,[sp,#-128]!
214	add	x29,sp,#0
215	stp	x19,x20,[sp,#16]
216	stp	x21,x22,[sp,#32]
217	stp	x23,x24,[sp,#48]
218	stp	x25,x26,[sp,#64]
219	stp	x27,x28,[sp,#80]
220	stp	x0,x3,[sp,#96]	// offload rp and np
221
222	ldp	x6,x7,[x1,#8*0]
223	ldp	x8,x9,[x1,#8*2]
224	ldp	x10,x11,[x1,#8*4]
225	ldp	x12,x13,[x1,#8*6]
226
227	sub	x2,sp,x5,lsl#4
228	lsl	x5,x5,#3
229	ldr	x4,[x4]		// *n0
230	mov	sp,x2			// alloca
231	sub	x27,x5,#8*8
232	b	.Lsqr8x_zero_start
233
234.Lsqr8x_zero:
235	sub	x27,x27,#8*8
236	stp	xzr,xzr,[x2,#8*0]
237	stp	xzr,xzr,[x2,#8*2]
238	stp	xzr,xzr,[x2,#8*4]
239	stp	xzr,xzr,[x2,#8*6]
240.Lsqr8x_zero_start:
241	stp	xzr,xzr,[x2,#8*8]
242	stp	xzr,xzr,[x2,#8*10]
243	stp	xzr,xzr,[x2,#8*12]
244	stp	xzr,xzr,[x2,#8*14]
245	add	x2,x2,#8*16
246	cbnz	x27,.Lsqr8x_zero
247
248	add	x3,x1,x5
249	add	x1,x1,#8*8
250	mov	x19,xzr
251	mov	x20,xzr
252	mov	x21,xzr
253	mov	x22,xzr
254	mov	x23,xzr
255	mov	x24,xzr
256	mov	x25,xzr
257	mov	x26,xzr
258	mov	x2,sp
259	str	x4,[x29,#112]		// offload n0
260
261	// Multiply everything but a[i]*a[i]
262.align	4
263.Lsqr8x_outer_loop:
264        //                                                 a[1]a[0]	(i)
265        //                                             a[2]a[0]
266        //                                         a[3]a[0]
267        //                                     a[4]a[0]
268        //                                 a[5]a[0]
269        //                             a[6]a[0]
270        //                         a[7]a[0]
271        //                                         a[2]a[1]		(ii)
272        //                                     a[3]a[1]
273        //                                 a[4]a[1]
274        //                             a[5]a[1]
275        //                         a[6]a[1]
276        //                     a[7]a[1]
277        //                                 a[3]a[2]			(iii)
278        //                             a[4]a[2]
279        //                         a[5]a[2]
280        //                     a[6]a[2]
281        //                 a[7]a[2]
282        //                         a[4]a[3]				(iv)
283        //                     a[5]a[3]
284        //                 a[6]a[3]
285        //             a[7]a[3]
286        //                 a[5]a[4]					(v)
287        //             a[6]a[4]
288        //         a[7]a[4]
289        //         a[6]a[5]						(vi)
290        //     a[7]a[5]
291        // a[7]a[6]							(vii)
292
293	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
294	mul	x15,x8,x6
295	mul	x16,x9,x6
296	mul	x17,x10,x6
297	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
298	mul	x14,x11,x6
299	adcs	x21,x21,x15
300	mul	x15,x12,x6
301	adcs	x22,x22,x16
302	mul	x16,x13,x6
303	adcs	x23,x23,x17
304	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
305	adcs	x24,x24,x14
306	umulh	x14,x8,x6
307	adcs	x25,x25,x15
308	umulh	x15,x9,x6
309	adcs	x26,x26,x16
310	umulh	x16,x10,x6
311	stp	x19,x20,[x2],#8*2	// t[0..1]
312	adc	x19,xzr,xzr		// t[8]
313	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
314	umulh	x17,x11,x6
315	adcs	x22,x22,x14
316	umulh	x14,x12,x6
317	adcs	x23,x23,x15
318	umulh	x15,x13,x6
319	adcs	x24,x24,x16
320	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
321	adcs	x25,x25,x17
322	mul	x17,x9,x7
323	adcs	x26,x26,x14
324	mul	x14,x10,x7
325	adc	x19,x19,x15
326
327	mul	x15,x11,x7
328	adds	x22,x22,x16
329	mul	x16,x12,x7
330	adcs	x23,x23,x17
331	mul	x17,x13,x7
332	adcs	x24,x24,x14
333	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
334	adcs	x25,x25,x15
335	umulh	x15,x9,x7
336	adcs	x26,x26,x16
337	umulh	x16,x10,x7
338	adcs	x19,x19,x17
339	umulh	x17,x11,x7
340	stp	x21,x22,[x2],#8*2	// t[2..3]
341	adc	x20,xzr,xzr		// t[9]
342	adds	x23,x23,x14
343	umulh	x14,x12,x7
344	adcs	x24,x24,x15
345	umulh	x15,x13,x7
346	adcs	x25,x25,x16
347	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
348	adcs	x26,x26,x17
349	mul	x17,x10,x8
350	adcs	x19,x19,x14
351	mul	x14,x11,x8
352	adc	x20,x20,x15
353
354	mul	x15,x12,x8
355	adds	x24,x24,x16
356	mul	x16,x13,x8
357	adcs	x25,x25,x17
358	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
359	adcs	x26,x26,x14
360	umulh	x14,x10,x8
361	adcs	x19,x19,x15
362	umulh	x15,x11,x8
363	adcs	x20,x20,x16
364	umulh	x16,x12,x8
365	stp	x23,x24,[x2],#8*2	// t[4..5]
366	adc	x21,xzr,xzr		// t[10]
367	adds	x25,x25,x17
368	umulh	x17,x13,x8
369	adcs	x26,x26,x14
370	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
371	adcs	x19,x19,x15
372	mul	x15,x11,x9
373	adcs	x20,x20,x16
374	mul	x16,x12,x9
375	adc	x21,x21,x17
376
377	mul	x17,x13,x9
378	adds	x26,x26,x14
379	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
380	adcs	x19,x19,x15
381	umulh	x15,x11,x9
382	adcs	x20,x20,x16
383	umulh	x16,x12,x9
384	adcs	x21,x21,x17
385	umulh	x17,x13,x9
386	stp	x25,x26,[x2],#8*2	// t[6..7]
387	adc	x22,xzr,xzr		// t[11]
388	adds	x19,x19,x14
389	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
390	adcs	x20,x20,x15
391	mul	x15,x12,x10
392	adcs	x21,x21,x16
393	mul	x16,x13,x10
394	adc	x22,x22,x17
395
396	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
397	adds	x20,x20,x14
398	umulh	x14,x12,x10
399	adcs	x21,x21,x15
400	umulh	x15,x13,x10
401	adcs	x22,x22,x16
402	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
403	adc	x23,xzr,xzr		// t[12]
404	adds	x21,x21,x17
405	mul	x17,x13,x11
406	adcs	x22,x22,x14
407	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
408	adc	x23,x23,x15
409
410	umulh	x15,x13,x11
411	adds	x22,x22,x16
412	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
413	adcs	x23,x23,x17
414	umulh	x17,x13,x12		// hi(a[7]*a[6])
415	adc	x24,xzr,xzr		// t[13]
416	adds	x23,x23,x14
417	sub	x27,x3,x1	// done yet?
418	adc	x24,x24,x15
419
420	adds	x24,x24,x16
421	sub	x14,x3,x5	// rewinded ap
422	adc	x25,xzr,xzr		// t[14]
423	add	x25,x25,x17
424
425	cbz	x27,.Lsqr8x_outer_break
426
427	mov	x4,x6
428	ldp	x6,x7,[x2,#8*0]
429	ldp	x8,x9,[x2,#8*2]
430	ldp	x10,x11,[x2,#8*4]
431	ldp	x12,x13,[x2,#8*6]
432	adds	x19,x19,x6
433	adcs	x20,x20,x7
434	ldp	x6,x7,[x1,#8*0]
435	adcs	x21,x21,x8
436	adcs	x22,x22,x9
437	ldp	x8,x9,[x1,#8*2]
438	adcs	x23,x23,x10
439	adcs	x24,x24,x11
440	ldp	x10,x11,[x1,#8*4]
441	adcs	x25,x25,x12
442	mov	x0,x1
443	adcs	x26,xzr,x13
444	ldp	x12,x13,[x1,#8*6]
445	add	x1,x1,#8*8
446	//adc	x28,xzr,xzr		// moved below
447	mov	x27,#-8*8
448
449	//                                                         a[8]a[0]
450	//                                                     a[9]a[0]
451	//                                                 a[a]a[0]
452	//                                             a[b]a[0]
453	//                                         a[c]a[0]
454	//                                     a[d]a[0]
455	//                                 a[e]a[0]
456	//                             a[f]a[0]
457	//                                                     a[8]a[1]
458	//                         a[f]a[1]........................
459	//                                                 a[8]a[2]
460	//                     a[f]a[2]........................
461	//                                             a[8]a[3]
462	//                 a[f]a[3]........................
463	//                                         a[8]a[4]
464	//             a[f]a[4]........................
465	//                                     a[8]a[5]
466	//         a[f]a[5]........................
467	//                                 a[8]a[6]
468	//     a[f]a[6]........................
469	//                             a[8]a[7]
470	// a[f]a[7]........................
471.Lsqr8x_mul:
472	mul	x14,x6,x4
473	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
474	mul	x15,x7,x4
475	add	x27,x27,#8
476	mul	x16,x8,x4
477	mul	x17,x9,x4
478	adds	x19,x19,x14
479	mul	x14,x10,x4
480	adcs	x20,x20,x15
481	mul	x15,x11,x4
482	adcs	x21,x21,x16
483	mul	x16,x12,x4
484	adcs	x22,x22,x17
485	mul	x17,x13,x4
486	adcs	x23,x23,x14
487	umulh	x14,x6,x4
488	adcs	x24,x24,x15
489	umulh	x15,x7,x4
490	adcs	x25,x25,x16
491	umulh	x16,x8,x4
492	adcs	x26,x26,x17
493	umulh	x17,x9,x4
494	adc	x28,x28,xzr
495	str	x19,[x2],#8
496	adds	x19,x20,x14
497	umulh	x14,x10,x4
498	adcs	x20,x21,x15
499	umulh	x15,x11,x4
500	adcs	x21,x22,x16
501	umulh	x16,x12,x4
502	adcs	x22,x23,x17
503	umulh	x17,x13,x4
504	ldr	x4,[x0,x27]
505	adcs	x23,x24,x14
506	adcs	x24,x25,x15
507	adcs	x25,x26,x16
508	adcs	x26,x28,x17
509	//adc	x28,xzr,xzr		// moved above
510	cbnz	x27,.Lsqr8x_mul
511					// note that carry flag is guaranteed
512					// to be zero at this point
513	cmp	x1,x3		// done yet?
514	b.eq	.Lsqr8x_break
515
516	ldp	x6,x7,[x2,#8*0]
517	ldp	x8,x9,[x2,#8*2]
518	ldp	x10,x11,[x2,#8*4]
519	ldp	x12,x13,[x2,#8*6]
520	adds	x19,x19,x6
521	ldr	x4,[x0,#-8*8]
522	adcs	x20,x20,x7
523	ldp	x6,x7,[x1,#8*0]
524	adcs	x21,x21,x8
525	adcs	x22,x22,x9
526	ldp	x8,x9,[x1,#8*2]
527	adcs	x23,x23,x10
528	adcs	x24,x24,x11
529	ldp	x10,x11,[x1,#8*4]
530	adcs	x25,x25,x12
531	mov	x27,#-8*8
532	adcs	x26,x26,x13
533	ldp	x12,x13,[x1,#8*6]
534	add	x1,x1,#8*8
535	//adc	x28,xzr,xzr		// moved above
536	b	.Lsqr8x_mul
537
538.align	4
539.Lsqr8x_break:
540	ldp	x6,x7,[x0,#8*0]
541	add	x1,x0,#8*8
542	ldp	x8,x9,[x0,#8*2]
543	sub	x14,x3,x1		// is it last iteration?
544	ldp	x10,x11,[x0,#8*4]
545	sub	x15,x2,x14
546	ldp	x12,x13,[x0,#8*6]
547	cbz	x14,.Lsqr8x_outer_loop
548
549	stp	x19,x20,[x2,#8*0]
550	ldp	x19,x20,[x15,#8*0]
551	stp	x21,x22,[x2,#8*2]
552	ldp	x21,x22,[x15,#8*2]
553	stp	x23,x24,[x2,#8*4]
554	ldp	x23,x24,[x15,#8*4]
555	stp	x25,x26,[x2,#8*6]
556	mov	x2,x15
557	ldp	x25,x26,[x15,#8*6]
558	b	.Lsqr8x_outer_loop
559
560.align	4
561.Lsqr8x_outer_break:
562	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
563	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
564	ldp	x15,x16,[sp,#8*1]
565	ldp	x11,x13,[x14,#8*2]
566	add	x1,x14,#8*4
567	ldp	x17,x14,[sp,#8*3]
568
569	stp	x19,x20,[x2,#8*0]
570	mul	x19,x7,x7
571	stp	x21,x22,[x2,#8*2]
572	umulh	x7,x7,x7
573	stp	x23,x24,[x2,#8*4]
574	mul	x8,x9,x9
575	stp	x25,x26,[x2,#8*6]
576	mov	x2,sp
577	umulh	x9,x9,x9
578	adds	x20,x7,x15,lsl#1
579	extr	x15,x16,x15,#63
580	sub	x27,x5,#8*4
581
582.Lsqr4x_shift_n_add:
583	adcs	x21,x8,x15
584	extr	x16,x17,x16,#63
585	sub	x27,x27,#8*4
586	adcs	x22,x9,x16
587	ldp	x15,x16,[x2,#8*5]
588	mul	x10,x11,x11
589	ldp	x7,x9,[x1],#8*2
590	umulh	x11,x11,x11
591	mul	x12,x13,x13
592	umulh	x13,x13,x13
593	extr	x17,x14,x17,#63
594	stp	x19,x20,[x2,#8*0]
595	adcs	x23,x10,x17
596	extr	x14,x15,x14,#63
597	stp	x21,x22,[x2,#8*2]
598	adcs	x24,x11,x14
599	ldp	x17,x14,[x2,#8*7]
600	extr	x15,x16,x15,#63
601	adcs	x25,x12,x15
602	extr	x16,x17,x16,#63
603	adcs	x26,x13,x16
604	ldp	x15,x16,[x2,#8*9]
605	mul	x6,x7,x7
606	ldp	x11,x13,[x1],#8*2
607	umulh	x7,x7,x7
608	mul	x8,x9,x9
609	umulh	x9,x9,x9
610	stp	x23,x24,[x2,#8*4]
611	extr	x17,x14,x17,#63
612	stp	x25,x26,[x2,#8*6]
613	add	x2,x2,#8*8
614	adcs	x19,x6,x17
615	extr	x14,x15,x14,#63
616	adcs	x20,x7,x14
617	ldp	x17,x14,[x2,#8*3]
618	extr	x15,x16,x15,#63
619	cbnz	x27,.Lsqr4x_shift_n_add
620	ldp	x1,x4,[x29,#104]	// pull np and n0
621
622	adcs	x21,x8,x15
623	extr	x16,x17,x16,#63
624	adcs	x22,x9,x16
625	ldp	x15,x16,[x2,#8*5]
626	mul	x10,x11,x11
627	umulh	x11,x11,x11
628	stp	x19,x20,[x2,#8*0]
629	mul	x12,x13,x13
630	umulh	x13,x13,x13
631	stp	x21,x22,[x2,#8*2]
632	extr	x17,x14,x17,#63
633	adcs	x23,x10,x17
634	extr	x14,x15,x14,#63
635	ldp	x19,x20,[sp,#8*0]
636	adcs	x24,x11,x14
637	extr	x15,x16,x15,#63
638	ldp	x6,x7,[x1,#8*0]
639	adcs	x25,x12,x15
640	extr	x16,xzr,x16,#63
641	ldp	x8,x9,[x1,#8*2]
642	adc	x26,x13,x16
643	ldp	x10,x11,[x1,#8*4]
644
645	// Reduce by 512 bits per iteration
646	mul	x28,x4,x19		// t[0]*n0
647	ldp	x12,x13,[x1,#8*6]
648	add	x3,x1,x5
649	ldp	x21,x22,[sp,#8*2]
650	stp	x23,x24,[x2,#8*4]
651	ldp	x23,x24,[sp,#8*4]
652	stp	x25,x26,[x2,#8*6]
653	ldp	x25,x26,[sp,#8*6]
654	add	x1,x1,#8*8
655	mov	x30,xzr		// initial top-most carry
656	mov	x2,sp
657	mov	x27,#8
658
659.Lsqr8x_reduction:
660	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
661	mul	x15,x7,x28
662	sub	x27,x27,#1
663	mul	x16,x8,x28
664	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
665	mul	x17,x9,x28
666	// (*)	adds	xzr,x19,x14
667	subs	xzr,x19,#1		// (*)
668	mul	x14,x10,x28
669	adcs	x19,x20,x15
670	mul	x15,x11,x28
671	adcs	x20,x21,x16
672	mul	x16,x12,x28
673	adcs	x21,x22,x17
674	mul	x17,x13,x28
675	adcs	x22,x23,x14
676	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
677	adcs	x23,x24,x15
678	umulh	x15,x7,x28
679	adcs	x24,x25,x16
680	umulh	x16,x8,x28
681	adcs	x25,x26,x17
682	umulh	x17,x9,x28
683	adc	x26,xzr,xzr
684	adds	x19,x19,x14
685	umulh	x14,x10,x28
686	adcs	x20,x20,x15
687	umulh	x15,x11,x28
688	adcs	x21,x21,x16
689	umulh	x16,x12,x28
690	adcs	x22,x22,x17
691	umulh	x17,x13,x28
692	mul	x28,x4,x19		// next t[0]*n0
693	adcs	x23,x23,x14
694	adcs	x24,x24,x15
695	adcs	x25,x25,x16
696	adc	x26,x26,x17
697	cbnz	x27,.Lsqr8x_reduction
698
699	ldp	x14,x15,[x2,#8*0]
700	ldp	x16,x17,[x2,#8*2]
701	mov	x0,x2
702	sub	x27,x3,x1	// done yet?
703	adds	x19,x19,x14
704	adcs	x20,x20,x15
705	ldp	x14,x15,[x2,#8*4]
706	adcs	x21,x21,x16
707	adcs	x22,x22,x17
708	ldp	x16,x17,[x2,#8*6]
709	adcs	x23,x23,x14
710	adcs	x24,x24,x15
711	adcs	x25,x25,x16
712	adcs	x26,x26,x17
713	//adc	x28,xzr,xzr		// moved below
714	cbz	x27,.Lsqr8x8_post_condition
715
716	ldr	x4,[x2,#-8*8]
717	ldp	x6,x7,[x1,#8*0]
718	ldp	x8,x9,[x1,#8*2]
719	ldp	x10,x11,[x1,#8*4]
720	mov	x27,#-8*8
721	ldp	x12,x13,[x1,#8*6]
722	add	x1,x1,#8*8
723
724.Lsqr8x_tail:
725	mul	x14,x6,x4
726	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
727	mul	x15,x7,x4
728	add	x27,x27,#8
729	mul	x16,x8,x4
730	mul	x17,x9,x4
731	adds	x19,x19,x14
732	mul	x14,x10,x4
733	adcs	x20,x20,x15
734	mul	x15,x11,x4
735	adcs	x21,x21,x16
736	mul	x16,x12,x4
737	adcs	x22,x22,x17
738	mul	x17,x13,x4
739	adcs	x23,x23,x14
740	umulh	x14,x6,x4
741	adcs	x24,x24,x15
742	umulh	x15,x7,x4
743	adcs	x25,x25,x16
744	umulh	x16,x8,x4
745	adcs	x26,x26,x17
746	umulh	x17,x9,x4
747	adc	x28,x28,xzr
748	str	x19,[x2],#8
749	adds	x19,x20,x14
750	umulh	x14,x10,x4
751	adcs	x20,x21,x15
752	umulh	x15,x11,x4
753	adcs	x21,x22,x16
754	umulh	x16,x12,x4
755	adcs	x22,x23,x17
756	umulh	x17,x13,x4
757	ldr	x4,[x0,x27]
758	adcs	x23,x24,x14
759	adcs	x24,x25,x15
760	adcs	x25,x26,x16
761	adcs	x26,x28,x17
762	//adc	x28,xzr,xzr		// moved above
763	cbnz	x27,.Lsqr8x_tail
764					// note that carry flag is guaranteed
765					// to be zero at this point
766	ldp	x6,x7,[x2,#8*0]
767	sub	x27,x3,x1	// done yet?
768	sub	x16,x3,x5	// rewinded np
769	ldp	x8,x9,[x2,#8*2]
770	ldp	x10,x11,[x2,#8*4]
771	ldp	x12,x13,[x2,#8*6]
772	cbz	x27,.Lsqr8x_tail_break
773
774	ldr	x4,[x0,#-8*8]
775	adds	x19,x19,x6
776	adcs	x20,x20,x7
777	ldp	x6,x7,[x1,#8*0]
778	adcs	x21,x21,x8
779	adcs	x22,x22,x9
780	ldp	x8,x9,[x1,#8*2]
781	adcs	x23,x23,x10
782	adcs	x24,x24,x11
783	ldp	x10,x11,[x1,#8*4]
784	adcs	x25,x25,x12
785	mov	x27,#-8*8
786	adcs	x26,x26,x13
787	ldp	x12,x13,[x1,#8*6]
788	add	x1,x1,#8*8
789	//adc	x28,xzr,xzr		// moved above
790	b	.Lsqr8x_tail
791
792.align	4
793.Lsqr8x_tail_break:
794	ldr	x4,[x29,#112]		// pull n0
795	add	x27,x2,#8*8		// end of current t[num] window
796
797	subs	xzr,x30,#1		// "move" top-most carry to carry bit
798	adcs	x14,x19,x6
799	adcs	x15,x20,x7
800	ldp	x19,x20,[x0,#8*0]
801	adcs	x21,x21,x8
802	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
803	adcs	x22,x22,x9
804	ldp	x8,x9,[x16,#8*2]
805	adcs	x23,x23,x10
806	adcs	x24,x24,x11
807	ldp	x10,x11,[x16,#8*4]
808	adcs	x25,x25,x12
809	adcs	x26,x26,x13
810	ldp	x12,x13,[x16,#8*6]
811	add	x1,x16,#8*8
812	adc	x30,xzr,xzr	// top-most carry
813	mul	x28,x4,x19
814	stp	x14,x15,[x2,#8*0]
815	stp	x21,x22,[x2,#8*2]
816	ldp	x21,x22,[x0,#8*2]
817	stp	x23,x24,[x2,#8*4]
818	ldp	x23,x24,[x0,#8*4]
819	cmp	x27,x29		// did we hit the bottom?
820	stp	x25,x26,[x2,#8*6]
821	mov	x2,x0			// slide the window
822	ldp	x25,x26,[x0,#8*6]
823	mov	x27,#8
824	b.ne	.Lsqr8x_reduction
825
826	// Final step. We see if result is larger than modulus, and
827	// if it is, subtract the modulus. But comparison implies
828	// subtraction. So we subtract modulus, see if it borrowed,
829	// and conditionally copy original value.
830	ldr	x0,[x29,#96]		// pull rp
831	add	x2,x2,#8*8
832	subs	x14,x19,x6
833	sbcs	x15,x20,x7
834	sub	x27,x5,#8*8
835	mov	x3,x0		// x0 copy
836
837.Lsqr8x_sub:
838	sbcs	x16,x21,x8
839	ldp	x6,x7,[x1,#8*0]
840	sbcs	x17,x22,x9
841	stp	x14,x15,[x0,#8*0]
842	sbcs	x14,x23,x10
843	ldp	x8,x9,[x1,#8*2]
844	sbcs	x15,x24,x11
845	stp	x16,x17,[x0,#8*2]
846	sbcs	x16,x25,x12
847	ldp	x10,x11,[x1,#8*4]
848	sbcs	x17,x26,x13
849	ldp	x12,x13,[x1,#8*6]
850	add	x1,x1,#8*8
851	ldp	x19,x20,[x2,#8*0]
852	sub	x27,x27,#8*8
853	ldp	x21,x22,[x2,#8*2]
854	ldp	x23,x24,[x2,#8*4]
855	ldp	x25,x26,[x2,#8*6]
856	add	x2,x2,#8*8
857	stp	x14,x15,[x0,#8*4]
858	sbcs	x14,x19,x6
859	stp	x16,x17,[x0,#8*6]
860	add	x0,x0,#8*8
861	sbcs	x15,x20,x7
862	cbnz	x27,.Lsqr8x_sub
863
864	sbcs	x16,x21,x8
865	mov	x2,sp
866	add	x1,sp,x5
867	ldp	x6,x7,[x3,#8*0]
868	sbcs	x17,x22,x9
869	stp	x14,x15,[x0,#8*0]
870	sbcs	x14,x23,x10
871	ldp	x8,x9,[x3,#8*2]
872	sbcs	x15,x24,x11
873	stp	x16,x17,[x0,#8*2]
874	sbcs	x16,x25,x12
875	ldp	x19,x20,[x1,#8*0]
876	sbcs	x17,x26,x13
877	ldp	x21,x22,[x1,#8*2]
878	sbcs	xzr,x30,xzr	// did it borrow?
879	ldr	x30,[x29,#8]		// pull return address
880	stp	x14,x15,[x0,#8*4]
881	stp	x16,x17,[x0,#8*6]
882
883	sub	x27,x5,#8*4
884.Lsqr4x_cond_copy:
885	sub	x27,x27,#8*4
886	csel	x14,x19,x6,lo
887	stp	xzr,xzr,[x2,#8*0]
888	csel	x15,x20,x7,lo
889	ldp	x6,x7,[x3,#8*4]
890	ldp	x19,x20,[x1,#8*4]
891	csel	x16,x21,x8,lo
892	stp	xzr,xzr,[x2,#8*2]
893	add	x2,x2,#8*4
894	csel	x17,x22,x9,lo
895	ldp	x8,x9,[x3,#8*6]
896	ldp	x21,x22,[x1,#8*6]
897	add	x1,x1,#8*4
898	stp	x14,x15,[x3,#8*0]
899	stp	x16,x17,[x3,#8*2]
900	add	x3,x3,#8*4
901	stp	xzr,xzr,[x1,#8*0]
902	stp	xzr,xzr,[x1,#8*2]
903	cbnz	x27,.Lsqr4x_cond_copy
904
905	csel	x14,x19,x6,lo
906	stp	xzr,xzr,[x2,#8*0]
907	csel	x15,x20,x7,lo
908	stp	xzr,xzr,[x2,#8*2]
909	csel	x16,x21,x8,lo
910	csel	x17,x22,x9,lo
911	stp	x14,x15,[x3,#8*0]
912	stp	x16,x17,[x3,#8*2]
913
914	b	.Lsqr8x_done
915
916.align	4
917.Lsqr8x8_post_condition:
918	adc	x28,xzr,xzr
919	ldr	x30,[x29,#8]		// pull return address
920	// x19-7,x28 hold result, x6-7 hold modulus
921	subs	x6,x19,x6
922	ldr	x1,[x29,#96]		// pull rp
923	sbcs	x7,x20,x7
924	stp	xzr,xzr,[sp,#8*0]
925	sbcs	x8,x21,x8
926	stp	xzr,xzr,[sp,#8*2]
927	sbcs	x9,x22,x9
928	stp	xzr,xzr,[sp,#8*4]
929	sbcs	x10,x23,x10
930	stp	xzr,xzr,[sp,#8*6]
931	sbcs	x11,x24,x11
932	stp	xzr,xzr,[sp,#8*8]
933	sbcs	x12,x25,x12
934	stp	xzr,xzr,[sp,#8*10]
935	sbcs	x13,x26,x13
936	stp	xzr,xzr,[sp,#8*12]
937	sbcs	x28,x28,xzr	// did it borrow?
938	stp	xzr,xzr,[sp,#8*14]
939
940	// x6-7 hold result-modulus
941	csel	x6,x19,x6,lo
942	csel	x7,x20,x7,lo
943	csel	x8,x21,x8,lo
944	csel	x9,x22,x9,lo
945	stp	x6,x7,[x1,#8*0]
946	csel	x10,x23,x10,lo
947	csel	x11,x24,x11,lo
948	stp	x8,x9,[x1,#8*2]
949	csel	x12,x25,x12,lo
950	csel	x13,x26,x13,lo
951	stp	x10,x11,[x1,#8*4]
952	stp	x12,x13,[x1,#8*6]
953
954.Lsqr8x_done:
955	ldp	x19,x20,[x29,#16]
956	mov	sp,x29
957	ldp	x21,x22,[x29,#32]
958	mov	x0,#1
959	ldp	x23,x24,[x29,#48]
960	ldp	x25,x26,[x29,#64]
961	ldp	x27,x28,[x29,#80]
962	ldr	x29,[sp],#128
963	ret
964.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
965.type	__bn_mul4x_mont,%function
966.align	5
967__bn_mul4x_mont:
968	stp	x29,x30,[sp,#-128]!
969	add	x29,sp,#0
970	stp	x19,x20,[sp,#16]
971	stp	x21,x22,[sp,#32]
972	stp	x23,x24,[sp,#48]
973	stp	x25,x26,[sp,#64]
974	stp	x27,x28,[sp,#80]
975
976	sub	x26,sp,x5,lsl#3
977	lsl	x5,x5,#3
978	ldr	x4,[x4]		// *n0
979	sub	sp,x26,#8*4		// alloca
980
981	add	x10,x2,x5
982	add	x27,x1,x5
983	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
984
985	ldr	x24,[x2,#8*0]		// b[0]
986	ldp	x6,x7,[x1,#8*0]	// a[0..3]
987	ldp	x8,x9,[x1,#8*2]
988	add	x1,x1,#8*4
989	mov	x19,xzr
990	mov	x20,xzr
991	mov	x21,xzr
992	mov	x22,xzr
993	ldp	x14,x15,[x3,#8*0]	// n[0..3]
994	ldp	x16,x17,[x3,#8*2]
995	adds	x3,x3,#8*4		// clear carry bit
996	mov	x0,xzr
997	mov	x28,#0
998	mov	x26,sp
999
1000.Loop_mul4x_1st_reduction:
1001	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1002	adc	x0,x0,xzr	// modulo-scheduled
1003	mul	x11,x7,x24
1004	add	x28,x28,#8
1005	mul	x12,x8,x24
1006	and	x28,x28,#31
1007	mul	x13,x9,x24
1008	adds	x19,x19,x10
1009	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1010	adcs	x20,x20,x11
1011	mul	x25,x19,x4		// t[0]*n0
1012	adcs	x21,x21,x12
1013	umulh	x11,x7,x24
1014	adcs	x22,x22,x13
1015	umulh	x12,x8,x24
1016	adc	x23,xzr,xzr
1017	umulh	x13,x9,x24
1018	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1019	adds	x20,x20,x10
1020	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1021	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1022	adcs	x21,x21,x11
1023	mul	x11,x15,x25
1024	adcs	x22,x22,x12
1025	mul	x12,x16,x25
1026	adc	x23,x23,x13		// can't overflow
1027	mul	x13,x17,x25
1028	// (*)	adds	xzr,x19,x10
1029	subs	xzr,x19,#1		// (*)
1030	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1031	adcs	x19,x20,x11
1032	umulh	x11,x15,x25
1033	adcs	x20,x21,x12
1034	umulh	x12,x16,x25
1035	adcs	x21,x22,x13
1036	umulh	x13,x17,x25
1037	adcs	x22,x23,x0
1038	adc	x0,xzr,xzr
1039	adds	x19,x19,x10
1040	sub	x10,x27,x1
1041	adcs	x20,x20,x11
1042	adcs	x21,x21,x12
1043	adcs	x22,x22,x13
1044	//adc	x0,x0,xzr
1045	cbnz	x28,.Loop_mul4x_1st_reduction
1046
1047	cbz	x10,.Lmul4x4_post_condition
1048
1049	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1050	ldp	x8,x9,[x1,#8*2]
1051	add	x1,x1,#8*4
1052	ldr	x25,[sp]		// a[0]*n0
1053	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1054	ldp	x16,x17,[x3,#8*2]
1055	add	x3,x3,#8*4
1056
1057.Loop_mul4x_1st_tail:
1058	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1059	adc	x0,x0,xzr	// modulo-scheduled
1060	mul	x11,x7,x24
1061	add	x28,x28,#8
1062	mul	x12,x8,x24
1063	and	x28,x28,#31
1064	mul	x13,x9,x24
1065	adds	x19,x19,x10
1066	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1067	adcs	x20,x20,x11
1068	umulh	x11,x7,x24
1069	adcs	x21,x21,x12
1070	umulh	x12,x8,x24
1071	adcs	x22,x22,x13
1072	umulh	x13,x9,x24
1073	adc	x23,xzr,xzr
1074	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1075	adds	x20,x20,x10
1076	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1077	adcs	x21,x21,x11
1078	mul	x11,x15,x25
1079	adcs	x22,x22,x12
1080	mul	x12,x16,x25
1081	adc	x23,x23,x13		// can't overflow
1082	mul	x13,x17,x25
1083	adds	x19,x19,x10
1084	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1085	adcs	x20,x20,x11
1086	umulh	x11,x15,x25
1087	adcs	x21,x21,x12
1088	umulh	x12,x16,x25
1089	adcs	x22,x22,x13
1090	adcs	x23,x23,x0
1091	umulh	x13,x17,x25
1092	adc	x0,xzr,xzr
1093	ldr	x25,[sp,x28]		// next t[0]*n0
1094	str	x19,[x26],#8		// result!!!
1095	adds	x19,x20,x10
1096	sub	x10,x27,x1		// done yet?
1097	adcs	x20,x21,x11
1098	adcs	x21,x22,x12
1099	adcs	x22,x23,x13
1100	//adc	x0,x0,xzr
1101	cbnz	x28,.Loop_mul4x_1st_tail
1102
1103	sub	x11,x27,x5	// rewinded x1
1104	cbz	x10,.Lmul4x_proceed
1105
1106	ldp	x6,x7,[x1,#8*0]
1107	ldp	x8,x9,[x1,#8*2]
1108	add	x1,x1,#8*4
1109	ldp	x14,x15,[x3,#8*0]
1110	ldp	x16,x17,[x3,#8*2]
1111	add	x3,x3,#8*4
1112	b	.Loop_mul4x_1st_tail
1113
1114.align	5
1115.Lmul4x_proceed:
1116	ldr	x24,[x2,#8*4]!		// *++b
1117	adc	x30,x0,xzr
1118	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1119	sub	x3,x3,x5		// rewind np
1120	ldp	x8,x9,[x11,#8*2]
1121	add	x1,x11,#8*4
1122
1123	stp	x19,x20,[x26,#8*0]	// result!!!
1124	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1125	stp	x21,x22,[x26,#8*2]	// result!!!
1126	ldp	x21,x22,[sp,#8*6]
1127
1128	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1129	mov	x26,sp
1130	ldp	x16,x17,[x3,#8*2]
1131	adds	x3,x3,#8*4		// clear carry bit
1132	mov	x0,xzr
1133
1134.align	4
1135.Loop_mul4x_reduction:
1136	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1137	adc	x0,x0,xzr	// modulo-scheduled
1138	mul	x11,x7,x24
1139	add	x28,x28,#8
1140	mul	x12,x8,x24
1141	and	x28,x28,#31
1142	mul	x13,x9,x24
1143	adds	x19,x19,x10
1144	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1145	adcs	x20,x20,x11
1146	mul	x25,x19,x4		// t[0]*n0
1147	adcs	x21,x21,x12
1148	umulh	x11,x7,x24
1149	adcs	x22,x22,x13
1150	umulh	x12,x8,x24
1151	adc	x23,xzr,xzr
1152	umulh	x13,x9,x24
1153	ldr	x24,[x2,x28]		// next b[i]
1154	adds	x20,x20,x10
1155	// (*)	mul	x10,x14,x25
1156	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1157	adcs	x21,x21,x11
1158	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1159	adcs	x22,x22,x12
1160	mul	x12,x16,x25
1161	adc	x23,x23,x13		// can't overflow
1162	mul	x13,x17,x25
1163	// (*)	adds	xzr,x19,x10
1164	subs	xzr,x19,#1		// (*)
1165	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1166	adcs	x19,x20,x11
1167	umulh	x11,x15,x25
1168	adcs	x20,x21,x12
1169	umulh	x12,x16,x25
1170	adcs	x21,x22,x13
1171	umulh	x13,x17,x25
1172	adcs	x22,x23,x0
1173	adc	x0,xzr,xzr
1174	adds	x19,x19,x10
1175	adcs	x20,x20,x11
1176	adcs	x21,x21,x12
1177	adcs	x22,x22,x13
1178	//adc	x0,x0,xzr
1179	cbnz	x28,.Loop_mul4x_reduction
1180
1181	adc	x0,x0,xzr
1182	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1183	ldp	x12,x13,[x26,#8*6]
1184	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1185	ldp	x8,x9,[x1,#8*2]
1186	add	x1,x1,#8*4
1187	adds	x19,x19,x10
1188	adcs	x20,x20,x11
1189	adcs	x21,x21,x12
1190	adcs	x22,x22,x13
1191	//adc	x0,x0,xzr
1192
1193	ldr	x25,[sp]		// t[0]*n0
1194	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1195	ldp	x16,x17,[x3,#8*2]
1196	add	x3,x3,#8*4
1197
1198.align	4
1199.Loop_mul4x_tail:
1200	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1201	adc	x0,x0,xzr	// modulo-scheduled
1202	mul	x11,x7,x24
1203	add	x28,x28,#8
1204	mul	x12,x8,x24
1205	and	x28,x28,#31
1206	mul	x13,x9,x24
1207	adds	x19,x19,x10
1208	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1209	adcs	x20,x20,x11
1210	umulh	x11,x7,x24
1211	adcs	x21,x21,x12
1212	umulh	x12,x8,x24
1213	adcs	x22,x22,x13
1214	umulh	x13,x9,x24
1215	adc	x23,xzr,xzr
1216	ldr	x24,[x2,x28]		// next b[i]
1217	adds	x20,x20,x10
1218	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1219	adcs	x21,x21,x11
1220	mul	x11,x15,x25
1221	adcs	x22,x22,x12
1222	mul	x12,x16,x25
1223	adc	x23,x23,x13		// can't overflow
1224	mul	x13,x17,x25
1225	adds	x19,x19,x10
1226	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1227	adcs	x20,x20,x11
1228	umulh	x11,x15,x25
1229	adcs	x21,x21,x12
1230	umulh	x12,x16,x25
1231	adcs	x22,x22,x13
1232	umulh	x13,x17,x25
1233	adcs	x23,x23,x0
1234	ldr	x25,[sp,x28]		// next a[0]*n0
1235	adc	x0,xzr,xzr
1236	str	x19,[x26],#8		// result!!!
1237	adds	x19,x20,x10
1238	sub	x10,x27,x1		// done yet?
1239	adcs	x20,x21,x11
1240	adcs	x21,x22,x12
1241	adcs	x22,x23,x13
1242	//adc	x0,x0,xzr
1243	cbnz	x28,.Loop_mul4x_tail
1244
1245	sub	x11,x3,x5		// rewinded np?
1246	adc	x0,x0,xzr
1247	cbz	x10,.Loop_mul4x_break
1248
1249	ldp	x10,x11,[x26,#8*4]
1250	ldp	x12,x13,[x26,#8*6]
1251	ldp	x6,x7,[x1,#8*0]
1252	ldp	x8,x9,[x1,#8*2]
1253	add	x1,x1,#8*4
1254	adds	x19,x19,x10
1255	adcs	x20,x20,x11
1256	adcs	x21,x21,x12
1257	adcs	x22,x22,x13
1258	//adc	x0,x0,xzr
1259	ldp	x14,x15,[x3,#8*0]
1260	ldp	x16,x17,[x3,#8*2]
1261	add	x3,x3,#8*4
1262	b	.Loop_mul4x_tail
1263
1264.align	4
1265.Loop_mul4x_break:
1266	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1267	adds	x19,x19,x30
1268	add	x2,x2,#8*4		// bp++
1269	adcs	x20,x20,xzr
1270	sub	x1,x1,x5		// rewind ap
1271	adcs	x21,x21,xzr
1272	stp	x19,x20,[x26,#8*0]	// result!!!
1273	adcs	x22,x22,xzr
1274	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1275	adc	x30,x0,xzr
1276	stp	x21,x22,[x26,#8*2]	// result!!!
1277	cmp	x2,x13			// done yet?
1278	ldp	x21,x22,[sp,#8*6]
1279	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1280	ldp	x16,x17,[x11,#8*2]
1281	add	x3,x11,#8*4
1282	b.eq	.Lmul4x_post
1283
1284	ldr	x24,[x2]
1285	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1286	ldp	x8,x9,[x1,#8*2]
1287	adds	x1,x1,#8*4		// clear carry bit
1288	mov	x0,xzr
1289	mov	x26,sp
1290	b	.Loop_mul4x_reduction
1291
1292.align	4
1293.Lmul4x_post:
1294	// Final step. We see if result is larger than modulus, and
1295	// if it is, subtract the modulus. But comparison implies
1296	// subtraction. So we subtract modulus, see if it borrowed,
1297	// and conditionally copy original value.
1298	mov	x0,x12
1299	mov	x27,x12		// x0 copy
1300	subs	x10,x19,x14
1301	add	x26,sp,#8*8
1302	sbcs	x11,x20,x15
1303	sub	x28,x5,#8*4
1304
1305.Lmul4x_sub:
1306	sbcs	x12,x21,x16
1307	ldp	x14,x15,[x3,#8*0]
1308	sub	x28,x28,#8*4
1309	ldp	x19,x20,[x26,#8*0]
1310	sbcs	x13,x22,x17
1311	ldp	x16,x17,[x3,#8*2]
1312	add	x3,x3,#8*4
1313	ldp	x21,x22,[x26,#8*2]
1314	add	x26,x26,#8*4
1315	stp	x10,x11,[x0,#8*0]
1316	sbcs	x10,x19,x14
1317	stp	x12,x13,[x0,#8*2]
1318	add	x0,x0,#8*4
1319	sbcs	x11,x20,x15
1320	cbnz	x28,.Lmul4x_sub
1321
1322	sbcs	x12,x21,x16
1323	mov	x26,sp
1324	add	x1,sp,#8*4
1325	ldp	x6,x7,[x27,#8*0]
1326	sbcs	x13,x22,x17
1327	stp	x10,x11,[x0,#8*0]
1328	ldp	x8,x9,[x27,#8*2]
1329	stp	x12,x13,[x0,#8*2]
1330	ldp	x19,x20,[x1,#8*0]
1331	ldp	x21,x22,[x1,#8*2]
1332	sbcs	xzr,x30,xzr	// did it borrow?
1333	ldr	x30,[x29,#8]		// pull return address
1334
1335	sub	x28,x5,#8*4
1336.Lmul4x_cond_copy:
1337	sub	x28,x28,#8*4
1338	csel	x10,x19,x6,lo
1339	stp	xzr,xzr,[x26,#8*0]
1340	csel	x11,x20,x7,lo
1341	ldp	x6,x7,[x27,#8*4]
1342	ldp	x19,x20,[x1,#8*4]
1343	csel	x12,x21,x8,lo
1344	stp	xzr,xzr,[x26,#8*2]
1345	add	x26,x26,#8*4
1346	csel	x13,x22,x9,lo
1347	ldp	x8,x9,[x27,#8*6]
1348	ldp	x21,x22,[x1,#8*6]
1349	add	x1,x1,#8*4
1350	stp	x10,x11,[x27,#8*0]
1351	stp	x12,x13,[x27,#8*2]
1352	add	x27,x27,#8*4
1353	cbnz	x28,.Lmul4x_cond_copy
1354
1355	csel	x10,x19,x6,lo
1356	stp	xzr,xzr,[x26,#8*0]
1357	csel	x11,x20,x7,lo
1358	stp	xzr,xzr,[x26,#8*2]
1359	csel	x12,x21,x8,lo
1360	stp	xzr,xzr,[x26,#8*3]
1361	csel	x13,x22,x9,lo
1362	stp	xzr,xzr,[x26,#8*4]
1363	stp	x10,x11,[x27,#8*0]
1364	stp	x12,x13,[x27,#8*2]
1365
1366	b	.Lmul4x_done
1367
1368.align	4
1369.Lmul4x4_post_condition:
1370	adc	x0,x0,xzr
1371	ldr	x1,[x29,#96]		// pull rp
1372	// x19-3,x0 hold result, x14-7 hold modulus
1373	subs	x6,x19,x14
1374	ldr	x30,[x29,#8]		// pull return address
1375	sbcs	x7,x20,x15
1376	stp	xzr,xzr,[sp,#8*0]
1377	sbcs	x8,x21,x16
1378	stp	xzr,xzr,[sp,#8*2]
1379	sbcs	x9,x22,x17
1380	stp	xzr,xzr,[sp,#8*4]
1381	sbcs	xzr,x0,xzr		// did it borrow?
1382	stp	xzr,xzr,[sp,#8*6]
1383
1384	// x6-3 hold result-modulus
1385	csel	x6,x19,x6,lo
1386	csel	x7,x20,x7,lo
1387	csel	x8,x21,x8,lo
1388	csel	x9,x22,x9,lo
1389	stp	x6,x7,[x1,#8*0]
1390	stp	x8,x9,[x1,#8*2]
1391
1392.Lmul4x_done:
1393	ldp	x19,x20,[x29,#16]
1394	mov	sp,x29
1395	ldp	x21,x22,[x29,#32]
1396	mov	x0,#1
1397	ldp	x23,x24,[x29,#48]
1398	ldp	x25,x26,[x29,#64]
1399	ldp	x27,x28,[x29,#80]
1400	ldr	x29,[sp],#128
1401	ret
1402.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1403.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1404.align	2
1405.align	4
1406#endif