1#if defined(__aarch64__)
2.text
3
4.globl	bn_mul_mont
5.hidden	bn_mul_mont
6.type	bn_mul_mont,%function
7.align	5
8bn_mul_mont:
9	tst	x5,#7
10	b.eq	__bn_sqr8x_mont
11	tst	x5,#3
12	b.eq	__bn_mul4x_mont
13.Lmul_mont:
14	stp	x29,x30,[sp,#-64]!
15	add	x29,sp,#0
16	stp	x19,x20,[sp,#16]
17	stp	x21,x22,[sp,#32]
18	stp	x23,x24,[sp,#48]
19
20	ldr	x9,[x2],#8		// bp[0]
21	sub	x22,sp,x5,lsl#3
22	ldp	x7,x8,[x1],#16	// ap[0..1]
23	lsl	x5,x5,#3
24	ldr	x4,[x4]		// *n0
25	and	x22,x22,#-16		// ABI says so
26	ldp	x13,x14,[x3],#16	// np[0..1]
27
28	mul	x6,x7,x9		// ap[0]*bp[0]
29	sub	x21,x5,#16		// j=num-2
30	umulh	x7,x7,x9
31	mul	x10,x8,x9		// ap[1]*bp[0]
32	umulh	x11,x8,x9
33
34	mul	x15,x6,x4		// "tp[0]"*n0
35	mov	sp,x22			// alloca
36
37	// (*)	mul	x12,x13,x15	// np[0]*m1
38	umulh	x13,x13,x15
39	mul	x16,x14,x15		// np[1]*m1
40	// (*)	adds	x12,x12,x6	// discarded
41	// (*)	As for removal of first multiplication and addition
42	//	instructions. The outcome of first addition is
43	//	guaranteed to be zero, which leaves two computationally
44	//	significant outcomes: it either carries or not. Then
45	//	question is when does it carry? Is there alternative
46	//	way to deduce it? If you follow operations, you can
47	//	observe that condition for carry is quite simple:
48	//	x6 being non-zero. So that carry can be calculated
49	//	by adding -1 to x6. That's what next instruction does.
50	subs	xzr,x6,#1		// (*)
51	umulh	x17,x14,x15
52	adc	x13,x13,xzr
53	cbz	x21,.L1st_skip
54
55.L1st:
56	ldr	x8,[x1],#8
57	adds	x6,x10,x7
58	sub	x21,x21,#8		// j--
59	adc	x7,x11,xzr
60
61	ldr	x14,[x3],#8
62	adds	x12,x16,x13
63	mul	x10,x8,x9		// ap[j]*bp[0]
64	adc	x13,x17,xzr
65	umulh	x11,x8,x9
66
67	adds	x12,x12,x6
68	mul	x16,x14,x15		// np[j]*m1
69	adc	x13,x13,xzr
70	umulh	x17,x14,x15
71	str	x12,[x22],#8		// tp[j-1]
72	cbnz	x21,.L1st
73
74.L1st_skip:
75	adds	x6,x10,x7
76	sub	x1,x1,x5		// rewind x1
77	adc	x7,x11,xzr
78
79	adds	x12,x16,x13
80	sub	x3,x3,x5		// rewind x3
81	adc	x13,x17,xzr
82
83	adds	x12,x12,x6
84	sub	x20,x5,#8		// i=num-1
85	adcs	x13,x13,x7
86
87	adc	x19,xzr,xzr		// upmost overflow bit
88	stp	x12,x13,[x22]
89
90.Louter:
91	ldr	x9,[x2],#8		// bp[i]
92	ldp	x7,x8,[x1],#16
93	ldr	x23,[sp]		// tp[0]
94	add	x22,sp,#8
95
96	mul	x6,x7,x9		// ap[0]*bp[i]
97	sub	x21,x5,#16		// j=num-2
98	umulh	x7,x7,x9
99	ldp	x13,x14,[x3],#16
100	mul	x10,x8,x9		// ap[1]*bp[i]
101	adds	x6,x6,x23
102	umulh	x11,x8,x9
103	adc	x7,x7,xzr
104
105	mul	x15,x6,x4
106	sub	x20,x20,#8		// i--
107
108	// (*)	mul	x12,x13,x15	// np[0]*m1
109	umulh	x13,x13,x15
110	mul	x16,x14,x15		// np[1]*m1
111	// (*)	adds	x12,x12,x6
112	subs	xzr,x6,#1		// (*)
113	umulh	x17,x14,x15
114	cbz	x21,.Linner_skip
115
116.Linner:
117	ldr	x8,[x1],#8
118	adc	x13,x13,xzr
119	ldr	x23,[x22],#8		// tp[j]
120	adds	x6,x10,x7
121	sub	x21,x21,#8		// j--
122	adc	x7,x11,xzr
123
124	adds	x12,x16,x13
125	ldr	x14,[x3],#8
126	adc	x13,x17,xzr
127
128	mul	x10,x8,x9		// ap[j]*bp[i]
129	adds	x6,x6,x23
130	umulh	x11,x8,x9
131	adc	x7,x7,xzr
132
133	mul	x16,x14,x15		// np[j]*m1
134	adds	x12,x12,x6
135	umulh	x17,x14,x15
136	str	x12,[x22,#-16]		// tp[j-1]
137	cbnz	x21,.Linner
138
139.Linner_skip:
140	ldr	x23,[x22],#8		// tp[j]
141	adc	x13,x13,xzr
142	adds	x6,x10,x7
143	sub	x1,x1,x5		// rewind x1
144	adc	x7,x11,xzr
145
146	adds	x12,x16,x13
147	sub	x3,x3,x5		// rewind x3
148	adcs	x13,x17,x19
149	adc	x19,xzr,xzr
150
151	adds	x6,x6,x23
152	adc	x7,x7,xzr
153
154	adds	x12,x12,x6
155	adcs	x13,x13,x7
156	adc	x19,x19,xzr		// upmost overflow bit
157	stp	x12,x13,[x22,#-16]
158
159	cbnz	x20,.Louter
160
161	// Final step. We see if result is larger than modulus, and
162	// if it is, subtract the modulus. But comparison implies
163	// subtraction. So we subtract modulus, see if it borrowed,
164	// and conditionally copy original value.
165	ldr	x23,[sp]		// tp[0]
166	add	x22,sp,#8
167	ldr	x14,[x3],#8		// np[0]
168	subs	x21,x5,#8		// j=num-1 and clear borrow
169	mov	x1,x0
170.Lsub:
171	sbcs	x8,x23,x14		// tp[j]-np[j]
172	ldr	x23,[x22],#8
173	sub	x21,x21,#8		// j--
174	ldr	x14,[x3],#8
175	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
176	cbnz	x21,.Lsub
177
178	sbcs	x8,x23,x14
179	sbcs	x19,x19,xzr		// did it borrow?
180	str	x8,[x1],#8		// rp[num-1]
181
182	ldr	x23,[sp]		// tp[0]
183	add	x22,sp,#8
184	ldr	x8,[x0],#8		// rp[0]
185	sub	x5,x5,#8		// num--
186	nop
187.Lcond_copy:
188	sub	x5,x5,#8		// num--
189	csel	x14,x23,x8,lo		// did it borrow?
190	ldr	x23,[x22],#8
191	ldr	x8,[x0],#8
192	str	xzr,[x22,#-16]		// wipe tp
193	str	x14,[x0,#-16]
194	cbnz	x5,.Lcond_copy
195
196	csel	x14,x23,x8,lo
197	str	xzr,[x22,#-8]		// wipe tp
198	str	x14,[x0,#-8]
199
200	ldp	x19,x20,[x29,#16]
201	mov	sp,x29
202	ldp	x21,x22,[x29,#32]
203	mov	x0,#1
204	ldp	x23,x24,[x29,#48]
205	ldr	x29,[sp],#64
206	ret
207.size	bn_mul_mont,.-bn_mul_mont
208.type	__bn_sqr8x_mont,%function
209.align	5
210__bn_sqr8x_mont:
211	cmp	x1,x2
212	b.ne	__bn_mul4x_mont
213.Lsqr8x_mont:
214	stp	x29,x30,[sp,#-128]!
215	add	x29,sp,#0
216	stp	x19,x20,[sp,#16]
217	stp	x21,x22,[sp,#32]
218	stp	x23,x24,[sp,#48]
219	stp	x25,x26,[sp,#64]
220	stp	x27,x28,[sp,#80]
221	stp	x0,x3,[sp,#96]	// offload rp and np
222
223	ldp	x6,x7,[x1,#8*0]
224	ldp	x8,x9,[x1,#8*2]
225	ldp	x10,x11,[x1,#8*4]
226	ldp	x12,x13,[x1,#8*6]
227
228	sub	x2,sp,x5,lsl#4
229	lsl	x5,x5,#3
230	ldr	x4,[x4]		// *n0
231	mov	sp,x2			// alloca
232	sub	x27,x5,#8*8
233	b	.Lsqr8x_zero_start
234
235.Lsqr8x_zero:
236	sub	x27,x27,#8*8
237	stp	xzr,xzr,[x2,#8*0]
238	stp	xzr,xzr,[x2,#8*2]
239	stp	xzr,xzr,[x2,#8*4]
240	stp	xzr,xzr,[x2,#8*6]
241.Lsqr8x_zero_start:
242	stp	xzr,xzr,[x2,#8*8]
243	stp	xzr,xzr,[x2,#8*10]
244	stp	xzr,xzr,[x2,#8*12]
245	stp	xzr,xzr,[x2,#8*14]
246	add	x2,x2,#8*16
247	cbnz	x27,.Lsqr8x_zero
248
249	add	x3,x1,x5
250	add	x1,x1,#8*8
251	mov	x19,xzr
252	mov	x20,xzr
253	mov	x21,xzr
254	mov	x22,xzr
255	mov	x23,xzr
256	mov	x24,xzr
257	mov	x25,xzr
258	mov	x26,xzr
259	mov	x2,sp
260	str	x4,[x29,#112]		// offload n0
261
262	// Multiply everything but a[i]*a[i]
263.align	4
264.Lsqr8x_outer_loop:
265        //                                                 a[1]a[0]	(i)
266        //                                             a[2]a[0]
267        //                                         a[3]a[0]
268        //                                     a[4]a[0]
269        //                                 a[5]a[0]
270        //                             a[6]a[0]
271        //                         a[7]a[0]
272        //                                         a[2]a[1]		(ii)
273        //                                     a[3]a[1]
274        //                                 a[4]a[1]
275        //                             a[5]a[1]
276        //                         a[6]a[1]
277        //                     a[7]a[1]
278        //                                 a[3]a[2]			(iii)
279        //                             a[4]a[2]
280        //                         a[5]a[2]
281        //                     a[6]a[2]
282        //                 a[7]a[2]
283        //                         a[4]a[3]				(iv)
284        //                     a[5]a[3]
285        //                 a[6]a[3]
286        //             a[7]a[3]
287        //                 a[5]a[4]					(v)
288        //             a[6]a[4]
289        //         a[7]a[4]
290        //         a[6]a[5]						(vi)
291        //     a[7]a[5]
292        // a[7]a[6]							(vii)
293
294	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
295	mul	x15,x8,x6
296	mul	x16,x9,x6
297	mul	x17,x10,x6
298	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
299	mul	x14,x11,x6
300	adcs	x21,x21,x15
301	mul	x15,x12,x6
302	adcs	x22,x22,x16
303	mul	x16,x13,x6
304	adcs	x23,x23,x17
305	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
306	adcs	x24,x24,x14
307	umulh	x14,x8,x6
308	adcs	x25,x25,x15
309	umulh	x15,x9,x6
310	adcs	x26,x26,x16
311	umulh	x16,x10,x6
312	stp	x19,x20,[x2],#8*2	// t[0..1]
313	adc	x19,xzr,xzr		// t[8]
314	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
315	umulh	x17,x11,x6
316	adcs	x22,x22,x14
317	umulh	x14,x12,x6
318	adcs	x23,x23,x15
319	umulh	x15,x13,x6
320	adcs	x24,x24,x16
321	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
322	adcs	x25,x25,x17
323	mul	x17,x9,x7
324	adcs	x26,x26,x14
325	mul	x14,x10,x7
326	adc	x19,x19,x15
327
328	mul	x15,x11,x7
329	adds	x22,x22,x16
330	mul	x16,x12,x7
331	adcs	x23,x23,x17
332	mul	x17,x13,x7
333	adcs	x24,x24,x14
334	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
335	adcs	x25,x25,x15
336	umulh	x15,x9,x7
337	adcs	x26,x26,x16
338	umulh	x16,x10,x7
339	adcs	x19,x19,x17
340	umulh	x17,x11,x7
341	stp	x21,x22,[x2],#8*2	// t[2..3]
342	adc	x20,xzr,xzr		// t[9]
343	adds	x23,x23,x14
344	umulh	x14,x12,x7
345	adcs	x24,x24,x15
346	umulh	x15,x13,x7
347	adcs	x25,x25,x16
348	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
349	adcs	x26,x26,x17
350	mul	x17,x10,x8
351	adcs	x19,x19,x14
352	mul	x14,x11,x8
353	adc	x20,x20,x15
354
355	mul	x15,x12,x8
356	adds	x24,x24,x16
357	mul	x16,x13,x8
358	adcs	x25,x25,x17
359	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
360	adcs	x26,x26,x14
361	umulh	x14,x10,x8
362	adcs	x19,x19,x15
363	umulh	x15,x11,x8
364	adcs	x20,x20,x16
365	umulh	x16,x12,x8
366	stp	x23,x24,[x2],#8*2	// t[4..5]
367	adc	x21,xzr,xzr		// t[10]
368	adds	x25,x25,x17
369	umulh	x17,x13,x8
370	adcs	x26,x26,x14
371	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
372	adcs	x19,x19,x15
373	mul	x15,x11,x9
374	adcs	x20,x20,x16
375	mul	x16,x12,x9
376	adc	x21,x21,x17
377
378	mul	x17,x13,x9
379	adds	x26,x26,x14
380	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
381	adcs	x19,x19,x15
382	umulh	x15,x11,x9
383	adcs	x20,x20,x16
384	umulh	x16,x12,x9
385	adcs	x21,x21,x17
386	umulh	x17,x13,x9
387	stp	x25,x26,[x2],#8*2	// t[6..7]
388	adc	x22,xzr,xzr		// t[11]
389	adds	x19,x19,x14
390	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
391	adcs	x20,x20,x15
392	mul	x15,x12,x10
393	adcs	x21,x21,x16
394	mul	x16,x13,x10
395	adc	x22,x22,x17
396
397	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
398	adds	x20,x20,x14
399	umulh	x14,x12,x10
400	adcs	x21,x21,x15
401	umulh	x15,x13,x10
402	adcs	x22,x22,x16
403	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
404	adc	x23,xzr,xzr		// t[12]
405	adds	x21,x21,x17
406	mul	x17,x13,x11
407	adcs	x22,x22,x14
408	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
409	adc	x23,x23,x15
410
411	umulh	x15,x13,x11
412	adds	x22,x22,x16
413	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
414	adcs	x23,x23,x17
415	umulh	x17,x13,x12		// hi(a[7]*a[6])
416	adc	x24,xzr,xzr		// t[13]
417	adds	x23,x23,x14
418	sub	x27,x3,x1	// done yet?
419	adc	x24,x24,x15
420
421	adds	x24,x24,x16
422	sub	x14,x3,x5	// rewinded ap
423	adc	x25,xzr,xzr		// t[14]
424	add	x25,x25,x17
425
426	cbz	x27,.Lsqr8x_outer_break
427
428	mov	x4,x6
429	ldp	x6,x7,[x2,#8*0]
430	ldp	x8,x9,[x2,#8*2]
431	ldp	x10,x11,[x2,#8*4]
432	ldp	x12,x13,[x2,#8*6]
433	adds	x19,x19,x6
434	adcs	x20,x20,x7
435	ldp	x6,x7,[x1,#8*0]
436	adcs	x21,x21,x8
437	adcs	x22,x22,x9
438	ldp	x8,x9,[x1,#8*2]
439	adcs	x23,x23,x10
440	adcs	x24,x24,x11
441	ldp	x10,x11,[x1,#8*4]
442	adcs	x25,x25,x12
443	mov	x0,x1
444	adcs	x26,xzr,x13
445	ldp	x12,x13,[x1,#8*6]
446	add	x1,x1,#8*8
447	//adc	x28,xzr,xzr		// moved below
448	mov	x27,#-8*8
449
450	//                                                         a[8]a[0]
451	//                                                     a[9]a[0]
452	//                                                 a[a]a[0]
453	//                                             a[b]a[0]
454	//                                         a[c]a[0]
455	//                                     a[d]a[0]
456	//                                 a[e]a[0]
457	//                             a[f]a[0]
458	//                                                     a[8]a[1]
459	//                         a[f]a[1]........................
460	//                                                 a[8]a[2]
461	//                     a[f]a[2]........................
462	//                                             a[8]a[3]
463	//                 a[f]a[3]........................
464	//                                         a[8]a[4]
465	//             a[f]a[4]........................
466	//                                     a[8]a[5]
467	//         a[f]a[5]........................
468	//                                 a[8]a[6]
469	//     a[f]a[6]........................
470	//                             a[8]a[7]
471	// a[f]a[7]........................
472.Lsqr8x_mul:
473	mul	x14,x6,x4
474	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
475	mul	x15,x7,x4
476	add	x27,x27,#8
477	mul	x16,x8,x4
478	mul	x17,x9,x4
479	adds	x19,x19,x14
480	mul	x14,x10,x4
481	adcs	x20,x20,x15
482	mul	x15,x11,x4
483	adcs	x21,x21,x16
484	mul	x16,x12,x4
485	adcs	x22,x22,x17
486	mul	x17,x13,x4
487	adcs	x23,x23,x14
488	umulh	x14,x6,x4
489	adcs	x24,x24,x15
490	umulh	x15,x7,x4
491	adcs	x25,x25,x16
492	umulh	x16,x8,x4
493	adcs	x26,x26,x17
494	umulh	x17,x9,x4
495	adc	x28,x28,xzr
496	str	x19,[x2],#8
497	adds	x19,x20,x14
498	umulh	x14,x10,x4
499	adcs	x20,x21,x15
500	umulh	x15,x11,x4
501	adcs	x21,x22,x16
502	umulh	x16,x12,x4
503	adcs	x22,x23,x17
504	umulh	x17,x13,x4
505	ldr	x4,[x0,x27]
506	adcs	x23,x24,x14
507	adcs	x24,x25,x15
508	adcs	x25,x26,x16
509	adcs	x26,x28,x17
510	//adc	x28,xzr,xzr		// moved above
511	cbnz	x27,.Lsqr8x_mul
512					// note that carry flag is guaranteed
513					// to be zero at this point
514	cmp	x1,x3		// done yet?
515	b.eq	.Lsqr8x_break
516
517	ldp	x6,x7,[x2,#8*0]
518	ldp	x8,x9,[x2,#8*2]
519	ldp	x10,x11,[x2,#8*4]
520	ldp	x12,x13,[x2,#8*6]
521	adds	x19,x19,x6
522	ldr	x4,[x0,#-8*8]
523	adcs	x20,x20,x7
524	ldp	x6,x7,[x1,#8*0]
525	adcs	x21,x21,x8
526	adcs	x22,x22,x9
527	ldp	x8,x9,[x1,#8*2]
528	adcs	x23,x23,x10
529	adcs	x24,x24,x11
530	ldp	x10,x11,[x1,#8*4]
531	adcs	x25,x25,x12
532	mov	x27,#-8*8
533	adcs	x26,x26,x13
534	ldp	x12,x13,[x1,#8*6]
535	add	x1,x1,#8*8
536	//adc	x28,xzr,xzr		// moved above
537	b	.Lsqr8x_mul
538
539.align	4
540.Lsqr8x_break:
541	ldp	x6,x7,[x0,#8*0]
542	add	x1,x0,#8*8
543	ldp	x8,x9,[x0,#8*2]
544	sub	x14,x3,x1		// is it last iteration?
545	ldp	x10,x11,[x0,#8*4]
546	sub	x15,x2,x14
547	ldp	x12,x13,[x0,#8*6]
548	cbz	x14,.Lsqr8x_outer_loop
549
550	stp	x19,x20,[x2,#8*0]
551	ldp	x19,x20,[x15,#8*0]
552	stp	x21,x22,[x2,#8*2]
553	ldp	x21,x22,[x15,#8*2]
554	stp	x23,x24,[x2,#8*4]
555	ldp	x23,x24,[x15,#8*4]
556	stp	x25,x26,[x2,#8*6]
557	mov	x2,x15
558	ldp	x25,x26,[x15,#8*6]
559	b	.Lsqr8x_outer_loop
560
561.align	4
562.Lsqr8x_outer_break:
563	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
564	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
565	ldp	x15,x16,[sp,#8*1]
566	ldp	x11,x13,[x14,#8*2]
567	add	x1,x14,#8*4
568	ldp	x17,x14,[sp,#8*3]
569
570	stp	x19,x20,[x2,#8*0]
571	mul	x19,x7,x7
572	stp	x21,x22,[x2,#8*2]
573	umulh	x7,x7,x7
574	stp	x23,x24,[x2,#8*4]
575	mul	x8,x9,x9
576	stp	x25,x26,[x2,#8*6]
577	mov	x2,sp
578	umulh	x9,x9,x9
579	adds	x20,x7,x15,lsl#1
580	extr	x15,x16,x15,#63
581	sub	x27,x5,#8*4
582
583.Lsqr4x_shift_n_add:
584	adcs	x21,x8,x15
585	extr	x16,x17,x16,#63
586	sub	x27,x27,#8*4
587	adcs	x22,x9,x16
588	ldp	x15,x16,[x2,#8*5]
589	mul	x10,x11,x11
590	ldp	x7,x9,[x1],#8*2
591	umulh	x11,x11,x11
592	mul	x12,x13,x13
593	umulh	x13,x13,x13
594	extr	x17,x14,x17,#63
595	stp	x19,x20,[x2,#8*0]
596	adcs	x23,x10,x17
597	extr	x14,x15,x14,#63
598	stp	x21,x22,[x2,#8*2]
599	adcs	x24,x11,x14
600	ldp	x17,x14,[x2,#8*7]
601	extr	x15,x16,x15,#63
602	adcs	x25,x12,x15
603	extr	x16,x17,x16,#63
604	adcs	x26,x13,x16
605	ldp	x15,x16,[x2,#8*9]
606	mul	x6,x7,x7
607	ldp	x11,x13,[x1],#8*2
608	umulh	x7,x7,x7
609	mul	x8,x9,x9
610	umulh	x9,x9,x9
611	stp	x23,x24,[x2,#8*4]
612	extr	x17,x14,x17,#63
613	stp	x25,x26,[x2,#8*6]
614	add	x2,x2,#8*8
615	adcs	x19,x6,x17
616	extr	x14,x15,x14,#63
617	adcs	x20,x7,x14
618	ldp	x17,x14,[x2,#8*3]
619	extr	x15,x16,x15,#63
620	cbnz	x27,.Lsqr4x_shift_n_add
621	ldp	x1,x4,[x29,#104]	// pull np and n0
622
623	adcs	x21,x8,x15
624	extr	x16,x17,x16,#63
625	adcs	x22,x9,x16
626	ldp	x15,x16,[x2,#8*5]
627	mul	x10,x11,x11
628	umulh	x11,x11,x11
629	stp	x19,x20,[x2,#8*0]
630	mul	x12,x13,x13
631	umulh	x13,x13,x13
632	stp	x21,x22,[x2,#8*2]
633	extr	x17,x14,x17,#63
634	adcs	x23,x10,x17
635	extr	x14,x15,x14,#63
636	ldp	x19,x20,[sp,#8*0]
637	adcs	x24,x11,x14
638	extr	x15,x16,x15,#63
639	ldp	x6,x7,[x1,#8*0]
640	adcs	x25,x12,x15
641	extr	x16,xzr,x16,#63
642	ldp	x8,x9,[x1,#8*2]
643	adc	x26,x13,x16
644	ldp	x10,x11,[x1,#8*4]
645
646	// Reduce by 512 bits per iteration
647	mul	x28,x4,x19		// t[0]*n0
648	ldp	x12,x13,[x1,#8*6]
649	add	x3,x1,x5
650	ldp	x21,x22,[sp,#8*2]
651	stp	x23,x24,[x2,#8*4]
652	ldp	x23,x24,[sp,#8*4]
653	stp	x25,x26,[x2,#8*6]
654	ldp	x25,x26,[sp,#8*6]
655	add	x1,x1,#8*8
656	mov	x30,xzr		// initial top-most carry
657	mov	x2,sp
658	mov	x27,#8
659
660.Lsqr8x_reduction:
661	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
662	mul	x15,x7,x28
663	sub	x27,x27,#1
664	mul	x16,x8,x28
665	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
666	mul	x17,x9,x28
667	// (*)	adds	xzr,x19,x14
668	subs	xzr,x19,#1		// (*)
669	mul	x14,x10,x28
670	adcs	x19,x20,x15
671	mul	x15,x11,x28
672	adcs	x20,x21,x16
673	mul	x16,x12,x28
674	adcs	x21,x22,x17
675	mul	x17,x13,x28
676	adcs	x22,x23,x14
677	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
678	adcs	x23,x24,x15
679	umulh	x15,x7,x28
680	adcs	x24,x25,x16
681	umulh	x16,x8,x28
682	adcs	x25,x26,x17
683	umulh	x17,x9,x28
684	adc	x26,xzr,xzr
685	adds	x19,x19,x14
686	umulh	x14,x10,x28
687	adcs	x20,x20,x15
688	umulh	x15,x11,x28
689	adcs	x21,x21,x16
690	umulh	x16,x12,x28
691	adcs	x22,x22,x17
692	umulh	x17,x13,x28
693	mul	x28,x4,x19		// next t[0]*n0
694	adcs	x23,x23,x14
695	adcs	x24,x24,x15
696	adcs	x25,x25,x16
697	adc	x26,x26,x17
698	cbnz	x27,.Lsqr8x_reduction
699
700	ldp	x14,x15,[x2,#8*0]
701	ldp	x16,x17,[x2,#8*2]
702	mov	x0,x2
703	sub	x27,x3,x1	// done yet?
704	adds	x19,x19,x14
705	adcs	x20,x20,x15
706	ldp	x14,x15,[x2,#8*4]
707	adcs	x21,x21,x16
708	adcs	x22,x22,x17
709	ldp	x16,x17,[x2,#8*6]
710	adcs	x23,x23,x14
711	adcs	x24,x24,x15
712	adcs	x25,x25,x16
713	adcs	x26,x26,x17
714	//adc	x28,xzr,xzr		// moved below
715	cbz	x27,.Lsqr8x8_post_condition
716
717	ldr	x4,[x2,#-8*8]
718	ldp	x6,x7,[x1,#8*0]
719	ldp	x8,x9,[x1,#8*2]
720	ldp	x10,x11,[x1,#8*4]
721	mov	x27,#-8*8
722	ldp	x12,x13,[x1,#8*6]
723	add	x1,x1,#8*8
724
725.Lsqr8x_tail:
726	mul	x14,x6,x4
727	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
728	mul	x15,x7,x4
729	add	x27,x27,#8
730	mul	x16,x8,x4
731	mul	x17,x9,x4
732	adds	x19,x19,x14
733	mul	x14,x10,x4
734	adcs	x20,x20,x15
735	mul	x15,x11,x4
736	adcs	x21,x21,x16
737	mul	x16,x12,x4
738	adcs	x22,x22,x17
739	mul	x17,x13,x4
740	adcs	x23,x23,x14
741	umulh	x14,x6,x4
742	adcs	x24,x24,x15
743	umulh	x15,x7,x4
744	adcs	x25,x25,x16
745	umulh	x16,x8,x4
746	adcs	x26,x26,x17
747	umulh	x17,x9,x4
748	adc	x28,x28,xzr
749	str	x19,[x2],#8
750	adds	x19,x20,x14
751	umulh	x14,x10,x4
752	adcs	x20,x21,x15
753	umulh	x15,x11,x4
754	adcs	x21,x22,x16
755	umulh	x16,x12,x4
756	adcs	x22,x23,x17
757	umulh	x17,x13,x4
758	ldr	x4,[x0,x27]
759	adcs	x23,x24,x14
760	adcs	x24,x25,x15
761	adcs	x25,x26,x16
762	adcs	x26,x28,x17
763	//adc	x28,xzr,xzr		// moved above
764	cbnz	x27,.Lsqr8x_tail
765					// note that carry flag is guaranteed
766					// to be zero at this point
767	ldp	x6,x7,[x2,#8*0]
768	sub	x27,x3,x1	// done yet?
769	sub	x16,x3,x5	// rewinded np
770	ldp	x8,x9,[x2,#8*2]
771	ldp	x10,x11,[x2,#8*4]
772	ldp	x12,x13,[x2,#8*6]
773	cbz	x27,.Lsqr8x_tail_break
774
775	ldr	x4,[x0,#-8*8]
776	adds	x19,x19,x6
777	adcs	x20,x20,x7
778	ldp	x6,x7,[x1,#8*0]
779	adcs	x21,x21,x8
780	adcs	x22,x22,x9
781	ldp	x8,x9,[x1,#8*2]
782	adcs	x23,x23,x10
783	adcs	x24,x24,x11
784	ldp	x10,x11,[x1,#8*4]
785	adcs	x25,x25,x12
786	mov	x27,#-8*8
787	adcs	x26,x26,x13
788	ldp	x12,x13,[x1,#8*6]
789	add	x1,x1,#8*8
790	//adc	x28,xzr,xzr		// moved above
791	b	.Lsqr8x_tail
792
793.align	4
794.Lsqr8x_tail_break:
795	ldr	x4,[x29,#112]		// pull n0
796	add	x27,x2,#8*8		// end of current t[num] window
797
798	subs	xzr,x30,#1		// "move" top-most carry to carry bit
799	adcs	x14,x19,x6
800	adcs	x15,x20,x7
801	ldp	x19,x20,[x0,#8*0]
802	adcs	x21,x21,x8
803	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
804	adcs	x22,x22,x9
805	ldp	x8,x9,[x16,#8*2]
806	adcs	x23,x23,x10
807	adcs	x24,x24,x11
808	ldp	x10,x11,[x16,#8*4]
809	adcs	x25,x25,x12
810	adcs	x26,x26,x13
811	ldp	x12,x13,[x16,#8*6]
812	add	x1,x16,#8*8
813	adc	x30,xzr,xzr	// top-most carry
814	mul	x28,x4,x19
815	stp	x14,x15,[x2,#8*0]
816	stp	x21,x22,[x2,#8*2]
817	ldp	x21,x22,[x0,#8*2]
818	stp	x23,x24,[x2,#8*4]
819	ldp	x23,x24,[x0,#8*4]
820	cmp	x27,x29		// did we hit the bottom?
821	stp	x25,x26,[x2,#8*6]
822	mov	x2,x0			// slide the window
823	ldp	x25,x26,[x0,#8*6]
824	mov	x27,#8
825	b.ne	.Lsqr8x_reduction
826
827	// Final step. We see if result is larger than modulus, and
828	// if it is, subtract the modulus. But comparison implies
829	// subtraction. So we subtract modulus, see if it borrowed,
830	// and conditionally copy original value.
831	ldr	x0,[x29,#96]		// pull rp
832	add	x2,x2,#8*8
833	subs	x14,x19,x6
834	sbcs	x15,x20,x7
835	sub	x27,x5,#8*8
836	mov	x3,x0		// x0 copy
837
838.Lsqr8x_sub:
839	sbcs	x16,x21,x8
840	ldp	x6,x7,[x1,#8*0]
841	sbcs	x17,x22,x9
842	stp	x14,x15,[x0,#8*0]
843	sbcs	x14,x23,x10
844	ldp	x8,x9,[x1,#8*2]
845	sbcs	x15,x24,x11
846	stp	x16,x17,[x0,#8*2]
847	sbcs	x16,x25,x12
848	ldp	x10,x11,[x1,#8*4]
849	sbcs	x17,x26,x13
850	ldp	x12,x13,[x1,#8*6]
851	add	x1,x1,#8*8
852	ldp	x19,x20,[x2,#8*0]
853	sub	x27,x27,#8*8
854	ldp	x21,x22,[x2,#8*2]
855	ldp	x23,x24,[x2,#8*4]
856	ldp	x25,x26,[x2,#8*6]
857	add	x2,x2,#8*8
858	stp	x14,x15,[x0,#8*4]
859	sbcs	x14,x19,x6
860	stp	x16,x17,[x0,#8*6]
861	add	x0,x0,#8*8
862	sbcs	x15,x20,x7
863	cbnz	x27,.Lsqr8x_sub
864
865	sbcs	x16,x21,x8
866	mov	x2,sp
867	add	x1,sp,x5
868	ldp	x6,x7,[x3,#8*0]
869	sbcs	x17,x22,x9
870	stp	x14,x15,[x0,#8*0]
871	sbcs	x14,x23,x10
872	ldp	x8,x9,[x3,#8*2]
873	sbcs	x15,x24,x11
874	stp	x16,x17,[x0,#8*2]
875	sbcs	x16,x25,x12
876	ldp	x19,x20,[x1,#8*0]
877	sbcs	x17,x26,x13
878	ldp	x21,x22,[x1,#8*2]
879	sbcs	xzr,x30,xzr	// did it borrow?
880	ldr	x30,[x29,#8]		// pull return address
881	stp	x14,x15,[x0,#8*4]
882	stp	x16,x17,[x0,#8*6]
883
884	sub	x27,x5,#8*4
885.Lsqr4x_cond_copy:
886	sub	x27,x27,#8*4
887	csel	x14,x19,x6,lo
888	stp	xzr,xzr,[x2,#8*0]
889	csel	x15,x20,x7,lo
890	ldp	x6,x7,[x3,#8*4]
891	ldp	x19,x20,[x1,#8*4]
892	csel	x16,x21,x8,lo
893	stp	xzr,xzr,[x2,#8*2]
894	add	x2,x2,#8*4
895	csel	x17,x22,x9,lo
896	ldp	x8,x9,[x3,#8*6]
897	ldp	x21,x22,[x1,#8*6]
898	add	x1,x1,#8*4
899	stp	x14,x15,[x3,#8*0]
900	stp	x16,x17,[x3,#8*2]
901	add	x3,x3,#8*4
902	stp	xzr,xzr,[x1,#8*0]
903	stp	xzr,xzr,[x1,#8*2]
904	cbnz	x27,.Lsqr4x_cond_copy
905
906	csel	x14,x19,x6,lo
907	stp	xzr,xzr,[x2,#8*0]
908	csel	x15,x20,x7,lo
909	stp	xzr,xzr,[x2,#8*2]
910	csel	x16,x21,x8,lo
911	csel	x17,x22,x9,lo
912	stp	x14,x15,[x3,#8*0]
913	stp	x16,x17,[x3,#8*2]
914
915	b	.Lsqr8x_done
916
917.align	4
918.Lsqr8x8_post_condition:
919	adc	x28,xzr,xzr
920	ldr	x30,[x29,#8]		// pull return address
921	// x19-7,x28 hold result, x6-7 hold modulus
922	subs	x6,x19,x6
923	ldr	x1,[x29,#96]		// pull rp
924	sbcs	x7,x20,x7
925	stp	xzr,xzr,[sp,#8*0]
926	sbcs	x8,x21,x8
927	stp	xzr,xzr,[sp,#8*2]
928	sbcs	x9,x22,x9
929	stp	xzr,xzr,[sp,#8*4]
930	sbcs	x10,x23,x10
931	stp	xzr,xzr,[sp,#8*6]
932	sbcs	x11,x24,x11
933	stp	xzr,xzr,[sp,#8*8]
934	sbcs	x12,x25,x12
935	stp	xzr,xzr,[sp,#8*10]
936	sbcs	x13,x26,x13
937	stp	xzr,xzr,[sp,#8*12]
938	sbcs	x28,x28,xzr	// did it borrow?
939	stp	xzr,xzr,[sp,#8*14]
940
941	// x6-7 hold result-modulus
942	csel	x6,x19,x6,lo
943	csel	x7,x20,x7,lo
944	csel	x8,x21,x8,lo
945	csel	x9,x22,x9,lo
946	stp	x6,x7,[x1,#8*0]
947	csel	x10,x23,x10,lo
948	csel	x11,x24,x11,lo
949	stp	x8,x9,[x1,#8*2]
950	csel	x12,x25,x12,lo
951	csel	x13,x26,x13,lo
952	stp	x10,x11,[x1,#8*4]
953	stp	x12,x13,[x1,#8*6]
954
955.Lsqr8x_done:
956	ldp	x19,x20,[x29,#16]
957	mov	sp,x29
958	ldp	x21,x22,[x29,#32]
959	mov	x0,#1
960	ldp	x23,x24,[x29,#48]
961	ldp	x25,x26,[x29,#64]
962	ldp	x27,x28,[x29,#80]
963	ldr	x29,[sp],#128
964	ret
965.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
966.type	__bn_mul4x_mont,%function
967.align	5
968__bn_mul4x_mont:
969	stp	x29,x30,[sp,#-128]!
970	add	x29,sp,#0
971	stp	x19,x20,[sp,#16]
972	stp	x21,x22,[sp,#32]
973	stp	x23,x24,[sp,#48]
974	stp	x25,x26,[sp,#64]
975	stp	x27,x28,[sp,#80]
976
977	sub	x26,sp,x5,lsl#3
978	lsl	x5,x5,#3
979	ldr	x4,[x4]		// *n0
980	sub	sp,x26,#8*4		// alloca
981
982	add	x10,x2,x5
983	add	x27,x1,x5
984	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
985
986	ldr	x24,[x2,#8*0]		// b[0]
987	ldp	x6,x7,[x1,#8*0]	// a[0..3]
988	ldp	x8,x9,[x1,#8*2]
989	add	x1,x1,#8*4
990	mov	x19,xzr
991	mov	x20,xzr
992	mov	x21,xzr
993	mov	x22,xzr
994	ldp	x14,x15,[x3,#8*0]	// n[0..3]
995	ldp	x16,x17,[x3,#8*2]
996	adds	x3,x3,#8*4		// clear carry bit
997	mov	x0,xzr
998	mov	x28,#0
999	mov	x26,sp
1000
1001.Loop_mul4x_1st_reduction:
1002	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1003	adc	x0,x0,xzr	// modulo-scheduled
1004	mul	x11,x7,x24
1005	add	x28,x28,#8
1006	mul	x12,x8,x24
1007	and	x28,x28,#31
1008	mul	x13,x9,x24
1009	adds	x19,x19,x10
1010	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1011	adcs	x20,x20,x11
1012	mul	x25,x19,x4		// t[0]*n0
1013	adcs	x21,x21,x12
1014	umulh	x11,x7,x24
1015	adcs	x22,x22,x13
1016	umulh	x12,x8,x24
1017	adc	x23,xzr,xzr
1018	umulh	x13,x9,x24
1019	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1020	adds	x20,x20,x10
1021	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1022	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1023	adcs	x21,x21,x11
1024	mul	x11,x15,x25
1025	adcs	x22,x22,x12
1026	mul	x12,x16,x25
1027	adc	x23,x23,x13		// can't overflow
1028	mul	x13,x17,x25
1029	// (*)	adds	xzr,x19,x10
1030	subs	xzr,x19,#1		// (*)
1031	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1032	adcs	x19,x20,x11
1033	umulh	x11,x15,x25
1034	adcs	x20,x21,x12
1035	umulh	x12,x16,x25
1036	adcs	x21,x22,x13
1037	umulh	x13,x17,x25
1038	adcs	x22,x23,x0
1039	adc	x0,xzr,xzr
1040	adds	x19,x19,x10
1041	sub	x10,x27,x1
1042	adcs	x20,x20,x11
1043	adcs	x21,x21,x12
1044	adcs	x22,x22,x13
1045	//adc	x0,x0,xzr
1046	cbnz	x28,.Loop_mul4x_1st_reduction
1047
1048	cbz	x10,.Lmul4x4_post_condition
1049
1050	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1051	ldp	x8,x9,[x1,#8*2]
1052	add	x1,x1,#8*4
1053	ldr	x25,[sp]		// a[0]*n0
1054	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1055	ldp	x16,x17,[x3,#8*2]
1056	add	x3,x3,#8*4
1057
1058.Loop_mul4x_1st_tail:
1059	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1060	adc	x0,x0,xzr	// modulo-scheduled
1061	mul	x11,x7,x24
1062	add	x28,x28,#8
1063	mul	x12,x8,x24
1064	and	x28,x28,#31
1065	mul	x13,x9,x24
1066	adds	x19,x19,x10
1067	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1068	adcs	x20,x20,x11
1069	umulh	x11,x7,x24
1070	adcs	x21,x21,x12
1071	umulh	x12,x8,x24
1072	adcs	x22,x22,x13
1073	umulh	x13,x9,x24
1074	adc	x23,xzr,xzr
1075	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1076	adds	x20,x20,x10
1077	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1078	adcs	x21,x21,x11
1079	mul	x11,x15,x25
1080	adcs	x22,x22,x12
1081	mul	x12,x16,x25
1082	adc	x23,x23,x13		// can't overflow
1083	mul	x13,x17,x25
1084	adds	x19,x19,x10
1085	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1086	adcs	x20,x20,x11
1087	umulh	x11,x15,x25
1088	adcs	x21,x21,x12
1089	umulh	x12,x16,x25
1090	adcs	x22,x22,x13
1091	adcs	x23,x23,x0
1092	umulh	x13,x17,x25
1093	adc	x0,xzr,xzr
1094	ldr	x25,[sp,x28]		// next t[0]*n0
1095	str	x19,[x26],#8		// result!!!
1096	adds	x19,x20,x10
1097	sub	x10,x27,x1		// done yet?
1098	adcs	x20,x21,x11
1099	adcs	x21,x22,x12
1100	adcs	x22,x23,x13
1101	//adc	x0,x0,xzr
1102	cbnz	x28,.Loop_mul4x_1st_tail
1103
1104	sub	x11,x27,x5	// rewinded x1
1105	cbz	x10,.Lmul4x_proceed
1106
1107	ldp	x6,x7,[x1,#8*0]
1108	ldp	x8,x9,[x1,#8*2]
1109	add	x1,x1,#8*4
1110	ldp	x14,x15,[x3,#8*0]
1111	ldp	x16,x17,[x3,#8*2]
1112	add	x3,x3,#8*4
1113	b	.Loop_mul4x_1st_tail
1114
1115.align	5
1116.Lmul4x_proceed:
1117	ldr	x24,[x2,#8*4]!		// *++b
1118	adc	x30,x0,xzr
1119	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1120	sub	x3,x3,x5		// rewind np
1121	ldp	x8,x9,[x11,#8*2]
1122	add	x1,x11,#8*4
1123
1124	stp	x19,x20,[x26,#8*0]	// result!!!
1125	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1126	stp	x21,x22,[x26,#8*2]	// result!!!
1127	ldp	x21,x22,[sp,#8*6]
1128
1129	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1130	mov	x26,sp
1131	ldp	x16,x17,[x3,#8*2]
1132	adds	x3,x3,#8*4		// clear carry bit
1133	mov	x0,xzr
1134
1135.align	4
1136.Loop_mul4x_reduction:
1137	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1138	adc	x0,x0,xzr	// modulo-scheduled
1139	mul	x11,x7,x24
1140	add	x28,x28,#8
1141	mul	x12,x8,x24
1142	and	x28,x28,#31
1143	mul	x13,x9,x24
1144	adds	x19,x19,x10
1145	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1146	adcs	x20,x20,x11
1147	mul	x25,x19,x4		// t[0]*n0
1148	adcs	x21,x21,x12
1149	umulh	x11,x7,x24
1150	adcs	x22,x22,x13
1151	umulh	x12,x8,x24
1152	adc	x23,xzr,xzr
1153	umulh	x13,x9,x24
1154	ldr	x24,[x2,x28]		// next b[i]
1155	adds	x20,x20,x10
1156	// (*)	mul	x10,x14,x25
1157	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1158	adcs	x21,x21,x11
1159	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1160	adcs	x22,x22,x12
1161	mul	x12,x16,x25
1162	adc	x23,x23,x13		// can't overflow
1163	mul	x13,x17,x25
1164	// (*)	adds	xzr,x19,x10
1165	subs	xzr,x19,#1		// (*)
1166	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1167	adcs	x19,x20,x11
1168	umulh	x11,x15,x25
1169	adcs	x20,x21,x12
1170	umulh	x12,x16,x25
1171	adcs	x21,x22,x13
1172	umulh	x13,x17,x25
1173	adcs	x22,x23,x0
1174	adc	x0,xzr,xzr
1175	adds	x19,x19,x10
1176	adcs	x20,x20,x11
1177	adcs	x21,x21,x12
1178	adcs	x22,x22,x13
1179	//adc	x0,x0,xzr
1180	cbnz	x28,.Loop_mul4x_reduction
1181
1182	adc	x0,x0,xzr
1183	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1184	ldp	x12,x13,[x26,#8*6]
1185	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1186	ldp	x8,x9,[x1,#8*2]
1187	add	x1,x1,#8*4
1188	adds	x19,x19,x10
1189	adcs	x20,x20,x11
1190	adcs	x21,x21,x12
1191	adcs	x22,x22,x13
1192	//adc	x0,x0,xzr
1193
1194	ldr	x25,[sp]		// t[0]*n0
1195	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1196	ldp	x16,x17,[x3,#8*2]
1197	add	x3,x3,#8*4
1198
1199.align	4
1200.Loop_mul4x_tail:
1201	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1202	adc	x0,x0,xzr	// modulo-scheduled
1203	mul	x11,x7,x24
1204	add	x28,x28,#8
1205	mul	x12,x8,x24
1206	and	x28,x28,#31
1207	mul	x13,x9,x24
1208	adds	x19,x19,x10
1209	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1210	adcs	x20,x20,x11
1211	umulh	x11,x7,x24
1212	adcs	x21,x21,x12
1213	umulh	x12,x8,x24
1214	adcs	x22,x22,x13
1215	umulh	x13,x9,x24
1216	adc	x23,xzr,xzr
1217	ldr	x24,[x2,x28]		// next b[i]
1218	adds	x20,x20,x10
1219	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1220	adcs	x21,x21,x11
1221	mul	x11,x15,x25
1222	adcs	x22,x22,x12
1223	mul	x12,x16,x25
1224	adc	x23,x23,x13		// can't overflow
1225	mul	x13,x17,x25
1226	adds	x19,x19,x10
1227	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1228	adcs	x20,x20,x11
1229	umulh	x11,x15,x25
1230	adcs	x21,x21,x12
1231	umulh	x12,x16,x25
1232	adcs	x22,x22,x13
1233	umulh	x13,x17,x25
1234	adcs	x23,x23,x0
1235	ldr	x25,[sp,x28]		// next a[0]*n0
1236	adc	x0,xzr,xzr
1237	str	x19,[x26],#8		// result!!!
1238	adds	x19,x20,x10
1239	sub	x10,x27,x1		// done yet?
1240	adcs	x20,x21,x11
1241	adcs	x21,x22,x12
1242	adcs	x22,x23,x13
1243	//adc	x0,x0,xzr
1244	cbnz	x28,.Loop_mul4x_tail
1245
1246	sub	x11,x3,x5		// rewinded np?
1247	adc	x0,x0,xzr
1248	cbz	x10,.Loop_mul4x_break
1249
1250	ldp	x10,x11,[x26,#8*4]
1251	ldp	x12,x13,[x26,#8*6]
1252	ldp	x6,x7,[x1,#8*0]
1253	ldp	x8,x9,[x1,#8*2]
1254	add	x1,x1,#8*4
1255	adds	x19,x19,x10
1256	adcs	x20,x20,x11
1257	adcs	x21,x21,x12
1258	adcs	x22,x22,x13
1259	//adc	x0,x0,xzr
1260	ldp	x14,x15,[x3,#8*0]
1261	ldp	x16,x17,[x3,#8*2]
1262	add	x3,x3,#8*4
1263	b	.Loop_mul4x_tail
1264
1265.align	4
1266.Loop_mul4x_break:
1267	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1268	adds	x19,x19,x30
1269	add	x2,x2,#8*4		// bp++
1270	adcs	x20,x20,xzr
1271	sub	x1,x1,x5		// rewind ap
1272	adcs	x21,x21,xzr
1273	stp	x19,x20,[x26,#8*0]	// result!!!
1274	adcs	x22,x22,xzr
1275	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1276	adc	x30,x0,xzr
1277	stp	x21,x22,[x26,#8*2]	// result!!!
1278	cmp	x2,x13			// done yet?
1279	ldp	x21,x22,[sp,#8*6]
1280	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1281	ldp	x16,x17,[x11,#8*2]
1282	add	x3,x11,#8*4
1283	b.eq	.Lmul4x_post
1284
1285	ldr	x24,[x2]
1286	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1287	ldp	x8,x9,[x1,#8*2]
1288	adds	x1,x1,#8*4		// clear carry bit
1289	mov	x0,xzr
1290	mov	x26,sp
1291	b	.Loop_mul4x_reduction
1292
1293.align	4
1294.Lmul4x_post:
1295	// Final step. We see if result is larger than modulus, and
1296	// if it is, subtract the modulus. But comparison implies
1297	// subtraction. So we subtract modulus, see if it borrowed,
1298	// and conditionally copy original value.
1299	mov	x0,x12
1300	mov	x27,x12		// x0 copy
1301	subs	x10,x19,x14
1302	add	x26,sp,#8*8
1303	sbcs	x11,x20,x15
1304	sub	x28,x5,#8*4
1305
1306.Lmul4x_sub:
1307	sbcs	x12,x21,x16
1308	ldp	x14,x15,[x3,#8*0]
1309	sub	x28,x28,#8*4
1310	ldp	x19,x20,[x26,#8*0]
1311	sbcs	x13,x22,x17
1312	ldp	x16,x17,[x3,#8*2]
1313	add	x3,x3,#8*4
1314	ldp	x21,x22,[x26,#8*2]
1315	add	x26,x26,#8*4
1316	stp	x10,x11,[x0,#8*0]
1317	sbcs	x10,x19,x14
1318	stp	x12,x13,[x0,#8*2]
1319	add	x0,x0,#8*4
1320	sbcs	x11,x20,x15
1321	cbnz	x28,.Lmul4x_sub
1322
1323	sbcs	x12,x21,x16
1324	mov	x26,sp
1325	add	x1,sp,#8*4
1326	ldp	x6,x7,[x27,#8*0]
1327	sbcs	x13,x22,x17
1328	stp	x10,x11,[x0,#8*0]
1329	ldp	x8,x9,[x27,#8*2]
1330	stp	x12,x13,[x0,#8*2]
1331	ldp	x19,x20,[x1,#8*0]
1332	ldp	x21,x22,[x1,#8*2]
1333	sbcs	xzr,x30,xzr	// did it borrow?
1334	ldr	x30,[x29,#8]		// pull return address
1335
1336	sub	x28,x5,#8*4
1337.Lmul4x_cond_copy:
1338	sub	x28,x28,#8*4
1339	csel	x10,x19,x6,lo
1340	stp	xzr,xzr,[x26,#8*0]
1341	csel	x11,x20,x7,lo
1342	ldp	x6,x7,[x27,#8*4]
1343	ldp	x19,x20,[x1,#8*4]
1344	csel	x12,x21,x8,lo
1345	stp	xzr,xzr,[x26,#8*2]
1346	add	x26,x26,#8*4
1347	csel	x13,x22,x9,lo
1348	ldp	x8,x9,[x27,#8*6]
1349	ldp	x21,x22,[x1,#8*6]
1350	add	x1,x1,#8*4
1351	stp	x10,x11,[x27,#8*0]
1352	stp	x12,x13,[x27,#8*2]
1353	add	x27,x27,#8*4
1354	cbnz	x28,.Lmul4x_cond_copy
1355
1356	csel	x10,x19,x6,lo
1357	stp	xzr,xzr,[x26,#8*0]
1358	csel	x11,x20,x7,lo
1359	stp	xzr,xzr,[x26,#8*2]
1360	csel	x12,x21,x8,lo
1361	stp	xzr,xzr,[x26,#8*3]
1362	csel	x13,x22,x9,lo
1363	stp	xzr,xzr,[x26,#8*4]
1364	stp	x10,x11,[x27,#8*0]
1365	stp	x12,x13,[x27,#8*2]
1366
1367	b	.Lmul4x_done
1368
1369.align	4
1370.Lmul4x4_post_condition:
1371	adc	x0,x0,xzr
1372	ldr	x1,[x29,#96]		// pull rp
1373	// x19-3,x0 hold result, x14-7 hold modulus
1374	subs	x6,x19,x14
1375	ldr	x30,[x29,#8]		// pull return address
1376	sbcs	x7,x20,x15
1377	stp	xzr,xzr,[sp,#8*0]
1378	sbcs	x8,x21,x16
1379	stp	xzr,xzr,[sp,#8*2]
1380	sbcs	x9,x22,x17
1381	stp	xzr,xzr,[sp,#8*4]
1382	sbcs	xzr,x0,xzr		// did it borrow?
1383	stp	xzr,xzr,[sp,#8*6]
1384
1385	// x6-3 hold result-modulus
1386	csel	x6,x19,x6,lo
1387	csel	x7,x20,x7,lo
1388	csel	x8,x21,x8,lo
1389	csel	x9,x22,x9,lo
1390	stp	x6,x7,[x1,#8*0]
1391	stp	x8,x9,[x1,#8*2]
1392
1393.Lmul4x_done:
1394	ldp	x19,x20,[x29,#16]
1395	mov	sp,x29
1396	ldp	x21,x22,[x29,#32]
1397	mov	x0,#1
1398	ldp	x23,x24,[x29,#48]
1399	ldp	x25,x26,[x29,#64]
1400	ldp	x27,x28,[x29,#80]
1401	ldr	x29,[sp],#128
1402	ret
1403.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1404.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1405.align	2
1406.align	4
1407#endif
1408