1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
11#if defined(__aarch64__)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15.text
16
17.globl	bn_mul_mont
18.hidden	bn_mul_mont
19.type	bn_mul_mont,%function
20.align	5
21bn_mul_mont:
22	tst	x5,#7
23	b.eq	__bn_sqr8x_mont
24	tst	x5,#3
25	b.eq	__bn_mul4x_mont
26.Lmul_mont:
27	stp	x29,x30,[sp,#-64]!
28	add	x29,sp,#0
29	stp	x19,x20,[sp,#16]
30	stp	x21,x22,[sp,#32]
31	stp	x23,x24,[sp,#48]
32
33	ldr	x9,[x2],#8		// bp[0]
34	sub	x22,sp,x5,lsl#3
35	ldp	x7,x8,[x1],#16	// ap[0..1]
36	lsl	x5,x5,#3
37	ldr	x4,[x4]		// *n0
38	and	x22,x22,#-16		// ABI says so
39	ldp	x13,x14,[x3],#16	// np[0..1]
40
41	mul	x6,x7,x9		// ap[0]*bp[0]
42	sub	x21,x5,#16		// j=num-2
43	umulh	x7,x7,x9
44	mul	x10,x8,x9		// ap[1]*bp[0]
45	umulh	x11,x8,x9
46
47	mul	x15,x6,x4		// "tp[0]"*n0
48	mov	sp,x22			// alloca
49
50	// (*)	mul	x12,x13,x15	// np[0]*m1
51	umulh	x13,x13,x15
52	mul	x16,x14,x15		// np[1]*m1
53	// (*)	adds	x12,x12,x6	// discarded
54	// (*)	As for removal of first multiplication and addition
55	//	instructions. The outcome of first addition is
56	//	guaranteed to be zero, which leaves two computationally
57	//	significant outcomes: it either carries or not. Then
58	//	question is when does it carry? Is there alternative
59	//	way to deduce it? If you follow operations, you can
60	//	observe that condition for carry is quite simple:
61	//	x6 being non-zero. So that carry can be calculated
62	//	by adding -1 to x6. That's what next instruction does.
63	subs	xzr,x6,#1		// (*)
64	umulh	x17,x14,x15
65	adc	x13,x13,xzr
66	cbz	x21,.L1st_skip
67
68.L1st:
69	ldr	x8,[x1],#8
70	adds	x6,x10,x7
71	sub	x21,x21,#8		// j--
72	adc	x7,x11,xzr
73
74	ldr	x14,[x3],#8
75	adds	x12,x16,x13
76	mul	x10,x8,x9		// ap[j]*bp[0]
77	adc	x13,x17,xzr
78	umulh	x11,x8,x9
79
80	adds	x12,x12,x6
81	mul	x16,x14,x15		// np[j]*m1
82	adc	x13,x13,xzr
83	umulh	x17,x14,x15
84	str	x12,[x22],#8		// tp[j-1]
85	cbnz	x21,.L1st
86
87.L1st_skip:
88	adds	x6,x10,x7
89	sub	x1,x1,x5		// rewind x1
90	adc	x7,x11,xzr
91
92	adds	x12,x16,x13
93	sub	x3,x3,x5		// rewind x3
94	adc	x13,x17,xzr
95
96	adds	x12,x12,x6
97	sub	x20,x5,#8		// i=num-1
98	adcs	x13,x13,x7
99
100	adc	x19,xzr,xzr		// upmost overflow bit
101	stp	x12,x13,[x22]
102
103.Louter:
104	ldr	x9,[x2],#8		// bp[i]
105	ldp	x7,x8,[x1],#16
106	ldr	x23,[sp]		// tp[0]
107	add	x22,sp,#8
108
109	mul	x6,x7,x9		// ap[0]*bp[i]
110	sub	x21,x5,#16		// j=num-2
111	umulh	x7,x7,x9
112	ldp	x13,x14,[x3],#16
113	mul	x10,x8,x9		// ap[1]*bp[i]
114	adds	x6,x6,x23
115	umulh	x11,x8,x9
116	adc	x7,x7,xzr
117
118	mul	x15,x6,x4
119	sub	x20,x20,#8		// i--
120
121	// (*)	mul	x12,x13,x15	// np[0]*m1
122	umulh	x13,x13,x15
123	mul	x16,x14,x15		// np[1]*m1
124	// (*)	adds	x12,x12,x6
125	subs	xzr,x6,#1		// (*)
126	umulh	x17,x14,x15
127	cbz	x21,.Linner_skip
128
129.Linner:
130	ldr	x8,[x1],#8
131	adc	x13,x13,xzr
132	ldr	x23,[x22],#8		// tp[j]
133	adds	x6,x10,x7
134	sub	x21,x21,#8		// j--
135	adc	x7,x11,xzr
136
137	adds	x12,x16,x13
138	ldr	x14,[x3],#8
139	adc	x13,x17,xzr
140
141	mul	x10,x8,x9		// ap[j]*bp[i]
142	adds	x6,x6,x23
143	umulh	x11,x8,x9
144	adc	x7,x7,xzr
145
146	mul	x16,x14,x15		// np[j]*m1
147	adds	x12,x12,x6
148	umulh	x17,x14,x15
149	str	x12,[x22,#-16]		// tp[j-1]
150	cbnz	x21,.Linner
151
152.Linner_skip:
153	ldr	x23,[x22],#8		// tp[j]
154	adc	x13,x13,xzr
155	adds	x6,x10,x7
156	sub	x1,x1,x5		// rewind x1
157	adc	x7,x11,xzr
158
159	adds	x12,x16,x13
160	sub	x3,x3,x5		// rewind x3
161	adcs	x13,x17,x19
162	adc	x19,xzr,xzr
163
164	adds	x6,x6,x23
165	adc	x7,x7,xzr
166
167	adds	x12,x12,x6
168	adcs	x13,x13,x7
169	adc	x19,x19,xzr		// upmost overflow bit
170	stp	x12,x13,[x22,#-16]
171
172	cbnz	x20,.Louter
173
174	// Final step. We see if result is larger than modulus, and
175	// if it is, subtract the modulus. But comparison implies
176	// subtraction. So we subtract modulus, see if it borrowed,
177	// and conditionally copy original value.
178	ldr	x23,[sp]		// tp[0]
179	add	x22,sp,#8
180	ldr	x14,[x3],#8		// np[0]
181	subs	x21,x5,#8		// j=num-1 and clear borrow
182	mov	x1,x0
183.Lsub:
184	sbcs	x8,x23,x14		// tp[j]-np[j]
185	ldr	x23,[x22],#8
186	sub	x21,x21,#8		// j--
187	ldr	x14,[x3],#8
188	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
189	cbnz	x21,.Lsub
190
191	sbcs	x8,x23,x14
192	sbcs	x19,x19,xzr		// did it borrow?
193	str	x8,[x1],#8		// rp[num-1]
194
195	ldr	x23,[sp]		// tp[0]
196	add	x22,sp,#8
197	ldr	x8,[x0],#8		// rp[0]
198	sub	x5,x5,#8		// num--
199	nop
200.Lcond_copy:
201	sub	x5,x5,#8		// num--
202	csel	x14,x23,x8,lo		// did it borrow?
203	ldr	x23,[x22],#8
204	ldr	x8,[x0],#8
205	str	xzr,[x22,#-16]		// wipe tp
206	str	x14,[x0,#-16]
207	cbnz	x5,.Lcond_copy
208
209	csel	x14,x23,x8,lo
210	str	xzr,[x22,#-8]		// wipe tp
211	str	x14,[x0,#-8]
212
213	ldp	x19,x20,[x29,#16]
214	mov	sp,x29
215	ldp	x21,x22,[x29,#32]
216	mov	x0,#1
217	ldp	x23,x24,[x29,#48]
218	ldr	x29,[sp],#64
219	ret
220.size	bn_mul_mont,.-bn_mul_mont
221.type	__bn_sqr8x_mont,%function
222.align	5
223__bn_sqr8x_mont:
224	cmp	x1,x2
225	b.ne	__bn_mul4x_mont
226.Lsqr8x_mont:
227	stp	x29,x30,[sp,#-128]!
228	add	x29,sp,#0
229	stp	x19,x20,[sp,#16]
230	stp	x21,x22,[sp,#32]
231	stp	x23,x24,[sp,#48]
232	stp	x25,x26,[sp,#64]
233	stp	x27,x28,[sp,#80]
234	stp	x0,x3,[sp,#96]	// offload rp and np
235
236	ldp	x6,x7,[x1,#8*0]
237	ldp	x8,x9,[x1,#8*2]
238	ldp	x10,x11,[x1,#8*4]
239	ldp	x12,x13,[x1,#8*6]
240
241	sub	x2,sp,x5,lsl#4
242	lsl	x5,x5,#3
243	ldr	x4,[x4]		// *n0
244	mov	sp,x2			// alloca
245	sub	x27,x5,#8*8
246	b	.Lsqr8x_zero_start
247
248.Lsqr8x_zero:
249	sub	x27,x27,#8*8
250	stp	xzr,xzr,[x2,#8*0]
251	stp	xzr,xzr,[x2,#8*2]
252	stp	xzr,xzr,[x2,#8*4]
253	stp	xzr,xzr,[x2,#8*6]
254.Lsqr8x_zero_start:
255	stp	xzr,xzr,[x2,#8*8]
256	stp	xzr,xzr,[x2,#8*10]
257	stp	xzr,xzr,[x2,#8*12]
258	stp	xzr,xzr,[x2,#8*14]
259	add	x2,x2,#8*16
260	cbnz	x27,.Lsqr8x_zero
261
262	add	x3,x1,x5
263	add	x1,x1,#8*8
264	mov	x19,xzr
265	mov	x20,xzr
266	mov	x21,xzr
267	mov	x22,xzr
268	mov	x23,xzr
269	mov	x24,xzr
270	mov	x25,xzr
271	mov	x26,xzr
272	mov	x2,sp
273	str	x4,[x29,#112]		// offload n0
274
275	// Multiply everything but a[i]*a[i]
276.align	4
277.Lsqr8x_outer_loop:
278        //                                                 a[1]a[0]	(i)
279        //                                             a[2]a[0]
280        //                                         a[3]a[0]
281        //                                     a[4]a[0]
282        //                                 a[5]a[0]
283        //                             a[6]a[0]
284        //                         a[7]a[0]
285        //                                         a[2]a[1]		(ii)
286        //                                     a[3]a[1]
287        //                                 a[4]a[1]
288        //                             a[5]a[1]
289        //                         a[6]a[1]
290        //                     a[7]a[1]
291        //                                 a[3]a[2]			(iii)
292        //                             a[4]a[2]
293        //                         a[5]a[2]
294        //                     a[6]a[2]
295        //                 a[7]a[2]
296        //                         a[4]a[3]				(iv)
297        //                     a[5]a[3]
298        //                 a[6]a[3]
299        //             a[7]a[3]
300        //                 a[5]a[4]					(v)
301        //             a[6]a[4]
302        //         a[7]a[4]
303        //         a[6]a[5]						(vi)
304        //     a[7]a[5]
305        // a[7]a[6]							(vii)
306
307	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
308	mul	x15,x8,x6
309	mul	x16,x9,x6
310	mul	x17,x10,x6
311	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
312	mul	x14,x11,x6
313	adcs	x21,x21,x15
314	mul	x15,x12,x6
315	adcs	x22,x22,x16
316	mul	x16,x13,x6
317	adcs	x23,x23,x17
318	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
319	adcs	x24,x24,x14
320	umulh	x14,x8,x6
321	adcs	x25,x25,x15
322	umulh	x15,x9,x6
323	adcs	x26,x26,x16
324	umulh	x16,x10,x6
325	stp	x19,x20,[x2],#8*2	// t[0..1]
326	adc	x19,xzr,xzr		// t[8]
327	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
328	umulh	x17,x11,x6
329	adcs	x22,x22,x14
330	umulh	x14,x12,x6
331	adcs	x23,x23,x15
332	umulh	x15,x13,x6
333	adcs	x24,x24,x16
334	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
335	adcs	x25,x25,x17
336	mul	x17,x9,x7
337	adcs	x26,x26,x14
338	mul	x14,x10,x7
339	adc	x19,x19,x15
340
341	mul	x15,x11,x7
342	adds	x22,x22,x16
343	mul	x16,x12,x7
344	adcs	x23,x23,x17
345	mul	x17,x13,x7
346	adcs	x24,x24,x14
347	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
348	adcs	x25,x25,x15
349	umulh	x15,x9,x7
350	adcs	x26,x26,x16
351	umulh	x16,x10,x7
352	adcs	x19,x19,x17
353	umulh	x17,x11,x7
354	stp	x21,x22,[x2],#8*2	// t[2..3]
355	adc	x20,xzr,xzr		// t[9]
356	adds	x23,x23,x14
357	umulh	x14,x12,x7
358	adcs	x24,x24,x15
359	umulh	x15,x13,x7
360	adcs	x25,x25,x16
361	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
362	adcs	x26,x26,x17
363	mul	x17,x10,x8
364	adcs	x19,x19,x14
365	mul	x14,x11,x8
366	adc	x20,x20,x15
367
368	mul	x15,x12,x8
369	adds	x24,x24,x16
370	mul	x16,x13,x8
371	adcs	x25,x25,x17
372	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
373	adcs	x26,x26,x14
374	umulh	x14,x10,x8
375	adcs	x19,x19,x15
376	umulh	x15,x11,x8
377	adcs	x20,x20,x16
378	umulh	x16,x12,x8
379	stp	x23,x24,[x2],#8*2	// t[4..5]
380	adc	x21,xzr,xzr		// t[10]
381	adds	x25,x25,x17
382	umulh	x17,x13,x8
383	adcs	x26,x26,x14
384	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
385	adcs	x19,x19,x15
386	mul	x15,x11,x9
387	adcs	x20,x20,x16
388	mul	x16,x12,x9
389	adc	x21,x21,x17
390
391	mul	x17,x13,x9
392	adds	x26,x26,x14
393	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
394	adcs	x19,x19,x15
395	umulh	x15,x11,x9
396	adcs	x20,x20,x16
397	umulh	x16,x12,x9
398	adcs	x21,x21,x17
399	umulh	x17,x13,x9
400	stp	x25,x26,[x2],#8*2	// t[6..7]
401	adc	x22,xzr,xzr		// t[11]
402	adds	x19,x19,x14
403	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
404	adcs	x20,x20,x15
405	mul	x15,x12,x10
406	adcs	x21,x21,x16
407	mul	x16,x13,x10
408	adc	x22,x22,x17
409
410	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
411	adds	x20,x20,x14
412	umulh	x14,x12,x10
413	adcs	x21,x21,x15
414	umulh	x15,x13,x10
415	adcs	x22,x22,x16
416	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
417	adc	x23,xzr,xzr		// t[12]
418	adds	x21,x21,x17
419	mul	x17,x13,x11
420	adcs	x22,x22,x14
421	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
422	adc	x23,x23,x15
423
424	umulh	x15,x13,x11
425	adds	x22,x22,x16
426	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
427	adcs	x23,x23,x17
428	umulh	x17,x13,x12		// hi(a[7]*a[6])
429	adc	x24,xzr,xzr		// t[13]
430	adds	x23,x23,x14
431	sub	x27,x3,x1	// done yet?
432	adc	x24,x24,x15
433
434	adds	x24,x24,x16
435	sub	x14,x3,x5	// rewinded ap
436	adc	x25,xzr,xzr		// t[14]
437	add	x25,x25,x17
438
439	cbz	x27,.Lsqr8x_outer_break
440
441	mov	x4,x6
442	ldp	x6,x7,[x2,#8*0]
443	ldp	x8,x9,[x2,#8*2]
444	ldp	x10,x11,[x2,#8*4]
445	ldp	x12,x13,[x2,#8*6]
446	adds	x19,x19,x6
447	adcs	x20,x20,x7
448	ldp	x6,x7,[x1,#8*0]
449	adcs	x21,x21,x8
450	adcs	x22,x22,x9
451	ldp	x8,x9,[x1,#8*2]
452	adcs	x23,x23,x10
453	adcs	x24,x24,x11
454	ldp	x10,x11,[x1,#8*4]
455	adcs	x25,x25,x12
456	mov	x0,x1
457	adcs	x26,xzr,x13
458	ldp	x12,x13,[x1,#8*6]
459	add	x1,x1,#8*8
460	//adc	x28,xzr,xzr		// moved below
461	mov	x27,#-8*8
462
463	//                                                         a[8]a[0]
464	//                                                     a[9]a[0]
465	//                                                 a[a]a[0]
466	//                                             a[b]a[0]
467	//                                         a[c]a[0]
468	//                                     a[d]a[0]
469	//                                 a[e]a[0]
470	//                             a[f]a[0]
471	//                                                     a[8]a[1]
472	//                         a[f]a[1]........................
473	//                                                 a[8]a[2]
474	//                     a[f]a[2]........................
475	//                                             a[8]a[3]
476	//                 a[f]a[3]........................
477	//                                         a[8]a[4]
478	//             a[f]a[4]........................
479	//                                     a[8]a[5]
480	//         a[f]a[5]........................
481	//                                 a[8]a[6]
482	//     a[f]a[6]........................
483	//                             a[8]a[7]
484	// a[f]a[7]........................
485.Lsqr8x_mul:
486	mul	x14,x6,x4
487	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
488	mul	x15,x7,x4
489	add	x27,x27,#8
490	mul	x16,x8,x4
491	mul	x17,x9,x4
492	adds	x19,x19,x14
493	mul	x14,x10,x4
494	adcs	x20,x20,x15
495	mul	x15,x11,x4
496	adcs	x21,x21,x16
497	mul	x16,x12,x4
498	adcs	x22,x22,x17
499	mul	x17,x13,x4
500	adcs	x23,x23,x14
501	umulh	x14,x6,x4
502	adcs	x24,x24,x15
503	umulh	x15,x7,x4
504	adcs	x25,x25,x16
505	umulh	x16,x8,x4
506	adcs	x26,x26,x17
507	umulh	x17,x9,x4
508	adc	x28,x28,xzr
509	str	x19,[x2],#8
510	adds	x19,x20,x14
511	umulh	x14,x10,x4
512	adcs	x20,x21,x15
513	umulh	x15,x11,x4
514	adcs	x21,x22,x16
515	umulh	x16,x12,x4
516	adcs	x22,x23,x17
517	umulh	x17,x13,x4
518	ldr	x4,[x0,x27]
519	adcs	x23,x24,x14
520	adcs	x24,x25,x15
521	adcs	x25,x26,x16
522	adcs	x26,x28,x17
523	//adc	x28,xzr,xzr		// moved above
524	cbnz	x27,.Lsqr8x_mul
525					// note that carry flag is guaranteed
526					// to be zero at this point
527	cmp	x1,x3		// done yet?
528	b.eq	.Lsqr8x_break
529
530	ldp	x6,x7,[x2,#8*0]
531	ldp	x8,x9,[x2,#8*2]
532	ldp	x10,x11,[x2,#8*4]
533	ldp	x12,x13,[x2,#8*6]
534	adds	x19,x19,x6
535	ldr	x4,[x0,#-8*8]
536	adcs	x20,x20,x7
537	ldp	x6,x7,[x1,#8*0]
538	adcs	x21,x21,x8
539	adcs	x22,x22,x9
540	ldp	x8,x9,[x1,#8*2]
541	adcs	x23,x23,x10
542	adcs	x24,x24,x11
543	ldp	x10,x11,[x1,#8*4]
544	adcs	x25,x25,x12
545	mov	x27,#-8*8
546	adcs	x26,x26,x13
547	ldp	x12,x13,[x1,#8*6]
548	add	x1,x1,#8*8
549	//adc	x28,xzr,xzr		// moved above
550	b	.Lsqr8x_mul
551
552.align	4
553.Lsqr8x_break:
554	ldp	x6,x7,[x0,#8*0]
555	add	x1,x0,#8*8
556	ldp	x8,x9,[x0,#8*2]
557	sub	x14,x3,x1		// is it last iteration?
558	ldp	x10,x11,[x0,#8*4]
559	sub	x15,x2,x14
560	ldp	x12,x13,[x0,#8*6]
561	cbz	x14,.Lsqr8x_outer_loop
562
563	stp	x19,x20,[x2,#8*0]
564	ldp	x19,x20,[x15,#8*0]
565	stp	x21,x22,[x2,#8*2]
566	ldp	x21,x22,[x15,#8*2]
567	stp	x23,x24,[x2,#8*4]
568	ldp	x23,x24,[x15,#8*4]
569	stp	x25,x26,[x2,#8*6]
570	mov	x2,x15
571	ldp	x25,x26,[x15,#8*6]
572	b	.Lsqr8x_outer_loop
573
574.align	4
575.Lsqr8x_outer_break:
576	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
577	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
578	ldp	x15,x16,[sp,#8*1]
579	ldp	x11,x13,[x14,#8*2]
580	add	x1,x14,#8*4
581	ldp	x17,x14,[sp,#8*3]
582
583	stp	x19,x20,[x2,#8*0]
584	mul	x19,x7,x7
585	stp	x21,x22,[x2,#8*2]
586	umulh	x7,x7,x7
587	stp	x23,x24,[x2,#8*4]
588	mul	x8,x9,x9
589	stp	x25,x26,[x2,#8*6]
590	mov	x2,sp
591	umulh	x9,x9,x9
592	adds	x20,x7,x15,lsl#1
593	extr	x15,x16,x15,#63
594	sub	x27,x5,#8*4
595
596.Lsqr4x_shift_n_add:
597	adcs	x21,x8,x15
598	extr	x16,x17,x16,#63
599	sub	x27,x27,#8*4
600	adcs	x22,x9,x16
601	ldp	x15,x16,[x2,#8*5]
602	mul	x10,x11,x11
603	ldp	x7,x9,[x1],#8*2
604	umulh	x11,x11,x11
605	mul	x12,x13,x13
606	umulh	x13,x13,x13
607	extr	x17,x14,x17,#63
608	stp	x19,x20,[x2,#8*0]
609	adcs	x23,x10,x17
610	extr	x14,x15,x14,#63
611	stp	x21,x22,[x2,#8*2]
612	adcs	x24,x11,x14
613	ldp	x17,x14,[x2,#8*7]
614	extr	x15,x16,x15,#63
615	adcs	x25,x12,x15
616	extr	x16,x17,x16,#63
617	adcs	x26,x13,x16
618	ldp	x15,x16,[x2,#8*9]
619	mul	x6,x7,x7
620	ldp	x11,x13,[x1],#8*2
621	umulh	x7,x7,x7
622	mul	x8,x9,x9
623	umulh	x9,x9,x9
624	stp	x23,x24,[x2,#8*4]
625	extr	x17,x14,x17,#63
626	stp	x25,x26,[x2,#8*6]
627	add	x2,x2,#8*8
628	adcs	x19,x6,x17
629	extr	x14,x15,x14,#63
630	adcs	x20,x7,x14
631	ldp	x17,x14,[x2,#8*3]
632	extr	x15,x16,x15,#63
633	cbnz	x27,.Lsqr4x_shift_n_add
634	ldp	x1,x4,[x29,#104]	// pull np and n0
635
636	adcs	x21,x8,x15
637	extr	x16,x17,x16,#63
638	adcs	x22,x9,x16
639	ldp	x15,x16,[x2,#8*5]
640	mul	x10,x11,x11
641	umulh	x11,x11,x11
642	stp	x19,x20,[x2,#8*0]
643	mul	x12,x13,x13
644	umulh	x13,x13,x13
645	stp	x21,x22,[x2,#8*2]
646	extr	x17,x14,x17,#63
647	adcs	x23,x10,x17
648	extr	x14,x15,x14,#63
649	ldp	x19,x20,[sp,#8*0]
650	adcs	x24,x11,x14
651	extr	x15,x16,x15,#63
652	ldp	x6,x7,[x1,#8*0]
653	adcs	x25,x12,x15
654	extr	x16,xzr,x16,#63
655	ldp	x8,x9,[x1,#8*2]
656	adc	x26,x13,x16
657	ldp	x10,x11,[x1,#8*4]
658
659	// Reduce by 512 bits per iteration
660	mul	x28,x4,x19		// t[0]*n0
661	ldp	x12,x13,[x1,#8*6]
662	add	x3,x1,x5
663	ldp	x21,x22,[sp,#8*2]
664	stp	x23,x24,[x2,#8*4]
665	ldp	x23,x24,[sp,#8*4]
666	stp	x25,x26,[x2,#8*6]
667	ldp	x25,x26,[sp,#8*6]
668	add	x1,x1,#8*8
669	mov	x30,xzr		// initial top-most carry
670	mov	x2,sp
671	mov	x27,#8
672
673.Lsqr8x_reduction:
674	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
675	mul	x15,x7,x28
676	sub	x27,x27,#1
677	mul	x16,x8,x28
678	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
679	mul	x17,x9,x28
680	// (*)	adds	xzr,x19,x14
681	subs	xzr,x19,#1		// (*)
682	mul	x14,x10,x28
683	adcs	x19,x20,x15
684	mul	x15,x11,x28
685	adcs	x20,x21,x16
686	mul	x16,x12,x28
687	adcs	x21,x22,x17
688	mul	x17,x13,x28
689	adcs	x22,x23,x14
690	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
691	adcs	x23,x24,x15
692	umulh	x15,x7,x28
693	adcs	x24,x25,x16
694	umulh	x16,x8,x28
695	adcs	x25,x26,x17
696	umulh	x17,x9,x28
697	adc	x26,xzr,xzr
698	adds	x19,x19,x14
699	umulh	x14,x10,x28
700	adcs	x20,x20,x15
701	umulh	x15,x11,x28
702	adcs	x21,x21,x16
703	umulh	x16,x12,x28
704	adcs	x22,x22,x17
705	umulh	x17,x13,x28
706	mul	x28,x4,x19		// next t[0]*n0
707	adcs	x23,x23,x14
708	adcs	x24,x24,x15
709	adcs	x25,x25,x16
710	adc	x26,x26,x17
711	cbnz	x27,.Lsqr8x_reduction
712
713	ldp	x14,x15,[x2,#8*0]
714	ldp	x16,x17,[x2,#8*2]
715	mov	x0,x2
716	sub	x27,x3,x1	// done yet?
717	adds	x19,x19,x14
718	adcs	x20,x20,x15
719	ldp	x14,x15,[x2,#8*4]
720	adcs	x21,x21,x16
721	adcs	x22,x22,x17
722	ldp	x16,x17,[x2,#8*6]
723	adcs	x23,x23,x14
724	adcs	x24,x24,x15
725	adcs	x25,x25,x16
726	adcs	x26,x26,x17
727	//adc	x28,xzr,xzr		// moved below
728	cbz	x27,.Lsqr8x8_post_condition
729
730	ldr	x4,[x2,#-8*8]
731	ldp	x6,x7,[x1,#8*0]
732	ldp	x8,x9,[x1,#8*2]
733	ldp	x10,x11,[x1,#8*4]
734	mov	x27,#-8*8
735	ldp	x12,x13,[x1,#8*6]
736	add	x1,x1,#8*8
737
738.Lsqr8x_tail:
739	mul	x14,x6,x4
740	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
741	mul	x15,x7,x4
742	add	x27,x27,#8
743	mul	x16,x8,x4
744	mul	x17,x9,x4
745	adds	x19,x19,x14
746	mul	x14,x10,x4
747	adcs	x20,x20,x15
748	mul	x15,x11,x4
749	adcs	x21,x21,x16
750	mul	x16,x12,x4
751	adcs	x22,x22,x17
752	mul	x17,x13,x4
753	adcs	x23,x23,x14
754	umulh	x14,x6,x4
755	adcs	x24,x24,x15
756	umulh	x15,x7,x4
757	adcs	x25,x25,x16
758	umulh	x16,x8,x4
759	adcs	x26,x26,x17
760	umulh	x17,x9,x4
761	adc	x28,x28,xzr
762	str	x19,[x2],#8
763	adds	x19,x20,x14
764	umulh	x14,x10,x4
765	adcs	x20,x21,x15
766	umulh	x15,x11,x4
767	adcs	x21,x22,x16
768	umulh	x16,x12,x4
769	adcs	x22,x23,x17
770	umulh	x17,x13,x4
771	ldr	x4,[x0,x27]
772	adcs	x23,x24,x14
773	adcs	x24,x25,x15
774	adcs	x25,x26,x16
775	adcs	x26,x28,x17
776	//adc	x28,xzr,xzr		// moved above
777	cbnz	x27,.Lsqr8x_tail
778					// note that carry flag is guaranteed
779					// to be zero at this point
780	ldp	x6,x7,[x2,#8*0]
781	sub	x27,x3,x1	// done yet?
782	sub	x16,x3,x5	// rewinded np
783	ldp	x8,x9,[x2,#8*2]
784	ldp	x10,x11,[x2,#8*4]
785	ldp	x12,x13,[x2,#8*6]
786	cbz	x27,.Lsqr8x_tail_break
787
788	ldr	x4,[x0,#-8*8]
789	adds	x19,x19,x6
790	adcs	x20,x20,x7
791	ldp	x6,x7,[x1,#8*0]
792	adcs	x21,x21,x8
793	adcs	x22,x22,x9
794	ldp	x8,x9,[x1,#8*2]
795	adcs	x23,x23,x10
796	adcs	x24,x24,x11
797	ldp	x10,x11,[x1,#8*4]
798	adcs	x25,x25,x12
799	mov	x27,#-8*8
800	adcs	x26,x26,x13
801	ldp	x12,x13,[x1,#8*6]
802	add	x1,x1,#8*8
803	//adc	x28,xzr,xzr		// moved above
804	b	.Lsqr8x_tail
805
806.align	4
807.Lsqr8x_tail_break:
808	ldr	x4,[x29,#112]		// pull n0
809	add	x27,x2,#8*8		// end of current t[num] window
810
811	subs	xzr,x30,#1		// "move" top-most carry to carry bit
812	adcs	x14,x19,x6
813	adcs	x15,x20,x7
814	ldp	x19,x20,[x0,#8*0]
815	adcs	x21,x21,x8
816	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
817	adcs	x22,x22,x9
818	ldp	x8,x9,[x16,#8*2]
819	adcs	x23,x23,x10
820	adcs	x24,x24,x11
821	ldp	x10,x11,[x16,#8*4]
822	adcs	x25,x25,x12
823	adcs	x26,x26,x13
824	ldp	x12,x13,[x16,#8*6]
825	add	x1,x16,#8*8
826	adc	x30,xzr,xzr	// top-most carry
827	mul	x28,x4,x19
828	stp	x14,x15,[x2,#8*0]
829	stp	x21,x22,[x2,#8*2]
830	ldp	x21,x22,[x0,#8*2]
831	stp	x23,x24,[x2,#8*4]
832	ldp	x23,x24,[x0,#8*4]
833	cmp	x27,x29		// did we hit the bottom?
834	stp	x25,x26,[x2,#8*6]
835	mov	x2,x0			// slide the window
836	ldp	x25,x26,[x0,#8*6]
837	mov	x27,#8
838	b.ne	.Lsqr8x_reduction
839
840	// Final step. We see if result is larger than modulus, and
841	// if it is, subtract the modulus. But comparison implies
842	// subtraction. So we subtract modulus, see if it borrowed,
843	// and conditionally copy original value.
844	ldr	x0,[x29,#96]		// pull rp
845	add	x2,x2,#8*8
846	subs	x14,x19,x6
847	sbcs	x15,x20,x7
848	sub	x27,x5,#8*8
849	mov	x3,x0		// x0 copy
850
851.Lsqr8x_sub:
852	sbcs	x16,x21,x8
853	ldp	x6,x7,[x1,#8*0]
854	sbcs	x17,x22,x9
855	stp	x14,x15,[x0,#8*0]
856	sbcs	x14,x23,x10
857	ldp	x8,x9,[x1,#8*2]
858	sbcs	x15,x24,x11
859	stp	x16,x17,[x0,#8*2]
860	sbcs	x16,x25,x12
861	ldp	x10,x11,[x1,#8*4]
862	sbcs	x17,x26,x13
863	ldp	x12,x13,[x1,#8*6]
864	add	x1,x1,#8*8
865	ldp	x19,x20,[x2,#8*0]
866	sub	x27,x27,#8*8
867	ldp	x21,x22,[x2,#8*2]
868	ldp	x23,x24,[x2,#8*4]
869	ldp	x25,x26,[x2,#8*6]
870	add	x2,x2,#8*8
871	stp	x14,x15,[x0,#8*4]
872	sbcs	x14,x19,x6
873	stp	x16,x17,[x0,#8*6]
874	add	x0,x0,#8*8
875	sbcs	x15,x20,x7
876	cbnz	x27,.Lsqr8x_sub
877
878	sbcs	x16,x21,x8
879	mov	x2,sp
880	add	x1,sp,x5
881	ldp	x6,x7,[x3,#8*0]
882	sbcs	x17,x22,x9
883	stp	x14,x15,[x0,#8*0]
884	sbcs	x14,x23,x10
885	ldp	x8,x9,[x3,#8*2]
886	sbcs	x15,x24,x11
887	stp	x16,x17,[x0,#8*2]
888	sbcs	x16,x25,x12
889	ldp	x19,x20,[x1,#8*0]
890	sbcs	x17,x26,x13
891	ldp	x21,x22,[x1,#8*2]
892	sbcs	xzr,x30,xzr	// did it borrow?
893	ldr	x30,[x29,#8]		// pull return address
894	stp	x14,x15,[x0,#8*4]
895	stp	x16,x17,[x0,#8*6]
896
897	sub	x27,x5,#8*4
898.Lsqr4x_cond_copy:
899	sub	x27,x27,#8*4
900	csel	x14,x19,x6,lo
901	stp	xzr,xzr,[x2,#8*0]
902	csel	x15,x20,x7,lo
903	ldp	x6,x7,[x3,#8*4]
904	ldp	x19,x20,[x1,#8*4]
905	csel	x16,x21,x8,lo
906	stp	xzr,xzr,[x2,#8*2]
907	add	x2,x2,#8*4
908	csel	x17,x22,x9,lo
909	ldp	x8,x9,[x3,#8*6]
910	ldp	x21,x22,[x1,#8*6]
911	add	x1,x1,#8*4
912	stp	x14,x15,[x3,#8*0]
913	stp	x16,x17,[x3,#8*2]
914	add	x3,x3,#8*4
915	stp	xzr,xzr,[x1,#8*0]
916	stp	xzr,xzr,[x1,#8*2]
917	cbnz	x27,.Lsqr4x_cond_copy
918
919	csel	x14,x19,x6,lo
920	stp	xzr,xzr,[x2,#8*0]
921	csel	x15,x20,x7,lo
922	stp	xzr,xzr,[x2,#8*2]
923	csel	x16,x21,x8,lo
924	csel	x17,x22,x9,lo
925	stp	x14,x15,[x3,#8*0]
926	stp	x16,x17,[x3,#8*2]
927
928	b	.Lsqr8x_done
929
930.align	4
931.Lsqr8x8_post_condition:
932	adc	x28,xzr,xzr
933	ldr	x30,[x29,#8]		// pull return address
934	// x19-7,x28 hold result, x6-7 hold modulus
935	subs	x6,x19,x6
936	ldr	x1,[x29,#96]		// pull rp
937	sbcs	x7,x20,x7
938	stp	xzr,xzr,[sp,#8*0]
939	sbcs	x8,x21,x8
940	stp	xzr,xzr,[sp,#8*2]
941	sbcs	x9,x22,x9
942	stp	xzr,xzr,[sp,#8*4]
943	sbcs	x10,x23,x10
944	stp	xzr,xzr,[sp,#8*6]
945	sbcs	x11,x24,x11
946	stp	xzr,xzr,[sp,#8*8]
947	sbcs	x12,x25,x12
948	stp	xzr,xzr,[sp,#8*10]
949	sbcs	x13,x26,x13
950	stp	xzr,xzr,[sp,#8*12]
951	sbcs	x28,x28,xzr	// did it borrow?
952	stp	xzr,xzr,[sp,#8*14]
953
954	// x6-7 hold result-modulus
955	csel	x6,x19,x6,lo
956	csel	x7,x20,x7,lo
957	csel	x8,x21,x8,lo
958	csel	x9,x22,x9,lo
959	stp	x6,x7,[x1,#8*0]
960	csel	x10,x23,x10,lo
961	csel	x11,x24,x11,lo
962	stp	x8,x9,[x1,#8*2]
963	csel	x12,x25,x12,lo
964	csel	x13,x26,x13,lo
965	stp	x10,x11,[x1,#8*4]
966	stp	x12,x13,[x1,#8*6]
967
968.Lsqr8x_done:
969	ldp	x19,x20,[x29,#16]
970	mov	sp,x29
971	ldp	x21,x22,[x29,#32]
972	mov	x0,#1
973	ldp	x23,x24,[x29,#48]
974	ldp	x25,x26,[x29,#64]
975	ldp	x27,x28,[x29,#80]
976	ldr	x29,[sp],#128
977	ret
978.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
979.type	__bn_mul4x_mont,%function
980.align	5
981__bn_mul4x_mont:
982	stp	x29,x30,[sp,#-128]!
983	add	x29,sp,#0
984	stp	x19,x20,[sp,#16]
985	stp	x21,x22,[sp,#32]
986	stp	x23,x24,[sp,#48]
987	stp	x25,x26,[sp,#64]
988	stp	x27,x28,[sp,#80]
989
990	sub	x26,sp,x5,lsl#3
991	lsl	x5,x5,#3
992	ldr	x4,[x4]		// *n0
993	sub	sp,x26,#8*4		// alloca
994
995	add	x10,x2,x5
996	add	x27,x1,x5
997	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
998
999	ldr	x24,[x2,#8*0]		// b[0]
1000	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1001	ldp	x8,x9,[x1,#8*2]
1002	add	x1,x1,#8*4
1003	mov	x19,xzr
1004	mov	x20,xzr
1005	mov	x21,xzr
1006	mov	x22,xzr
1007	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1008	ldp	x16,x17,[x3,#8*2]
1009	adds	x3,x3,#8*4		// clear carry bit
1010	mov	x0,xzr
1011	mov	x28,#0
1012	mov	x26,sp
1013
1014.Loop_mul4x_1st_reduction:
1015	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1016	adc	x0,x0,xzr	// modulo-scheduled
1017	mul	x11,x7,x24
1018	add	x28,x28,#8
1019	mul	x12,x8,x24
1020	and	x28,x28,#31
1021	mul	x13,x9,x24
1022	adds	x19,x19,x10
1023	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1024	adcs	x20,x20,x11
1025	mul	x25,x19,x4		// t[0]*n0
1026	adcs	x21,x21,x12
1027	umulh	x11,x7,x24
1028	adcs	x22,x22,x13
1029	umulh	x12,x8,x24
1030	adc	x23,xzr,xzr
1031	umulh	x13,x9,x24
1032	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1033	adds	x20,x20,x10
1034	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1035	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1036	adcs	x21,x21,x11
1037	mul	x11,x15,x25
1038	adcs	x22,x22,x12
1039	mul	x12,x16,x25
1040	adc	x23,x23,x13		// can't overflow
1041	mul	x13,x17,x25
1042	// (*)	adds	xzr,x19,x10
1043	subs	xzr,x19,#1		// (*)
1044	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1045	adcs	x19,x20,x11
1046	umulh	x11,x15,x25
1047	adcs	x20,x21,x12
1048	umulh	x12,x16,x25
1049	adcs	x21,x22,x13
1050	umulh	x13,x17,x25
1051	adcs	x22,x23,x0
1052	adc	x0,xzr,xzr
1053	adds	x19,x19,x10
1054	sub	x10,x27,x1
1055	adcs	x20,x20,x11
1056	adcs	x21,x21,x12
1057	adcs	x22,x22,x13
1058	//adc	x0,x0,xzr
1059	cbnz	x28,.Loop_mul4x_1st_reduction
1060
1061	cbz	x10,.Lmul4x4_post_condition
1062
1063	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1064	ldp	x8,x9,[x1,#8*2]
1065	add	x1,x1,#8*4
1066	ldr	x25,[sp]		// a[0]*n0
1067	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1068	ldp	x16,x17,[x3,#8*2]
1069	add	x3,x3,#8*4
1070
1071.Loop_mul4x_1st_tail:
1072	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1073	adc	x0,x0,xzr	// modulo-scheduled
1074	mul	x11,x7,x24
1075	add	x28,x28,#8
1076	mul	x12,x8,x24
1077	and	x28,x28,#31
1078	mul	x13,x9,x24
1079	adds	x19,x19,x10
1080	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1081	adcs	x20,x20,x11
1082	umulh	x11,x7,x24
1083	adcs	x21,x21,x12
1084	umulh	x12,x8,x24
1085	adcs	x22,x22,x13
1086	umulh	x13,x9,x24
1087	adc	x23,xzr,xzr
1088	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1089	adds	x20,x20,x10
1090	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1091	adcs	x21,x21,x11
1092	mul	x11,x15,x25
1093	adcs	x22,x22,x12
1094	mul	x12,x16,x25
1095	adc	x23,x23,x13		// can't overflow
1096	mul	x13,x17,x25
1097	adds	x19,x19,x10
1098	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1099	adcs	x20,x20,x11
1100	umulh	x11,x15,x25
1101	adcs	x21,x21,x12
1102	umulh	x12,x16,x25
1103	adcs	x22,x22,x13
1104	adcs	x23,x23,x0
1105	umulh	x13,x17,x25
1106	adc	x0,xzr,xzr
1107	ldr	x25,[sp,x28]		// next t[0]*n0
1108	str	x19,[x26],#8		// result!!!
1109	adds	x19,x20,x10
1110	sub	x10,x27,x1		// done yet?
1111	adcs	x20,x21,x11
1112	adcs	x21,x22,x12
1113	adcs	x22,x23,x13
1114	//adc	x0,x0,xzr
1115	cbnz	x28,.Loop_mul4x_1st_tail
1116
1117	sub	x11,x27,x5	// rewinded x1
1118	cbz	x10,.Lmul4x_proceed
1119
1120	ldp	x6,x7,[x1,#8*0]
1121	ldp	x8,x9,[x1,#8*2]
1122	add	x1,x1,#8*4
1123	ldp	x14,x15,[x3,#8*0]
1124	ldp	x16,x17,[x3,#8*2]
1125	add	x3,x3,#8*4
1126	b	.Loop_mul4x_1st_tail
1127
1128.align	5
1129.Lmul4x_proceed:
1130	ldr	x24,[x2,#8*4]!		// *++b
1131	adc	x30,x0,xzr
1132	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1133	sub	x3,x3,x5		// rewind np
1134	ldp	x8,x9,[x11,#8*2]
1135	add	x1,x11,#8*4
1136
1137	stp	x19,x20,[x26,#8*0]	// result!!!
1138	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1139	stp	x21,x22,[x26,#8*2]	// result!!!
1140	ldp	x21,x22,[sp,#8*6]
1141
1142	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1143	mov	x26,sp
1144	ldp	x16,x17,[x3,#8*2]
1145	adds	x3,x3,#8*4		// clear carry bit
1146	mov	x0,xzr
1147
1148.align	4
1149.Loop_mul4x_reduction:
1150	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1151	adc	x0,x0,xzr	// modulo-scheduled
1152	mul	x11,x7,x24
1153	add	x28,x28,#8
1154	mul	x12,x8,x24
1155	and	x28,x28,#31
1156	mul	x13,x9,x24
1157	adds	x19,x19,x10
1158	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1159	adcs	x20,x20,x11
1160	mul	x25,x19,x4		// t[0]*n0
1161	adcs	x21,x21,x12
1162	umulh	x11,x7,x24
1163	adcs	x22,x22,x13
1164	umulh	x12,x8,x24
1165	adc	x23,xzr,xzr
1166	umulh	x13,x9,x24
1167	ldr	x24,[x2,x28]		// next b[i]
1168	adds	x20,x20,x10
1169	// (*)	mul	x10,x14,x25
1170	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1171	adcs	x21,x21,x11
1172	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1173	adcs	x22,x22,x12
1174	mul	x12,x16,x25
1175	adc	x23,x23,x13		// can't overflow
1176	mul	x13,x17,x25
1177	// (*)	adds	xzr,x19,x10
1178	subs	xzr,x19,#1		// (*)
1179	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1180	adcs	x19,x20,x11
1181	umulh	x11,x15,x25
1182	adcs	x20,x21,x12
1183	umulh	x12,x16,x25
1184	adcs	x21,x22,x13
1185	umulh	x13,x17,x25
1186	adcs	x22,x23,x0
1187	adc	x0,xzr,xzr
1188	adds	x19,x19,x10
1189	adcs	x20,x20,x11
1190	adcs	x21,x21,x12
1191	adcs	x22,x22,x13
1192	//adc	x0,x0,xzr
1193	cbnz	x28,.Loop_mul4x_reduction
1194
1195	adc	x0,x0,xzr
1196	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1197	ldp	x12,x13,[x26,#8*6]
1198	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1199	ldp	x8,x9,[x1,#8*2]
1200	add	x1,x1,#8*4
1201	adds	x19,x19,x10
1202	adcs	x20,x20,x11
1203	adcs	x21,x21,x12
1204	adcs	x22,x22,x13
1205	//adc	x0,x0,xzr
1206
1207	ldr	x25,[sp]		// t[0]*n0
1208	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1209	ldp	x16,x17,[x3,#8*2]
1210	add	x3,x3,#8*4
1211
1212.align	4
1213.Loop_mul4x_tail:
1214	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1215	adc	x0,x0,xzr	// modulo-scheduled
1216	mul	x11,x7,x24
1217	add	x28,x28,#8
1218	mul	x12,x8,x24
1219	and	x28,x28,#31
1220	mul	x13,x9,x24
1221	adds	x19,x19,x10
1222	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1223	adcs	x20,x20,x11
1224	umulh	x11,x7,x24
1225	adcs	x21,x21,x12
1226	umulh	x12,x8,x24
1227	adcs	x22,x22,x13
1228	umulh	x13,x9,x24
1229	adc	x23,xzr,xzr
1230	ldr	x24,[x2,x28]		// next b[i]
1231	adds	x20,x20,x10
1232	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1233	adcs	x21,x21,x11
1234	mul	x11,x15,x25
1235	adcs	x22,x22,x12
1236	mul	x12,x16,x25
1237	adc	x23,x23,x13		// can't overflow
1238	mul	x13,x17,x25
1239	adds	x19,x19,x10
1240	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1241	adcs	x20,x20,x11
1242	umulh	x11,x15,x25
1243	adcs	x21,x21,x12
1244	umulh	x12,x16,x25
1245	adcs	x22,x22,x13
1246	umulh	x13,x17,x25
1247	adcs	x23,x23,x0
1248	ldr	x25,[sp,x28]		// next a[0]*n0
1249	adc	x0,xzr,xzr
1250	str	x19,[x26],#8		// result!!!
1251	adds	x19,x20,x10
1252	sub	x10,x27,x1		// done yet?
1253	adcs	x20,x21,x11
1254	adcs	x21,x22,x12
1255	adcs	x22,x23,x13
1256	//adc	x0,x0,xzr
1257	cbnz	x28,.Loop_mul4x_tail
1258
1259	sub	x11,x3,x5		// rewinded np?
1260	adc	x0,x0,xzr
1261	cbz	x10,.Loop_mul4x_break
1262
1263	ldp	x10,x11,[x26,#8*4]
1264	ldp	x12,x13,[x26,#8*6]
1265	ldp	x6,x7,[x1,#8*0]
1266	ldp	x8,x9,[x1,#8*2]
1267	add	x1,x1,#8*4
1268	adds	x19,x19,x10
1269	adcs	x20,x20,x11
1270	adcs	x21,x21,x12
1271	adcs	x22,x22,x13
1272	//adc	x0,x0,xzr
1273	ldp	x14,x15,[x3,#8*0]
1274	ldp	x16,x17,[x3,#8*2]
1275	add	x3,x3,#8*4
1276	b	.Loop_mul4x_tail
1277
1278.align	4
1279.Loop_mul4x_break:
1280	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1281	adds	x19,x19,x30
1282	add	x2,x2,#8*4		// bp++
1283	adcs	x20,x20,xzr
1284	sub	x1,x1,x5		// rewind ap
1285	adcs	x21,x21,xzr
1286	stp	x19,x20,[x26,#8*0]	// result!!!
1287	adcs	x22,x22,xzr
1288	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1289	adc	x30,x0,xzr
1290	stp	x21,x22,[x26,#8*2]	// result!!!
1291	cmp	x2,x13			// done yet?
1292	ldp	x21,x22,[sp,#8*6]
1293	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1294	ldp	x16,x17,[x11,#8*2]
1295	add	x3,x11,#8*4
1296	b.eq	.Lmul4x_post
1297
1298	ldr	x24,[x2]
1299	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1300	ldp	x8,x9,[x1,#8*2]
1301	adds	x1,x1,#8*4		// clear carry bit
1302	mov	x0,xzr
1303	mov	x26,sp
1304	b	.Loop_mul4x_reduction
1305
1306.align	4
1307.Lmul4x_post:
1308	// Final step. We see if result is larger than modulus, and
1309	// if it is, subtract the modulus. But comparison implies
1310	// subtraction. So we subtract modulus, see if it borrowed,
1311	// and conditionally copy original value.
1312	mov	x0,x12
1313	mov	x27,x12		// x0 copy
1314	subs	x10,x19,x14
1315	add	x26,sp,#8*8
1316	sbcs	x11,x20,x15
1317	sub	x28,x5,#8*4
1318
1319.Lmul4x_sub:
1320	sbcs	x12,x21,x16
1321	ldp	x14,x15,[x3,#8*0]
1322	sub	x28,x28,#8*4
1323	ldp	x19,x20,[x26,#8*0]
1324	sbcs	x13,x22,x17
1325	ldp	x16,x17,[x3,#8*2]
1326	add	x3,x3,#8*4
1327	ldp	x21,x22,[x26,#8*2]
1328	add	x26,x26,#8*4
1329	stp	x10,x11,[x0,#8*0]
1330	sbcs	x10,x19,x14
1331	stp	x12,x13,[x0,#8*2]
1332	add	x0,x0,#8*4
1333	sbcs	x11,x20,x15
1334	cbnz	x28,.Lmul4x_sub
1335
1336	sbcs	x12,x21,x16
1337	mov	x26,sp
1338	add	x1,sp,#8*4
1339	ldp	x6,x7,[x27,#8*0]
1340	sbcs	x13,x22,x17
1341	stp	x10,x11,[x0,#8*0]
1342	ldp	x8,x9,[x27,#8*2]
1343	stp	x12,x13,[x0,#8*2]
1344	ldp	x19,x20,[x1,#8*0]
1345	ldp	x21,x22,[x1,#8*2]
1346	sbcs	xzr,x30,xzr	// did it borrow?
1347	ldr	x30,[x29,#8]		// pull return address
1348
1349	sub	x28,x5,#8*4
1350.Lmul4x_cond_copy:
1351	sub	x28,x28,#8*4
1352	csel	x10,x19,x6,lo
1353	stp	xzr,xzr,[x26,#8*0]
1354	csel	x11,x20,x7,lo
1355	ldp	x6,x7,[x27,#8*4]
1356	ldp	x19,x20,[x1,#8*4]
1357	csel	x12,x21,x8,lo
1358	stp	xzr,xzr,[x26,#8*2]
1359	add	x26,x26,#8*4
1360	csel	x13,x22,x9,lo
1361	ldp	x8,x9,[x27,#8*6]
1362	ldp	x21,x22,[x1,#8*6]
1363	add	x1,x1,#8*4
1364	stp	x10,x11,[x27,#8*0]
1365	stp	x12,x13,[x27,#8*2]
1366	add	x27,x27,#8*4
1367	cbnz	x28,.Lmul4x_cond_copy
1368
1369	csel	x10,x19,x6,lo
1370	stp	xzr,xzr,[x26,#8*0]
1371	csel	x11,x20,x7,lo
1372	stp	xzr,xzr,[x26,#8*2]
1373	csel	x12,x21,x8,lo
1374	stp	xzr,xzr,[x26,#8*3]
1375	csel	x13,x22,x9,lo
1376	stp	xzr,xzr,[x26,#8*4]
1377	stp	x10,x11,[x27,#8*0]
1378	stp	x12,x13,[x27,#8*2]
1379
1380	b	.Lmul4x_done
1381
1382.align	4
1383.Lmul4x4_post_condition:
1384	adc	x0,x0,xzr
1385	ldr	x1,[x29,#96]		// pull rp
1386	// x19-3,x0 hold result, x14-7 hold modulus
1387	subs	x6,x19,x14
1388	ldr	x30,[x29,#8]		// pull return address
1389	sbcs	x7,x20,x15
1390	stp	xzr,xzr,[sp,#8*0]
1391	sbcs	x8,x21,x16
1392	stp	xzr,xzr,[sp,#8*2]
1393	sbcs	x9,x22,x17
1394	stp	xzr,xzr,[sp,#8*4]
1395	sbcs	xzr,x0,xzr		// did it borrow?
1396	stp	xzr,xzr,[sp,#8*6]
1397
1398	// x6-3 hold result-modulus
1399	csel	x6,x19,x6,lo
1400	csel	x7,x20,x7,lo
1401	csel	x8,x21,x8,lo
1402	csel	x9,x22,x9,lo
1403	stp	x6,x7,[x1,#8*0]
1404	stp	x8,x9,[x1,#8*2]
1405
1406.Lmul4x_done:
1407	ldp	x19,x20,[x29,#16]
1408	mov	sp,x29
1409	ldp	x21,x22,[x29,#32]
1410	mov	x0,#1
1411	ldp	x23,x24,[x29,#48]
1412	ldp	x25,x26,[x29,#64]
1413	ldp	x27,x28,[x29,#80]
1414	ldr	x29,[sp],#128
1415	ret
1416.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1417.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1418.align	2
1419.align	4
1420#endif
1421#endif  // !OPENSSL_NO_ASM
1422