1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include <GFp/arm_arch.h>
14
15.text
16
17.globl	GFp_bn_mul_mont
18.hidden	GFp_bn_mul_mont
19.type	GFp_bn_mul_mont,%function
20.align	5
21GFp_bn_mul_mont:
22	AARCH64_SIGN_LINK_REGISTER
23	tst	x5,#7
24	b.eq	__bn_sqr8x_mont
25	tst	x5,#3
26	b.eq	__bn_mul4x_mont
27.Lmul_mont:
28	stp	x29,x30,[sp,#-64]!
29	add	x29,sp,#0
30	stp	x19,x20,[sp,#16]
31	stp	x21,x22,[sp,#32]
32	stp	x23,x24,[sp,#48]
33
34	ldr	x9,[x2],#8		// bp[0]
35	sub	x22,sp,x5,lsl#3
36	ldp	x7,x8,[x1],#16	// ap[0..1]
37	lsl	x5,x5,#3
38	ldr	x4,[x4]		// *n0
39	and	x22,x22,#-16		// ABI says so
40	ldp	x13,x14,[x3],#16	// np[0..1]
41
42	mul	x6,x7,x9		// ap[0]*bp[0]
43	sub	x21,x5,#16		// j=num-2
44	umulh	x7,x7,x9
45	mul	x10,x8,x9		// ap[1]*bp[0]
46	umulh	x11,x8,x9
47
48	mul	x15,x6,x4		// "tp[0]"*n0
49	mov	sp,x22			// alloca
50
51	// (*)	mul	x12,x13,x15	// np[0]*m1
52	umulh	x13,x13,x15
53	mul	x16,x14,x15		// np[1]*m1
54	// (*)	adds	x12,x12,x6	// discarded
55	// (*)	As for removal of first multiplication and addition
56	//	instructions. The outcome of first addition is
57	//	guaranteed to be zero, which leaves two computationally
58	//	significant outcomes: it either carries or not. Then
59	//	question is when does it carry? Is there alternative
60	//	way to deduce it? If you follow operations, you can
61	//	observe that condition for carry is quite simple:
62	//	x6 being non-zero. So that carry can be calculated
63	//	by adding -1 to x6. That's what next instruction does.
64	subs	xzr,x6,#1		// (*)
65	umulh	x17,x14,x15
66	adc	x13,x13,xzr
67	cbz	x21,.L1st_skip
68
69.L1st:
70	ldr	x8,[x1],#8
71	adds	x6,x10,x7
72	sub	x21,x21,#8		// j--
73	adc	x7,x11,xzr
74
75	ldr	x14,[x3],#8
76	adds	x12,x16,x13
77	mul	x10,x8,x9		// ap[j]*bp[0]
78	adc	x13,x17,xzr
79	umulh	x11,x8,x9
80
81	adds	x12,x12,x6
82	mul	x16,x14,x15		// np[j]*m1
83	adc	x13,x13,xzr
84	umulh	x17,x14,x15
85	str	x12,[x22],#8		// tp[j-1]
86	cbnz	x21,.L1st
87
88.L1st_skip:
89	adds	x6,x10,x7
90	sub	x1,x1,x5		// rewind x1
91	adc	x7,x11,xzr
92
93	adds	x12,x16,x13
94	sub	x3,x3,x5		// rewind x3
95	adc	x13,x17,xzr
96
97	adds	x12,x12,x6
98	sub	x20,x5,#8		// i=num-1
99	adcs	x13,x13,x7
100
101	adc	x19,xzr,xzr		// upmost overflow bit
102	stp	x12,x13,[x22]
103
104.Louter:
105	ldr	x9,[x2],#8		// bp[i]
106	ldp	x7,x8,[x1],#16
107	ldr	x23,[sp]		// tp[0]
108	add	x22,sp,#8
109
110	mul	x6,x7,x9		// ap[0]*bp[i]
111	sub	x21,x5,#16		// j=num-2
112	umulh	x7,x7,x9
113	ldp	x13,x14,[x3],#16
114	mul	x10,x8,x9		// ap[1]*bp[i]
115	adds	x6,x6,x23
116	umulh	x11,x8,x9
117	adc	x7,x7,xzr
118
119	mul	x15,x6,x4
120	sub	x20,x20,#8		// i--
121
122	// (*)	mul	x12,x13,x15	// np[0]*m1
123	umulh	x13,x13,x15
124	mul	x16,x14,x15		// np[1]*m1
125	// (*)	adds	x12,x12,x6
126	subs	xzr,x6,#1		// (*)
127	umulh	x17,x14,x15
128	cbz	x21,.Linner_skip
129
130.Linner:
131	ldr	x8,[x1],#8
132	adc	x13,x13,xzr
133	ldr	x23,[x22],#8		// tp[j]
134	adds	x6,x10,x7
135	sub	x21,x21,#8		// j--
136	adc	x7,x11,xzr
137
138	adds	x12,x16,x13
139	ldr	x14,[x3],#8
140	adc	x13,x17,xzr
141
142	mul	x10,x8,x9		// ap[j]*bp[i]
143	adds	x6,x6,x23
144	umulh	x11,x8,x9
145	adc	x7,x7,xzr
146
147	mul	x16,x14,x15		// np[j]*m1
148	adds	x12,x12,x6
149	umulh	x17,x14,x15
150	str	x12,[x22,#-16]		// tp[j-1]
151	cbnz	x21,.Linner
152
153.Linner_skip:
154	ldr	x23,[x22],#8		// tp[j]
155	adc	x13,x13,xzr
156	adds	x6,x10,x7
157	sub	x1,x1,x5		// rewind x1
158	adc	x7,x11,xzr
159
160	adds	x12,x16,x13
161	sub	x3,x3,x5		// rewind x3
162	adcs	x13,x17,x19
163	adc	x19,xzr,xzr
164
165	adds	x6,x6,x23
166	adc	x7,x7,xzr
167
168	adds	x12,x12,x6
169	adcs	x13,x13,x7
170	adc	x19,x19,xzr		// upmost overflow bit
171	stp	x12,x13,[x22,#-16]
172
173	cbnz	x20,.Louter
174
175	// Final step. We see if result is larger than modulus, and
176	// if it is, subtract the modulus. But comparison implies
177	// subtraction. So we subtract modulus, see if it borrowed,
178	// and conditionally copy original value.
179	ldr	x23,[sp]		// tp[0]
180	add	x22,sp,#8
181	ldr	x14,[x3],#8		// np[0]
182	subs	x21,x5,#8		// j=num-1 and clear borrow
183	mov	x1,x0
184.Lsub:
185	sbcs	x8,x23,x14		// tp[j]-np[j]
186	ldr	x23,[x22],#8
187	sub	x21,x21,#8		// j--
188	ldr	x14,[x3],#8
189	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
190	cbnz	x21,.Lsub
191
192	sbcs	x8,x23,x14
193	sbcs	x19,x19,xzr		// did it borrow?
194	str	x8,[x1],#8		// rp[num-1]
195
196	ldr	x23,[sp]		// tp[0]
197	add	x22,sp,#8
198	ldr	x8,[x0],#8		// rp[0]
199	sub	x5,x5,#8		// num--
200	nop
201.Lcond_copy:
202	sub	x5,x5,#8		// num--
203	csel	x14,x23,x8,lo		// did it borrow?
204	ldr	x23,[x22],#8
205	ldr	x8,[x0],#8
206	str	xzr,[x22,#-16]		// wipe tp
207	str	x14,[x0,#-16]
208	cbnz	x5,.Lcond_copy
209
210	csel	x14,x23,x8,lo
211	str	xzr,[x22,#-8]		// wipe tp
212	str	x14,[x0,#-8]
213
214	ldp	x19,x20,[x29,#16]
215	mov	sp,x29
216	ldp	x21,x22,[x29,#32]
217	mov	x0,#1
218	ldp	x23,x24,[x29,#48]
219	ldr	x29,[sp],#64
220	AARCH64_VALIDATE_LINK_REGISTER
221	ret
222.size	GFp_bn_mul_mont,.-GFp_bn_mul_mont
223.type	__bn_sqr8x_mont,%function
224.align	5
225__bn_sqr8x_mont:
226	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
227	// only from bn_mul_mont which has already signed the return address.
228	cmp	x1,x2
229	b.ne	__bn_mul4x_mont
230.Lsqr8x_mont:
231	stp	x29,x30,[sp,#-128]!
232	add	x29,sp,#0
233	stp	x19,x20,[sp,#16]
234	stp	x21,x22,[sp,#32]
235	stp	x23,x24,[sp,#48]
236	stp	x25,x26,[sp,#64]
237	stp	x27,x28,[sp,#80]
238	stp	x0,x3,[sp,#96]	// offload rp and np
239
240	ldp	x6,x7,[x1,#8*0]
241	ldp	x8,x9,[x1,#8*2]
242	ldp	x10,x11,[x1,#8*4]
243	ldp	x12,x13,[x1,#8*6]
244
245	sub	x2,sp,x5,lsl#4
246	lsl	x5,x5,#3
247	ldr	x4,[x4]		// *n0
248	mov	sp,x2			// alloca
249	sub	x27,x5,#8*8
250	b	.Lsqr8x_zero_start
251
252.Lsqr8x_zero:
253	sub	x27,x27,#8*8
254	stp	xzr,xzr,[x2,#8*0]
255	stp	xzr,xzr,[x2,#8*2]
256	stp	xzr,xzr,[x2,#8*4]
257	stp	xzr,xzr,[x2,#8*6]
258.Lsqr8x_zero_start:
259	stp	xzr,xzr,[x2,#8*8]
260	stp	xzr,xzr,[x2,#8*10]
261	stp	xzr,xzr,[x2,#8*12]
262	stp	xzr,xzr,[x2,#8*14]
263	add	x2,x2,#8*16
264	cbnz	x27,.Lsqr8x_zero
265
266	add	x3,x1,x5
267	add	x1,x1,#8*8
268	mov	x19,xzr
269	mov	x20,xzr
270	mov	x21,xzr
271	mov	x22,xzr
272	mov	x23,xzr
273	mov	x24,xzr
274	mov	x25,xzr
275	mov	x26,xzr
276	mov	x2,sp
277	str	x4,[x29,#112]		// offload n0
278
279	// Multiply everything but a[i]*a[i]
280.align	4
281.Lsqr8x_outer_loop:
282        //                                                 a[1]a[0]	(i)
283        //                                             a[2]a[0]
284        //                                         a[3]a[0]
285        //                                     a[4]a[0]
286        //                                 a[5]a[0]
287        //                             a[6]a[0]
288        //                         a[7]a[0]
289        //                                         a[2]a[1]		(ii)
290        //                                     a[3]a[1]
291        //                                 a[4]a[1]
292        //                             a[5]a[1]
293        //                         a[6]a[1]
294        //                     a[7]a[1]
295        //                                 a[3]a[2]			(iii)
296        //                             a[4]a[2]
297        //                         a[5]a[2]
298        //                     a[6]a[2]
299        //                 a[7]a[2]
300        //                         a[4]a[3]				(iv)
301        //                     a[5]a[3]
302        //                 a[6]a[3]
303        //             a[7]a[3]
304        //                 a[5]a[4]					(v)
305        //             a[6]a[4]
306        //         a[7]a[4]
307        //         a[6]a[5]						(vi)
308        //     a[7]a[5]
309        // a[7]a[6]							(vii)
310
311	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
312	mul	x15,x8,x6
313	mul	x16,x9,x6
314	mul	x17,x10,x6
315	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
316	mul	x14,x11,x6
317	adcs	x21,x21,x15
318	mul	x15,x12,x6
319	adcs	x22,x22,x16
320	mul	x16,x13,x6
321	adcs	x23,x23,x17
322	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
323	adcs	x24,x24,x14
324	umulh	x14,x8,x6
325	adcs	x25,x25,x15
326	umulh	x15,x9,x6
327	adcs	x26,x26,x16
328	umulh	x16,x10,x6
329	stp	x19,x20,[x2],#8*2	// t[0..1]
330	adc	x19,xzr,xzr		// t[8]
331	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
332	umulh	x17,x11,x6
333	adcs	x22,x22,x14
334	umulh	x14,x12,x6
335	adcs	x23,x23,x15
336	umulh	x15,x13,x6
337	adcs	x24,x24,x16
338	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
339	adcs	x25,x25,x17
340	mul	x17,x9,x7
341	adcs	x26,x26,x14
342	mul	x14,x10,x7
343	adc	x19,x19,x15
344
345	mul	x15,x11,x7
346	adds	x22,x22,x16
347	mul	x16,x12,x7
348	adcs	x23,x23,x17
349	mul	x17,x13,x7
350	adcs	x24,x24,x14
351	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
352	adcs	x25,x25,x15
353	umulh	x15,x9,x7
354	adcs	x26,x26,x16
355	umulh	x16,x10,x7
356	adcs	x19,x19,x17
357	umulh	x17,x11,x7
358	stp	x21,x22,[x2],#8*2	// t[2..3]
359	adc	x20,xzr,xzr		// t[9]
360	adds	x23,x23,x14
361	umulh	x14,x12,x7
362	adcs	x24,x24,x15
363	umulh	x15,x13,x7
364	adcs	x25,x25,x16
365	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
366	adcs	x26,x26,x17
367	mul	x17,x10,x8
368	adcs	x19,x19,x14
369	mul	x14,x11,x8
370	adc	x20,x20,x15
371
372	mul	x15,x12,x8
373	adds	x24,x24,x16
374	mul	x16,x13,x8
375	adcs	x25,x25,x17
376	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
377	adcs	x26,x26,x14
378	umulh	x14,x10,x8
379	adcs	x19,x19,x15
380	umulh	x15,x11,x8
381	adcs	x20,x20,x16
382	umulh	x16,x12,x8
383	stp	x23,x24,[x2],#8*2	// t[4..5]
384	adc	x21,xzr,xzr		// t[10]
385	adds	x25,x25,x17
386	umulh	x17,x13,x8
387	adcs	x26,x26,x14
388	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
389	adcs	x19,x19,x15
390	mul	x15,x11,x9
391	adcs	x20,x20,x16
392	mul	x16,x12,x9
393	adc	x21,x21,x17
394
395	mul	x17,x13,x9
396	adds	x26,x26,x14
397	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
398	adcs	x19,x19,x15
399	umulh	x15,x11,x9
400	adcs	x20,x20,x16
401	umulh	x16,x12,x9
402	adcs	x21,x21,x17
403	umulh	x17,x13,x9
404	stp	x25,x26,[x2],#8*2	// t[6..7]
405	adc	x22,xzr,xzr		// t[11]
406	adds	x19,x19,x14
407	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
408	adcs	x20,x20,x15
409	mul	x15,x12,x10
410	adcs	x21,x21,x16
411	mul	x16,x13,x10
412	adc	x22,x22,x17
413
414	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
415	adds	x20,x20,x14
416	umulh	x14,x12,x10
417	adcs	x21,x21,x15
418	umulh	x15,x13,x10
419	adcs	x22,x22,x16
420	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
421	adc	x23,xzr,xzr		// t[12]
422	adds	x21,x21,x17
423	mul	x17,x13,x11
424	adcs	x22,x22,x14
425	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
426	adc	x23,x23,x15
427
428	umulh	x15,x13,x11
429	adds	x22,x22,x16
430	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
431	adcs	x23,x23,x17
432	umulh	x17,x13,x12		// hi(a[7]*a[6])
433	adc	x24,xzr,xzr		// t[13]
434	adds	x23,x23,x14
435	sub	x27,x3,x1	// done yet?
436	adc	x24,x24,x15
437
438	adds	x24,x24,x16
439	sub	x14,x3,x5	// rewinded ap
440	adc	x25,xzr,xzr		// t[14]
441	add	x25,x25,x17
442
443	cbz	x27,.Lsqr8x_outer_break
444
445	mov	x4,x6
446	ldp	x6,x7,[x2,#8*0]
447	ldp	x8,x9,[x2,#8*2]
448	ldp	x10,x11,[x2,#8*4]
449	ldp	x12,x13,[x2,#8*6]
450	adds	x19,x19,x6
451	adcs	x20,x20,x7
452	ldp	x6,x7,[x1,#8*0]
453	adcs	x21,x21,x8
454	adcs	x22,x22,x9
455	ldp	x8,x9,[x1,#8*2]
456	adcs	x23,x23,x10
457	adcs	x24,x24,x11
458	ldp	x10,x11,[x1,#8*4]
459	adcs	x25,x25,x12
460	mov	x0,x1
461	adcs	x26,xzr,x13
462	ldp	x12,x13,[x1,#8*6]
463	add	x1,x1,#8*8
464	//adc	x28,xzr,xzr		// moved below
465	mov	x27,#-8*8
466
467	//                                                         a[8]a[0]
468	//                                                     a[9]a[0]
469	//                                                 a[a]a[0]
470	//                                             a[b]a[0]
471	//                                         a[c]a[0]
472	//                                     a[d]a[0]
473	//                                 a[e]a[0]
474	//                             a[f]a[0]
475	//                                                     a[8]a[1]
476	//                         a[f]a[1]........................
477	//                                                 a[8]a[2]
478	//                     a[f]a[2]........................
479	//                                             a[8]a[3]
480	//                 a[f]a[3]........................
481	//                                         a[8]a[4]
482	//             a[f]a[4]........................
483	//                                     a[8]a[5]
484	//         a[f]a[5]........................
485	//                                 a[8]a[6]
486	//     a[f]a[6]........................
487	//                             a[8]a[7]
488	// a[f]a[7]........................
489.Lsqr8x_mul:
490	mul	x14,x6,x4
491	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
492	mul	x15,x7,x4
493	add	x27,x27,#8
494	mul	x16,x8,x4
495	mul	x17,x9,x4
496	adds	x19,x19,x14
497	mul	x14,x10,x4
498	adcs	x20,x20,x15
499	mul	x15,x11,x4
500	adcs	x21,x21,x16
501	mul	x16,x12,x4
502	adcs	x22,x22,x17
503	mul	x17,x13,x4
504	adcs	x23,x23,x14
505	umulh	x14,x6,x4
506	adcs	x24,x24,x15
507	umulh	x15,x7,x4
508	adcs	x25,x25,x16
509	umulh	x16,x8,x4
510	adcs	x26,x26,x17
511	umulh	x17,x9,x4
512	adc	x28,x28,xzr
513	str	x19,[x2],#8
514	adds	x19,x20,x14
515	umulh	x14,x10,x4
516	adcs	x20,x21,x15
517	umulh	x15,x11,x4
518	adcs	x21,x22,x16
519	umulh	x16,x12,x4
520	adcs	x22,x23,x17
521	umulh	x17,x13,x4
522	ldr	x4,[x0,x27]
523	adcs	x23,x24,x14
524	adcs	x24,x25,x15
525	adcs	x25,x26,x16
526	adcs	x26,x28,x17
527	//adc	x28,xzr,xzr		// moved above
528	cbnz	x27,.Lsqr8x_mul
529					// note that carry flag is guaranteed
530					// to be zero at this point
531	cmp	x1,x3		// done yet?
532	b.eq	.Lsqr8x_break
533
534	ldp	x6,x7,[x2,#8*0]
535	ldp	x8,x9,[x2,#8*2]
536	ldp	x10,x11,[x2,#8*4]
537	ldp	x12,x13,[x2,#8*6]
538	adds	x19,x19,x6
539	ldr	x4,[x0,#-8*8]
540	adcs	x20,x20,x7
541	ldp	x6,x7,[x1,#8*0]
542	adcs	x21,x21,x8
543	adcs	x22,x22,x9
544	ldp	x8,x9,[x1,#8*2]
545	adcs	x23,x23,x10
546	adcs	x24,x24,x11
547	ldp	x10,x11,[x1,#8*4]
548	adcs	x25,x25,x12
549	mov	x27,#-8*8
550	adcs	x26,x26,x13
551	ldp	x12,x13,[x1,#8*6]
552	add	x1,x1,#8*8
553	//adc	x28,xzr,xzr		// moved above
554	b	.Lsqr8x_mul
555
556.align	4
557.Lsqr8x_break:
558	ldp	x6,x7,[x0,#8*0]
559	add	x1,x0,#8*8
560	ldp	x8,x9,[x0,#8*2]
561	sub	x14,x3,x1		// is it last iteration?
562	ldp	x10,x11,[x0,#8*4]
563	sub	x15,x2,x14
564	ldp	x12,x13,[x0,#8*6]
565	cbz	x14,.Lsqr8x_outer_loop
566
567	stp	x19,x20,[x2,#8*0]
568	ldp	x19,x20,[x15,#8*0]
569	stp	x21,x22,[x2,#8*2]
570	ldp	x21,x22,[x15,#8*2]
571	stp	x23,x24,[x2,#8*4]
572	ldp	x23,x24,[x15,#8*4]
573	stp	x25,x26,[x2,#8*6]
574	mov	x2,x15
575	ldp	x25,x26,[x15,#8*6]
576	b	.Lsqr8x_outer_loop
577
578.align	4
579.Lsqr8x_outer_break:
580	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
581	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
582	ldp	x15,x16,[sp,#8*1]
583	ldp	x11,x13,[x14,#8*2]
584	add	x1,x14,#8*4
585	ldp	x17,x14,[sp,#8*3]
586
587	stp	x19,x20,[x2,#8*0]
588	mul	x19,x7,x7
589	stp	x21,x22,[x2,#8*2]
590	umulh	x7,x7,x7
591	stp	x23,x24,[x2,#8*4]
592	mul	x8,x9,x9
593	stp	x25,x26,[x2,#8*6]
594	mov	x2,sp
595	umulh	x9,x9,x9
596	adds	x20,x7,x15,lsl#1
597	extr	x15,x16,x15,#63
598	sub	x27,x5,#8*4
599
600.Lsqr4x_shift_n_add:
601	adcs	x21,x8,x15
602	extr	x16,x17,x16,#63
603	sub	x27,x27,#8*4
604	adcs	x22,x9,x16
605	ldp	x15,x16,[x2,#8*5]
606	mul	x10,x11,x11
607	ldp	x7,x9,[x1],#8*2
608	umulh	x11,x11,x11
609	mul	x12,x13,x13
610	umulh	x13,x13,x13
611	extr	x17,x14,x17,#63
612	stp	x19,x20,[x2,#8*0]
613	adcs	x23,x10,x17
614	extr	x14,x15,x14,#63
615	stp	x21,x22,[x2,#8*2]
616	adcs	x24,x11,x14
617	ldp	x17,x14,[x2,#8*7]
618	extr	x15,x16,x15,#63
619	adcs	x25,x12,x15
620	extr	x16,x17,x16,#63
621	adcs	x26,x13,x16
622	ldp	x15,x16,[x2,#8*9]
623	mul	x6,x7,x7
624	ldp	x11,x13,[x1],#8*2
625	umulh	x7,x7,x7
626	mul	x8,x9,x9
627	umulh	x9,x9,x9
628	stp	x23,x24,[x2,#8*4]
629	extr	x17,x14,x17,#63
630	stp	x25,x26,[x2,#8*6]
631	add	x2,x2,#8*8
632	adcs	x19,x6,x17
633	extr	x14,x15,x14,#63
634	adcs	x20,x7,x14
635	ldp	x17,x14,[x2,#8*3]
636	extr	x15,x16,x15,#63
637	cbnz	x27,.Lsqr4x_shift_n_add
638	ldp	x1,x4,[x29,#104]	// pull np and n0
639
640	adcs	x21,x8,x15
641	extr	x16,x17,x16,#63
642	adcs	x22,x9,x16
643	ldp	x15,x16,[x2,#8*5]
644	mul	x10,x11,x11
645	umulh	x11,x11,x11
646	stp	x19,x20,[x2,#8*0]
647	mul	x12,x13,x13
648	umulh	x13,x13,x13
649	stp	x21,x22,[x2,#8*2]
650	extr	x17,x14,x17,#63
651	adcs	x23,x10,x17
652	extr	x14,x15,x14,#63
653	ldp	x19,x20,[sp,#8*0]
654	adcs	x24,x11,x14
655	extr	x15,x16,x15,#63
656	ldp	x6,x7,[x1,#8*0]
657	adcs	x25,x12,x15
658	extr	x16,xzr,x16,#63
659	ldp	x8,x9,[x1,#8*2]
660	adc	x26,x13,x16
661	ldp	x10,x11,[x1,#8*4]
662
663	// Reduce by 512 bits per iteration
664	mul	x28,x4,x19		// t[0]*n0
665	ldp	x12,x13,[x1,#8*6]
666	add	x3,x1,x5
667	ldp	x21,x22,[sp,#8*2]
668	stp	x23,x24,[x2,#8*4]
669	ldp	x23,x24,[sp,#8*4]
670	stp	x25,x26,[x2,#8*6]
671	ldp	x25,x26,[sp,#8*6]
672	add	x1,x1,#8*8
673	mov	x30,xzr		// initial top-most carry
674	mov	x2,sp
675	mov	x27,#8
676
677.Lsqr8x_reduction:
678	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
679	mul	x15,x7,x28
680	sub	x27,x27,#1
681	mul	x16,x8,x28
682	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
683	mul	x17,x9,x28
684	// (*)	adds	xzr,x19,x14
685	subs	xzr,x19,#1		// (*)
686	mul	x14,x10,x28
687	adcs	x19,x20,x15
688	mul	x15,x11,x28
689	adcs	x20,x21,x16
690	mul	x16,x12,x28
691	adcs	x21,x22,x17
692	mul	x17,x13,x28
693	adcs	x22,x23,x14
694	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
695	adcs	x23,x24,x15
696	umulh	x15,x7,x28
697	adcs	x24,x25,x16
698	umulh	x16,x8,x28
699	adcs	x25,x26,x17
700	umulh	x17,x9,x28
701	adc	x26,xzr,xzr
702	adds	x19,x19,x14
703	umulh	x14,x10,x28
704	adcs	x20,x20,x15
705	umulh	x15,x11,x28
706	adcs	x21,x21,x16
707	umulh	x16,x12,x28
708	adcs	x22,x22,x17
709	umulh	x17,x13,x28
710	mul	x28,x4,x19		// next t[0]*n0
711	adcs	x23,x23,x14
712	adcs	x24,x24,x15
713	adcs	x25,x25,x16
714	adc	x26,x26,x17
715	cbnz	x27,.Lsqr8x_reduction
716
717	ldp	x14,x15,[x2,#8*0]
718	ldp	x16,x17,[x2,#8*2]
719	mov	x0,x2
720	sub	x27,x3,x1	// done yet?
721	adds	x19,x19,x14
722	adcs	x20,x20,x15
723	ldp	x14,x15,[x2,#8*4]
724	adcs	x21,x21,x16
725	adcs	x22,x22,x17
726	ldp	x16,x17,[x2,#8*6]
727	adcs	x23,x23,x14
728	adcs	x24,x24,x15
729	adcs	x25,x25,x16
730	adcs	x26,x26,x17
731	//adc	x28,xzr,xzr		// moved below
732	cbz	x27,.Lsqr8x8_post_condition
733
734	ldr	x4,[x2,#-8*8]
735	ldp	x6,x7,[x1,#8*0]
736	ldp	x8,x9,[x1,#8*2]
737	ldp	x10,x11,[x1,#8*4]
738	mov	x27,#-8*8
739	ldp	x12,x13,[x1,#8*6]
740	add	x1,x1,#8*8
741
742.Lsqr8x_tail:
743	mul	x14,x6,x4
744	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
745	mul	x15,x7,x4
746	add	x27,x27,#8
747	mul	x16,x8,x4
748	mul	x17,x9,x4
749	adds	x19,x19,x14
750	mul	x14,x10,x4
751	adcs	x20,x20,x15
752	mul	x15,x11,x4
753	adcs	x21,x21,x16
754	mul	x16,x12,x4
755	adcs	x22,x22,x17
756	mul	x17,x13,x4
757	adcs	x23,x23,x14
758	umulh	x14,x6,x4
759	adcs	x24,x24,x15
760	umulh	x15,x7,x4
761	adcs	x25,x25,x16
762	umulh	x16,x8,x4
763	adcs	x26,x26,x17
764	umulh	x17,x9,x4
765	adc	x28,x28,xzr
766	str	x19,[x2],#8
767	adds	x19,x20,x14
768	umulh	x14,x10,x4
769	adcs	x20,x21,x15
770	umulh	x15,x11,x4
771	adcs	x21,x22,x16
772	umulh	x16,x12,x4
773	adcs	x22,x23,x17
774	umulh	x17,x13,x4
775	ldr	x4,[x0,x27]
776	adcs	x23,x24,x14
777	adcs	x24,x25,x15
778	adcs	x25,x26,x16
779	adcs	x26,x28,x17
780	//adc	x28,xzr,xzr		// moved above
781	cbnz	x27,.Lsqr8x_tail
782					// note that carry flag is guaranteed
783					// to be zero at this point
784	ldp	x6,x7,[x2,#8*0]
785	sub	x27,x3,x1	// done yet?
786	sub	x16,x3,x5	// rewinded np
787	ldp	x8,x9,[x2,#8*2]
788	ldp	x10,x11,[x2,#8*4]
789	ldp	x12,x13,[x2,#8*6]
790	cbz	x27,.Lsqr8x_tail_break
791
792	ldr	x4,[x0,#-8*8]
793	adds	x19,x19,x6
794	adcs	x20,x20,x7
795	ldp	x6,x7,[x1,#8*0]
796	adcs	x21,x21,x8
797	adcs	x22,x22,x9
798	ldp	x8,x9,[x1,#8*2]
799	adcs	x23,x23,x10
800	adcs	x24,x24,x11
801	ldp	x10,x11,[x1,#8*4]
802	adcs	x25,x25,x12
803	mov	x27,#-8*8
804	adcs	x26,x26,x13
805	ldp	x12,x13,[x1,#8*6]
806	add	x1,x1,#8*8
807	//adc	x28,xzr,xzr		// moved above
808	b	.Lsqr8x_tail
809
810.align	4
811.Lsqr8x_tail_break:
812	ldr	x4,[x29,#112]		// pull n0
813	add	x27,x2,#8*8		// end of current t[num] window
814
815	subs	xzr,x30,#1		// "move" top-most carry to carry bit
816	adcs	x14,x19,x6
817	adcs	x15,x20,x7
818	ldp	x19,x20,[x0,#8*0]
819	adcs	x21,x21,x8
820	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
821	adcs	x22,x22,x9
822	ldp	x8,x9,[x16,#8*2]
823	adcs	x23,x23,x10
824	adcs	x24,x24,x11
825	ldp	x10,x11,[x16,#8*4]
826	adcs	x25,x25,x12
827	adcs	x26,x26,x13
828	ldp	x12,x13,[x16,#8*6]
829	add	x1,x16,#8*8
830	adc	x30,xzr,xzr	// top-most carry
831	mul	x28,x4,x19
832	stp	x14,x15,[x2,#8*0]
833	stp	x21,x22,[x2,#8*2]
834	ldp	x21,x22,[x0,#8*2]
835	stp	x23,x24,[x2,#8*4]
836	ldp	x23,x24,[x0,#8*4]
837	cmp	x27,x29		// did we hit the bottom?
838	stp	x25,x26,[x2,#8*6]
839	mov	x2,x0			// slide the window
840	ldp	x25,x26,[x0,#8*6]
841	mov	x27,#8
842	b.ne	.Lsqr8x_reduction
843
844	// Final step. We see if result is larger than modulus, and
845	// if it is, subtract the modulus. But comparison implies
846	// subtraction. So we subtract modulus, see if it borrowed,
847	// and conditionally copy original value.
848	ldr	x0,[x29,#96]		// pull rp
849	add	x2,x2,#8*8
850	subs	x14,x19,x6
851	sbcs	x15,x20,x7
852	sub	x27,x5,#8*8
853	mov	x3,x0		// x0 copy
854
855.Lsqr8x_sub:
856	sbcs	x16,x21,x8
857	ldp	x6,x7,[x1,#8*0]
858	sbcs	x17,x22,x9
859	stp	x14,x15,[x0,#8*0]
860	sbcs	x14,x23,x10
861	ldp	x8,x9,[x1,#8*2]
862	sbcs	x15,x24,x11
863	stp	x16,x17,[x0,#8*2]
864	sbcs	x16,x25,x12
865	ldp	x10,x11,[x1,#8*4]
866	sbcs	x17,x26,x13
867	ldp	x12,x13,[x1,#8*6]
868	add	x1,x1,#8*8
869	ldp	x19,x20,[x2,#8*0]
870	sub	x27,x27,#8*8
871	ldp	x21,x22,[x2,#8*2]
872	ldp	x23,x24,[x2,#8*4]
873	ldp	x25,x26,[x2,#8*6]
874	add	x2,x2,#8*8
875	stp	x14,x15,[x0,#8*4]
876	sbcs	x14,x19,x6
877	stp	x16,x17,[x0,#8*6]
878	add	x0,x0,#8*8
879	sbcs	x15,x20,x7
880	cbnz	x27,.Lsqr8x_sub
881
882	sbcs	x16,x21,x8
883	mov	x2,sp
884	add	x1,sp,x5
885	ldp	x6,x7,[x3,#8*0]
886	sbcs	x17,x22,x9
887	stp	x14,x15,[x0,#8*0]
888	sbcs	x14,x23,x10
889	ldp	x8,x9,[x3,#8*2]
890	sbcs	x15,x24,x11
891	stp	x16,x17,[x0,#8*2]
892	sbcs	x16,x25,x12
893	ldp	x19,x20,[x1,#8*0]
894	sbcs	x17,x26,x13
895	ldp	x21,x22,[x1,#8*2]
896	sbcs	xzr,x30,xzr	// did it borrow?
897	ldr	x30,[x29,#8]		// pull return address
898	stp	x14,x15,[x0,#8*4]
899	stp	x16,x17,[x0,#8*6]
900
901	sub	x27,x5,#8*4
902.Lsqr4x_cond_copy:
903	sub	x27,x27,#8*4
904	csel	x14,x19,x6,lo
905	stp	xzr,xzr,[x2,#8*0]
906	csel	x15,x20,x7,lo
907	ldp	x6,x7,[x3,#8*4]
908	ldp	x19,x20,[x1,#8*4]
909	csel	x16,x21,x8,lo
910	stp	xzr,xzr,[x2,#8*2]
911	add	x2,x2,#8*4
912	csel	x17,x22,x9,lo
913	ldp	x8,x9,[x3,#8*6]
914	ldp	x21,x22,[x1,#8*6]
915	add	x1,x1,#8*4
916	stp	x14,x15,[x3,#8*0]
917	stp	x16,x17,[x3,#8*2]
918	add	x3,x3,#8*4
919	stp	xzr,xzr,[x1,#8*0]
920	stp	xzr,xzr,[x1,#8*2]
921	cbnz	x27,.Lsqr4x_cond_copy
922
923	csel	x14,x19,x6,lo
924	stp	xzr,xzr,[x2,#8*0]
925	csel	x15,x20,x7,lo
926	stp	xzr,xzr,[x2,#8*2]
927	csel	x16,x21,x8,lo
928	csel	x17,x22,x9,lo
929	stp	x14,x15,[x3,#8*0]
930	stp	x16,x17,[x3,#8*2]
931
932	b	.Lsqr8x_done
933
934.align	4
935.Lsqr8x8_post_condition:
936	adc	x28,xzr,xzr
937	ldr	x30,[x29,#8]		// pull return address
938	// x19-7,x28 hold result, x6-7 hold modulus
939	subs	x6,x19,x6
940	ldr	x1,[x29,#96]		// pull rp
941	sbcs	x7,x20,x7
942	stp	xzr,xzr,[sp,#8*0]
943	sbcs	x8,x21,x8
944	stp	xzr,xzr,[sp,#8*2]
945	sbcs	x9,x22,x9
946	stp	xzr,xzr,[sp,#8*4]
947	sbcs	x10,x23,x10
948	stp	xzr,xzr,[sp,#8*6]
949	sbcs	x11,x24,x11
950	stp	xzr,xzr,[sp,#8*8]
951	sbcs	x12,x25,x12
952	stp	xzr,xzr,[sp,#8*10]
953	sbcs	x13,x26,x13
954	stp	xzr,xzr,[sp,#8*12]
955	sbcs	x28,x28,xzr	// did it borrow?
956	stp	xzr,xzr,[sp,#8*14]
957
958	// x6-7 hold result-modulus
959	csel	x6,x19,x6,lo
960	csel	x7,x20,x7,lo
961	csel	x8,x21,x8,lo
962	csel	x9,x22,x9,lo
963	stp	x6,x7,[x1,#8*0]
964	csel	x10,x23,x10,lo
965	csel	x11,x24,x11,lo
966	stp	x8,x9,[x1,#8*2]
967	csel	x12,x25,x12,lo
968	csel	x13,x26,x13,lo
969	stp	x10,x11,[x1,#8*4]
970	stp	x12,x13,[x1,#8*6]
971
972.Lsqr8x_done:
973	ldp	x19,x20,[x29,#16]
974	mov	sp,x29
975	ldp	x21,x22,[x29,#32]
976	mov	x0,#1
977	ldp	x23,x24,[x29,#48]
978	ldp	x25,x26,[x29,#64]
979	ldp	x27,x28,[x29,#80]
980	ldr	x29,[sp],#128
981	// x30 is popped earlier
982	AARCH64_VALIDATE_LINK_REGISTER
983	ret
984.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
985.type	__bn_mul4x_mont,%function
986.align	5
987__bn_mul4x_mont:
988	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
989	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
990	// return address.
991	stp	x29,x30,[sp,#-128]!
992	add	x29,sp,#0
993	stp	x19,x20,[sp,#16]
994	stp	x21,x22,[sp,#32]
995	stp	x23,x24,[sp,#48]
996	stp	x25,x26,[sp,#64]
997	stp	x27,x28,[sp,#80]
998
999	sub	x26,sp,x5,lsl#3
1000	lsl	x5,x5,#3
1001	ldr	x4,[x4]		// *n0
1002	sub	sp,x26,#8*4		// alloca
1003
1004	add	x10,x2,x5
1005	add	x27,x1,x5
1006	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1007
1008	ldr	x24,[x2,#8*0]		// b[0]
1009	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1010	ldp	x8,x9,[x1,#8*2]
1011	add	x1,x1,#8*4
1012	mov	x19,xzr
1013	mov	x20,xzr
1014	mov	x21,xzr
1015	mov	x22,xzr
1016	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1017	ldp	x16,x17,[x3,#8*2]
1018	adds	x3,x3,#8*4		// clear carry bit
1019	mov	x0,xzr
1020	mov	x28,#0
1021	mov	x26,sp
1022
1023.Loop_mul4x_1st_reduction:
1024	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1025	adc	x0,x0,xzr	// modulo-scheduled
1026	mul	x11,x7,x24
1027	add	x28,x28,#8
1028	mul	x12,x8,x24
1029	and	x28,x28,#31
1030	mul	x13,x9,x24
1031	adds	x19,x19,x10
1032	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1033	adcs	x20,x20,x11
1034	mul	x25,x19,x4		// t[0]*n0
1035	adcs	x21,x21,x12
1036	umulh	x11,x7,x24
1037	adcs	x22,x22,x13
1038	umulh	x12,x8,x24
1039	adc	x23,xzr,xzr
1040	umulh	x13,x9,x24
1041	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1042	adds	x20,x20,x10
1043	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1044	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1045	adcs	x21,x21,x11
1046	mul	x11,x15,x25
1047	adcs	x22,x22,x12
1048	mul	x12,x16,x25
1049	adc	x23,x23,x13		// can't overflow
1050	mul	x13,x17,x25
1051	// (*)	adds	xzr,x19,x10
1052	subs	xzr,x19,#1		// (*)
1053	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1054	adcs	x19,x20,x11
1055	umulh	x11,x15,x25
1056	adcs	x20,x21,x12
1057	umulh	x12,x16,x25
1058	adcs	x21,x22,x13
1059	umulh	x13,x17,x25
1060	adcs	x22,x23,x0
1061	adc	x0,xzr,xzr
1062	adds	x19,x19,x10
1063	sub	x10,x27,x1
1064	adcs	x20,x20,x11
1065	adcs	x21,x21,x12
1066	adcs	x22,x22,x13
1067	//adc	x0,x0,xzr
1068	cbnz	x28,.Loop_mul4x_1st_reduction
1069
1070	cbz	x10,.Lmul4x4_post_condition
1071
1072	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1073	ldp	x8,x9,[x1,#8*2]
1074	add	x1,x1,#8*4
1075	ldr	x25,[sp]		// a[0]*n0
1076	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1077	ldp	x16,x17,[x3,#8*2]
1078	add	x3,x3,#8*4
1079
1080.Loop_mul4x_1st_tail:
1081	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1082	adc	x0,x0,xzr	// modulo-scheduled
1083	mul	x11,x7,x24
1084	add	x28,x28,#8
1085	mul	x12,x8,x24
1086	and	x28,x28,#31
1087	mul	x13,x9,x24
1088	adds	x19,x19,x10
1089	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1090	adcs	x20,x20,x11
1091	umulh	x11,x7,x24
1092	adcs	x21,x21,x12
1093	umulh	x12,x8,x24
1094	adcs	x22,x22,x13
1095	umulh	x13,x9,x24
1096	adc	x23,xzr,xzr
1097	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1098	adds	x20,x20,x10
1099	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1100	adcs	x21,x21,x11
1101	mul	x11,x15,x25
1102	adcs	x22,x22,x12
1103	mul	x12,x16,x25
1104	adc	x23,x23,x13		// can't overflow
1105	mul	x13,x17,x25
1106	adds	x19,x19,x10
1107	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1108	adcs	x20,x20,x11
1109	umulh	x11,x15,x25
1110	adcs	x21,x21,x12
1111	umulh	x12,x16,x25
1112	adcs	x22,x22,x13
1113	adcs	x23,x23,x0
1114	umulh	x13,x17,x25
1115	adc	x0,xzr,xzr
1116	ldr	x25,[sp,x28]		// next t[0]*n0
1117	str	x19,[x26],#8		// result!!!
1118	adds	x19,x20,x10
1119	sub	x10,x27,x1		// done yet?
1120	adcs	x20,x21,x11
1121	adcs	x21,x22,x12
1122	adcs	x22,x23,x13
1123	//adc	x0,x0,xzr
1124	cbnz	x28,.Loop_mul4x_1st_tail
1125
1126	sub	x11,x27,x5	// rewinded x1
1127	cbz	x10,.Lmul4x_proceed
1128
1129	ldp	x6,x7,[x1,#8*0]
1130	ldp	x8,x9,[x1,#8*2]
1131	add	x1,x1,#8*4
1132	ldp	x14,x15,[x3,#8*0]
1133	ldp	x16,x17,[x3,#8*2]
1134	add	x3,x3,#8*4
1135	b	.Loop_mul4x_1st_tail
1136
1137.align	5
1138.Lmul4x_proceed:
1139	ldr	x24,[x2,#8*4]!		// *++b
1140	adc	x30,x0,xzr
1141	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1142	sub	x3,x3,x5		// rewind np
1143	ldp	x8,x9,[x11,#8*2]
1144	add	x1,x11,#8*4
1145
1146	stp	x19,x20,[x26,#8*0]	// result!!!
1147	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1148	stp	x21,x22,[x26,#8*2]	// result!!!
1149	ldp	x21,x22,[sp,#8*6]
1150
1151	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1152	mov	x26,sp
1153	ldp	x16,x17,[x3,#8*2]
1154	adds	x3,x3,#8*4		// clear carry bit
1155	mov	x0,xzr
1156
1157.align	4
1158.Loop_mul4x_reduction:
1159	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1160	adc	x0,x0,xzr	// modulo-scheduled
1161	mul	x11,x7,x24
1162	add	x28,x28,#8
1163	mul	x12,x8,x24
1164	and	x28,x28,#31
1165	mul	x13,x9,x24
1166	adds	x19,x19,x10
1167	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1168	adcs	x20,x20,x11
1169	mul	x25,x19,x4		// t[0]*n0
1170	adcs	x21,x21,x12
1171	umulh	x11,x7,x24
1172	adcs	x22,x22,x13
1173	umulh	x12,x8,x24
1174	adc	x23,xzr,xzr
1175	umulh	x13,x9,x24
1176	ldr	x24,[x2,x28]		// next b[i]
1177	adds	x20,x20,x10
1178	// (*)	mul	x10,x14,x25
1179	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1180	adcs	x21,x21,x11
1181	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1182	adcs	x22,x22,x12
1183	mul	x12,x16,x25
1184	adc	x23,x23,x13		// can't overflow
1185	mul	x13,x17,x25
1186	// (*)	adds	xzr,x19,x10
1187	subs	xzr,x19,#1		// (*)
1188	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1189	adcs	x19,x20,x11
1190	umulh	x11,x15,x25
1191	adcs	x20,x21,x12
1192	umulh	x12,x16,x25
1193	adcs	x21,x22,x13
1194	umulh	x13,x17,x25
1195	adcs	x22,x23,x0
1196	adc	x0,xzr,xzr
1197	adds	x19,x19,x10
1198	adcs	x20,x20,x11
1199	adcs	x21,x21,x12
1200	adcs	x22,x22,x13
1201	//adc	x0,x0,xzr
1202	cbnz	x28,.Loop_mul4x_reduction
1203
1204	adc	x0,x0,xzr
1205	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1206	ldp	x12,x13,[x26,#8*6]
1207	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1208	ldp	x8,x9,[x1,#8*2]
1209	add	x1,x1,#8*4
1210	adds	x19,x19,x10
1211	adcs	x20,x20,x11
1212	adcs	x21,x21,x12
1213	adcs	x22,x22,x13
1214	//adc	x0,x0,xzr
1215
1216	ldr	x25,[sp]		// t[0]*n0
1217	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1218	ldp	x16,x17,[x3,#8*2]
1219	add	x3,x3,#8*4
1220
1221.align	4
1222.Loop_mul4x_tail:
1223	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1224	adc	x0,x0,xzr	// modulo-scheduled
1225	mul	x11,x7,x24
1226	add	x28,x28,#8
1227	mul	x12,x8,x24
1228	and	x28,x28,#31
1229	mul	x13,x9,x24
1230	adds	x19,x19,x10
1231	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1232	adcs	x20,x20,x11
1233	umulh	x11,x7,x24
1234	adcs	x21,x21,x12
1235	umulh	x12,x8,x24
1236	adcs	x22,x22,x13
1237	umulh	x13,x9,x24
1238	adc	x23,xzr,xzr
1239	ldr	x24,[x2,x28]		// next b[i]
1240	adds	x20,x20,x10
1241	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1242	adcs	x21,x21,x11
1243	mul	x11,x15,x25
1244	adcs	x22,x22,x12
1245	mul	x12,x16,x25
1246	adc	x23,x23,x13		// can't overflow
1247	mul	x13,x17,x25
1248	adds	x19,x19,x10
1249	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1250	adcs	x20,x20,x11
1251	umulh	x11,x15,x25
1252	adcs	x21,x21,x12
1253	umulh	x12,x16,x25
1254	adcs	x22,x22,x13
1255	umulh	x13,x17,x25
1256	adcs	x23,x23,x0
1257	ldr	x25,[sp,x28]		// next a[0]*n0
1258	adc	x0,xzr,xzr
1259	str	x19,[x26],#8		// result!!!
1260	adds	x19,x20,x10
1261	sub	x10,x27,x1		// done yet?
1262	adcs	x20,x21,x11
1263	adcs	x21,x22,x12
1264	adcs	x22,x23,x13
1265	//adc	x0,x0,xzr
1266	cbnz	x28,.Loop_mul4x_tail
1267
1268	sub	x11,x3,x5		// rewinded np?
1269	adc	x0,x0,xzr
1270	cbz	x10,.Loop_mul4x_break
1271
1272	ldp	x10,x11,[x26,#8*4]
1273	ldp	x12,x13,[x26,#8*6]
1274	ldp	x6,x7,[x1,#8*0]
1275	ldp	x8,x9,[x1,#8*2]
1276	add	x1,x1,#8*4
1277	adds	x19,x19,x10
1278	adcs	x20,x20,x11
1279	adcs	x21,x21,x12
1280	adcs	x22,x22,x13
1281	//adc	x0,x0,xzr
1282	ldp	x14,x15,[x3,#8*0]
1283	ldp	x16,x17,[x3,#8*2]
1284	add	x3,x3,#8*4
1285	b	.Loop_mul4x_tail
1286
1287.align	4
1288.Loop_mul4x_break:
1289	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1290	adds	x19,x19,x30
1291	add	x2,x2,#8*4		// bp++
1292	adcs	x20,x20,xzr
1293	sub	x1,x1,x5		// rewind ap
1294	adcs	x21,x21,xzr
1295	stp	x19,x20,[x26,#8*0]	// result!!!
1296	adcs	x22,x22,xzr
1297	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1298	adc	x30,x0,xzr
1299	stp	x21,x22,[x26,#8*2]	// result!!!
1300	cmp	x2,x13			// done yet?
1301	ldp	x21,x22,[sp,#8*6]
1302	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1303	ldp	x16,x17,[x11,#8*2]
1304	add	x3,x11,#8*4
1305	b.eq	.Lmul4x_post
1306
1307	ldr	x24,[x2]
1308	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1309	ldp	x8,x9,[x1,#8*2]
1310	adds	x1,x1,#8*4		// clear carry bit
1311	mov	x0,xzr
1312	mov	x26,sp
1313	b	.Loop_mul4x_reduction
1314
1315.align	4
1316.Lmul4x_post:
1317	// Final step. We see if result is larger than modulus, and
1318	// if it is, subtract the modulus. But comparison implies
1319	// subtraction. So we subtract modulus, see if it borrowed,
1320	// and conditionally copy original value.
1321	mov	x0,x12
1322	mov	x27,x12		// x0 copy
1323	subs	x10,x19,x14
1324	add	x26,sp,#8*8
1325	sbcs	x11,x20,x15
1326	sub	x28,x5,#8*4
1327
1328.Lmul4x_sub:
1329	sbcs	x12,x21,x16
1330	ldp	x14,x15,[x3,#8*0]
1331	sub	x28,x28,#8*4
1332	ldp	x19,x20,[x26,#8*0]
1333	sbcs	x13,x22,x17
1334	ldp	x16,x17,[x3,#8*2]
1335	add	x3,x3,#8*4
1336	ldp	x21,x22,[x26,#8*2]
1337	add	x26,x26,#8*4
1338	stp	x10,x11,[x0,#8*0]
1339	sbcs	x10,x19,x14
1340	stp	x12,x13,[x0,#8*2]
1341	add	x0,x0,#8*4
1342	sbcs	x11,x20,x15
1343	cbnz	x28,.Lmul4x_sub
1344
1345	sbcs	x12,x21,x16
1346	mov	x26,sp
1347	add	x1,sp,#8*4
1348	ldp	x6,x7,[x27,#8*0]
1349	sbcs	x13,x22,x17
1350	stp	x10,x11,[x0,#8*0]
1351	ldp	x8,x9,[x27,#8*2]
1352	stp	x12,x13,[x0,#8*2]
1353	ldp	x19,x20,[x1,#8*0]
1354	ldp	x21,x22,[x1,#8*2]
1355	sbcs	xzr,x30,xzr	// did it borrow?
1356	ldr	x30,[x29,#8]		// pull return address
1357
1358	sub	x28,x5,#8*4
1359.Lmul4x_cond_copy:
1360	sub	x28,x28,#8*4
1361	csel	x10,x19,x6,lo
1362	stp	xzr,xzr,[x26,#8*0]
1363	csel	x11,x20,x7,lo
1364	ldp	x6,x7,[x27,#8*4]
1365	ldp	x19,x20,[x1,#8*4]
1366	csel	x12,x21,x8,lo
1367	stp	xzr,xzr,[x26,#8*2]
1368	add	x26,x26,#8*4
1369	csel	x13,x22,x9,lo
1370	ldp	x8,x9,[x27,#8*6]
1371	ldp	x21,x22,[x1,#8*6]
1372	add	x1,x1,#8*4
1373	stp	x10,x11,[x27,#8*0]
1374	stp	x12,x13,[x27,#8*2]
1375	add	x27,x27,#8*4
1376	cbnz	x28,.Lmul4x_cond_copy
1377
1378	csel	x10,x19,x6,lo
1379	stp	xzr,xzr,[x26,#8*0]
1380	csel	x11,x20,x7,lo
1381	stp	xzr,xzr,[x26,#8*2]
1382	csel	x12,x21,x8,lo
1383	stp	xzr,xzr,[x26,#8*3]
1384	csel	x13,x22,x9,lo
1385	stp	xzr,xzr,[x26,#8*4]
1386	stp	x10,x11,[x27,#8*0]
1387	stp	x12,x13,[x27,#8*2]
1388
1389	b	.Lmul4x_done
1390
1391.align	4
1392.Lmul4x4_post_condition:
1393	adc	x0,x0,xzr
1394	ldr	x1,[x29,#96]		// pull rp
1395	// x19-3,x0 hold result, x14-7 hold modulus
1396	subs	x6,x19,x14
1397	ldr	x30,[x29,#8]		// pull return address
1398	sbcs	x7,x20,x15
1399	stp	xzr,xzr,[sp,#8*0]
1400	sbcs	x8,x21,x16
1401	stp	xzr,xzr,[sp,#8*2]
1402	sbcs	x9,x22,x17
1403	stp	xzr,xzr,[sp,#8*4]
1404	sbcs	xzr,x0,xzr		// did it borrow?
1405	stp	xzr,xzr,[sp,#8*6]
1406
1407	// x6-3 hold result-modulus
1408	csel	x6,x19,x6,lo
1409	csel	x7,x20,x7,lo
1410	csel	x8,x21,x8,lo
1411	csel	x9,x22,x9,lo
1412	stp	x6,x7,[x1,#8*0]
1413	stp	x8,x9,[x1,#8*2]
1414
1415.Lmul4x_done:
1416	ldp	x19,x20,[x29,#16]
1417	mov	sp,x29
1418	ldp	x21,x22,[x29,#32]
1419	mov	x0,#1
1420	ldp	x23,x24,[x29,#48]
1421	ldp	x25,x26,[x29,#64]
1422	ldp	x27,x28,[x29,#80]
1423	ldr	x29,[sp],#128
1424	// x30 is popped earlier
1425	AARCH64_VALIDATE_LINK_REGISTER
1426	ret
1427.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1428.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1429.align	2
1430.align	4
1431#endif
1432#endif  // !OPENSSL_NO_ASM
1433.section	.note.GNU-stack,"",%progbits
1434