1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include <GFp/arm_arch.h>
14
15.text
16.align	5
17.Lpoly:
18.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
19.Lone_mont:
20.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
21.Lone:
22.quad	1,0,0,0
23.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24.align	2
25
26// void	GFp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
27//					     const BN_ULONG x2[4]);
28.globl	GFp_nistz256_mul_mont
29.hidden	GFp_nistz256_mul_mont
30.type	GFp_nistz256_mul_mont,%function
31.align	4
32GFp_nistz256_mul_mont:
33	stp	x29,x30,[sp,#-32]!
34	add	x29,sp,#0
35	stp	x19,x20,[sp,#16]
36
37	ldr	x3,[x2]		// bp[0]
38	ldp	x4,x5,[x1]
39	ldp	x6,x7,[x1,#16]
40	ldr	x12,.Lpoly+8
41	ldr	x13,.Lpoly+24
42
43	bl	__ecp_nistz256_mul_mont
44
45	ldp	x19,x20,[sp,#16]
46	ldp	x29,x30,[sp],#32
47	ret
48.size	GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
49
50// void	GFp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
51.globl	GFp_nistz256_sqr_mont
52.hidden	GFp_nistz256_sqr_mont
53.type	GFp_nistz256_sqr_mont,%function
54.align	4
55GFp_nistz256_sqr_mont:
56	stp	x29,x30,[sp,#-32]!
57	add	x29,sp,#0
58	stp	x19,x20,[sp,#16]
59
60	ldp	x4,x5,[x1]
61	ldp	x6,x7,[x1,#16]
62	ldr	x12,.Lpoly+8
63	ldr	x13,.Lpoly+24
64
65	bl	__ecp_nistz256_sqr_mont
66
67	ldp	x19,x20,[sp,#16]
68	ldp	x29,x30,[sp],#32
69	ret
70.size	GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont
71
72// void	GFp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
73//					const BN_ULONG x2[4]);
74.globl	GFp_nistz256_add
75.hidden	GFp_nistz256_add
76.type	GFp_nistz256_add,%function
77.align	4
78GFp_nistz256_add:
79	stp	x29,x30,[sp,#-16]!
80	add	x29,sp,#0
81
82	ldp	x14,x15,[x1]
83	ldp	x8,x9,[x2]
84	ldp	x16,x17,[x1,#16]
85	ldp	x10,x11,[x2,#16]
86	ldr	x12,.Lpoly+8
87	ldr	x13,.Lpoly+24
88
89	bl	__ecp_nistz256_add
90
91	ldp	x29,x30,[sp],#16
92	ret
93.size	GFp_nistz256_add,.-GFp_nistz256_add
94
95// void	GFp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
96.globl	GFp_nistz256_neg
97.hidden	GFp_nistz256_neg
98.type	GFp_nistz256_neg,%function
99.align	4
100GFp_nistz256_neg:
101	stp	x29,x30,[sp,#-16]!
102	add	x29,sp,#0
103
104	mov	x2,x1
105	mov	x14,xzr		// a = 0
106	mov	x15,xzr
107	mov	x16,xzr
108	mov	x17,xzr
109	ldr	x12,.Lpoly+8
110	ldr	x13,.Lpoly+24
111
112	bl	__ecp_nistz256_sub_from
113
114	ldp	x29,x30,[sp],#16
115	ret
116.size	GFp_nistz256_neg,.-GFp_nistz256_neg
117
118// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
119// to x4-x7 and b[0] - to x3
120.type	__ecp_nistz256_mul_mont,%function
121.align	4
122__ecp_nistz256_mul_mont:
123	mul	x14,x4,x3		// a[0]*b[0]
124	umulh	x8,x4,x3
125
126	mul	x15,x5,x3		// a[1]*b[0]
127	umulh	x9,x5,x3
128
129	mul	x16,x6,x3		// a[2]*b[0]
130	umulh	x10,x6,x3
131
132	mul	x17,x7,x3		// a[3]*b[0]
133	umulh	x11,x7,x3
134	ldr	x3,[x2,#8]		// b[1]
135
136	adds	x15,x15,x8		// accumulate high parts of multiplication
137	lsl	x8,x14,#32
138	adcs	x16,x16,x9
139	lsr	x9,x14,#32
140	adcs	x17,x17,x10
141	adc	x19,xzr,x11
142	mov	x20,xzr
143	subs	x10,x14,x8		// "*0xffff0001"
144	sbc	x11,x14,x9
145	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
146	mul	x8,x4,x3		// lo(a[0]*b[i])
147	adcs	x15,x16,x9
148	mul	x9,x5,x3		// lo(a[1]*b[i])
149	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
150	mul	x10,x6,x3		// lo(a[2]*b[i])
151	adcs	x17,x19,x11
152	mul	x11,x7,x3		// lo(a[3]*b[i])
153	adc	x19,x20,xzr
154
155	adds	x14,x14,x8		// accumulate low parts of multiplication
156	umulh	x8,x4,x3		// hi(a[0]*b[i])
157	adcs	x15,x15,x9
158	umulh	x9,x5,x3		// hi(a[1]*b[i])
159	adcs	x16,x16,x10
160	umulh	x10,x6,x3		// hi(a[2]*b[i])
161	adcs	x17,x17,x11
162	umulh	x11,x7,x3		// hi(a[3]*b[i])
163	adc	x19,x19,xzr
164	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
165	adds	x15,x15,x8		// accumulate high parts of multiplication
166	lsl	x8,x14,#32
167	adcs	x16,x16,x9
168	lsr	x9,x14,#32
169	adcs	x17,x17,x10
170	adcs	x19,x19,x11
171	adc	x20,xzr,xzr
172	subs	x10,x14,x8		// "*0xffff0001"
173	sbc	x11,x14,x9
174	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
175	mul	x8,x4,x3		// lo(a[0]*b[i])
176	adcs	x15,x16,x9
177	mul	x9,x5,x3		// lo(a[1]*b[i])
178	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
179	mul	x10,x6,x3		// lo(a[2]*b[i])
180	adcs	x17,x19,x11
181	mul	x11,x7,x3		// lo(a[3]*b[i])
182	adc	x19,x20,xzr
183
184	adds	x14,x14,x8		// accumulate low parts of multiplication
185	umulh	x8,x4,x3		// hi(a[0]*b[i])
186	adcs	x15,x15,x9
187	umulh	x9,x5,x3		// hi(a[1]*b[i])
188	adcs	x16,x16,x10
189	umulh	x10,x6,x3		// hi(a[2]*b[i])
190	adcs	x17,x17,x11
191	umulh	x11,x7,x3		// hi(a[3]*b[i])
192	adc	x19,x19,xzr
193	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
194	adds	x15,x15,x8		// accumulate high parts of multiplication
195	lsl	x8,x14,#32
196	adcs	x16,x16,x9
197	lsr	x9,x14,#32
198	adcs	x17,x17,x10
199	adcs	x19,x19,x11
200	adc	x20,xzr,xzr
201	subs	x10,x14,x8		// "*0xffff0001"
202	sbc	x11,x14,x9
203	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
204	mul	x8,x4,x3		// lo(a[0]*b[i])
205	adcs	x15,x16,x9
206	mul	x9,x5,x3		// lo(a[1]*b[i])
207	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
208	mul	x10,x6,x3		// lo(a[2]*b[i])
209	adcs	x17,x19,x11
210	mul	x11,x7,x3		// lo(a[3]*b[i])
211	adc	x19,x20,xzr
212
213	adds	x14,x14,x8		// accumulate low parts of multiplication
214	umulh	x8,x4,x3		// hi(a[0]*b[i])
215	adcs	x15,x15,x9
216	umulh	x9,x5,x3		// hi(a[1]*b[i])
217	adcs	x16,x16,x10
218	umulh	x10,x6,x3		// hi(a[2]*b[i])
219	adcs	x17,x17,x11
220	umulh	x11,x7,x3		// hi(a[3]*b[i])
221	adc	x19,x19,xzr
222	adds	x15,x15,x8		// accumulate high parts of multiplication
223	lsl	x8,x14,#32
224	adcs	x16,x16,x9
225	lsr	x9,x14,#32
226	adcs	x17,x17,x10
227	adcs	x19,x19,x11
228	adc	x20,xzr,xzr
229	// last reduction
230	subs	x10,x14,x8		// "*0xffff0001"
231	sbc	x11,x14,x9
232	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
233	adcs	x15,x16,x9
234	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
235	adcs	x17,x19,x11
236	adc	x19,x20,xzr
237
238	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
239	sbcs	x9,x15,x12
240	sbcs	x10,x16,xzr
241	sbcs	x11,x17,x13
242	sbcs	xzr,x19,xzr		// did it borrow?
243
244	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
245	csel	x15,x15,x9,lo
246	csel	x16,x16,x10,lo
247	stp	x14,x15,[x0]
248	csel	x17,x17,x11,lo
249	stp	x16,x17,[x0,#16]
250
251	ret
252.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
253
254// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
255// to x4-x7
256.type	__ecp_nistz256_sqr_mont,%function
257.align	4
258__ecp_nistz256_sqr_mont:
259	//  |  |  |  |  |  |a1*a0|  |
260	//  |  |  |  |  |a2*a0|  |  |
261	//  |  |a3*a2|a3*a0|  |  |  |
262	//  |  |  |  |a2*a1|  |  |  |
263	//  |  |  |a3*a1|  |  |  |  |
264	// *|  |  |  |  |  |  |  | 2|
265	// +|a3*a3|a2*a2|a1*a1|a0*a0|
266	//  |--+--+--+--+--+--+--+--|
267	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
268	//
269	//  "can't overflow" below mark carrying into high part of
270	//  multiplication result, which can't overflow, because it
271	//  can never be all ones.
272
273	mul	x15,x5,x4		// a[1]*a[0]
274	umulh	x9,x5,x4
275	mul	x16,x6,x4		// a[2]*a[0]
276	umulh	x10,x6,x4
277	mul	x17,x7,x4		// a[3]*a[0]
278	umulh	x19,x7,x4
279
280	adds	x16,x16,x9		// accumulate high parts of multiplication
281	mul	x8,x6,x5		// a[2]*a[1]
282	umulh	x9,x6,x5
283	adcs	x17,x17,x10
284	mul	x10,x7,x5		// a[3]*a[1]
285	umulh	x11,x7,x5
286	adc	x19,x19,xzr		// can't overflow
287
288	mul	x20,x7,x6		// a[3]*a[2]
289	umulh	x1,x7,x6
290
291	adds	x9,x9,x10		// accumulate high parts of multiplication
292	mul	x14,x4,x4		// a[0]*a[0]
293	adc	x10,x11,xzr		// can't overflow
294
295	adds	x17,x17,x8		// accumulate low parts of multiplication
296	umulh	x4,x4,x4
297	adcs	x19,x19,x9
298	mul	x9,x5,x5		// a[1]*a[1]
299	adcs	x20,x20,x10
300	umulh	x5,x5,x5
301	adc	x1,x1,xzr		// can't overflow
302
303	adds	x15,x15,x15	// acc[1-6]*=2
304	mul	x10,x6,x6		// a[2]*a[2]
305	adcs	x16,x16,x16
306	umulh	x6,x6,x6
307	adcs	x17,x17,x17
308	mul	x11,x7,x7		// a[3]*a[3]
309	adcs	x19,x19,x19
310	umulh	x7,x7,x7
311	adcs	x20,x20,x20
312	adcs	x1,x1,x1
313	adc	x2,xzr,xzr
314
315	adds	x15,x15,x4		// +a[i]*a[i]
316	adcs	x16,x16,x9
317	adcs	x17,x17,x5
318	adcs	x19,x19,x10
319	adcs	x20,x20,x6
320	lsl	x8,x14,#32
321	adcs	x1,x1,x11
322	lsr	x9,x14,#32
323	adc	x2,x2,x7
324	subs	x10,x14,x8		// "*0xffff0001"
325	sbc	x11,x14,x9
326	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
327	adcs	x15,x16,x9
328	lsl	x8,x14,#32
329	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
330	lsr	x9,x14,#32
331	adc	x17,x11,xzr		// can't overflow
332	subs	x10,x14,x8		// "*0xffff0001"
333	sbc	x11,x14,x9
334	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
335	adcs	x15,x16,x9
336	lsl	x8,x14,#32
337	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
338	lsr	x9,x14,#32
339	adc	x17,x11,xzr		// can't overflow
340	subs	x10,x14,x8		// "*0xffff0001"
341	sbc	x11,x14,x9
342	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
343	adcs	x15,x16,x9
344	lsl	x8,x14,#32
345	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
346	lsr	x9,x14,#32
347	adc	x17,x11,xzr		// can't overflow
348	subs	x10,x14,x8		// "*0xffff0001"
349	sbc	x11,x14,x9
350	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
351	adcs	x15,x16,x9
352	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
353	adc	x17,x11,xzr		// can't overflow
354
355	adds	x14,x14,x19	// accumulate upper half
356	adcs	x15,x15,x20
357	adcs	x16,x16,x1
358	adcs	x17,x17,x2
359	adc	x19,xzr,xzr
360
361	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
362	sbcs	x9,x15,x12
363	sbcs	x10,x16,xzr
364	sbcs	x11,x17,x13
365	sbcs	xzr,x19,xzr		// did it borrow?
366
367	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
368	csel	x15,x15,x9,lo
369	csel	x16,x16,x10,lo
370	stp	x14,x15,[x0]
371	csel	x17,x17,x11,lo
372	stp	x16,x17,[x0,#16]
373
374	ret
375.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
376
377// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
378// x4-x7 and x8-x11. This is done because it's used in multiple
379// contexts, e.g. in multiplication by 2 and 3...
380.type	__ecp_nistz256_add,%function
381.align	4
382__ecp_nistz256_add:
383	adds	x14,x14,x8		// ret = a+b
384	adcs	x15,x15,x9
385	adcs	x16,x16,x10
386	adcs	x17,x17,x11
387	adc	x1,xzr,xzr		// zap x1
388
389	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
390	sbcs	x9,x15,x12
391	sbcs	x10,x16,xzr
392	sbcs	x11,x17,x13
393	sbcs	xzr,x1,xzr		// did subtraction borrow?
394
395	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
396	csel	x15,x15,x9,lo
397	csel	x16,x16,x10,lo
398	stp	x14,x15,[x0]
399	csel	x17,x17,x11,lo
400	stp	x16,x17,[x0,#16]
401
402	ret
403.size	__ecp_nistz256_add,.-__ecp_nistz256_add
404
405.type	__ecp_nistz256_sub_from,%function
406.align	4
407__ecp_nistz256_sub_from:
408	ldp	x8,x9,[x2]
409	ldp	x10,x11,[x2,#16]
410	subs	x14,x14,x8		// ret = a-b
411	sbcs	x15,x15,x9
412	sbcs	x16,x16,x10
413	sbcs	x17,x17,x11
414	sbc	x1,xzr,xzr		// zap x1
415
416	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
417	adcs	x9,x15,x12
418	adcs	x10,x16,xzr
419	adc	x11,x17,x13
420	cmp	x1,xzr			// did subtraction borrow?
421
422	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
423	csel	x15,x15,x9,eq
424	csel	x16,x16,x10,eq
425	stp	x14,x15,[x0]
426	csel	x17,x17,x11,eq
427	stp	x16,x17,[x0,#16]
428
429	ret
430.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
431
432.type	__ecp_nistz256_sub_morf,%function
433.align	4
434__ecp_nistz256_sub_morf:
435	ldp	x8,x9,[x2]
436	ldp	x10,x11,[x2,#16]
437	subs	x14,x8,x14		// ret = b-a
438	sbcs	x15,x9,x15
439	sbcs	x16,x10,x16
440	sbcs	x17,x11,x17
441	sbc	x1,xzr,xzr		// zap x1
442
443	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
444	adcs	x9,x15,x12
445	adcs	x10,x16,xzr
446	adc	x11,x17,x13
447	cmp	x1,xzr			// did subtraction borrow?
448
449	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
450	csel	x15,x15,x9,eq
451	csel	x16,x16,x10,eq
452	stp	x14,x15,[x0]
453	csel	x17,x17,x11,eq
454	stp	x16,x17,[x0,#16]
455
456	ret
457.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
458
459.type	__ecp_nistz256_div_by_2,%function
460.align	4
461__ecp_nistz256_div_by_2:
462	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
463	adcs	x9,x15,x12
464	adcs	x10,x16,xzr
465	adcs	x11,x17,x13
466	adc	x1,xzr,xzr		// zap x1
467	tst	x14,#1		// is a even?
468
469	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
470	csel	x15,x15,x9,eq
471	csel	x16,x16,x10,eq
472	csel	x17,x17,x11,eq
473	csel	x1,xzr,x1,eq
474
475	lsr	x14,x14,#1		// ret >>= 1
476	orr	x14,x14,x15,lsl#63
477	lsr	x15,x15,#1
478	orr	x15,x15,x16,lsl#63
479	lsr	x16,x16,#1
480	orr	x16,x16,x17,lsl#63
481	lsr	x17,x17,#1
482	stp	x14,x15,[x0]
483	orr	x17,x17,x1,lsl#63
484	stp	x16,x17,[x0,#16]
485
486	ret
487.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
488.globl	GFp_nistz256_point_double
489.hidden	GFp_nistz256_point_double
490.type	GFp_nistz256_point_double,%function
491.align	5
492GFp_nistz256_point_double:
493	stp	x29,x30,[sp,#-80]!
494	add	x29,sp,#0
495	stp	x19,x20,[sp,#16]
496	stp	x21,x22,[sp,#32]
497	sub	sp,sp,#32*4
498
499.Ldouble_shortcut:
500	ldp	x14,x15,[x1,#32]
501	mov	x21,x0
502	ldp	x16,x17,[x1,#48]
503	mov	x22,x1
504	ldr	x12,.Lpoly+8
505	mov	x8,x14
506	ldr	x13,.Lpoly+24
507	mov	x9,x15
508	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
509	mov	x10,x16
510	mov	x11,x17
511	ldp	x6,x7,[x22,#64+16]
512	add	x0,sp,#0
513	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
514
515	add	x0,sp,#64
516	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
517
518	ldp	x8,x9,[x22]
519	ldp	x10,x11,[x22,#16]
520	mov	x4,x14		// put Zsqr aside for p256_sub
521	mov	x5,x15
522	mov	x6,x16
523	mov	x7,x17
524	add	x0,sp,#32
525	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
526
527	add	x2,x22,#0
528	mov	x14,x4		// restore Zsqr
529	mov	x15,x5
530	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
531	mov	x16,x6
532	mov	x17,x7
533	ldp	x6,x7,[sp,#0+16]
534	add	x0,sp,#64
535	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
536
537	add	x0,sp,#0
538	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
539
540	ldr	x3,[x22,#32]
541	ldp	x4,x5,[x22,#64]
542	ldp	x6,x7,[x22,#64+16]
543	add	x2,x22,#32
544	add	x0,sp,#96
545	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
546
547	mov	x8,x14
548	mov	x9,x15
549	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
550	mov	x10,x16
551	mov	x11,x17
552	ldp	x6,x7,[sp,#0+16]
553	add	x0,x21,#64
554	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
555
556	add	x0,sp,#96
557	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
558
559	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
560	ldp	x4,x5,[sp,#32]
561	ldp	x6,x7,[sp,#32+16]
562	add	x0,x21,#32
563	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
564
565	add	x2,sp,#64
566	add	x0,sp,#32
567	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
568
569	mov	x8,x14		// duplicate M
570	mov	x9,x15
571	mov	x10,x16
572	mov	x11,x17
573	mov	x4,x14		// put M aside
574	mov	x5,x15
575	mov	x6,x16
576	mov	x7,x17
577	add	x0,sp,#32
578	bl	__ecp_nistz256_add
579	mov	x8,x4			// restore M
580	mov	x9,x5
581	ldr	x3,[x22]		// forward load for p256_mul_mont
582	mov	x10,x6
583	ldp	x4,x5,[sp,#0]
584	mov	x11,x7
585	ldp	x6,x7,[sp,#0+16]
586	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
587
588	add	x2,x22,#0
589	add	x0,sp,#0
590	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
591
592	mov	x8,x14
593	mov	x9,x15
594	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
595	mov	x10,x16
596	mov	x11,x17
597	ldp	x6,x7,[sp,#32+16]
598	add	x0,sp,#96
599	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
600
601	add	x0,x21,#0
602	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
603
604	add	x2,sp,#96
605	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
606
607	add	x2,sp,#0
608	add	x0,sp,#0
609	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
610
611	ldr	x3,[sp,#32]
612	mov	x4,x14		// copy S
613	mov	x5,x15
614	mov	x6,x16
615	mov	x7,x17
616	add	x2,sp,#32
617	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
618
619	add	x2,x21,#32
620	add	x0,x21,#32
621	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
622
623	add	sp,x29,#0		// destroy frame
624	ldp	x19,x20,[x29,#16]
625	ldp	x21,x22,[x29,#32]
626	ldp	x29,x30,[sp],#80
627	ret
628.size	GFp_nistz256_point_double,.-GFp_nistz256_point_double
629.globl	GFp_nistz256_point_add_affine
630.hidden	GFp_nistz256_point_add_affine
631.type	GFp_nistz256_point_add_affine,%function
632.align	5
633GFp_nistz256_point_add_affine:
634	stp	x29,x30,[sp,#-80]!
635	add	x29,sp,#0
636	stp	x19,x20,[sp,#16]
637	stp	x21,x22,[sp,#32]
638	stp	x23,x24,[sp,#48]
639	stp	x25,x26,[sp,#64]
640	sub	sp,sp,#32*10
641
642	mov	x21,x0
643	mov	x22,x1
644	mov	x23,x2
645	ldr	x12,.Lpoly+8
646	ldr	x13,.Lpoly+24
647
648	ldp	x4,x5,[x1,#64]	// in1_z
649	ldp	x6,x7,[x1,#64+16]
650	orr	x8,x4,x5
651	orr	x10,x6,x7
652	orr	x24,x8,x10
653	cmp	x24,#0
654	csetm	x24,ne		// !in1infty
655
656	ldp	x14,x15,[x2]	// in2_x
657	ldp	x16,x17,[x2,#16]
658	ldp	x8,x9,[x2,#32]	// in2_y
659	ldp	x10,x11,[x2,#48]
660	orr	x14,x14,x15
661	orr	x16,x16,x17
662	orr	x8,x8,x9
663	orr	x10,x10,x11
664	orr	x14,x14,x16
665	orr	x8,x8,x10
666	orr	x25,x14,x8
667	cmp	x25,#0
668	csetm	x25,ne		// !in2infty
669
670	add	x0,sp,#128
671	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
672
673	mov	x4,x14
674	mov	x5,x15
675	mov	x6,x16
676	mov	x7,x17
677	ldr	x3,[x23]
678	add	x2,x23,#0
679	add	x0,sp,#96
680	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
681
682	add	x2,x22,#0
683	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
684	ldp	x4,x5,[sp,#128]
685	ldp	x6,x7,[sp,#128+16]
686	add	x0,sp,#160
687	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
688
689	add	x2,x22,#64
690	add	x0,sp,#128
691	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
692
693	ldr	x3,[x22,#64]
694	ldp	x4,x5,[sp,#160]
695	ldp	x6,x7,[sp,#160+16]
696	add	x2,x22,#64
697	add	x0,sp,#64
698	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
699
700	ldr	x3,[x23,#32]
701	ldp	x4,x5,[sp,#128]
702	ldp	x6,x7,[sp,#128+16]
703	add	x2,x23,#32
704	add	x0,sp,#128
705	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
706
707	add	x2,x22,#32
708	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
709	ldp	x6,x7,[sp,#160+16]
710	add	x0,sp,#192
711	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
712
713	add	x0,sp,#224
714	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
715
716	ldp	x4,x5,[sp,#192]
717	ldp	x6,x7,[sp,#192+16]
718	add	x0,sp,#288
719	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
720
721	ldr	x3,[sp,#160]
722	ldp	x4,x5,[sp,#224]
723	ldp	x6,x7,[sp,#224+16]
724	add	x2,sp,#160
725	add	x0,sp,#256
726	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
727
728	ldr	x3,[x22]
729	ldp	x4,x5,[sp,#224]
730	ldp	x6,x7,[sp,#224+16]
731	add	x2,x22,#0
732	add	x0,sp,#96
733	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
734
735	mov	x8,x14
736	mov	x9,x15
737	mov	x10,x16
738	mov	x11,x17
739	add	x0,sp,#224
740	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
741
742	add	x2,sp,#288
743	add	x0,sp,#0
744	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
745
746	add	x2,sp,#256
747	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
748
749	add	x2,sp,#96
750	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
751	ldp	x4,x5,[sp,#256]
752	ldp	x6,x7,[sp,#256+16]
753	add	x0,sp,#32
754	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
755
756	add	x2,x22,#32
757	add	x0,sp,#128
758	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
759
760	ldr	x3,[sp,#192]
761	ldp	x4,x5,[sp,#32]
762	ldp	x6,x7,[sp,#32+16]
763	add	x2,sp,#192
764	add	x0,sp,#32
765	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
766
767	add	x2,sp,#128
768	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
769
770	ldp	x4,x5,[sp,#0]		// res
771	ldp	x6,x7,[sp,#0+16]
772	ldp	x8,x9,[x23]		// in2
773	ldp	x10,x11,[x23,#16]
774	ldp	x14,x15,[x22,#0]	// in1
775	cmp	x24,#0			// !, remember?
776	ldp	x16,x17,[x22,#0+16]
777	csel	x8,x4,x8,ne
778	csel	x9,x5,x9,ne
779	ldp	x4,x5,[sp,#0+0+32]	// res
780	csel	x10,x6,x10,ne
781	csel	x11,x7,x11,ne
782	cmp	x25,#0			// !, remember?
783	ldp	x6,x7,[sp,#0+0+48]
784	csel	x14,x8,x14,ne
785	csel	x15,x9,x15,ne
786	ldp	x8,x9,[x23,#0+32]	// in2
787	csel	x16,x10,x16,ne
788	csel	x17,x11,x17,ne
789	ldp	x10,x11,[x23,#0+48]
790	stp	x14,x15,[x21,#0]
791	stp	x16,x17,[x21,#0+16]
792	adr	x23,.Lone_mont-64
793	ldp	x14,x15,[x22,#32]	// in1
794	cmp	x24,#0			// !, remember?
795	ldp	x16,x17,[x22,#32+16]
796	csel	x8,x4,x8,ne
797	csel	x9,x5,x9,ne
798	ldp	x4,x5,[sp,#0+32+32]	// res
799	csel	x10,x6,x10,ne
800	csel	x11,x7,x11,ne
801	cmp	x25,#0			// !, remember?
802	ldp	x6,x7,[sp,#0+32+48]
803	csel	x14,x8,x14,ne
804	csel	x15,x9,x15,ne
805	ldp	x8,x9,[x23,#32+32]	// in2
806	csel	x16,x10,x16,ne
807	csel	x17,x11,x17,ne
808	ldp	x10,x11,[x23,#32+48]
809	stp	x14,x15,[x21,#32]
810	stp	x16,x17,[x21,#32+16]
811	ldp	x14,x15,[x22,#64]	// in1
812	cmp	x24,#0			// !, remember?
813	ldp	x16,x17,[x22,#64+16]
814	csel	x8,x4,x8,ne
815	csel	x9,x5,x9,ne
816	csel	x10,x6,x10,ne
817	csel	x11,x7,x11,ne
818	cmp	x25,#0			// !, remember?
819	csel	x14,x8,x14,ne
820	csel	x15,x9,x15,ne
821	csel	x16,x10,x16,ne
822	csel	x17,x11,x17,ne
823	stp	x14,x15,[x21,#64]
824	stp	x16,x17,[x21,#64+16]
825
826	add	sp,x29,#0		// destroy frame
827	ldp	x19,x20,[x29,#16]
828	ldp	x21,x22,[x29,#32]
829	ldp	x23,x24,[x29,#48]
830	ldp	x25,x26,[x29,#64]
831	ldp	x29,x30,[sp],#80
832	ret
833.size	GFp_nistz256_point_add_affine,.-GFp_nistz256_point_add_affine
834#endif
835#endif  // !OPENSSL_NO_ASM
836.section	.note.GNU-stack,"",%progbits
837