1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__arm__)
13#include <GFp/arm_arch.h>
14
15.text
16#if defined(__thumb2__)
17.syntax	unified
18.thumb
19#else
20.code	32
21#endif
22
23.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24.align	2
25.align	6
26.type	__ecp_nistz256_mul_by_2,%function
27.align	4
28__ecp_nistz256_mul_by_2:
29	ldr	r4,[r1,#0]
30	ldr	r5,[r1,#4]
31	ldr	r6,[r1,#8]
32	adds	r4,r4,r4		@ a[0:7]+=a[0:7], i.e. add with itself
33	ldr	r7,[r1,#12]
34	adcs	r5,r5,r5
35	ldr	r8,[r1,#16]
36	adcs	r6,r6,r6
37	ldr	r9,[r1,#20]
38	adcs	r7,r7,r7
39	ldr	r10,[r1,#24]
40	adcs	r8,r8,r8
41	ldr	r11,[r1,#28]
42	adcs	r9,r9,r9
43	adcs	r10,r10,r10
44	mov	r3,#0
45	adcs	r11,r11,r11
46	adc	r3,r3,#0
47
48	b	.Lreduce_by_sub
49.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
50
51@ void	GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
52@					const BN_ULONG r2[8]);
53.globl	GFp_nistz256_add
54.hidden	GFp_nistz256_add
55.type	GFp_nistz256_add,%function
56.align	4
57GFp_nistz256_add:
58	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
59	bl	__ecp_nistz256_add
60#if __ARM_ARCH__>=5 || !defined(__thumb__)
61	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
62#else
63	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
64	bx	lr			@ interoperable with Thumb ISA:-)
65#endif
66.size	GFp_nistz256_add,.-GFp_nistz256_add
67
68.type	__ecp_nistz256_add,%function
69.align	4
70__ecp_nistz256_add:
71	str	lr,[sp,#-4]!		@ push lr
72
73	ldr	r4,[r1,#0]
74	ldr	r5,[r1,#4]
75	ldr	r6,[r1,#8]
76	ldr	r7,[r1,#12]
77	ldr	r8,[r1,#16]
78	ldr	r3,[r2,#0]
79	ldr	r9,[r1,#20]
80	ldr	r12,[r2,#4]
81	ldr	r10,[r1,#24]
82	ldr	r14,[r2,#8]
83	ldr	r11,[r1,#28]
84	ldr	r1,[r2,#12]
85	adds	r4,r4,r3
86	ldr	r3,[r2,#16]
87	adcs	r5,r5,r12
88	ldr	r12,[r2,#20]
89	adcs	r6,r6,r14
90	ldr	r14,[r2,#24]
91	adcs	r7,r7,r1
92	ldr	r1,[r2,#28]
93	adcs	r8,r8,r3
94	adcs	r9,r9,r12
95	adcs	r10,r10,r14
96	mov	r3,#0
97	adcs	r11,r11,r1
98	adc	r3,r3,#0
99	ldr	lr,[sp],#4		@ pop lr
100
101.Lreduce_by_sub:
102
103	@ if a+b >= modulus, subtract modulus.
104	@
105	@ But since comparison implies subtraction, we subtract
106	@ modulus and then add it back if subtraction borrowed.
107
108	subs	r4,r4,#-1
109	sbcs	r5,r5,#-1
110	sbcs	r6,r6,#-1
111	sbcs	r7,r7,#0
112	sbcs	r8,r8,#0
113	sbcs	r9,r9,#0
114	sbcs	r10,r10,#1
115	sbcs	r11,r11,#-1
116	sbc	r3,r3,#0
117
118	@ Note that because mod has special form, i.e. consists of
119	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
120	@ using value of borrow as a whole or extracting single bit.
121	@ Follow r3 register...
122
123	adds	r4,r4,r3		@ add synthesized modulus
124	adcs	r5,r5,r3
125	str	r4,[r0,#0]
126	adcs	r6,r6,r3
127	str	r5,[r0,#4]
128	adcs	r7,r7,#0
129	str	r6,[r0,#8]
130	adcs	r8,r8,#0
131	str	r7,[r0,#12]
132	adcs	r9,r9,#0
133	str	r8,[r0,#16]
134	adcs	r10,r10,r3,lsr#31
135	str	r9,[r0,#20]
136	adcs	r11,r11,r3
137	str	r10,[r0,#24]
138	str	r11,[r0,#28]
139
140	mov	pc,lr
141.size	__ecp_nistz256_add,.-__ecp_nistz256_add
142
143.type	__ecp_nistz256_mul_by_3,%function
144.align	4
145__ecp_nistz256_mul_by_3:
146	str	lr,[sp,#-4]!		@ push lr
147
148	@ As multiplication by 3 is performed as 2*n+n, below are inline
149	@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
150	@ corresponding subroutines for details.
151
152	ldr	r4,[r1,#0]
153	ldr	r5,[r1,#4]
154	ldr	r6,[r1,#8]
155	adds	r4,r4,r4		@ a[0:7]+=a[0:7]
156	ldr	r7,[r1,#12]
157	adcs	r5,r5,r5
158	ldr	r8,[r1,#16]
159	adcs	r6,r6,r6
160	ldr	r9,[r1,#20]
161	adcs	r7,r7,r7
162	ldr	r10,[r1,#24]
163	adcs	r8,r8,r8
164	ldr	r11,[r1,#28]
165	adcs	r9,r9,r9
166	adcs	r10,r10,r10
167	mov	r3,#0
168	adcs	r11,r11,r11
169	adc	r3,r3,#0
170
171	subs	r4,r4,#-1		@ .Lreduce_by_sub but without stores
172	sbcs	r5,r5,#-1
173	sbcs	r6,r6,#-1
174	sbcs	r7,r7,#0
175	sbcs	r8,r8,#0
176	sbcs	r9,r9,#0
177	sbcs	r10,r10,#1
178	sbcs	r11,r11,#-1
179	sbc	r3,r3,#0
180
181	adds	r4,r4,r3		@ add synthesized modulus
182	adcs	r5,r5,r3
183	adcs	r6,r6,r3
184	adcs	r7,r7,#0
185	adcs	r8,r8,#0
186	ldr	r2,[r1,#0]
187	adcs	r9,r9,#0
188	ldr	r12,[r1,#4]
189	adcs	r10,r10,r3,lsr#31
190	ldr	r14,[r1,#8]
191	adc	r11,r11,r3
192
193	ldr	r3,[r1,#12]
194	adds	r4,r4,r2		@ 2*a[0:7]+=a[0:7]
195	ldr	r2,[r1,#16]
196	adcs	r5,r5,r12
197	ldr	r12,[r1,#20]
198	adcs	r6,r6,r14
199	ldr	r14,[r1,#24]
200	adcs	r7,r7,r3
201	ldr	r1,[r1,#28]
202	adcs	r8,r8,r2
203	adcs	r9,r9,r12
204	adcs	r10,r10,r14
205	mov	r3,#0
206	adcs	r11,r11,r1
207	adc	r3,r3,#0
208	ldr	lr,[sp],#4		@ pop lr
209
210	b	.Lreduce_by_sub
211.size	__ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
212
213.type	__ecp_nistz256_div_by_2,%function
214.align	4
215__ecp_nistz256_div_by_2:
216	@ ret = (a is odd ? a+mod : a) >> 1
217
218	ldr	r4,[r1,#0]
219	ldr	r5,[r1,#4]
220	ldr	r6,[r1,#8]
221	mov	r3,r4,lsl#31		@ place least significant bit to most
222					@ significant position, now arithmetic
223					@ right shift by 31 will produce -1 or
224					@ 0, while logical right shift 1 or 0,
225					@ this is how modulus is conditionally
226					@ synthesized in this case...
227	ldr	r7,[r1,#12]
228	adds	r4,r4,r3,asr#31
229	ldr	r8,[r1,#16]
230	adcs	r5,r5,r3,asr#31
231	ldr	r9,[r1,#20]
232	adcs	r6,r6,r3,asr#31
233	ldr	r10,[r1,#24]
234	adcs	r7,r7,#0
235	ldr	r11,[r1,#28]
236	adcs	r8,r8,#0
237	mov	r4,r4,lsr#1		@ a[0:7]>>=1, we can start early
238					@ because it doesn't affect flags
239	adcs	r9,r9,#0
240	orr	r4,r4,r5,lsl#31
241	adcs	r10,r10,r3,lsr#31
242	mov	r2,#0
243	adcs	r11,r11,r3,asr#31
244	mov	r5,r5,lsr#1
245	adc	r2,r2,#0	@ top-most carry bit from addition
246
247	orr	r5,r5,r6,lsl#31
248	mov	r6,r6,lsr#1
249	str	r4,[r0,#0]
250	orr	r6,r6,r7,lsl#31
251	mov	r7,r7,lsr#1
252	str	r5,[r0,#4]
253	orr	r7,r7,r8,lsl#31
254	mov	r8,r8,lsr#1
255	str	r6,[r0,#8]
256	orr	r8,r8,r9,lsl#31
257	mov	r9,r9,lsr#1
258	str	r7,[r0,#12]
259	orr	r9,r9,r10,lsl#31
260	mov	r10,r10,lsr#1
261	str	r8,[r0,#16]
262	orr	r10,r10,r11,lsl#31
263	mov	r11,r11,lsr#1
264	str	r9,[r0,#20]
265	orr	r11,r11,r2,lsl#31	@ don't forget the top-most carry bit
266	str	r10,[r0,#24]
267	str	r11,[r0,#28]
268
269	mov	pc,lr
270.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
271
272.type	__ecp_nistz256_sub,%function
273.align	4
274__ecp_nistz256_sub:
275	str	lr,[sp,#-4]!		@ push lr
276
277	ldr	r4,[r1,#0]
278	ldr	r5,[r1,#4]
279	ldr	r6,[r1,#8]
280	ldr	r7,[r1,#12]
281	ldr	r8,[r1,#16]
282	ldr	r3,[r2,#0]
283	ldr	r9,[r1,#20]
284	ldr	r12,[r2,#4]
285	ldr	r10,[r1,#24]
286	ldr	r14,[r2,#8]
287	ldr	r11,[r1,#28]
288	ldr	r1,[r2,#12]
289	subs	r4,r4,r3
290	ldr	r3,[r2,#16]
291	sbcs	r5,r5,r12
292	ldr	r12,[r2,#20]
293	sbcs	r6,r6,r14
294	ldr	r14,[r2,#24]
295	sbcs	r7,r7,r1
296	ldr	r1,[r2,#28]
297	sbcs	r8,r8,r3
298	sbcs	r9,r9,r12
299	sbcs	r10,r10,r14
300	sbcs	r11,r11,r1
301	sbc	r3,r3,r3		@ broadcast borrow bit
302	ldr	lr,[sp],#4		@ pop lr
303
304.Lreduce_by_add:
305
306	@ if a-b borrows, add modulus.
307	@
308	@ Note that because mod has special form, i.e. consists of
309	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
310	@ broadcasting borrow bit to a register, r3, and using it as
311	@ a whole or extracting single bit.
312
313	adds	r4,r4,r3		@ add synthesized modulus
314	adcs	r5,r5,r3
315	str	r4,[r0,#0]
316	adcs	r6,r6,r3
317	str	r5,[r0,#4]
318	adcs	r7,r7,#0
319	str	r6,[r0,#8]
320	adcs	r8,r8,#0
321	str	r7,[r0,#12]
322	adcs	r9,r9,#0
323	str	r8,[r0,#16]
324	adcs	r10,r10,r3,lsr#31
325	str	r9,[r0,#20]
326	adcs	r11,r11,r3
327	str	r10,[r0,#24]
328	str	r11,[r0,#28]
329
330	mov	pc,lr
331.size	__ecp_nistz256_sub,.-__ecp_nistz256_sub
332
333@ void	GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
334.globl	GFp_nistz256_neg
335.hidden	GFp_nistz256_neg
336.type	GFp_nistz256_neg,%function
337.align	4
338GFp_nistz256_neg:
339	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
340	bl	__ecp_nistz256_neg
341#if __ARM_ARCH__>=5 || !defined(__thumb__)
342	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
343#else
344	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
345	bx	lr			@ interoperable with Thumb ISA:-)
346#endif
347.size	GFp_nistz256_neg,.-GFp_nistz256_neg
348
349.type	__ecp_nistz256_neg,%function
350.align	4
351__ecp_nistz256_neg:
352	ldr	r4,[r1,#0]
353	eor	r3,r3,r3
354	ldr	r5,[r1,#4]
355	ldr	r6,[r1,#8]
356	subs	r4,r3,r4
357	ldr	r7,[r1,#12]
358	sbcs	r5,r3,r5
359	ldr	r8,[r1,#16]
360	sbcs	r6,r3,r6
361	ldr	r9,[r1,#20]
362	sbcs	r7,r3,r7
363	ldr	r10,[r1,#24]
364	sbcs	r8,r3,r8
365	ldr	r11,[r1,#28]
366	sbcs	r9,r3,r9
367	sbcs	r10,r3,r10
368	sbcs	r11,r3,r11
369	sbc	r3,r3,r3
370
371	b	.Lreduce_by_add
372.size	__ecp_nistz256_neg,.-__ecp_nistz256_neg
373@ void	GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
374@					     const BN_ULONG r2[8]);
375.globl	GFp_nistz256_mul_mont
376.hidden	GFp_nistz256_mul_mont
377.type	GFp_nistz256_mul_mont,%function
378.align	4
379GFp_nistz256_mul_mont:
380	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
381	bl	__ecp_nistz256_mul_mont
382#if __ARM_ARCH__>=5 || !defined(__thumb__)
383	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
384#else
385	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
386	bx	lr			@ interoperable with Thumb ISA:-)
387#endif
388.size	GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont
389
390.type	__ecp_nistz256_mul_mont,%function
391.align	4
392__ecp_nistz256_mul_mont:
393	stmdb	sp!,{r0,r1,r2,lr}			@ make a copy of arguments too
394
395	ldr	r2,[r2,#0]			@ b[0]
396	ldmia	r1,{r4,r5,r6,r7,r8,r9,r10,r11}
397
398	umull	r3,r14,r4,r2		@ r[0]=a[0]*b[0]
399	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy a[0-7] to stack, so
400						@ that it can be addressed
401						@ without spending register
402						@ on address
403	umull	r4,r0,r5,r2		@ r[1]=a[1]*b[0]
404	umull	r5,r1,r6,r2
405	adds	r4,r4,r14		@ accumulate high part of mult
406	umull	r6,r12,r7,r2
407	adcs	r5,r5,r0
408	umull	r7,r14,r8,r2
409	adcs	r6,r6,r1
410	umull	r8,r0,r9,r2
411	adcs	r7,r7,r12
412	umull	r9,r1,r10,r2
413	adcs	r8,r8,r14
414	umull	r10,r12,r11,r2
415	adcs	r9,r9,r0
416	adcs	r10,r10,r1
417	eor	r14,r14,r14			@ first overflow bit is zero
418	adc	r11,r12,#0
419	@ multiplication-less reduction 1
420	adds	r6,r6,r3		@ r[3]+=r[0]
421	ldr	r2,[sp,#40]			@ restore b_ptr
422	adcs	r7,r7,#0		@ r[4]+=0
423	adcs	r8,r8,#0		@ r[5]+=0
424	adcs	r9,r9,r3		@ r[6]+=r[0]
425	ldr	r1,[sp,#0]			@ load a[0]
426	adcs	r10,r10,#0		@ r[7]+=0
427	ldr	r2,[r2,#4*1]			@ load b[i]
428	adcs	r11,r11,r3		@ r[8]+=r[0]
429	eor	r0,r0,r0
430	adc	r14,r14,#0			@ overflow bit
431	subs	r10,r10,r3		@ r[7]-=r[0]
432	ldr	r12,[sp,#4]			@ a[1]
433	sbcs	r11,r11,#0		@ r[8]-=0
434	umlal	r4,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
435	eor	r1,r1,r1
436	sbc	r3,r14,#0			@ overflow bit, keep in mind
437						@ that netto result is
438						@ addition of a value which
439						@ makes underflow impossible
440
441	ldr	r14,[sp,#8]			@ a[2]
442	umlal	r5,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
443	str	r3,[sp,#36]		@ temporarily offload overflow
444	eor	r12,r12,r12
445	ldr	r3,[sp,#12]			@ a[3], r3 is alias r3
446	umlal	r6,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
447	eor	r14,r14,r14
448	adds	r5,r5,r0		@ accumulate high part of mult
449	ldr	r0,[sp,#16]			@ a[4]
450	umlal	r7,r14,r3,r2		@ "r[3]"+=a[3]*b[i]
451	eor	r3,r3,r3
452	adcs	r6,r6,r1
453	ldr	r1,[sp,#20]			@ a[5]
454	umlal	r8,r3,r0,r2		@ "r[4]"+=a[4]*b[i]
455	eor	r0,r0,r0
456	adcs	r7,r7,r12
457	ldr	r12,[sp,#24]			@ a[6]
458	umlal	r9,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
459	eor	r1,r1,r1
460	adcs	r8,r8,r14
461	ldr	r14,[sp,#28]			@ a[7]
462	umlal	r10,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
463	eor	r12,r12,r12
464	adcs	r9,r9,r3
465	ldr	r3,[sp,#36]		@ restore overflow bit
466	umlal	r11,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
467	eor	r14,r14,r14
468	adcs	r10,r10,r0
469	adcs	r11,r11,r1
470	adcs	r3,r3,r12
471	adc	r14,r14,#0			@ new overflow bit
472	@ multiplication-less reduction 2
473	adds	r7,r7,r4		@ r[3]+=r[0]
474	ldr	r2,[sp,#40]			@ restore b_ptr
475	adcs	r8,r8,#0		@ r[4]+=0
476	adcs	r9,r9,#0		@ r[5]+=0
477	adcs	r10,r10,r4		@ r[6]+=r[0]
478	ldr	r1,[sp,#0]			@ load a[0]
479	adcs	r11,r11,#0		@ r[7]+=0
480	ldr	r2,[r2,#4*2]			@ load b[i]
481	adcs	r3,r3,r4		@ r[8]+=r[0]
482	eor	r0,r0,r0
483	adc	r14,r14,#0			@ overflow bit
484	subs	r11,r11,r4		@ r[7]-=r[0]
485	ldr	r12,[sp,#4]			@ a[1]
486	sbcs	r3,r3,#0		@ r[8]-=0
487	umlal	r5,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
488	eor	r1,r1,r1
489	sbc	r4,r14,#0			@ overflow bit, keep in mind
490						@ that netto result is
491						@ addition of a value which
492						@ makes underflow impossible
493
494	ldr	r14,[sp,#8]			@ a[2]
495	umlal	r6,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
496	str	r4,[sp,#36]		@ temporarily offload overflow
497	eor	r12,r12,r12
498	ldr	r4,[sp,#12]			@ a[3], r4 is alias r4
499	umlal	r7,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
500	eor	r14,r14,r14
501	adds	r6,r6,r0		@ accumulate high part of mult
502	ldr	r0,[sp,#16]			@ a[4]
503	umlal	r8,r14,r4,r2		@ "r[3]"+=a[3]*b[i]
504	eor	r4,r4,r4
505	adcs	r7,r7,r1
506	ldr	r1,[sp,#20]			@ a[5]
507	umlal	r9,r4,r0,r2		@ "r[4]"+=a[4]*b[i]
508	eor	r0,r0,r0
509	adcs	r8,r8,r12
510	ldr	r12,[sp,#24]			@ a[6]
511	umlal	r10,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
512	eor	r1,r1,r1
513	adcs	r9,r9,r14
514	ldr	r14,[sp,#28]			@ a[7]
515	umlal	r11,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
516	eor	r12,r12,r12
517	adcs	r10,r10,r4
518	ldr	r4,[sp,#36]		@ restore overflow bit
519	umlal	r3,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
520	eor	r14,r14,r14
521	adcs	r11,r11,r0
522	adcs	r3,r3,r1
523	adcs	r4,r4,r12
524	adc	r14,r14,#0			@ new overflow bit
525	@ multiplication-less reduction 3
526	adds	r8,r8,r5		@ r[3]+=r[0]
527	ldr	r2,[sp,#40]			@ restore b_ptr
528	adcs	r9,r9,#0		@ r[4]+=0
529	adcs	r10,r10,#0		@ r[5]+=0
530	adcs	r11,r11,r5		@ r[6]+=r[0]
531	ldr	r1,[sp,#0]			@ load a[0]
532	adcs	r3,r3,#0		@ r[7]+=0
533	ldr	r2,[r2,#4*3]			@ load b[i]
534	adcs	r4,r4,r5		@ r[8]+=r[0]
535	eor	r0,r0,r0
536	adc	r14,r14,#0			@ overflow bit
537	subs	r3,r3,r5		@ r[7]-=r[0]
538	ldr	r12,[sp,#4]			@ a[1]
539	sbcs	r4,r4,#0		@ r[8]-=0
540	umlal	r6,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
541	eor	r1,r1,r1
542	sbc	r5,r14,#0			@ overflow bit, keep in mind
543						@ that netto result is
544						@ addition of a value which
545						@ makes underflow impossible
546
547	ldr	r14,[sp,#8]			@ a[2]
548	umlal	r7,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
549	str	r5,[sp,#36]		@ temporarily offload overflow
550	eor	r12,r12,r12
551	ldr	r5,[sp,#12]			@ a[3], r5 is alias r5
552	umlal	r8,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
553	eor	r14,r14,r14
554	adds	r7,r7,r0		@ accumulate high part of mult
555	ldr	r0,[sp,#16]			@ a[4]
556	umlal	r9,r14,r5,r2		@ "r[3]"+=a[3]*b[i]
557	eor	r5,r5,r5
558	adcs	r8,r8,r1
559	ldr	r1,[sp,#20]			@ a[5]
560	umlal	r10,r5,r0,r2		@ "r[4]"+=a[4]*b[i]
561	eor	r0,r0,r0
562	adcs	r9,r9,r12
563	ldr	r12,[sp,#24]			@ a[6]
564	umlal	r11,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
565	eor	r1,r1,r1
566	adcs	r10,r10,r14
567	ldr	r14,[sp,#28]			@ a[7]
568	umlal	r3,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
569	eor	r12,r12,r12
570	adcs	r11,r11,r5
571	ldr	r5,[sp,#36]		@ restore overflow bit
572	umlal	r4,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
573	eor	r14,r14,r14
574	adcs	r3,r3,r0
575	adcs	r4,r4,r1
576	adcs	r5,r5,r12
577	adc	r14,r14,#0			@ new overflow bit
578	@ multiplication-less reduction 4
579	adds	r9,r9,r6		@ r[3]+=r[0]
580	ldr	r2,[sp,#40]			@ restore b_ptr
581	adcs	r10,r10,#0		@ r[4]+=0
582	adcs	r11,r11,#0		@ r[5]+=0
583	adcs	r3,r3,r6		@ r[6]+=r[0]
584	ldr	r1,[sp,#0]			@ load a[0]
585	adcs	r4,r4,#0		@ r[7]+=0
586	ldr	r2,[r2,#4*4]			@ load b[i]
587	adcs	r5,r5,r6		@ r[8]+=r[0]
588	eor	r0,r0,r0
589	adc	r14,r14,#0			@ overflow bit
590	subs	r4,r4,r6		@ r[7]-=r[0]
591	ldr	r12,[sp,#4]			@ a[1]
592	sbcs	r5,r5,#0		@ r[8]-=0
593	umlal	r7,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
594	eor	r1,r1,r1
595	sbc	r6,r14,#0			@ overflow bit, keep in mind
596						@ that netto result is
597						@ addition of a value which
598						@ makes underflow impossible
599
600	ldr	r14,[sp,#8]			@ a[2]
601	umlal	r8,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
602	str	r6,[sp,#36]		@ temporarily offload overflow
603	eor	r12,r12,r12
604	ldr	r6,[sp,#12]			@ a[3], r6 is alias r6
605	umlal	r9,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
606	eor	r14,r14,r14
607	adds	r8,r8,r0		@ accumulate high part of mult
608	ldr	r0,[sp,#16]			@ a[4]
609	umlal	r10,r14,r6,r2		@ "r[3]"+=a[3]*b[i]
610	eor	r6,r6,r6
611	adcs	r9,r9,r1
612	ldr	r1,[sp,#20]			@ a[5]
613	umlal	r11,r6,r0,r2		@ "r[4]"+=a[4]*b[i]
614	eor	r0,r0,r0
615	adcs	r10,r10,r12
616	ldr	r12,[sp,#24]			@ a[6]
617	umlal	r3,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
618	eor	r1,r1,r1
619	adcs	r11,r11,r14
620	ldr	r14,[sp,#28]			@ a[7]
621	umlal	r4,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
622	eor	r12,r12,r12
623	adcs	r3,r3,r6
624	ldr	r6,[sp,#36]		@ restore overflow bit
625	umlal	r5,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
626	eor	r14,r14,r14
627	adcs	r4,r4,r0
628	adcs	r5,r5,r1
629	adcs	r6,r6,r12
630	adc	r14,r14,#0			@ new overflow bit
631	@ multiplication-less reduction 5
632	adds	r10,r10,r7		@ r[3]+=r[0]
633	ldr	r2,[sp,#40]			@ restore b_ptr
634	adcs	r11,r11,#0		@ r[4]+=0
635	adcs	r3,r3,#0		@ r[5]+=0
636	adcs	r4,r4,r7		@ r[6]+=r[0]
637	ldr	r1,[sp,#0]			@ load a[0]
638	adcs	r5,r5,#0		@ r[7]+=0
639	ldr	r2,[r2,#4*5]			@ load b[i]
640	adcs	r6,r6,r7		@ r[8]+=r[0]
641	eor	r0,r0,r0
642	adc	r14,r14,#0			@ overflow bit
643	subs	r5,r5,r7		@ r[7]-=r[0]
644	ldr	r12,[sp,#4]			@ a[1]
645	sbcs	r6,r6,#0		@ r[8]-=0
646	umlal	r8,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
647	eor	r1,r1,r1
648	sbc	r7,r14,#0			@ overflow bit, keep in mind
649						@ that netto result is
650						@ addition of a value which
651						@ makes underflow impossible
652
653	ldr	r14,[sp,#8]			@ a[2]
654	umlal	r9,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
655	str	r7,[sp,#36]		@ temporarily offload overflow
656	eor	r12,r12,r12
657	ldr	r7,[sp,#12]			@ a[3], r7 is alias r7
658	umlal	r10,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
659	eor	r14,r14,r14
660	adds	r9,r9,r0		@ accumulate high part of mult
661	ldr	r0,[sp,#16]			@ a[4]
662	umlal	r11,r14,r7,r2		@ "r[3]"+=a[3]*b[i]
663	eor	r7,r7,r7
664	adcs	r10,r10,r1
665	ldr	r1,[sp,#20]			@ a[5]
666	umlal	r3,r7,r0,r2		@ "r[4]"+=a[4]*b[i]
667	eor	r0,r0,r0
668	adcs	r11,r11,r12
669	ldr	r12,[sp,#24]			@ a[6]
670	umlal	r4,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
671	eor	r1,r1,r1
672	adcs	r3,r3,r14
673	ldr	r14,[sp,#28]			@ a[7]
674	umlal	r5,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
675	eor	r12,r12,r12
676	adcs	r4,r4,r7
677	ldr	r7,[sp,#36]		@ restore overflow bit
678	umlal	r6,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
679	eor	r14,r14,r14
680	adcs	r5,r5,r0
681	adcs	r6,r6,r1
682	adcs	r7,r7,r12
683	adc	r14,r14,#0			@ new overflow bit
684	@ multiplication-less reduction 6
685	adds	r11,r11,r8		@ r[3]+=r[0]
686	ldr	r2,[sp,#40]			@ restore b_ptr
687	adcs	r3,r3,#0		@ r[4]+=0
688	adcs	r4,r4,#0		@ r[5]+=0
689	adcs	r5,r5,r8		@ r[6]+=r[0]
690	ldr	r1,[sp,#0]			@ load a[0]
691	adcs	r6,r6,#0		@ r[7]+=0
692	ldr	r2,[r2,#4*6]			@ load b[i]
693	adcs	r7,r7,r8		@ r[8]+=r[0]
694	eor	r0,r0,r0
695	adc	r14,r14,#0			@ overflow bit
696	subs	r6,r6,r8		@ r[7]-=r[0]
697	ldr	r12,[sp,#4]			@ a[1]
698	sbcs	r7,r7,#0		@ r[8]-=0
699	umlal	r9,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
700	eor	r1,r1,r1
701	sbc	r8,r14,#0			@ overflow bit, keep in mind
702						@ that netto result is
703						@ addition of a value which
704						@ makes underflow impossible
705
706	ldr	r14,[sp,#8]			@ a[2]
707	umlal	r10,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
708	str	r8,[sp,#36]		@ temporarily offload overflow
709	eor	r12,r12,r12
710	ldr	r8,[sp,#12]			@ a[3], r8 is alias r8
711	umlal	r11,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
712	eor	r14,r14,r14
713	adds	r10,r10,r0		@ accumulate high part of mult
714	ldr	r0,[sp,#16]			@ a[4]
715	umlal	r3,r14,r8,r2		@ "r[3]"+=a[3]*b[i]
716	eor	r8,r8,r8
717	adcs	r11,r11,r1
718	ldr	r1,[sp,#20]			@ a[5]
719	umlal	r4,r8,r0,r2		@ "r[4]"+=a[4]*b[i]
720	eor	r0,r0,r0
721	adcs	r3,r3,r12
722	ldr	r12,[sp,#24]			@ a[6]
723	umlal	r5,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
724	eor	r1,r1,r1
725	adcs	r4,r4,r14
726	ldr	r14,[sp,#28]			@ a[7]
727	umlal	r6,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
728	eor	r12,r12,r12
729	adcs	r5,r5,r8
730	ldr	r8,[sp,#36]		@ restore overflow bit
731	umlal	r7,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
732	eor	r14,r14,r14
733	adcs	r6,r6,r0
734	adcs	r7,r7,r1
735	adcs	r8,r8,r12
736	adc	r14,r14,#0			@ new overflow bit
737	@ multiplication-less reduction 7
738	adds	r3,r3,r9		@ r[3]+=r[0]
739	ldr	r2,[sp,#40]			@ restore b_ptr
740	adcs	r4,r4,#0		@ r[4]+=0
741	adcs	r5,r5,#0		@ r[5]+=0
742	adcs	r6,r6,r9		@ r[6]+=r[0]
743	ldr	r1,[sp,#0]			@ load a[0]
744	adcs	r7,r7,#0		@ r[7]+=0
745	ldr	r2,[r2,#4*7]			@ load b[i]
746	adcs	r8,r8,r9		@ r[8]+=r[0]
747	eor	r0,r0,r0
748	adc	r14,r14,#0			@ overflow bit
749	subs	r7,r7,r9		@ r[7]-=r[0]
750	ldr	r12,[sp,#4]			@ a[1]
751	sbcs	r8,r8,#0		@ r[8]-=0
752	umlal	r10,r0,r1,r2		@ "r[0]"+=a[0]*b[i]
753	eor	r1,r1,r1
754	sbc	r9,r14,#0			@ overflow bit, keep in mind
755						@ that netto result is
756						@ addition of a value which
757						@ makes underflow impossible
758
759	ldr	r14,[sp,#8]			@ a[2]
760	umlal	r11,r1,r12,r2		@ "r[1]"+=a[1]*b[i]
761	str	r9,[sp,#36]		@ temporarily offload overflow
762	eor	r12,r12,r12
763	ldr	r9,[sp,#12]			@ a[3], r9 is alias r9
764	umlal	r3,r12,r14,r2		@ "r[2]"+=a[2]*b[i]
765	eor	r14,r14,r14
766	adds	r11,r11,r0		@ accumulate high part of mult
767	ldr	r0,[sp,#16]			@ a[4]
768	umlal	r4,r14,r9,r2		@ "r[3]"+=a[3]*b[i]
769	eor	r9,r9,r9
770	adcs	r3,r3,r1
771	ldr	r1,[sp,#20]			@ a[5]
772	umlal	r5,r9,r0,r2		@ "r[4]"+=a[4]*b[i]
773	eor	r0,r0,r0
774	adcs	r4,r4,r12
775	ldr	r12,[sp,#24]			@ a[6]
776	umlal	r6,r0,r1,r2		@ "r[5]"+=a[5]*b[i]
777	eor	r1,r1,r1
778	adcs	r5,r5,r14
779	ldr	r14,[sp,#28]			@ a[7]
780	umlal	r7,r1,r12,r2		@ "r[6]"+=a[6]*b[i]
781	eor	r12,r12,r12
782	adcs	r6,r6,r9
783	ldr	r9,[sp,#36]		@ restore overflow bit
784	umlal	r8,r12,r14,r2		@ "r[7]"+=a[7]*b[i]
785	eor	r14,r14,r14
786	adcs	r7,r7,r0
787	adcs	r8,r8,r1
788	adcs	r9,r9,r12
789	adc	r14,r14,#0			@ new overflow bit
790	@ last multiplication-less reduction
791	adds	r4,r4,r10
792	ldr	r0,[sp,#32]			@ restore r_ptr
793	adcs	r5,r5,#0
794	adcs	r6,r6,#0
795	adcs	r7,r7,r10
796	adcs	r8,r8,#0
797	adcs	r9,r9,r10
798	adc	r14,r14,#0
799	subs	r8,r8,r10
800	sbcs	r9,r9,#0
801	sbc	r10,r14,#0			@ overflow bit
802
803	@ Final step is "if result > mod, subtract mod", but we do it
804	@ "other way around", namely subtract modulus from result
805	@ and if it borrowed, add modulus back.
806
807	adds	r11,r11,#1		@ subs	r11,r11,#-1
808	adcs	r3,r3,#0		@ sbcs	r3,r3,#-1
809	adcs	r4,r4,#0		@ sbcs	r4,r4,#-1
810	sbcs	r5,r5,#0
811	sbcs	r6,r6,#0
812	sbcs	r7,r7,#0
813	sbcs	r8,r8,#1
814	adcs	r9,r9,#0		@ sbcs	r9,r9,#-1
815	ldr	lr,[sp,#44]			@ restore lr
816	sbc	r10,r10,#0		@ broadcast borrow bit
817	add	sp,sp,#48
818
819	@ Note that because mod has special form, i.e. consists of
820	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
821	@ broadcasting borrow bit to a register, r10, and using it as
822	@ a whole or extracting single bit.
823
824	adds	r11,r11,r10		@ add modulus or zero
825	adcs	r3,r3,r10
826	str	r11,[r0,#0]
827	adcs	r4,r4,r10
828	str	r3,[r0,#4]
829	adcs	r5,r5,#0
830	str	r4,[r0,#8]
831	adcs	r6,r6,#0
832	str	r5,[r0,#12]
833	adcs	r7,r7,#0
834	str	r6,[r0,#16]
835	adcs	r8,r8,r10,lsr#31
836	str	r7,[r0,#20]
837	adc	r9,r9,r10
838	str	r8,[r0,#24]
839	str	r9,[r0,#28]
840
841	mov	pc,lr
842.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
843.type	__ecp_nistz256_sub_from,%function
844.align	5
845__ecp_nistz256_sub_from:
846	str	lr,[sp,#-4]!		@ push lr
847
848	ldr	r10,[r2,#0]
849	ldr	r12,[r2,#4]
850	ldr	r14,[r2,#8]
851	ldr	r1,[r2,#12]
852	subs	r11,r11,r10
853	ldr	r10,[r2,#16]
854	sbcs	r3,r3,r12
855	ldr	r12,[r2,#20]
856	sbcs	r4,r4,r14
857	ldr	r14,[r2,#24]
858	sbcs	r5,r5,r1
859	ldr	r1,[r2,#28]
860	sbcs	r6,r6,r10
861	sbcs	r7,r7,r12
862	sbcs	r8,r8,r14
863	sbcs	r9,r9,r1
864	sbc	r2,r2,r2		@ broadcast borrow bit
865	ldr	lr,[sp],#4		@ pop lr
866
867	adds	r11,r11,r2		@ add synthesized modulus
868	adcs	r3,r3,r2
869	str	r11,[r0,#0]
870	adcs	r4,r4,r2
871	str	r3,[r0,#4]
872	adcs	r5,r5,#0
873	str	r4,[r0,#8]
874	adcs	r6,r6,#0
875	str	r5,[r0,#12]
876	adcs	r7,r7,#0
877	str	r6,[r0,#16]
878	adcs	r8,r8,r2,lsr#31
879	str	r7,[r0,#20]
880	adcs	r9,r9,r2
881	str	r8,[r0,#24]
882	str	r9,[r0,#28]
883
884	mov	pc,lr
885.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
886
887.type	__ecp_nistz256_sub_morf,%function
888.align	5
889__ecp_nistz256_sub_morf:
890	str	lr,[sp,#-4]!		@ push lr
891
892	ldr	r10,[r2,#0]
893	ldr	r12,[r2,#4]
894	ldr	r14,[r2,#8]
895	ldr	r1,[r2,#12]
896	subs	r11,r10,r11
897	ldr	r10,[r2,#16]
898	sbcs	r3,r12,r3
899	ldr	r12,[r2,#20]
900	sbcs	r4,r14,r4
901	ldr	r14,[r2,#24]
902	sbcs	r5,r1,r5
903	ldr	r1,[r2,#28]
904	sbcs	r6,r10,r6
905	sbcs	r7,r12,r7
906	sbcs	r8,r14,r8
907	sbcs	r9,r1,r9
908	sbc	r2,r2,r2		@ broadcast borrow bit
909	ldr	lr,[sp],#4		@ pop lr
910
911	adds	r11,r11,r2		@ add synthesized modulus
912	adcs	r3,r3,r2
913	str	r11,[r0,#0]
914	adcs	r4,r4,r2
915	str	r3,[r0,#4]
916	adcs	r5,r5,#0
917	str	r4,[r0,#8]
918	adcs	r6,r6,#0
919	str	r5,[r0,#12]
920	adcs	r7,r7,#0
921	str	r6,[r0,#16]
922	adcs	r8,r8,r2,lsr#31
923	str	r7,[r0,#20]
924	adcs	r9,r9,r2
925	str	r8,[r0,#24]
926	str	r9,[r0,#28]
927
928	mov	pc,lr
929.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
930
931.type	__ecp_nistz256_add_self,%function
932.align	4
933__ecp_nistz256_add_self:
934	adds	r11,r11,r11		@ a[0:7]+=a[0:7]
935	adcs	r3,r3,r3
936	adcs	r4,r4,r4
937	adcs	r5,r5,r5
938	adcs	r6,r6,r6
939	adcs	r7,r7,r7
940	adcs	r8,r8,r8
941	mov	r2,#0
942	adcs	r9,r9,r9
943	adc	r2,r2,#0
944
945	@ if a+b >= modulus, subtract modulus.
946	@
947	@ But since comparison implies subtraction, we subtract
948	@ modulus and then add it back if subtraction borrowed.
949
950	subs	r11,r11,#-1
951	sbcs	r3,r3,#-1
952	sbcs	r4,r4,#-1
953	sbcs	r5,r5,#0
954	sbcs	r6,r6,#0
955	sbcs	r7,r7,#0
956	sbcs	r8,r8,#1
957	sbcs	r9,r9,#-1
958	sbc	r2,r2,#0
959
960	@ Note that because mod has special form, i.e. consists of
961	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
962	@ using value of borrow as a whole or extracting single bit.
963	@ Follow r2 register...
964
965	adds	r11,r11,r2		@ add synthesized modulus
966	adcs	r3,r3,r2
967	str	r11,[r0,#0]
968	adcs	r4,r4,r2
969	str	r3,[r0,#4]
970	adcs	r5,r5,#0
971	str	r4,[r0,#8]
972	adcs	r6,r6,#0
973	str	r5,[r0,#12]
974	adcs	r7,r7,#0
975	str	r6,[r0,#16]
976	adcs	r8,r8,r2,lsr#31
977	str	r7,[r0,#20]
978	adcs	r9,r9,r2
979	str	r8,[r0,#24]
980	str	r9,[r0,#28]
981
982	mov	pc,lr
983.size	__ecp_nistz256_add_self,.-__ecp_nistz256_add_self
984
985.globl	GFp_nistz256_point_double
986.hidden	GFp_nistz256_point_double
987.type	GFp_nistz256_point_double,%function
988.align	5
989GFp_nistz256_point_double:
990	stmdb	sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ push from r0, unusual, but intentional
991	sub	sp,sp,#32*5
992
993.Lpoint_double_shortcut:
994	add	r3,sp,#96
995	ldmia	r1!,{r4,r5,r6,r7,r8,r9,r10,r11}	@ copy in_x
996	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
997
998	add	r0,sp,#0
999	bl	__ecp_nistz256_mul_by_2	@ p256_mul_by_2(S, in_y);
1000
1001	add	r2,r1,#32
1002	add	r1,r1,#32
1003	add	r0,sp,#64
1004	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Zsqr, in_z);
1005
1006	add	r1,sp,#0
1007	add	r2,sp,#0
1008	add	r0,sp,#0
1009	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(S, S);
1010
1011	ldr	r2,[sp,#32*5+4]
1012	add	r1,r2,#32
1013	add	r2,r2,#64
1014	add	r0,sp,#128
1015	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(tmp0, in_z, in_y);
1016
1017	ldr	r0,[sp,#32*5]
1018	add	r0,r0,#64
1019	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(res_z, tmp0);
1020
1021	add	r1,sp,#96
1022	add	r2,sp,#64
1023	add	r0,sp,#32
1024	bl	__ecp_nistz256_add	@ p256_add(M, in_x, Zsqr);
1025
1026	add	r1,sp,#96
1027	add	r2,sp,#64
1028	add	r0,sp,#64
1029	bl	__ecp_nistz256_sub	@ p256_sub(Zsqr, in_x, Zsqr);
1030
1031	add	r1,sp,#0
1032	add	r2,sp,#0
1033	add	r0,sp,#128
1034	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(tmp0, S);
1035
1036	add	r1,sp,#64
1037	add	r2,sp,#32
1038	add	r0,sp,#32
1039	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(M, M, Zsqr);
1040
1041	ldr	r0,[sp,#32*5]
1042	add	r1,sp,#128
1043	add	r0,r0,#32
1044	bl	__ecp_nistz256_div_by_2	@ p256_div_by_2(res_y, tmp0);
1045
1046	add	r1,sp,#32
1047	add	r0,sp,#32
1048	bl	__ecp_nistz256_mul_by_3	@ p256_mul_by_3(M, M);
1049
1050	add	r1,sp,#96
1051	add	r2,sp,#0
1052	add	r0,sp,#0
1053	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, in_x);
1054
1055	add	r0,sp,#128
1056	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(tmp0, S);
1057
1058	ldr	r0,[sp,#32*5]
1059	add	r1,sp,#32
1060	add	r2,sp,#32
1061	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(res_x, M);
1062
1063	add	r2,sp,#128
1064	bl	__ecp_nistz256_sub_from	@ p256_sub(res_x, res_x, tmp0);
1065
1066	add	r2,sp,#0
1067	add	r0,sp,#0
1068	bl	__ecp_nistz256_sub_morf	@ p256_sub(S, S, res_x);
1069
1070	add	r1,sp,#32
1071	add	r2,sp,#0
1072	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, M);
1073
1074	ldr	r0,[sp,#32*5]
1075	add	r2,r0,#32
1076	add	r0,r0,#32
1077	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, S, res_y);
1078
1079	add	sp,sp,#32*5+16		@ +16 means "skip even over saved r0-r3"
1080#if __ARM_ARCH__>=5 || !defined(__thumb__)
1081	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
1082#else
1083	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1084	bx	lr			@ interoperable with Thumb ISA:-)
1085#endif
1086.size	GFp_nistz256_point_double,.-GFp_nistz256_point_double
1087#endif
1088#endif  // !OPENSSL_NO_ASM
1089.section	.note.GNU-stack,"",%progbits
1090