1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include <GFp/arm_arch.h>
14
15.text
16.arch	armv8-a+crypto
17.globl	GFp_gcm_init_clmul
18.hidden	GFp_gcm_init_clmul
19.type	GFp_gcm_init_clmul,%function
20.align	4
21GFp_gcm_init_clmul:
22	AARCH64_VALID_CALL_TARGET
23	ld1	{v17.2d},[x1]		//load input H
24	movi	v19.16b,#0xe1
25	shl	v19.2d,v19.2d,#57		//0xc2.0
26	ext	v3.16b,v17.16b,v17.16b,#8
27	ushr	v18.2d,v19.2d,#63
28	dup	v17.4s,v17.s[1]
29	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
30	ushr	v18.2d,v3.2d,#63
31	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
32	and	v18.16b,v18.16b,v16.16b
33	shl	v3.2d,v3.2d,#1
34	ext	v18.16b,v18.16b,v18.16b,#8
35	and	v16.16b,v16.16b,v17.16b
36	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
37	eor	v20.16b,v3.16b,v16.16b		//twisted H
38	st1	{v20.2d},[x0],#16		//store Htable[0]
39
40	//calculate H^2
41	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
42	pmull	v0.1q,v20.1d,v20.1d
43	eor	v16.16b,v16.16b,v20.16b
44	pmull2	v2.1q,v20.2d,v20.2d
45	pmull	v1.1q,v16.1d,v16.1d
46
47	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
48	eor	v18.16b,v0.16b,v2.16b
49	eor	v1.16b,v1.16b,v17.16b
50	eor	v1.16b,v1.16b,v18.16b
51	pmull	v18.1q,v0.1d,v19.1d		//1st phase
52
53	ins	v2.d[0],v1.d[1]
54	ins	v1.d[1],v0.d[0]
55	eor	v0.16b,v1.16b,v18.16b
56
57	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
58	pmull	v0.1q,v0.1d,v19.1d
59	eor	v18.16b,v18.16b,v2.16b
60	eor	v22.16b,v0.16b,v18.16b
61
62	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
63	eor	v17.16b,v17.16b,v22.16b
64	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
65	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
66
67	ret
68.size	GFp_gcm_init_clmul,.-GFp_gcm_init_clmul
69.globl	GFp_gcm_gmult_clmul
70.hidden	GFp_gcm_gmult_clmul
71.type	GFp_gcm_gmult_clmul,%function
72.align	4
73GFp_gcm_gmult_clmul:
74	AARCH64_VALID_CALL_TARGET
75	ld1	{v17.2d},[x0]		//load Xi
76	movi	v19.16b,#0xe1
77	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
78	shl	v19.2d,v19.2d,#57
79#ifndef __ARMEB__
80	rev64	v17.16b,v17.16b
81#endif
82	ext	v3.16b,v17.16b,v17.16b,#8
83
84	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
85	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
86	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
87	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
88
89	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
90	eor	v18.16b,v0.16b,v2.16b
91	eor	v1.16b,v1.16b,v17.16b
92	eor	v1.16b,v1.16b,v18.16b
93	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
94
95	ins	v2.d[0],v1.d[1]
96	ins	v1.d[1],v0.d[0]
97	eor	v0.16b,v1.16b,v18.16b
98
99	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
100	pmull	v0.1q,v0.1d,v19.1d
101	eor	v18.16b,v18.16b,v2.16b
102	eor	v0.16b,v0.16b,v18.16b
103
104#ifndef __ARMEB__
105	rev64	v0.16b,v0.16b
106#endif
107	ext	v0.16b,v0.16b,v0.16b,#8
108	st1	{v0.2d},[x0]		//write out Xi
109
110	ret
111.size	GFp_gcm_gmult_clmul,.-GFp_gcm_gmult_clmul
112.globl	GFp_gcm_ghash_clmul
113.hidden	GFp_gcm_ghash_clmul
114.type	GFp_gcm_ghash_clmul,%function
115.align	4
116GFp_gcm_ghash_clmul:
117	AARCH64_VALID_CALL_TARGET
118	ld1	{v0.2d},[x0]		//load [rotated] Xi
119						//"[rotated]" means that
120						//loaded value would have
121						//to be rotated in order to
122						//make it appear as in
123						//algorithm specification
124	subs	x3,x3,#32		//see if x3 is 32 or larger
125	mov	x12,#16		//x12 is used as post-
126						//increment for input pointer;
127						//as loop is modulo-scheduled
128						//x12 is zeroed just in time
129						//to preclude overstepping
130						//inp[len], which means that
131						//last block[s] are actually
132						//loaded twice, but last
133						//copy is not processed
134	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
135	movi	v19.16b,#0xe1
136	ld1	{v22.2d},[x1]
137	csel	x12,xzr,x12,eq			//is it time to zero x12?
138	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
139	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
140	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
141#ifndef __ARMEB__
142	rev64	v16.16b,v16.16b
143	rev64	v0.16b,v0.16b
144#endif
145	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
146	b.lo	.Lodd_tail_v8		//x3 was less than 32
147	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
148#ifndef __ARMEB__
149	rev64	v17.16b,v17.16b
150#endif
151	ext	v7.16b,v17.16b,v17.16b,#8
152	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
153	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
154	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
155	pmull2	v6.1q,v20.2d,v7.2d
156	b	.Loop_mod2x_v8
157
158.align	4
159.Loop_mod2x_v8:
160	ext	v18.16b,v3.16b,v3.16b,#8
161	subs	x3,x3,#32		//is there more data?
162	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
163	csel	x12,xzr,x12,lo			//is it time to zero x12?
164
165	pmull	v5.1q,v21.1d,v17.1d
166	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
167	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
168	eor	v0.16b,v0.16b,v4.16b		//accumulate
169	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
170	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
171
172	eor	v2.16b,v2.16b,v6.16b
173	csel	x12,xzr,x12,eq			//is it time to zero x12?
174	eor	v1.16b,v1.16b,v5.16b
175
176	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
177	eor	v18.16b,v0.16b,v2.16b
178	eor	v1.16b,v1.16b,v17.16b
179	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
180#ifndef __ARMEB__
181	rev64	v16.16b,v16.16b
182#endif
183	eor	v1.16b,v1.16b,v18.16b
184	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
185
186#ifndef __ARMEB__
187	rev64	v17.16b,v17.16b
188#endif
189	ins	v2.d[0],v1.d[1]
190	ins	v1.d[1],v0.d[0]
191	ext	v7.16b,v17.16b,v17.16b,#8
192	ext	v3.16b,v16.16b,v16.16b,#8
193	eor	v0.16b,v1.16b,v18.16b
194	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
195	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
196
197	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
198	pmull	v0.1q,v0.1d,v19.1d
199	eor	v3.16b,v3.16b,v18.16b
200	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
201	eor	v3.16b,v3.16b,v0.16b
202	pmull2	v6.1q,v20.2d,v7.2d
203	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
204
205	eor	v2.16b,v2.16b,v18.16b
206	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
207	adds	x3,x3,#32		//re-construct x3
208	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
209	b.eq	.Ldone_v8		//is x3 zero?
210.Lodd_tail_v8:
211	ext	v18.16b,v0.16b,v0.16b,#8
212	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
213	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
214
215	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
216	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
217	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
218	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
219
220	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
221	eor	v18.16b,v0.16b,v2.16b
222	eor	v1.16b,v1.16b,v17.16b
223	eor	v1.16b,v1.16b,v18.16b
224	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
225
226	ins	v2.d[0],v1.d[1]
227	ins	v1.d[1],v0.d[0]
228	eor	v0.16b,v1.16b,v18.16b
229
230	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
231	pmull	v0.1q,v0.1d,v19.1d
232	eor	v18.16b,v18.16b,v2.16b
233	eor	v0.16b,v0.16b,v18.16b
234
235.Ldone_v8:
236#ifndef __ARMEB__
237	rev64	v0.16b,v0.16b
238#endif
239	ext	v0.16b,v0.16b,v0.16b,#8
240	st1	{v0.2d},[x0]		//write out Xi
241
242	ret
243.size	GFp_gcm_ghash_clmul,.-GFp_gcm_ghash_clmul
244.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
245.align	2
246.align	2
247#endif
248#endif  // !OPENSSL_NO_ASM
249.section	.note.GNU-stack,"",%progbits
250