1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include <GFp/arm_arch.h>
14
15.text
16
17.globl	GFp_gcm_init_neon
18.hidden	GFp_gcm_init_neon
19.type	GFp_gcm_init_neon,%function
20.align	4
21GFp_gcm_init_neon:
22	AARCH64_VALID_CALL_TARGET
23	// This function is adapted from gcm_init_v8. xC2 is t3.
24	ld1	{v17.2d}, [x1]			// load H
25	movi	v19.16b, #0xe1
26	shl	v19.2d, v19.2d, #57		// 0xc2.0
27	ext	v3.16b, v17.16b, v17.16b, #8
28	ushr	v18.2d, v19.2d, #63
29	dup	v17.4s, v17.s[1]
30	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
31	ushr	v18.2d, v3.2d, #63
32	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
33	and	v18.16b, v18.16b, v16.16b
34	shl	v3.2d, v3.2d, #1
35	ext	v18.16b, v18.16b, v18.16b, #8
36	and	v16.16b, v16.16b, v17.16b
37	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
38	eor	v5.16b, v3.16b, v16.16b	// twisted H
39	st1	{v5.2d}, [x0]			// store Htable[0]
40	ret
41.size	GFp_gcm_init_neon,.-GFp_gcm_init_neon
42
43.globl	GFp_gcm_gmult_neon
44.hidden	GFp_gcm_gmult_neon
45.type	GFp_gcm_gmult_neon,%function
46.align	4
47GFp_gcm_gmult_neon:
48	AARCH64_VALID_CALL_TARGET
49	ld1	{v3.16b}, [x0]		// load Xi
50	ld1	{v5.1d}, [x1], #8		// load twisted H
51	ld1	{v6.1d}, [x1]
52	adrp	x9, .Lmasks		// load constants
53	add	x9, x9, :lo12:.Lmasks
54	ld1	{v24.2d, v25.2d}, [x9]
55	rev64	v3.16b, v3.16b		// byteswap Xi
56	ext	v3.16b, v3.16b, v3.16b, #8
57	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
58
59	mov	x3, #16
60	b	.Lgmult_neon
61.size	GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon
62
63.globl	GFp_gcm_ghash_neon
64.hidden	GFp_gcm_ghash_neon
65.type	GFp_gcm_ghash_neon,%function
66.align	4
67GFp_gcm_ghash_neon:
68	AARCH64_VALID_CALL_TARGET
69	ld1	{v0.16b}, [x0]		// load Xi
70	ld1	{v5.1d}, [x1], #8		// load twisted H
71	ld1	{v6.1d}, [x1]
72	adrp	x9, .Lmasks		// load constants
73	add	x9, x9, :lo12:.Lmasks
74	ld1	{v24.2d, v25.2d}, [x9]
75	rev64	v0.16b, v0.16b		// byteswap Xi
76	ext	v0.16b, v0.16b, v0.16b, #8
77	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
78
79.Loop_neon:
80	ld1	{v3.16b}, [x2], #16	// load inp
81	rev64	v3.16b, v3.16b		// byteswap inp
82	ext	v3.16b, v3.16b, v3.16b, #8
83	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
84
85.Lgmult_neon:
86	// Split the input into v3 and v4. (The upper halves are unused,
87	// so it is okay to leave them alone.)
88	ins	v4.d[0], v3.d[1]
89	ext	v16.8b, v5.8b, v5.8b, #1	// A1
90	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
91	ext	v0.8b, v3.8b, v3.8b, #1		// B1
92	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
93	ext	v17.8b, v5.8b, v5.8b, #2	// A2
94	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
95	ext	v19.8b, v3.8b, v3.8b, #2	// B2
96	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
97	ext	v18.8b, v5.8b, v5.8b, #3	// A3
98	eor	v16.16b, v16.16b, v0.16b	// L = E + F
99	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
100	ext	v0.8b, v3.8b, v3.8b, #3		// B3
101	eor	v17.16b, v17.16b, v19.16b	// M = G + H
102	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
103
104	// Here we diverge from the 32-bit version. It computes the following
105	// (instructions reordered for clarity):
106	//
107	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
108	//     vand	$t0#hi, $t0#hi, $k48
109	//     veor	$t0#lo, $t0#lo, $t0#hi
110	//
111	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
112	//     vand	$t1#hi, $t1#hi, $k32
113	//     veor	$t1#lo, $t1#lo, $t1#hi
114	//
115	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
116	//     vand	$t2#hi, $t2#hi, $k16
117	//     veor	$t2#lo, $t2#lo, $t2#hi
118	//
119	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
120	//     vmov.i64	$t3#hi, #0
121	//
122	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
123	// upper halves of SIMD registers, so we must split each half into
124	// separate registers. To compensate, we pair computations up and
125	// parallelize.
126
127	ext	v19.8b, v3.8b, v3.8b, #4	// B4
128	eor	v18.16b, v18.16b, v0.16b	// N = I + J
129	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
130
131	// This can probably be scheduled more efficiently. For now, we just
132	// pair up independent instructions.
133	zip1	v20.2d, v16.2d, v17.2d
134	zip1	v22.2d, v18.2d, v19.2d
135	zip2	v21.2d, v16.2d, v17.2d
136	zip2	v23.2d, v18.2d, v19.2d
137	eor	v20.16b, v20.16b, v21.16b
138	eor	v22.16b, v22.16b, v23.16b
139	and	v21.16b, v21.16b, v24.16b
140	and	v23.16b, v23.16b, v25.16b
141	eor	v20.16b, v20.16b, v21.16b
142	eor	v22.16b, v22.16b, v23.16b
143	zip1	v16.2d, v20.2d, v21.2d
144	zip1	v18.2d, v22.2d, v23.2d
145	zip2	v17.2d, v20.2d, v21.2d
146	zip2	v19.2d, v22.2d, v23.2d
147
148	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
149	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
150	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
151	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
152	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
153	eor	v16.16b, v16.16b, v17.16b
154	eor	v18.16b, v18.16b, v19.16b
155	eor	v0.16b, v0.16b, v16.16b
156	eor	v0.16b, v0.16b, v18.16b
157	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
158	ext	v16.8b, v7.8b, v7.8b, #1	// A1
159	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
160	ext	v1.8b, v3.8b, v3.8b, #1		// B1
161	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
162	ext	v17.8b, v7.8b, v7.8b, #2	// A2
163	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
164	ext	v19.8b, v3.8b, v3.8b, #2	// B2
165	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
166	ext	v18.8b, v7.8b, v7.8b, #3	// A3
167	eor	v16.16b, v16.16b, v1.16b	// L = E + F
168	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
169	ext	v1.8b, v3.8b, v3.8b, #3		// B3
170	eor	v17.16b, v17.16b, v19.16b	// M = G + H
171	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
172
173	// Here we diverge from the 32-bit version. It computes the following
174	// (instructions reordered for clarity):
175	//
176	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
177	//     vand	$t0#hi, $t0#hi, $k48
178	//     veor	$t0#lo, $t0#lo, $t0#hi
179	//
180	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
181	//     vand	$t1#hi, $t1#hi, $k32
182	//     veor	$t1#lo, $t1#lo, $t1#hi
183	//
184	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
185	//     vand	$t2#hi, $t2#hi, $k16
186	//     veor	$t2#lo, $t2#lo, $t2#hi
187	//
188	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
189	//     vmov.i64	$t3#hi, #0
190	//
191	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
192	// upper halves of SIMD registers, so we must split each half into
193	// separate registers. To compensate, we pair computations up and
194	// parallelize.
195
196	ext	v19.8b, v3.8b, v3.8b, #4	// B4
197	eor	v18.16b, v18.16b, v1.16b	// N = I + J
198	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
199
200	// This can probably be scheduled more efficiently. For now, we just
201	// pair up independent instructions.
202	zip1	v20.2d, v16.2d, v17.2d
203	zip1	v22.2d, v18.2d, v19.2d
204	zip2	v21.2d, v16.2d, v17.2d
205	zip2	v23.2d, v18.2d, v19.2d
206	eor	v20.16b, v20.16b, v21.16b
207	eor	v22.16b, v22.16b, v23.16b
208	and	v21.16b, v21.16b, v24.16b
209	and	v23.16b, v23.16b, v25.16b
210	eor	v20.16b, v20.16b, v21.16b
211	eor	v22.16b, v22.16b, v23.16b
212	zip1	v16.2d, v20.2d, v21.2d
213	zip1	v18.2d, v22.2d, v23.2d
214	zip2	v17.2d, v20.2d, v21.2d
215	zip2	v19.2d, v22.2d, v23.2d
216
217	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
218	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
219	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
220	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
221	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
222	eor	v16.16b, v16.16b, v17.16b
223	eor	v18.16b, v18.16b, v19.16b
224	eor	v1.16b, v1.16b, v16.16b
225	eor	v1.16b, v1.16b, v18.16b
226	ext	v16.8b, v6.8b, v6.8b, #1	// A1
227	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
228	ext	v2.8b, v4.8b, v4.8b, #1		// B1
229	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
230	ext	v17.8b, v6.8b, v6.8b, #2	// A2
231	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
232	ext	v19.8b, v4.8b, v4.8b, #2	// B2
233	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
234	ext	v18.8b, v6.8b, v6.8b, #3	// A3
235	eor	v16.16b, v16.16b, v2.16b	// L = E + F
236	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
237	ext	v2.8b, v4.8b, v4.8b, #3		// B3
238	eor	v17.16b, v17.16b, v19.16b	// M = G + H
239	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
240
241	// Here we diverge from the 32-bit version. It computes the following
242	// (instructions reordered for clarity):
243	//
244	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
245	//     vand	$t0#hi, $t0#hi, $k48
246	//     veor	$t0#lo, $t0#lo, $t0#hi
247	//
248	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
249	//     vand	$t1#hi, $t1#hi, $k32
250	//     veor	$t1#lo, $t1#lo, $t1#hi
251	//
252	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
253	//     vand	$t2#hi, $t2#hi, $k16
254	//     veor	$t2#lo, $t2#lo, $t2#hi
255	//
256	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
257	//     vmov.i64	$t3#hi, #0
258	//
259	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
260	// upper halves of SIMD registers, so we must split each half into
261	// separate registers. To compensate, we pair computations up and
262	// parallelize.
263
264	ext	v19.8b, v4.8b, v4.8b, #4	// B4
265	eor	v18.16b, v18.16b, v2.16b	// N = I + J
266	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
267
268	// This can probably be scheduled more efficiently. For now, we just
269	// pair up independent instructions.
270	zip1	v20.2d, v16.2d, v17.2d
271	zip1	v22.2d, v18.2d, v19.2d
272	zip2	v21.2d, v16.2d, v17.2d
273	zip2	v23.2d, v18.2d, v19.2d
274	eor	v20.16b, v20.16b, v21.16b
275	eor	v22.16b, v22.16b, v23.16b
276	and	v21.16b, v21.16b, v24.16b
277	and	v23.16b, v23.16b, v25.16b
278	eor	v20.16b, v20.16b, v21.16b
279	eor	v22.16b, v22.16b, v23.16b
280	zip1	v16.2d, v20.2d, v21.2d
281	zip1	v18.2d, v22.2d, v23.2d
282	zip2	v17.2d, v20.2d, v21.2d
283	zip2	v19.2d, v22.2d, v23.2d
284
285	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
286	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
287	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
288	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
289	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
290	eor	v16.16b, v16.16b, v17.16b
291	eor	v18.16b, v18.16b, v19.16b
292	eor	v2.16b, v2.16b, v16.16b
293	eor	v2.16b, v2.16b, v18.16b
294	ext	v16.16b, v0.16b, v2.16b, #8
295	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
296	eor	v1.16b, v1.16b, v2.16b
297	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
298	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
299	// This is a no-op due to the ins instruction below.
300	// ins	v2.d[0], v1.d[1]
301
302	// equivalent of reduction_avx from ghash-x86_64.pl
303	shl	v17.2d, v0.2d, #57		// 1st phase
304	shl	v18.2d, v0.2d, #62
305	eor	v18.16b, v18.16b, v17.16b	//
306	shl	v17.2d, v0.2d, #63
307	eor	v18.16b, v18.16b, v17.16b	//
308	// Note Xm contains {Xl.d[1], Xh.d[0]}.
309	eor	v18.16b, v18.16b, v1.16b
310	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
311	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
312
313	ushr	v18.2d, v0.2d, #1		// 2nd phase
314	eor	v2.16b, v2.16b,v0.16b
315	eor	v0.16b, v0.16b,v18.16b	//
316	ushr	v18.2d, v18.2d, #6
317	ushr	v0.2d, v0.2d, #1		//
318	eor	v0.16b, v0.16b, v2.16b	//
319	eor	v0.16b, v0.16b, v18.16b	//
320
321	subs	x3, x3, #16
322	bne	.Loop_neon
323
324	rev64	v0.16b, v0.16b		// byteswap Xi and write
325	ext	v0.16b, v0.16b, v0.16b, #8
326	st1	{v0.16b}, [x0]
327
328	ret
329.size	GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon
330
331.section	.rodata
332.align	4
333.Lmasks:
334.quad	0x0000ffffffffffff	// k48
335.quad	0x00000000ffffffff	// k32
336.quad	0x000000000000ffff	// k16
337.quad	0x0000000000000000	// k0
338.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
339.align	2
340.align	2
341#endif
342#endif  // !OPENSSL_NO_ASM
343.section	.note.GNU-stack,"",%progbits
344