1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#include <GFp/arm_arch.h>
13
14.text
15
16.globl	_GFp_gcm_init_neon
17.private_extern	_GFp_gcm_init_neon
18
19.align	4
20_GFp_gcm_init_neon:
21	AARCH64_VALID_CALL_TARGET
22	// This function is adapted from gcm_init_v8. xC2 is t3.
23	ld1	{v17.2d}, [x1]			// load H
24	movi	v19.16b, #0xe1
25	shl	v19.2d, v19.2d, #57		// 0xc2.0
26	ext	v3.16b, v17.16b, v17.16b, #8
27	ushr	v18.2d, v19.2d, #63
28	dup	v17.4s, v17.s[1]
29	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
30	ushr	v18.2d, v3.2d, #63
31	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
32	and	v18.16b, v18.16b, v16.16b
33	shl	v3.2d, v3.2d, #1
34	ext	v18.16b, v18.16b, v18.16b, #8
35	and	v16.16b, v16.16b, v17.16b
36	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
37	eor	v5.16b, v3.16b, v16.16b	// twisted H
38	st1	{v5.2d}, [x0]			// store Htable[0]
39	ret
40
41
42.globl	_GFp_gcm_gmult_neon
43.private_extern	_GFp_gcm_gmult_neon
44
45.align	4
46_GFp_gcm_gmult_neon:
47	AARCH64_VALID_CALL_TARGET
48	ld1	{v3.16b}, [x0]		// load Xi
49	ld1	{v5.1d}, [x1], #8		// load twisted H
50	ld1	{v6.1d}, [x1]
51	adrp	x9, Lmasks@PAGE		// load constants
52	add	x9, x9, Lmasks@PAGEOFF
53	ld1	{v24.2d, v25.2d}, [x9]
54	rev64	v3.16b, v3.16b		// byteswap Xi
55	ext	v3.16b, v3.16b, v3.16b, #8
56	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
57
58	mov	x3, #16
59	b	Lgmult_neon
60
61
62.globl	_GFp_gcm_ghash_neon
63.private_extern	_GFp_gcm_ghash_neon
64
65.align	4
66_GFp_gcm_ghash_neon:
67	AARCH64_VALID_CALL_TARGET
68	ld1	{v0.16b}, [x0]		// load Xi
69	ld1	{v5.1d}, [x1], #8		// load twisted H
70	ld1	{v6.1d}, [x1]
71	adrp	x9, Lmasks@PAGE		// load constants
72	add	x9, x9, Lmasks@PAGEOFF
73	ld1	{v24.2d, v25.2d}, [x9]
74	rev64	v0.16b, v0.16b		// byteswap Xi
75	ext	v0.16b, v0.16b, v0.16b, #8
76	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
77
78Loop_neon:
79	ld1	{v3.16b}, [x2], #16	// load inp
80	rev64	v3.16b, v3.16b		// byteswap inp
81	ext	v3.16b, v3.16b, v3.16b, #8
82	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
83
84Lgmult_neon:
85	// Split the input into v3 and v4. (The upper halves are unused,
86	// so it is okay to leave them alone.)
87	ins	v4.d[0], v3.d[1]
88	ext	v16.8b, v5.8b, v5.8b, #1	// A1
89	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
90	ext	v0.8b, v3.8b, v3.8b, #1		// B1
91	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
92	ext	v17.8b, v5.8b, v5.8b, #2	// A2
93	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
94	ext	v19.8b, v3.8b, v3.8b, #2	// B2
95	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
96	ext	v18.8b, v5.8b, v5.8b, #3	// A3
97	eor	v16.16b, v16.16b, v0.16b	// L = E + F
98	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
99	ext	v0.8b, v3.8b, v3.8b, #3		// B3
100	eor	v17.16b, v17.16b, v19.16b	// M = G + H
101	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
102
103	// Here we diverge from the 32-bit version. It computes the following
104	// (instructions reordered for clarity):
105	//
106	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
107	//     vand	$t0#hi, $t0#hi, $k48
108	//     veor	$t0#lo, $t0#lo, $t0#hi
109	//
110	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
111	//     vand	$t1#hi, $t1#hi, $k32
112	//     veor	$t1#lo, $t1#lo, $t1#hi
113	//
114	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
115	//     vand	$t2#hi, $t2#hi, $k16
116	//     veor	$t2#lo, $t2#lo, $t2#hi
117	//
118	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
119	//     vmov.i64	$t3#hi, #0
120	//
121	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
122	// upper halves of SIMD registers, so we must split each half into
123	// separate registers. To compensate, we pair computations up and
124	// parallelize.
125
126	ext	v19.8b, v3.8b, v3.8b, #4	// B4
127	eor	v18.16b, v18.16b, v0.16b	// N = I + J
128	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
129
130	// This can probably be scheduled more efficiently. For now, we just
131	// pair up independent instructions.
132	zip1	v20.2d, v16.2d, v17.2d
133	zip1	v22.2d, v18.2d, v19.2d
134	zip2	v21.2d, v16.2d, v17.2d
135	zip2	v23.2d, v18.2d, v19.2d
136	eor	v20.16b, v20.16b, v21.16b
137	eor	v22.16b, v22.16b, v23.16b
138	and	v21.16b, v21.16b, v24.16b
139	and	v23.16b, v23.16b, v25.16b
140	eor	v20.16b, v20.16b, v21.16b
141	eor	v22.16b, v22.16b, v23.16b
142	zip1	v16.2d, v20.2d, v21.2d
143	zip1	v18.2d, v22.2d, v23.2d
144	zip2	v17.2d, v20.2d, v21.2d
145	zip2	v19.2d, v22.2d, v23.2d
146
147	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
148	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
149	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
150	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
151	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
152	eor	v16.16b, v16.16b, v17.16b
153	eor	v18.16b, v18.16b, v19.16b
154	eor	v0.16b, v0.16b, v16.16b
155	eor	v0.16b, v0.16b, v18.16b
156	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
157	ext	v16.8b, v7.8b, v7.8b, #1	// A1
158	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
159	ext	v1.8b, v3.8b, v3.8b, #1		// B1
160	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
161	ext	v17.8b, v7.8b, v7.8b, #2	// A2
162	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
163	ext	v19.8b, v3.8b, v3.8b, #2	// B2
164	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
165	ext	v18.8b, v7.8b, v7.8b, #3	// A3
166	eor	v16.16b, v16.16b, v1.16b	// L = E + F
167	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
168	ext	v1.8b, v3.8b, v3.8b, #3		// B3
169	eor	v17.16b, v17.16b, v19.16b	// M = G + H
170	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
171
172	// Here we diverge from the 32-bit version. It computes the following
173	// (instructions reordered for clarity):
174	//
175	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
176	//     vand	$t0#hi, $t0#hi, $k48
177	//     veor	$t0#lo, $t0#lo, $t0#hi
178	//
179	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
180	//     vand	$t1#hi, $t1#hi, $k32
181	//     veor	$t1#lo, $t1#lo, $t1#hi
182	//
183	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
184	//     vand	$t2#hi, $t2#hi, $k16
185	//     veor	$t2#lo, $t2#lo, $t2#hi
186	//
187	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
188	//     vmov.i64	$t3#hi, #0
189	//
190	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
191	// upper halves of SIMD registers, so we must split each half into
192	// separate registers. To compensate, we pair computations up and
193	// parallelize.
194
195	ext	v19.8b, v3.8b, v3.8b, #4	// B4
196	eor	v18.16b, v18.16b, v1.16b	// N = I + J
197	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
198
199	// This can probably be scheduled more efficiently. For now, we just
200	// pair up independent instructions.
201	zip1	v20.2d, v16.2d, v17.2d
202	zip1	v22.2d, v18.2d, v19.2d
203	zip2	v21.2d, v16.2d, v17.2d
204	zip2	v23.2d, v18.2d, v19.2d
205	eor	v20.16b, v20.16b, v21.16b
206	eor	v22.16b, v22.16b, v23.16b
207	and	v21.16b, v21.16b, v24.16b
208	and	v23.16b, v23.16b, v25.16b
209	eor	v20.16b, v20.16b, v21.16b
210	eor	v22.16b, v22.16b, v23.16b
211	zip1	v16.2d, v20.2d, v21.2d
212	zip1	v18.2d, v22.2d, v23.2d
213	zip2	v17.2d, v20.2d, v21.2d
214	zip2	v19.2d, v22.2d, v23.2d
215
216	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
217	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
218	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
219	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
220	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
221	eor	v16.16b, v16.16b, v17.16b
222	eor	v18.16b, v18.16b, v19.16b
223	eor	v1.16b, v1.16b, v16.16b
224	eor	v1.16b, v1.16b, v18.16b
225	ext	v16.8b, v6.8b, v6.8b, #1	// A1
226	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
227	ext	v2.8b, v4.8b, v4.8b, #1		// B1
228	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
229	ext	v17.8b, v6.8b, v6.8b, #2	// A2
230	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
231	ext	v19.8b, v4.8b, v4.8b, #2	// B2
232	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
233	ext	v18.8b, v6.8b, v6.8b, #3	// A3
234	eor	v16.16b, v16.16b, v2.16b	// L = E + F
235	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
236	ext	v2.8b, v4.8b, v4.8b, #3		// B3
237	eor	v17.16b, v17.16b, v19.16b	// M = G + H
238	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
239
240	// Here we diverge from the 32-bit version. It computes the following
241	// (instructions reordered for clarity):
242	//
243	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
244	//     vand	$t0#hi, $t0#hi, $k48
245	//     veor	$t0#lo, $t0#lo, $t0#hi
246	//
247	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
248	//     vand	$t1#hi, $t1#hi, $k32
249	//     veor	$t1#lo, $t1#lo, $t1#hi
250	//
251	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
252	//     vand	$t2#hi, $t2#hi, $k16
253	//     veor	$t2#lo, $t2#lo, $t2#hi
254	//
255	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
256	//     vmov.i64	$t3#hi, #0
257	//
258	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
259	// upper halves of SIMD registers, so we must split each half into
260	// separate registers. To compensate, we pair computations up and
261	// parallelize.
262
263	ext	v19.8b, v4.8b, v4.8b, #4	// B4
264	eor	v18.16b, v18.16b, v2.16b	// N = I + J
265	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
266
267	// This can probably be scheduled more efficiently. For now, we just
268	// pair up independent instructions.
269	zip1	v20.2d, v16.2d, v17.2d
270	zip1	v22.2d, v18.2d, v19.2d
271	zip2	v21.2d, v16.2d, v17.2d
272	zip2	v23.2d, v18.2d, v19.2d
273	eor	v20.16b, v20.16b, v21.16b
274	eor	v22.16b, v22.16b, v23.16b
275	and	v21.16b, v21.16b, v24.16b
276	and	v23.16b, v23.16b, v25.16b
277	eor	v20.16b, v20.16b, v21.16b
278	eor	v22.16b, v22.16b, v23.16b
279	zip1	v16.2d, v20.2d, v21.2d
280	zip1	v18.2d, v22.2d, v23.2d
281	zip2	v17.2d, v20.2d, v21.2d
282	zip2	v19.2d, v22.2d, v23.2d
283
284	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
285	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
286	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
287	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
288	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
289	eor	v16.16b, v16.16b, v17.16b
290	eor	v18.16b, v18.16b, v19.16b
291	eor	v2.16b, v2.16b, v16.16b
292	eor	v2.16b, v2.16b, v18.16b
293	ext	v16.16b, v0.16b, v2.16b, #8
294	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
295	eor	v1.16b, v1.16b, v2.16b
296	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
297	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
298	// This is a no-op due to the ins instruction below.
299	// ins	v2.d[0], v1.d[1]
300
301	// equivalent of reduction_avx from ghash-x86_64.pl
302	shl	v17.2d, v0.2d, #57		// 1st phase
303	shl	v18.2d, v0.2d, #62
304	eor	v18.16b, v18.16b, v17.16b	//
305	shl	v17.2d, v0.2d, #63
306	eor	v18.16b, v18.16b, v17.16b	//
307	// Note Xm contains {Xl.d[1], Xh.d[0]}.
308	eor	v18.16b, v18.16b, v1.16b
309	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
310	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
311
312	ushr	v18.2d, v0.2d, #1		// 2nd phase
313	eor	v2.16b, v2.16b,v0.16b
314	eor	v0.16b, v0.16b,v18.16b	//
315	ushr	v18.2d, v18.2d, #6
316	ushr	v0.2d, v0.2d, #1		//
317	eor	v0.16b, v0.16b, v2.16b	//
318	eor	v0.16b, v0.16b, v18.16b	//
319
320	subs	x3, x3, #16
321	bne	Loop_neon
322
323	rev64	v0.16b, v0.16b		// byteswap Xi and write
324	ext	v0.16b, v0.16b, v0.16b, #8
325	st1	{v0.16b}, [x0]
326
327	ret
328
329
330.section	__TEXT,__const
331.align	4
332Lmasks:
333.quad	0x0000ffffffffffff	// k48
334.quad	0x00000000ffffffff	// k32
335.quad	0x000000000000ffff	// k16
336.quad	0x0000000000000000	// k0
337.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
338.align	2
339.align	2
340#endif  // !OPENSSL_NO_ASM
341