1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.globl	_gcm_init_neon
17.private_extern	_gcm_init_neon
18
19.align	4
20_gcm_init_neon:
21	// This function is adapted from gcm_init_v8. xC2 is t3.
22	ld1	{v17.2d}, [x1]			// load H
23	movi	v19.16b, #0xe1
24	shl	v19.2d, v19.2d, #57		// 0xc2.0
25	ext	v3.16b, v17.16b, v17.16b, #8
26	ushr	v18.2d, v19.2d, #63
27	dup	v17.4s, v17.s[1]
28	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
29	ushr	v18.2d, v3.2d, #63
30	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
31	and	v18.16b, v18.16b, v16.16b
32	shl	v3.2d, v3.2d, #1
33	ext	v18.16b, v18.16b, v18.16b, #8
34	and	v16.16b, v16.16b, v17.16b
35	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
36	eor	v5.16b, v3.16b, v16.16b	// twisted H
37	st1	{v5.2d}, [x0]			// store Htable[0]
38	ret
39
40
41.globl	_gcm_gmult_neon
42.private_extern	_gcm_gmult_neon
43
44.align	4
45_gcm_gmult_neon:
46	ld1	{v3.16b}, [x0]		// load Xi
47	ld1	{v5.1d}, [x1], #8		// load twisted H
48	ld1	{v6.1d}, [x1]
49	adrp	x9, Lmasks@PAGE		// load constants
50	add	x9, x9, Lmasks@PAGEOFF
51	ld1	{v24.2d, v25.2d}, [x9]
52	rev64	v3.16b, v3.16b		// byteswap Xi
53	ext	v3.16b, v3.16b, v3.16b, #8
54	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
55
56	mov	x3, #16
57	b	Lgmult_neon
58
59
60.globl	_gcm_ghash_neon
61.private_extern	_gcm_ghash_neon
62
63.align	4
64_gcm_ghash_neon:
65	ld1	{v0.16b}, [x0]		// load Xi
66	ld1	{v5.1d}, [x1], #8		// load twisted H
67	ld1	{v6.1d}, [x1]
68	adrp	x9, Lmasks@PAGE		// load constants
69	add	x9, x9, Lmasks@PAGEOFF
70	ld1	{v24.2d, v25.2d}, [x9]
71	rev64	v0.16b, v0.16b		// byteswap Xi
72	ext	v0.16b, v0.16b, v0.16b, #8
73	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
74
75Loop_neon:
76	ld1	{v3.16b}, [x2], #16	// load inp
77	rev64	v3.16b, v3.16b		// byteswap inp
78	ext	v3.16b, v3.16b, v3.16b, #8
79	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
80
81Lgmult_neon:
82	// Split the input into v3 and v4. (The upper halves are unused,
83	// so it is okay to leave them alone.)
84	ins	v4.d[0], v3.d[1]
85	ext	v16.8b, v5.8b, v5.8b, #1	// A1
86	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
87	ext	v0.8b, v3.8b, v3.8b, #1		// B1
88	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
89	ext	v17.8b, v5.8b, v5.8b, #2	// A2
90	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
91	ext	v19.8b, v3.8b, v3.8b, #2	// B2
92	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
93	ext	v18.8b, v5.8b, v5.8b, #3	// A3
94	eor	v16.16b, v16.16b, v0.16b	// L = E + F
95	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
96	ext	v0.8b, v3.8b, v3.8b, #3		// B3
97	eor	v17.16b, v17.16b, v19.16b	// M = G + H
98	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
99
100	// Here we diverge from the 32-bit version. It computes the following
101	// (instructions reordered for clarity):
102	//
103	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
104	//     vand	$t0#hi, $t0#hi, $k48
105	//     veor	$t0#lo, $t0#lo, $t0#hi
106	//
107	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
108	//     vand	$t1#hi, $t1#hi, $k32
109	//     veor	$t1#lo, $t1#lo, $t1#hi
110	//
111	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
112	//     vand	$t2#hi, $t2#hi, $k16
113	//     veor	$t2#lo, $t2#lo, $t2#hi
114	//
115	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
116	//     vmov.i64	$t3#hi, #0
117	//
118	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
119	// upper halves of SIMD registers, so we must split each half into
120	// separate registers. To compensate, we pair computations up and
121	// parallelize.
122
123	ext	v19.8b, v3.8b, v3.8b, #4	// B4
124	eor	v18.16b, v18.16b, v0.16b	// N = I + J
125	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
126
127	// This can probably be scheduled more efficiently. For now, we just
128	// pair up independent instructions.
129	zip1	v20.2d, v16.2d, v17.2d
130	zip1	v22.2d, v18.2d, v19.2d
131	zip2	v21.2d, v16.2d, v17.2d
132	zip2	v23.2d, v18.2d, v19.2d
133	eor	v20.16b, v20.16b, v21.16b
134	eor	v22.16b, v22.16b, v23.16b
135	and	v21.16b, v21.16b, v24.16b
136	and	v23.16b, v23.16b, v25.16b
137	eor	v20.16b, v20.16b, v21.16b
138	eor	v22.16b, v22.16b, v23.16b
139	zip1	v16.2d, v20.2d, v21.2d
140	zip1	v18.2d, v22.2d, v23.2d
141	zip2	v17.2d, v20.2d, v21.2d
142	zip2	v19.2d, v22.2d, v23.2d
143
144	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
145	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
146	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
147	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
148	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
149	eor	v16.16b, v16.16b, v17.16b
150	eor	v18.16b, v18.16b, v19.16b
151	eor	v0.16b, v0.16b, v16.16b
152	eor	v0.16b, v0.16b, v18.16b
153	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
154	ext	v16.8b, v7.8b, v7.8b, #1	// A1
155	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
156	ext	v1.8b, v3.8b, v3.8b, #1		// B1
157	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
158	ext	v17.8b, v7.8b, v7.8b, #2	// A2
159	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
160	ext	v19.8b, v3.8b, v3.8b, #2	// B2
161	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
162	ext	v18.8b, v7.8b, v7.8b, #3	// A3
163	eor	v16.16b, v16.16b, v1.16b	// L = E + F
164	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
165	ext	v1.8b, v3.8b, v3.8b, #3		// B3
166	eor	v17.16b, v17.16b, v19.16b	// M = G + H
167	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
168
169	// Here we diverge from the 32-bit version. It computes the following
170	// (instructions reordered for clarity):
171	//
172	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
173	//     vand	$t0#hi, $t0#hi, $k48
174	//     veor	$t0#lo, $t0#lo, $t0#hi
175	//
176	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
177	//     vand	$t1#hi, $t1#hi, $k32
178	//     veor	$t1#lo, $t1#lo, $t1#hi
179	//
180	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
181	//     vand	$t2#hi, $t2#hi, $k16
182	//     veor	$t2#lo, $t2#lo, $t2#hi
183	//
184	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
185	//     vmov.i64	$t3#hi, #0
186	//
187	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
188	// upper halves of SIMD registers, so we must split each half into
189	// separate registers. To compensate, we pair computations up and
190	// parallelize.
191
192	ext	v19.8b, v3.8b, v3.8b, #4	// B4
193	eor	v18.16b, v18.16b, v1.16b	// N = I + J
194	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
195
196	// This can probably be scheduled more efficiently. For now, we just
197	// pair up independent instructions.
198	zip1	v20.2d, v16.2d, v17.2d
199	zip1	v22.2d, v18.2d, v19.2d
200	zip2	v21.2d, v16.2d, v17.2d
201	zip2	v23.2d, v18.2d, v19.2d
202	eor	v20.16b, v20.16b, v21.16b
203	eor	v22.16b, v22.16b, v23.16b
204	and	v21.16b, v21.16b, v24.16b
205	and	v23.16b, v23.16b, v25.16b
206	eor	v20.16b, v20.16b, v21.16b
207	eor	v22.16b, v22.16b, v23.16b
208	zip1	v16.2d, v20.2d, v21.2d
209	zip1	v18.2d, v22.2d, v23.2d
210	zip2	v17.2d, v20.2d, v21.2d
211	zip2	v19.2d, v22.2d, v23.2d
212
213	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
214	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
215	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
216	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
217	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
218	eor	v16.16b, v16.16b, v17.16b
219	eor	v18.16b, v18.16b, v19.16b
220	eor	v1.16b, v1.16b, v16.16b
221	eor	v1.16b, v1.16b, v18.16b
222	ext	v16.8b, v6.8b, v6.8b, #1	// A1
223	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
224	ext	v2.8b, v4.8b, v4.8b, #1		// B1
225	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
226	ext	v17.8b, v6.8b, v6.8b, #2	// A2
227	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
228	ext	v19.8b, v4.8b, v4.8b, #2	// B2
229	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
230	ext	v18.8b, v6.8b, v6.8b, #3	// A3
231	eor	v16.16b, v16.16b, v2.16b	// L = E + F
232	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
233	ext	v2.8b, v4.8b, v4.8b, #3		// B3
234	eor	v17.16b, v17.16b, v19.16b	// M = G + H
235	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
236
237	// Here we diverge from the 32-bit version. It computes the following
238	// (instructions reordered for clarity):
239	//
240	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
241	//     vand	$t0#hi, $t0#hi, $k48
242	//     veor	$t0#lo, $t0#lo, $t0#hi
243	//
244	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
245	//     vand	$t1#hi, $t1#hi, $k32
246	//     veor	$t1#lo, $t1#lo, $t1#hi
247	//
248	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
249	//     vand	$t2#hi, $t2#hi, $k16
250	//     veor	$t2#lo, $t2#lo, $t2#hi
251	//
252	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
253	//     vmov.i64	$t3#hi, #0
254	//
255	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
256	// upper halves of SIMD registers, so we must split each half into
257	// separate registers. To compensate, we pair computations up and
258	// parallelize.
259
260	ext	v19.8b, v4.8b, v4.8b, #4	// B4
261	eor	v18.16b, v18.16b, v2.16b	// N = I + J
262	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
263
264	// This can probably be scheduled more efficiently. For now, we just
265	// pair up independent instructions.
266	zip1	v20.2d, v16.2d, v17.2d
267	zip1	v22.2d, v18.2d, v19.2d
268	zip2	v21.2d, v16.2d, v17.2d
269	zip2	v23.2d, v18.2d, v19.2d
270	eor	v20.16b, v20.16b, v21.16b
271	eor	v22.16b, v22.16b, v23.16b
272	and	v21.16b, v21.16b, v24.16b
273	and	v23.16b, v23.16b, v25.16b
274	eor	v20.16b, v20.16b, v21.16b
275	eor	v22.16b, v22.16b, v23.16b
276	zip1	v16.2d, v20.2d, v21.2d
277	zip1	v18.2d, v22.2d, v23.2d
278	zip2	v17.2d, v20.2d, v21.2d
279	zip2	v19.2d, v22.2d, v23.2d
280
281	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
282	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
283	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
284	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
285	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
286	eor	v16.16b, v16.16b, v17.16b
287	eor	v18.16b, v18.16b, v19.16b
288	eor	v2.16b, v2.16b, v16.16b
289	eor	v2.16b, v2.16b, v18.16b
290	ext	v16.16b, v0.16b, v2.16b, #8
291	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
292	eor	v1.16b, v1.16b, v2.16b
293	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
294	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
295	// This is a no-op due to the ins instruction below.
296	// ins	v2.d[0], v1.d[1]
297
298	// equivalent of reduction_avx from ghash-x86_64.pl
299	shl	v17.2d, v0.2d, #57		// 1st phase
300	shl	v18.2d, v0.2d, #62
301	eor	v18.16b, v18.16b, v17.16b	//
302	shl	v17.2d, v0.2d, #63
303	eor	v18.16b, v18.16b, v17.16b	//
304	// Note Xm contains {Xl.d[1], Xh.d[0]}.
305	eor	v18.16b, v18.16b, v1.16b
306	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
307	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
308
309	ushr	v18.2d, v0.2d, #1		// 2nd phase
310	eor	v2.16b, v2.16b,v0.16b
311	eor	v0.16b, v0.16b,v18.16b	//
312	ushr	v18.2d, v18.2d, #6
313	ushr	v0.2d, v0.2d, #1		//
314	eor	v0.16b, v0.16b, v2.16b	//
315	eor	v0.16b, v0.16b, v18.16b	//
316
317	subs	x3, x3, #16
318	bne	Loop_neon
319
320	rev64	v0.16b, v0.16b		// byteswap Xi and write
321	ext	v0.16b, v0.16b, v0.16b, #8
322	st1	{v0.16b}, [x0]
323
324	ret
325
326
327.section	__TEXT,__const
328.align	4
329Lmasks:
330.quad	0x0000ffffffffffff	// k48
331.quad	0x00000000ffffffff	// k32
332.quad	0x000000000000ffff	// k16
333.quad	0x0000000000000000	// k0
334.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
335.align	2
336.align	2
337#endif  // !OPENSSL_NO_ASM
338