1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17.text
18
19.globl	_gcm_init_neon
20.private_extern	_gcm_init_neon
21
22.align	4
23_gcm_init_neon:
24	AARCH64_VALID_CALL_TARGET
25	// This function is adapted from gcm_init_v8. xC2 is t3.
26	ld1	{v17.2d}, [x1]			// load H
27	movi	v19.16b, #0xe1
28	shl	v19.2d, v19.2d, #57		// 0xc2.0
29	ext	v3.16b, v17.16b, v17.16b, #8
30	ushr	v18.2d, v19.2d, #63
31	dup	v17.4s, v17.s[1]
32	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
33	ushr	v18.2d, v3.2d, #63
34	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
35	and	v18.16b, v18.16b, v16.16b
36	shl	v3.2d, v3.2d, #1
37	ext	v18.16b, v18.16b, v18.16b, #8
38	and	v16.16b, v16.16b, v17.16b
39	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
40	eor	v5.16b, v3.16b, v16.16b	// twisted H
41	st1	{v5.2d}, [x0]			// store Htable[0]
42	ret
43
44
45.globl	_gcm_gmult_neon
46.private_extern	_gcm_gmult_neon
47
48.align	4
49_gcm_gmult_neon:
50	AARCH64_VALID_CALL_TARGET
51	ld1	{v3.16b}, [x0]		// load Xi
52	ld1	{v5.1d}, [x1], #8		// load twisted H
53	ld1	{v6.1d}, [x1]
54	adrp	x9, Lmasks@PAGE		// load constants
55	add	x9, x9, Lmasks@PAGEOFF
56	ld1	{v24.2d, v25.2d}, [x9]
57	rev64	v3.16b, v3.16b		// byteswap Xi
58	ext	v3.16b, v3.16b, v3.16b, #8
59	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
60
61	mov	x3, #16
62	b	Lgmult_neon
63
64
65.globl	_gcm_ghash_neon
66.private_extern	_gcm_ghash_neon
67
68.align	4
69_gcm_ghash_neon:
70	AARCH64_VALID_CALL_TARGET
71	ld1	{v0.16b}, [x0]		// load Xi
72	ld1	{v5.1d}, [x1], #8		// load twisted H
73	ld1	{v6.1d}, [x1]
74	adrp	x9, Lmasks@PAGE		// load constants
75	add	x9, x9, Lmasks@PAGEOFF
76	ld1	{v24.2d, v25.2d}, [x9]
77	rev64	v0.16b, v0.16b		// byteswap Xi
78	ext	v0.16b, v0.16b, v0.16b, #8
79	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
80
81Loop_neon:
82	ld1	{v3.16b}, [x2], #16	// load inp
83	rev64	v3.16b, v3.16b		// byteswap inp
84	ext	v3.16b, v3.16b, v3.16b, #8
85	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
86
87Lgmult_neon:
88	// Split the input into v3 and v4. (The upper halves are unused,
89	// so it is okay to leave them alone.)
90	ins	v4.d[0], v3.d[1]
91	ext	v16.8b, v5.8b, v5.8b, #1	// A1
92	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
93	ext	v0.8b, v3.8b, v3.8b, #1		// B1
94	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
95	ext	v17.8b, v5.8b, v5.8b, #2	// A2
96	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
97	ext	v19.8b, v3.8b, v3.8b, #2	// B2
98	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
99	ext	v18.8b, v5.8b, v5.8b, #3	// A3
100	eor	v16.16b, v16.16b, v0.16b	// L = E + F
101	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
102	ext	v0.8b, v3.8b, v3.8b, #3		// B3
103	eor	v17.16b, v17.16b, v19.16b	// M = G + H
104	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
105
106	// Here we diverge from the 32-bit version. It computes the following
107	// (instructions reordered for clarity):
108	//
109	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
110	//     vand	$t0#hi, $t0#hi, $k48
111	//     veor	$t0#lo, $t0#lo, $t0#hi
112	//
113	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
114	//     vand	$t1#hi, $t1#hi, $k32
115	//     veor	$t1#lo, $t1#lo, $t1#hi
116	//
117	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
118	//     vand	$t2#hi, $t2#hi, $k16
119	//     veor	$t2#lo, $t2#lo, $t2#hi
120	//
121	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
122	//     vmov.i64	$t3#hi, #0
123	//
124	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
125	// upper halves of SIMD registers, so we must split each half into
126	// separate registers. To compensate, we pair computations up and
127	// parallelize.
128
129	ext	v19.8b, v3.8b, v3.8b, #4	// B4
130	eor	v18.16b, v18.16b, v0.16b	// N = I + J
131	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
132
133	// This can probably be scheduled more efficiently. For now, we just
134	// pair up independent instructions.
135	zip1	v20.2d, v16.2d, v17.2d
136	zip1	v22.2d, v18.2d, v19.2d
137	zip2	v21.2d, v16.2d, v17.2d
138	zip2	v23.2d, v18.2d, v19.2d
139	eor	v20.16b, v20.16b, v21.16b
140	eor	v22.16b, v22.16b, v23.16b
141	and	v21.16b, v21.16b, v24.16b
142	and	v23.16b, v23.16b, v25.16b
143	eor	v20.16b, v20.16b, v21.16b
144	eor	v22.16b, v22.16b, v23.16b
145	zip1	v16.2d, v20.2d, v21.2d
146	zip1	v18.2d, v22.2d, v23.2d
147	zip2	v17.2d, v20.2d, v21.2d
148	zip2	v19.2d, v22.2d, v23.2d
149
150	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
151	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
152	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
153	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
154	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
155	eor	v16.16b, v16.16b, v17.16b
156	eor	v18.16b, v18.16b, v19.16b
157	eor	v0.16b, v0.16b, v16.16b
158	eor	v0.16b, v0.16b, v18.16b
159	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
160	ext	v16.8b, v7.8b, v7.8b, #1	// A1
161	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
162	ext	v1.8b, v3.8b, v3.8b, #1		// B1
163	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
164	ext	v17.8b, v7.8b, v7.8b, #2	// A2
165	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
166	ext	v19.8b, v3.8b, v3.8b, #2	// B2
167	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
168	ext	v18.8b, v7.8b, v7.8b, #3	// A3
169	eor	v16.16b, v16.16b, v1.16b	// L = E + F
170	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
171	ext	v1.8b, v3.8b, v3.8b, #3		// B3
172	eor	v17.16b, v17.16b, v19.16b	// M = G + H
173	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
174
175	// Here we diverge from the 32-bit version. It computes the following
176	// (instructions reordered for clarity):
177	//
178	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
179	//     vand	$t0#hi, $t0#hi, $k48
180	//     veor	$t0#lo, $t0#lo, $t0#hi
181	//
182	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
183	//     vand	$t1#hi, $t1#hi, $k32
184	//     veor	$t1#lo, $t1#lo, $t1#hi
185	//
186	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
187	//     vand	$t2#hi, $t2#hi, $k16
188	//     veor	$t2#lo, $t2#lo, $t2#hi
189	//
190	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
191	//     vmov.i64	$t3#hi, #0
192	//
193	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
194	// upper halves of SIMD registers, so we must split each half into
195	// separate registers. To compensate, we pair computations up and
196	// parallelize.
197
198	ext	v19.8b, v3.8b, v3.8b, #4	// B4
199	eor	v18.16b, v18.16b, v1.16b	// N = I + J
200	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
201
202	// This can probably be scheduled more efficiently. For now, we just
203	// pair up independent instructions.
204	zip1	v20.2d, v16.2d, v17.2d
205	zip1	v22.2d, v18.2d, v19.2d
206	zip2	v21.2d, v16.2d, v17.2d
207	zip2	v23.2d, v18.2d, v19.2d
208	eor	v20.16b, v20.16b, v21.16b
209	eor	v22.16b, v22.16b, v23.16b
210	and	v21.16b, v21.16b, v24.16b
211	and	v23.16b, v23.16b, v25.16b
212	eor	v20.16b, v20.16b, v21.16b
213	eor	v22.16b, v22.16b, v23.16b
214	zip1	v16.2d, v20.2d, v21.2d
215	zip1	v18.2d, v22.2d, v23.2d
216	zip2	v17.2d, v20.2d, v21.2d
217	zip2	v19.2d, v22.2d, v23.2d
218
219	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
220	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
221	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
222	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
223	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
224	eor	v16.16b, v16.16b, v17.16b
225	eor	v18.16b, v18.16b, v19.16b
226	eor	v1.16b, v1.16b, v16.16b
227	eor	v1.16b, v1.16b, v18.16b
228	ext	v16.8b, v6.8b, v6.8b, #1	// A1
229	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
230	ext	v2.8b, v4.8b, v4.8b, #1		// B1
231	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
232	ext	v17.8b, v6.8b, v6.8b, #2	// A2
233	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
234	ext	v19.8b, v4.8b, v4.8b, #2	// B2
235	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
236	ext	v18.8b, v6.8b, v6.8b, #3	// A3
237	eor	v16.16b, v16.16b, v2.16b	// L = E + F
238	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
239	ext	v2.8b, v4.8b, v4.8b, #3		// B3
240	eor	v17.16b, v17.16b, v19.16b	// M = G + H
241	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
242
243	// Here we diverge from the 32-bit version. It computes the following
244	// (instructions reordered for clarity):
245	//
246	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
247	//     vand	$t0#hi, $t0#hi, $k48
248	//     veor	$t0#lo, $t0#lo, $t0#hi
249	//
250	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
251	//     vand	$t1#hi, $t1#hi, $k32
252	//     veor	$t1#lo, $t1#lo, $t1#hi
253	//
254	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
255	//     vand	$t2#hi, $t2#hi, $k16
256	//     veor	$t2#lo, $t2#lo, $t2#hi
257	//
258	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
259	//     vmov.i64	$t3#hi, #0
260	//
261	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
262	// upper halves of SIMD registers, so we must split each half into
263	// separate registers. To compensate, we pair computations up and
264	// parallelize.
265
266	ext	v19.8b, v4.8b, v4.8b, #4	// B4
267	eor	v18.16b, v18.16b, v2.16b	// N = I + J
268	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
269
270	// This can probably be scheduled more efficiently. For now, we just
271	// pair up independent instructions.
272	zip1	v20.2d, v16.2d, v17.2d
273	zip1	v22.2d, v18.2d, v19.2d
274	zip2	v21.2d, v16.2d, v17.2d
275	zip2	v23.2d, v18.2d, v19.2d
276	eor	v20.16b, v20.16b, v21.16b
277	eor	v22.16b, v22.16b, v23.16b
278	and	v21.16b, v21.16b, v24.16b
279	and	v23.16b, v23.16b, v25.16b
280	eor	v20.16b, v20.16b, v21.16b
281	eor	v22.16b, v22.16b, v23.16b
282	zip1	v16.2d, v20.2d, v21.2d
283	zip1	v18.2d, v22.2d, v23.2d
284	zip2	v17.2d, v20.2d, v21.2d
285	zip2	v19.2d, v22.2d, v23.2d
286
287	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
288	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
289	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
290	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
291	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
292	eor	v16.16b, v16.16b, v17.16b
293	eor	v18.16b, v18.16b, v19.16b
294	eor	v2.16b, v2.16b, v16.16b
295	eor	v2.16b, v2.16b, v18.16b
296	ext	v16.16b, v0.16b, v2.16b, #8
297	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
298	eor	v1.16b, v1.16b, v2.16b
299	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
300	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
301	// This is a no-op due to the ins instruction below.
302	// ins	v2.d[0], v1.d[1]
303
304	// equivalent of reduction_avx from ghash-x86_64.pl
305	shl	v17.2d, v0.2d, #57		// 1st phase
306	shl	v18.2d, v0.2d, #62
307	eor	v18.16b, v18.16b, v17.16b	//
308	shl	v17.2d, v0.2d, #63
309	eor	v18.16b, v18.16b, v17.16b	//
310	// Note Xm contains {Xl.d[1], Xh.d[0]}.
311	eor	v18.16b, v18.16b, v1.16b
312	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
313	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
314
315	ushr	v18.2d, v0.2d, #1		// 2nd phase
316	eor	v2.16b, v2.16b,v0.16b
317	eor	v0.16b, v0.16b,v18.16b	//
318	ushr	v18.2d, v18.2d, #6
319	ushr	v0.2d, v0.2d, #1		//
320	eor	v0.16b, v0.16b, v2.16b	//
321	eor	v0.16b, v0.16b, v18.16b	//
322
323	subs	x3, x3, #16
324	bne	Loop_neon
325
326	rev64	v0.16b, v0.16b		// byteswap Xi and write
327	ext	v0.16b, v0.16b, v0.16b, #8
328	st1	{v0.16b}, [x0]
329
330	ret
331
332
333.section	__TEXT,__const
334.align	4
335Lmasks:
336.quad	0x0000ffffffffffff	// k48
337.quad	0x00000000ffffffff	// k32
338.quad	0x000000000000ffff	// k16
339.quad	0x0000000000000000	// k0
340.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
341.align	2
342.align	2
343#endif  // !OPENSSL_NO_ASM
344