1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15.text
16
17.globl	_gcm_init_neon
18.private_extern	_gcm_init_neon
19
20.align	4
21_gcm_init_neon:
22	// This function is adapted from gcm_init_v8. xC2 is t3.
23	ld1	{v17.2d}, [x1]			// load H
24	movi	v19.16b, #0xe1
25	shl	v19.2d, v19.2d, #57		// 0xc2.0
26	ext	v3.16b, v17.16b, v17.16b, #8
27	ushr	v18.2d, v19.2d, #63
28	dup	v17.4s, v17.s[1]
29	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
30	ushr	v18.2d, v3.2d, #63
31	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
32	and	v18.16b, v18.16b, v16.16b
33	shl	v3.2d, v3.2d, #1
34	ext	v18.16b, v18.16b, v18.16b, #8
35	and	v16.16b, v16.16b, v17.16b
36	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
37	eor	v5.16b, v3.16b, v16.16b	// twisted H
38	st1	{v5.2d}, [x0]			// store Htable[0]
39	ret
40
41
42.globl	_gcm_gmult_neon
43.private_extern	_gcm_gmult_neon
44
45.align	4
46_gcm_gmult_neon:
47	ld1	{v3.16b}, [x0]		// load Xi
48	ld1	{v5.1d}, [x1], #8		// load twisted H
49	ld1	{v6.1d}, [x1]
50	adrp	x9, Lmasks@PAGE		// load constants
51	add	x9, x9, Lmasks@PAGEOFF
52	ld1	{v24.2d, v25.2d}, [x9]
53	rev64	v3.16b, v3.16b		// byteswap Xi
54	ext	v3.16b, v3.16b, v3.16b, #8
55	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
56
57	mov	x3, #16
58	b	Lgmult_neon
59
60
61.globl	_gcm_ghash_neon
62.private_extern	_gcm_ghash_neon
63
64.align	4
65_gcm_ghash_neon:
66	ld1	{v0.16b}, [x0]		// load Xi
67	ld1	{v5.1d}, [x1], #8		// load twisted H
68	ld1	{v6.1d}, [x1]
69	adrp	x9, Lmasks@PAGE		// load constants
70	add	x9, x9, Lmasks@PAGEOFF
71	ld1	{v24.2d, v25.2d}, [x9]
72	rev64	v0.16b, v0.16b		// byteswap Xi
73	ext	v0.16b, v0.16b, v0.16b, #8
74	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
75
76Loop_neon:
77	ld1	{v3.16b}, [x2], #16	// load inp
78	rev64	v3.16b, v3.16b		// byteswap inp
79	ext	v3.16b, v3.16b, v3.16b, #8
80	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
81
82Lgmult_neon:
83	// Split the input into v3 and v4. (The upper halves are unused,
84	// so it is okay to leave them alone.)
85	ins	v4.d[0], v3.d[1]
86	ext	v16.8b, v5.8b, v5.8b, #1	// A1
87	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
88	ext	v0.8b, v3.8b, v3.8b, #1		// B1
89	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
90	ext	v17.8b, v5.8b, v5.8b, #2	// A2
91	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
92	ext	v19.8b, v3.8b, v3.8b, #2	// B2
93	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
94	ext	v18.8b, v5.8b, v5.8b, #3	// A3
95	eor	v16.16b, v16.16b, v0.16b	// L = E + F
96	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
97	ext	v0.8b, v3.8b, v3.8b, #3		// B3
98	eor	v17.16b, v17.16b, v19.16b	// M = G + H
99	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
100
101	// Here we diverge from the 32-bit version. It computes the following
102	// (instructions reordered for clarity):
103	//
104	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
105	//     vand	$t0#hi, $t0#hi, $k48
106	//     veor	$t0#lo, $t0#lo, $t0#hi
107	//
108	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
109	//     vand	$t1#hi, $t1#hi, $k32
110	//     veor	$t1#lo, $t1#lo, $t1#hi
111	//
112	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
113	//     vand	$t2#hi, $t2#hi, $k16
114	//     veor	$t2#lo, $t2#lo, $t2#hi
115	//
116	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
117	//     vmov.i64	$t3#hi, #0
118	//
119	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
120	// upper halves of SIMD registers, so we must split each half into
121	// separate registers. To compensate, we pair computations up and
122	// parallelize.
123
124	ext	v19.8b, v3.8b, v3.8b, #4	// B4
125	eor	v18.16b, v18.16b, v0.16b	// N = I + J
126	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
127
128	// This can probably be scheduled more efficiently. For now, we just
129	// pair up independent instructions.
130	zip1	v20.2d, v16.2d, v17.2d
131	zip1	v22.2d, v18.2d, v19.2d
132	zip2	v21.2d, v16.2d, v17.2d
133	zip2	v23.2d, v18.2d, v19.2d
134	eor	v20.16b, v20.16b, v21.16b
135	eor	v22.16b, v22.16b, v23.16b
136	and	v21.16b, v21.16b, v24.16b
137	and	v23.16b, v23.16b, v25.16b
138	eor	v20.16b, v20.16b, v21.16b
139	eor	v22.16b, v22.16b, v23.16b
140	zip1	v16.2d, v20.2d, v21.2d
141	zip1	v18.2d, v22.2d, v23.2d
142	zip2	v17.2d, v20.2d, v21.2d
143	zip2	v19.2d, v22.2d, v23.2d
144
145	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
146	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
147	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
148	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
149	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
150	eor	v16.16b, v16.16b, v17.16b
151	eor	v18.16b, v18.16b, v19.16b
152	eor	v0.16b, v0.16b, v16.16b
153	eor	v0.16b, v0.16b, v18.16b
154	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
155	ext	v16.8b, v7.8b, v7.8b, #1	// A1
156	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
157	ext	v1.8b, v3.8b, v3.8b, #1		// B1
158	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
159	ext	v17.8b, v7.8b, v7.8b, #2	// A2
160	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
161	ext	v19.8b, v3.8b, v3.8b, #2	// B2
162	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
163	ext	v18.8b, v7.8b, v7.8b, #3	// A3
164	eor	v16.16b, v16.16b, v1.16b	// L = E + F
165	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
166	ext	v1.8b, v3.8b, v3.8b, #3		// B3
167	eor	v17.16b, v17.16b, v19.16b	// M = G + H
168	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
169
170	// Here we diverge from the 32-bit version. It computes the following
171	// (instructions reordered for clarity):
172	//
173	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
174	//     vand	$t0#hi, $t0#hi, $k48
175	//     veor	$t0#lo, $t0#lo, $t0#hi
176	//
177	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
178	//     vand	$t1#hi, $t1#hi, $k32
179	//     veor	$t1#lo, $t1#lo, $t1#hi
180	//
181	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
182	//     vand	$t2#hi, $t2#hi, $k16
183	//     veor	$t2#lo, $t2#lo, $t2#hi
184	//
185	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
186	//     vmov.i64	$t3#hi, #0
187	//
188	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
189	// upper halves of SIMD registers, so we must split each half into
190	// separate registers. To compensate, we pair computations up and
191	// parallelize.
192
193	ext	v19.8b, v3.8b, v3.8b, #4	// B4
194	eor	v18.16b, v18.16b, v1.16b	// N = I + J
195	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
196
197	// This can probably be scheduled more efficiently. For now, we just
198	// pair up independent instructions.
199	zip1	v20.2d, v16.2d, v17.2d
200	zip1	v22.2d, v18.2d, v19.2d
201	zip2	v21.2d, v16.2d, v17.2d
202	zip2	v23.2d, v18.2d, v19.2d
203	eor	v20.16b, v20.16b, v21.16b
204	eor	v22.16b, v22.16b, v23.16b
205	and	v21.16b, v21.16b, v24.16b
206	and	v23.16b, v23.16b, v25.16b
207	eor	v20.16b, v20.16b, v21.16b
208	eor	v22.16b, v22.16b, v23.16b
209	zip1	v16.2d, v20.2d, v21.2d
210	zip1	v18.2d, v22.2d, v23.2d
211	zip2	v17.2d, v20.2d, v21.2d
212	zip2	v19.2d, v22.2d, v23.2d
213
214	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
215	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
216	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
217	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
218	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
219	eor	v16.16b, v16.16b, v17.16b
220	eor	v18.16b, v18.16b, v19.16b
221	eor	v1.16b, v1.16b, v16.16b
222	eor	v1.16b, v1.16b, v18.16b
223	ext	v16.8b, v6.8b, v6.8b, #1	// A1
224	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
225	ext	v2.8b, v4.8b, v4.8b, #1		// B1
226	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
227	ext	v17.8b, v6.8b, v6.8b, #2	// A2
228	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
229	ext	v19.8b, v4.8b, v4.8b, #2	// B2
230	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
231	ext	v18.8b, v6.8b, v6.8b, #3	// A3
232	eor	v16.16b, v16.16b, v2.16b	// L = E + F
233	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
234	ext	v2.8b, v4.8b, v4.8b, #3		// B3
235	eor	v17.16b, v17.16b, v19.16b	// M = G + H
236	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
237
238	// Here we diverge from the 32-bit version. It computes the following
239	// (instructions reordered for clarity):
240	//
241	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
242	//     vand	$t0#hi, $t0#hi, $k48
243	//     veor	$t0#lo, $t0#lo, $t0#hi
244	//
245	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
246	//     vand	$t1#hi, $t1#hi, $k32
247	//     veor	$t1#lo, $t1#lo, $t1#hi
248	//
249	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
250	//     vand	$t2#hi, $t2#hi, $k16
251	//     veor	$t2#lo, $t2#lo, $t2#hi
252	//
253	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
254	//     vmov.i64	$t3#hi, #0
255	//
256	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
257	// upper halves of SIMD registers, so we must split each half into
258	// separate registers. To compensate, we pair computations up and
259	// parallelize.
260
261	ext	v19.8b, v4.8b, v4.8b, #4	// B4
262	eor	v18.16b, v18.16b, v2.16b	// N = I + J
263	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
264
265	// This can probably be scheduled more efficiently. For now, we just
266	// pair up independent instructions.
267	zip1	v20.2d, v16.2d, v17.2d
268	zip1	v22.2d, v18.2d, v19.2d
269	zip2	v21.2d, v16.2d, v17.2d
270	zip2	v23.2d, v18.2d, v19.2d
271	eor	v20.16b, v20.16b, v21.16b
272	eor	v22.16b, v22.16b, v23.16b
273	and	v21.16b, v21.16b, v24.16b
274	and	v23.16b, v23.16b, v25.16b
275	eor	v20.16b, v20.16b, v21.16b
276	eor	v22.16b, v22.16b, v23.16b
277	zip1	v16.2d, v20.2d, v21.2d
278	zip1	v18.2d, v22.2d, v23.2d
279	zip2	v17.2d, v20.2d, v21.2d
280	zip2	v19.2d, v22.2d, v23.2d
281
282	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
283	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
284	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
285	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
286	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
287	eor	v16.16b, v16.16b, v17.16b
288	eor	v18.16b, v18.16b, v19.16b
289	eor	v2.16b, v2.16b, v16.16b
290	eor	v2.16b, v2.16b, v18.16b
291	ext	v16.16b, v0.16b, v2.16b, #8
292	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
293	eor	v1.16b, v1.16b, v2.16b
294	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
295	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
296	// This is a no-op due to the ins instruction below.
297	// ins	v2.d[0], v1.d[1]
298
299	// equivalent of reduction_avx from ghash-x86_64.pl
300	shl	v17.2d, v0.2d, #57		// 1st phase
301	shl	v18.2d, v0.2d, #62
302	eor	v18.16b, v18.16b, v17.16b	//
303	shl	v17.2d, v0.2d, #63
304	eor	v18.16b, v18.16b, v17.16b	//
305	// Note Xm contains {Xl.d[1], Xh.d[0]}.
306	eor	v18.16b, v18.16b, v1.16b
307	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
308	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
309
310	ushr	v18.2d, v0.2d, #1		// 2nd phase
311	eor	v2.16b, v2.16b,v0.16b
312	eor	v0.16b, v0.16b,v18.16b	//
313	ushr	v18.2d, v18.2d, #6
314	ushr	v0.2d, v0.2d, #1		//
315	eor	v0.16b, v0.16b, v2.16b	//
316	eor	v0.16b, v0.16b, v18.16b	//
317
318	subs	x3, x3, #16
319	bne	Loop_neon
320
321	rev64	v0.16b, v0.16b		// byteswap Xi and write
322	ext	v0.16b, v0.16b, v0.16b, #8
323	st1	{v0.16b}, [x0]
324
325	ret
326
327
328.section	__TEXT,__const
329.align	4
330Lmasks:
331.quad	0x0000ffffffffffff	// k48
332.quad	0x00000000ffffffff	// k32
333.quad	0x000000000000ffff	// k16
334.quad	0x0000000000000000	// k0
335.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
336.align	2
337.align	2
338#endif  // !OPENSSL_NO_ASM
339