1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18.text
19.arch	armv8-a+crypto
20.globl	gcm_init_v8
21.hidden	gcm_init_v8
22.type	gcm_init_v8,%function
23.align	4
24gcm_init_v8:
25	AARCH64_VALID_CALL_TARGET
26	ld1	{v17.2d},[x1]		//load input H
27	movi	v19.16b,#0xe1
28	shl	v19.2d,v19.2d,#57		//0xc2.0
29	ext	v3.16b,v17.16b,v17.16b,#8
30	ushr	v18.2d,v19.2d,#63
31	dup	v17.4s,v17.s[1]
32	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
33	ushr	v18.2d,v3.2d,#63
34	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
35	and	v18.16b,v18.16b,v16.16b
36	shl	v3.2d,v3.2d,#1
37	ext	v18.16b,v18.16b,v18.16b,#8
38	and	v16.16b,v16.16b,v17.16b
39	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
40	eor	v20.16b,v3.16b,v16.16b		//twisted H
41	st1	{v20.2d},[x0],#16		//store Htable[0]
42
43	//calculate H^2
44	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
45	pmull	v0.1q,v20.1d,v20.1d
46	eor	v16.16b,v16.16b,v20.16b
47	pmull2	v2.1q,v20.2d,v20.2d
48	pmull	v1.1q,v16.1d,v16.1d
49
50	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
51	eor	v18.16b,v0.16b,v2.16b
52	eor	v1.16b,v1.16b,v17.16b
53	eor	v1.16b,v1.16b,v18.16b
54	pmull	v18.1q,v0.1d,v19.1d		//1st phase
55
56	ins	v2.d[0],v1.d[1]
57	ins	v1.d[1],v0.d[0]
58	eor	v0.16b,v1.16b,v18.16b
59
60	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
61	pmull	v0.1q,v0.1d,v19.1d
62	eor	v18.16b,v18.16b,v2.16b
63	eor	v22.16b,v0.16b,v18.16b
64
65	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
66	eor	v17.16b,v17.16b,v22.16b
67	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
68	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
69
70	ret
71.size	gcm_init_v8,.-gcm_init_v8
72.globl	gcm_gmult_v8
73.hidden	gcm_gmult_v8
74.type	gcm_gmult_v8,%function
75.align	4
76gcm_gmult_v8:
77	AARCH64_VALID_CALL_TARGET
78	ld1	{v17.2d},[x0]		//load Xi
79	movi	v19.16b,#0xe1
80	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
81	shl	v19.2d,v19.2d,#57
82#ifndef __ARMEB__
83	rev64	v17.16b,v17.16b
84#endif
85	ext	v3.16b,v17.16b,v17.16b,#8
86
87	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
88	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
89	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
90	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
91
92	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
93	eor	v18.16b,v0.16b,v2.16b
94	eor	v1.16b,v1.16b,v17.16b
95	eor	v1.16b,v1.16b,v18.16b
96	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
97
98	ins	v2.d[0],v1.d[1]
99	ins	v1.d[1],v0.d[0]
100	eor	v0.16b,v1.16b,v18.16b
101
102	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
103	pmull	v0.1q,v0.1d,v19.1d
104	eor	v18.16b,v18.16b,v2.16b
105	eor	v0.16b,v0.16b,v18.16b
106
107#ifndef __ARMEB__
108	rev64	v0.16b,v0.16b
109#endif
110	ext	v0.16b,v0.16b,v0.16b,#8
111	st1	{v0.2d},[x0]		//write out Xi
112
113	ret
114.size	gcm_gmult_v8,.-gcm_gmult_v8
115.globl	gcm_ghash_v8
116.hidden	gcm_ghash_v8
117.type	gcm_ghash_v8,%function
118.align	4
119gcm_ghash_v8:
120	AARCH64_VALID_CALL_TARGET
121	ld1	{v0.2d},[x0]		//load [rotated] Xi
122						//"[rotated]" means that
123						//loaded value would have
124						//to be rotated in order to
125						//make it appear as in
126						//algorithm specification
127	subs	x3,x3,#32		//see if x3 is 32 or larger
128	mov	x12,#16		//x12 is used as post-
129						//increment for input pointer;
130						//as loop is modulo-scheduled
131						//x12 is zeroed just in time
132						//to preclude overstepping
133						//inp[len], which means that
134						//last block[s] are actually
135						//loaded twice, but last
136						//copy is not processed
137	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
138	movi	v19.16b,#0xe1
139	ld1	{v22.2d},[x1]
140	csel	x12,xzr,x12,eq			//is it time to zero x12?
141	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
142	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
143	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
144#ifndef __ARMEB__
145	rev64	v16.16b,v16.16b
146	rev64	v0.16b,v0.16b
147#endif
148	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
149	b.lo	.Lodd_tail_v8		//x3 was less than 32
150	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
151#ifndef __ARMEB__
152	rev64	v17.16b,v17.16b
153#endif
154	ext	v7.16b,v17.16b,v17.16b,#8
155	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
156	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
157	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
158	pmull2	v6.1q,v20.2d,v7.2d
159	b	.Loop_mod2x_v8
160
161.align	4
162.Loop_mod2x_v8:
163	ext	v18.16b,v3.16b,v3.16b,#8
164	subs	x3,x3,#32		//is there more data?
165	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
166	csel	x12,xzr,x12,lo			//is it time to zero x12?
167
168	pmull	v5.1q,v21.1d,v17.1d
169	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
170	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
171	eor	v0.16b,v0.16b,v4.16b		//accumulate
172	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
173	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
174
175	eor	v2.16b,v2.16b,v6.16b
176	csel	x12,xzr,x12,eq			//is it time to zero x12?
177	eor	v1.16b,v1.16b,v5.16b
178
179	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
180	eor	v18.16b,v0.16b,v2.16b
181	eor	v1.16b,v1.16b,v17.16b
182	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
183#ifndef __ARMEB__
184	rev64	v16.16b,v16.16b
185#endif
186	eor	v1.16b,v1.16b,v18.16b
187	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
188
189#ifndef __ARMEB__
190	rev64	v17.16b,v17.16b
191#endif
192	ins	v2.d[0],v1.d[1]
193	ins	v1.d[1],v0.d[0]
194	ext	v7.16b,v17.16b,v17.16b,#8
195	ext	v3.16b,v16.16b,v16.16b,#8
196	eor	v0.16b,v1.16b,v18.16b
197	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
198	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
199
200	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
201	pmull	v0.1q,v0.1d,v19.1d
202	eor	v3.16b,v3.16b,v18.16b
203	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
204	eor	v3.16b,v3.16b,v0.16b
205	pmull2	v6.1q,v20.2d,v7.2d
206	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
207
208	eor	v2.16b,v2.16b,v18.16b
209	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
210	adds	x3,x3,#32		//re-construct x3
211	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
212	b.eq	.Ldone_v8		//is x3 zero?
213.Lodd_tail_v8:
214	ext	v18.16b,v0.16b,v0.16b,#8
215	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
216	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
217
218	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
219	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
220	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
221	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
222
223	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
224	eor	v18.16b,v0.16b,v2.16b
225	eor	v1.16b,v1.16b,v17.16b
226	eor	v1.16b,v1.16b,v18.16b
227	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
228
229	ins	v2.d[0],v1.d[1]
230	ins	v1.d[1],v0.d[0]
231	eor	v0.16b,v1.16b,v18.16b
232
233	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
234	pmull	v0.1q,v0.1d,v19.1d
235	eor	v18.16b,v18.16b,v2.16b
236	eor	v0.16b,v0.16b,v18.16b
237
238.Ldone_v8:
239#ifndef __ARMEB__
240	rev64	v0.16b,v0.16b
241#endif
242	ext	v0.16b,v0.16b,v0.16b,#8
243	st1	{v0.2d},[x0]		//write out Xi
244
245	ret
246.size	gcm_ghash_v8,.-gcm_ghash_v8
247.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
248.align	2
249.align	2
250#endif
251#endif  // !OPENSSL_NO_ASM
252.section	.note.GNU-stack,"",%progbits
253