1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14#include <openssl/arm_arch.h>
15
16.text
17
18.globl	_gcm_init_v8
19.private_extern	_gcm_init_v8
20
21.align	4
22_gcm_init_v8:
23	ld1	{v17.2d},[x1]		//load input H
24	movi	v19.16b,#0xe1
25	shl	v19.2d,v19.2d,#57		//0xc2.0
26	ext	v3.16b,v17.16b,v17.16b,#8
27	ushr	v18.2d,v19.2d,#63
28	dup	v17.4s,v17.s[1]
29	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
30	ushr	v18.2d,v3.2d,#63
31	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
32	and	v18.16b,v18.16b,v16.16b
33	shl	v3.2d,v3.2d,#1
34	ext	v18.16b,v18.16b,v18.16b,#8
35	and	v16.16b,v16.16b,v17.16b
36	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
37	eor	v20.16b,v3.16b,v16.16b		//twisted H
38	st1	{v20.2d},[x0],#16		//store Htable[0]
39
40	//calculate H^2
41	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
42	pmull	v0.1q,v20.1d,v20.1d
43	eor	v16.16b,v16.16b,v20.16b
44	pmull2	v2.1q,v20.2d,v20.2d
45	pmull	v1.1q,v16.1d,v16.1d
46
47	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
48	eor	v18.16b,v0.16b,v2.16b
49	eor	v1.16b,v1.16b,v17.16b
50	eor	v1.16b,v1.16b,v18.16b
51	pmull	v18.1q,v0.1d,v19.1d		//1st phase
52
53	ins	v2.d[0],v1.d[1]
54	ins	v1.d[1],v0.d[0]
55	eor	v0.16b,v1.16b,v18.16b
56
57	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
58	pmull	v0.1q,v0.1d,v19.1d
59	eor	v18.16b,v18.16b,v2.16b
60	eor	v22.16b,v0.16b,v18.16b
61
62	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
63	eor	v17.16b,v17.16b,v22.16b
64	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
65	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
66
67	ret
68
69.globl	_gcm_gmult_v8
70.private_extern	_gcm_gmult_v8
71
72.align	4
73_gcm_gmult_v8:
74	ld1	{v17.2d},[x0]		//load Xi
75	movi	v19.16b,#0xe1
76	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
77	shl	v19.2d,v19.2d,#57
78#ifndef __ARMEB__
79	rev64	v17.16b,v17.16b
80#endif
81	ext	v3.16b,v17.16b,v17.16b,#8
82
83	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
84	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
85	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
86	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
87
88	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
89	eor	v18.16b,v0.16b,v2.16b
90	eor	v1.16b,v1.16b,v17.16b
91	eor	v1.16b,v1.16b,v18.16b
92	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
93
94	ins	v2.d[0],v1.d[1]
95	ins	v1.d[1],v0.d[0]
96	eor	v0.16b,v1.16b,v18.16b
97
98	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
99	pmull	v0.1q,v0.1d,v19.1d
100	eor	v18.16b,v18.16b,v2.16b
101	eor	v0.16b,v0.16b,v18.16b
102
103#ifndef __ARMEB__
104	rev64	v0.16b,v0.16b
105#endif
106	ext	v0.16b,v0.16b,v0.16b,#8
107	st1	{v0.2d},[x0]		//write out Xi
108
109	ret
110
111.globl	_gcm_ghash_v8
112.private_extern	_gcm_ghash_v8
113
114.align	4
115_gcm_ghash_v8:
116	ld1	{v0.2d},[x0]		//load [rotated] Xi
117						//"[rotated]" means that
118						//loaded value would have
119						//to be rotated in order to
120						//make it appear as in
121						//algorithm specification
122	subs	x3,x3,#32		//see if x3 is 32 or larger
123	mov	x12,#16		//x12 is used as post-
124						//increment for input pointer;
125						//as loop is modulo-scheduled
126						//x12 is zeroed just in time
127						//to preclude overstepping
128						//inp[len], which means that
129						//last block[s] are actually
130						//loaded twice, but last
131						//copy is not processed
132	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
133	movi	v19.16b,#0xe1
134	ld1	{v22.2d},[x1]
135	csel	x12,xzr,x12,eq			//is it time to zero x12?
136	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
137	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
138	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
139#ifndef __ARMEB__
140	rev64	v16.16b,v16.16b
141	rev64	v0.16b,v0.16b
142#endif
143	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
144	b.lo	Lodd_tail_v8		//x3 was less than 32
145	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
146#ifndef __ARMEB__
147	rev64	v17.16b,v17.16b
148#endif
149	ext	v7.16b,v17.16b,v17.16b,#8
150	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
151	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
152	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
153	pmull2	v6.1q,v20.2d,v7.2d
154	b	Loop_mod2x_v8
155
156.align	4
157Loop_mod2x_v8:
158	ext	v18.16b,v3.16b,v3.16b,#8
159	subs	x3,x3,#32		//is there more data?
160	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
161	csel	x12,xzr,x12,lo			//is it time to zero x12?
162
163	pmull	v5.1q,v21.1d,v17.1d
164	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
165	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
166	eor	v0.16b,v0.16b,v4.16b		//accumulate
167	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
168	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
169
170	eor	v2.16b,v2.16b,v6.16b
171	csel	x12,xzr,x12,eq			//is it time to zero x12?
172	eor	v1.16b,v1.16b,v5.16b
173
174	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
175	eor	v18.16b,v0.16b,v2.16b
176	eor	v1.16b,v1.16b,v17.16b
177	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
178#ifndef __ARMEB__
179	rev64	v16.16b,v16.16b
180#endif
181	eor	v1.16b,v1.16b,v18.16b
182	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
183
184#ifndef __ARMEB__
185	rev64	v17.16b,v17.16b
186#endif
187	ins	v2.d[0],v1.d[1]
188	ins	v1.d[1],v0.d[0]
189	ext	v7.16b,v17.16b,v17.16b,#8
190	ext	v3.16b,v16.16b,v16.16b,#8
191	eor	v0.16b,v1.16b,v18.16b
192	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
193	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
194
195	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
196	pmull	v0.1q,v0.1d,v19.1d
197	eor	v3.16b,v3.16b,v18.16b
198	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
199	eor	v3.16b,v3.16b,v0.16b
200	pmull2	v6.1q,v20.2d,v7.2d
201	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
202
203	eor	v2.16b,v2.16b,v18.16b
204	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
205	adds	x3,x3,#32		//re-construct x3
206	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
207	b.eq	Ldone_v8		//is x3 zero?
208Lodd_tail_v8:
209	ext	v18.16b,v0.16b,v0.16b,#8
210	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
211	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
212
213	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
214	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
215	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
216	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
217
218	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
219	eor	v18.16b,v0.16b,v2.16b
220	eor	v1.16b,v1.16b,v17.16b
221	eor	v1.16b,v1.16b,v18.16b
222	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
223
224	ins	v2.d[0],v1.d[1]
225	ins	v1.d[1],v0.d[0]
226	eor	v0.16b,v1.16b,v18.16b
227
228	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
229	pmull	v0.1q,v0.1d,v19.1d
230	eor	v18.16b,v18.16b,v2.16b
231	eor	v0.16b,v0.16b,v18.16b
232
233Ldone_v8:
234#ifndef __ARMEB__
235	rev64	v0.16b,v0.16b
236#endif
237	ext	v0.16b,v0.16b,v0.16b,#8
238	st1	{v0.2d},[x0]		//write out Xi
239
240	ret
241
242.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
243.align	2
244.align	2
245#endif  // !OPENSSL_NO_ASM
246