1#if defined(__aarch64__)
2#include <openssl/arm_arch.h>
3
4.text
5#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
6.arch	armv8-a+crypto
7#endif
8.globl	gcm_init_v8
9.hidden	gcm_init_v8
10.type	gcm_init_v8,%function
11.align	4
12gcm_init_v8:
13	ld1	{v17.2d},[x1]		//load input H
14	movi	v19.16b,#0xe1
15	shl	v19.2d,v19.2d,#57		//0xc2.0
16	ext	v3.16b,v17.16b,v17.16b,#8
17	ushr	v18.2d,v19.2d,#63
18	dup	v17.4s,v17.s[1]
19	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
20	ushr	v18.2d,v3.2d,#63
21	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
22	and	v18.16b,v18.16b,v16.16b
23	shl	v3.2d,v3.2d,#1
24	ext	v18.16b,v18.16b,v18.16b,#8
25	and	v16.16b,v16.16b,v17.16b
26	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
27	eor	v20.16b,v3.16b,v16.16b		//twisted H
28	st1	{v20.2d},[x0],#16		//store Htable[0]
29
30	//calculate H^2
31	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
32	pmull	v0.1q,v20.1d,v20.1d
33	eor	v16.16b,v16.16b,v20.16b
34	pmull2	v2.1q,v20.2d,v20.2d
35	pmull	v1.1q,v16.1d,v16.1d
36
37	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
38	eor	v18.16b,v0.16b,v2.16b
39	eor	v1.16b,v1.16b,v17.16b
40	eor	v1.16b,v1.16b,v18.16b
41	pmull	v18.1q,v0.1d,v19.1d		//1st phase
42
43	ins	v2.d[0],v1.d[1]
44	ins	v1.d[1],v0.d[0]
45	eor	v0.16b,v1.16b,v18.16b
46
47	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
48	pmull	v0.1q,v0.1d,v19.1d
49	eor	v18.16b,v18.16b,v2.16b
50	eor	v22.16b,v0.16b,v18.16b
51
52	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
53	eor	v17.16b,v17.16b,v22.16b
54	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
55	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
56
57	ret
58.size	gcm_init_v8,.-gcm_init_v8
59.globl	gcm_gmult_v8
60.hidden	gcm_gmult_v8
61.type	gcm_gmult_v8,%function
62.align	4
63gcm_gmult_v8:
64	ld1	{v17.2d},[x0]		//load Xi
65	movi	v19.16b,#0xe1
66	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
67	shl	v19.2d,v19.2d,#57
68#ifndef __ARMEB__
69	rev64	v17.16b,v17.16b
70#endif
71	ext	v3.16b,v17.16b,v17.16b,#8
72
73	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
74	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
75	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
76	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
77
78	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
79	eor	v18.16b,v0.16b,v2.16b
80	eor	v1.16b,v1.16b,v17.16b
81	eor	v1.16b,v1.16b,v18.16b
82	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
83
84	ins	v2.d[0],v1.d[1]
85	ins	v1.d[1],v0.d[0]
86	eor	v0.16b,v1.16b,v18.16b
87
88	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
89	pmull	v0.1q,v0.1d,v19.1d
90	eor	v18.16b,v18.16b,v2.16b
91	eor	v0.16b,v0.16b,v18.16b
92
93#ifndef __ARMEB__
94	rev64	v0.16b,v0.16b
95#endif
96	ext	v0.16b,v0.16b,v0.16b,#8
97	st1	{v0.2d},[x0]		//write out Xi
98
99	ret
100.size	gcm_gmult_v8,.-gcm_gmult_v8
101.globl	gcm_ghash_v8
102.hidden	gcm_ghash_v8
103.type	gcm_ghash_v8,%function
104.align	4
105gcm_ghash_v8:
106	ld1	{v0.2d},[x0]		//load [rotated] Xi
107						//"[rotated]" means that
108						//loaded value would have
109						//to be rotated in order to
110						//make it appear as in
111						//alorithm specification
112	subs	x3,x3,#32		//see if x3 is 32 or larger
113	mov	x12,#16		//x12 is used as post-
114						//increment for input pointer;
115						//as loop is modulo-scheduled
116						//x12 is zeroed just in time
117						//to preclude oversteping
118						//inp[len], which means that
119						//last block[s] are actually
120						//loaded twice, but last
121						//copy is not processed
122	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
123	movi	v19.16b,#0xe1
124	ld1	{v22.2d},[x1]
125	csel	x12,xzr,x12,eq			//is it time to zero x12?
126	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
127	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
128	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
129#ifndef __ARMEB__
130	rev64	v16.16b,v16.16b
131	rev64	v0.16b,v0.16b
132#endif
133	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
134	b.lo	.Lodd_tail_v8		//x3 was less than 32
135	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
136#ifndef __ARMEB__
137	rev64	v17.16b,v17.16b
138#endif
139	ext	v7.16b,v17.16b,v17.16b,#8
140	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
141	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
142	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
143	pmull2	v6.1q,v20.2d,v7.2d
144	b	.Loop_mod2x_v8
145
146.align	4
147.Loop_mod2x_v8:
148	ext	v18.16b,v3.16b,v3.16b,#8
149	subs	x3,x3,#32		//is there more data?
150	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
151	csel	x12,xzr,x12,lo			//is it time to zero x12?
152
153	pmull	v5.1q,v21.1d,v17.1d
154	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
155	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
156	eor	v0.16b,v0.16b,v4.16b		//accumulate
157	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
158	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
159
160	eor	v2.16b,v2.16b,v6.16b
161	csel	x12,xzr,x12,eq			//is it time to zero x12?
162	eor	v1.16b,v1.16b,v5.16b
163
164	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
165	eor	v18.16b,v0.16b,v2.16b
166	eor	v1.16b,v1.16b,v17.16b
167	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
168#ifndef __ARMEB__
169	rev64	v16.16b,v16.16b
170#endif
171	eor	v1.16b,v1.16b,v18.16b
172	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
173
174#ifndef __ARMEB__
175	rev64	v17.16b,v17.16b
176#endif
177	ins	v2.d[0],v1.d[1]
178	ins	v1.d[1],v0.d[0]
179	ext	v7.16b,v17.16b,v17.16b,#8
180	ext	v3.16b,v16.16b,v16.16b,#8
181	eor	v0.16b,v1.16b,v18.16b
182	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
183	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
184
185	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
186	pmull	v0.1q,v0.1d,v19.1d
187	eor	v3.16b,v3.16b,v18.16b
188	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
189	eor	v3.16b,v3.16b,v0.16b
190	pmull2	v6.1q,v20.2d,v7.2d
191	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
192
193	eor	v2.16b,v2.16b,v18.16b
194	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
195	adds	x3,x3,#32		//re-construct x3
196	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
197	b.eq	.Ldone_v8		//is x3 zero?
198.Lodd_tail_v8:
199	ext	v18.16b,v0.16b,v0.16b,#8
200	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
201	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
202
203	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
204	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
205	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
206	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
207
208	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
209	eor	v18.16b,v0.16b,v2.16b
210	eor	v1.16b,v1.16b,v17.16b
211	eor	v1.16b,v1.16b,v18.16b
212	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
213
214	ins	v2.d[0],v1.d[1]
215	ins	v1.d[1],v0.d[0]
216	eor	v0.16b,v1.16b,v18.16b
217
218	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
219	pmull	v0.1q,v0.1d,v19.1d
220	eor	v18.16b,v18.16b,v2.16b
221	eor	v0.16b,v0.16b,v18.16b
222
223.Ldone_v8:
224#ifndef __ARMEB__
225	rev64	v0.16b,v0.16b
226#endif
227	ext	v0.16b,v0.16b,v0.16b,#8
228	st1	{v0.2d},[x0]		//write out Xi
229
230	ret
231.size	gcm_ghash_v8,.-gcm_ghash_v8
232.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
233.align	2
234.align	2
235#endif
236