1#if defined(__aarch64__)
2#include <openssl/arm_arch.h>
3
4.text
5#if !defined(__clang__)
6.arch	armv8-a+crypto
7#endif
8.globl	gcm_init_v8
9.type	gcm_init_v8,%function
10.align	4
11gcm_init_v8:
12	ld1	{v17.2d},[x1]		//load input H
13	movi	v19.16b,#0xe1
14	shl	v19.2d,v19.2d,#57		//0xc2.0
15	ext	v3.16b,v17.16b,v17.16b,#8
16	ushr	v18.2d,v19.2d,#63
17	dup	v17.4s,v17.s[1]
18	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
19	ushr	v18.2d,v3.2d,#63
20	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
21	and	v18.16b,v18.16b,v16.16b
22	shl	v3.2d,v3.2d,#1
23	ext	v18.16b,v18.16b,v18.16b,#8
24	and	v16.16b,v16.16b,v17.16b
25	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
26	eor	v20.16b,v3.16b,v16.16b		//twisted H
27	st1	{v20.2d},[x0],#16		//store Htable[0]
28
29	//calculate H^2
30	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
31	pmull	v0.1q,v20.1d,v20.1d
32	eor	v16.16b,v16.16b,v20.16b
33	pmull2	v2.1q,v20.2d,v20.2d
34	pmull	v1.1q,v16.1d,v16.1d
35
36	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
37	eor	v18.16b,v0.16b,v2.16b
38	eor	v1.16b,v1.16b,v17.16b
39	eor	v1.16b,v1.16b,v18.16b
40	pmull	v18.1q,v0.1d,v19.1d		//1st phase
41
42	ins	v2.d[0],v1.d[1]
43	ins	v1.d[1],v0.d[0]
44	eor	v0.16b,v1.16b,v18.16b
45
46	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
47	pmull	v0.1q,v0.1d,v19.1d
48	eor	v18.16b,v18.16b,v2.16b
49	eor	v22.16b,v0.16b,v18.16b
50
51	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
52	eor	v17.16b,v17.16b,v22.16b
53	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
54	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
55
56	ret
57.size	gcm_init_v8,.-gcm_init_v8
58.globl	gcm_gmult_v8
59.type	gcm_gmult_v8,%function
60.align	4
61gcm_gmult_v8:
62	ld1	{v17.2d},[x0]		//load Xi
63	movi	v19.16b,#0xe1
64	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
65	shl	v19.2d,v19.2d,#57
66#ifndef __ARMEB__
67	rev64	v17.16b,v17.16b
68#endif
69	ext	v3.16b,v17.16b,v17.16b,#8
70
71	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
72	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
73	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
74	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
75
76	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
77	eor	v18.16b,v0.16b,v2.16b
78	eor	v1.16b,v1.16b,v17.16b
79	eor	v1.16b,v1.16b,v18.16b
80	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
81
82	ins	v2.d[0],v1.d[1]
83	ins	v1.d[1],v0.d[0]
84	eor	v0.16b,v1.16b,v18.16b
85
86	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
87	pmull	v0.1q,v0.1d,v19.1d
88	eor	v18.16b,v18.16b,v2.16b
89	eor	v0.16b,v0.16b,v18.16b
90
91#ifndef __ARMEB__
92	rev64	v0.16b,v0.16b
93#endif
94	ext	v0.16b,v0.16b,v0.16b,#8
95	st1	{v0.2d},[x0]		//write out Xi
96
97	ret
98.size	gcm_gmult_v8,.-gcm_gmult_v8
99.globl	gcm_ghash_v8
100.type	gcm_ghash_v8,%function
101.align	4
102gcm_ghash_v8:
103	ld1	{v0.2d},[x0]		//load [rotated] Xi
104						//"[rotated]" means that
105						//loaded value would have
106						//to be rotated in order to
107						//make it appear as in
108						//alorithm specification
109	subs	x3,x3,#32		//see if x3 is 32 or larger
110	mov	x12,#16		//x12 is used as post-
111						//increment for input pointer;
112						//as loop is modulo-scheduled
113						//x12 is zeroed just in time
114						//to preclude oversteping
115						//inp[len], which means that
116						//last block[s] are actually
117						//loaded twice, but last
118						//copy is not processed
119	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
120	movi	v19.16b,#0xe1
121	ld1	{v22.2d},[x1]
122	csel	x12,xzr,x12,eq			//is it time to zero x12?
123	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
124	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
125	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
126#ifndef __ARMEB__
127	rev64	v16.16b,v16.16b
128	rev64	v0.16b,v0.16b
129#endif
130	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
131	b.lo	.Lodd_tail_v8		//x3 was less than 32
132	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
133#ifndef __ARMEB__
134	rev64	v17.16b,v17.16b
135#endif
136	ext	v7.16b,v17.16b,v17.16b,#8
137	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
138	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
139	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
140	pmull2	v6.1q,v20.2d,v7.2d
141	b	.Loop_mod2x_v8
142
143.align	4
144.Loop_mod2x_v8:
145	ext	v18.16b,v3.16b,v3.16b,#8
146	subs	x3,x3,#32		//is there more data?
147	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
148	csel	x12,xzr,x12,lo			//is it time to zero x12?
149
150	pmull	v5.1q,v21.1d,v17.1d
151	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
152	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
153	eor	v0.16b,v0.16b,v4.16b		//accumulate
154	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
155	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
156
157	eor	v2.16b,v2.16b,v6.16b
158	csel	x12,xzr,x12,eq			//is it time to zero x12?
159	eor	v1.16b,v1.16b,v5.16b
160
161	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
162	eor	v18.16b,v0.16b,v2.16b
163	eor	v1.16b,v1.16b,v17.16b
164	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
165#ifndef __ARMEB__
166	rev64	v16.16b,v16.16b
167#endif
168	eor	v1.16b,v1.16b,v18.16b
169	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
170
171#ifndef __ARMEB__
172	rev64	v17.16b,v17.16b
173#endif
174	ins	v2.d[0],v1.d[1]
175	ins	v1.d[1],v0.d[0]
176	ext	v7.16b,v17.16b,v17.16b,#8
177	ext	v3.16b,v16.16b,v16.16b,#8
178	eor	v0.16b,v1.16b,v18.16b
179	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
180	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
181
182	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
183	pmull	v0.1q,v0.1d,v19.1d
184	eor	v3.16b,v3.16b,v18.16b
185	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
186	eor	v3.16b,v3.16b,v0.16b
187	pmull2	v6.1q,v20.2d,v7.2d
188	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
189
190	eor	v2.16b,v2.16b,v18.16b
191	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
192	adds	x3,x3,#32		//re-construct x3
193	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
194	b.eq	.Ldone_v8		//is x3 zero?
195.Lodd_tail_v8:
196	ext	v18.16b,v0.16b,v0.16b,#8
197	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
198	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
199
200	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
201	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
202	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
203	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
204
205	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
206	eor	v18.16b,v0.16b,v2.16b
207	eor	v1.16b,v1.16b,v17.16b
208	eor	v1.16b,v1.16b,v18.16b
209	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
210
211	ins	v2.d[0],v1.d[1]
212	ins	v1.d[1],v0.d[0]
213	eor	v0.16b,v1.16b,v18.16b
214
215	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
216	pmull	v0.1q,v0.1d,v19.1d
217	eor	v18.16b,v18.16b,v2.16b
218	eor	v0.16b,v0.16b,v18.16b
219
220.Ldone_v8:
221#ifndef __ARMEB__
222	rev64	v0.16b,v0.16b
223#endif
224	ext	v0.16b,v0.16b,v0.16b,#8
225	st1	{v0.2d},[x0]		//write out Xi
226
227	ret
228.size	gcm_ghash_v8,.-gcm_ghash_v8
229.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
230.align	2
231.align	2
232#endif