1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15
16# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
17# implements the multiplication algorithm described in:
18#
19# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
20# Polynomial Multiplication on ARM Processors using the NEON Engine.
21#
22# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
23#
24# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
25# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
26# NEON, the low and high halves of the 128-bit register q0 are accessible as
27# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
28# vN. Where the 32-bit version would use the upper half, this file must keep
29# halves in separate registers.
30#
31# The other distinction is in syntax. 32-bit NEON embeds lane information in the
32# instruction name, while AArch64 uses suffixes on the registers. For instance,
33# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
34#
35#     vshl.i64 q0, q0, #1
36#
37# in 64-bit, it would be written:
38#
39#     shl v0.2d, v0.2d, #1
40#
41# See Programmer's Guide for ARMv8-A, section 7 for details.
42# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
43#
44# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
45# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
46# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
47# polynomial and is conditioned on the PMULL extension. This file emulates the
48# latter with the former.
49
50use strict;
51
52my $flavour = shift;
53my $output;
54if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
55else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
56
57if ($flavour && $flavour ne "void") {
58    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
59    my $dir = $1;
60    my $xlate;
61    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
62    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
63    die "can't locate arm-xlate.pl";
64
65    open OUT,"| \"$^X\" $xlate $flavour $output";
66    *STDOUT=*OUT;
67} else {
68    open OUT,">$output";
69    *STDOUT=*OUT;
70}
71
72my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
73my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
74my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
75# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
76# to spare.
77my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
78my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
79my ($k48_k32, $k16_k0) = map("v$_", (24..25));
80
81my $code = "";
82
83# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
84# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
85sub clmul64x64 {
86my ($r, $a, $b) = @_;
87$code .= <<___;
88	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
89	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
90	ext	$r.8b, $b.8b, $b.8b, #1		// B1
91	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
92	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
93	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
94	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
95	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
96	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
97	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
98	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
99	ext	$r.8b, $b.8b, $b.8b, #3		// B3
100	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
101	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
102
103	// Here we diverge from the 32-bit version. It computes the following
104	// (instructions reordered for clarity):
105	//
106	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
107	//     vand	\$t0#hi, \$t0#hi, \$k48
108	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
109	//
110	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
111	//     vand	\$t1#hi, \$t1#hi, \$k32
112	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
113	//
114	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
115	//     vand	\$t2#hi, \$t2#hi, \$k16
116	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
117	//
118	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
119	//     vmov.i64	\$t3#hi, #0
120	//
121	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
122	// upper halves of SIMD registers, so we must split each half into
123	// separate registers. To compensate, we pair computations up and
124	// parallelize.
125
126	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
127	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
128	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
129
130	// This can probably be scheduled more efficiently. For now, we just
131	// pair up independent instructions.
132	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
133	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
134	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
135	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
136	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
137	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
138	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
139	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
140	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
141	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
142	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
143	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
144	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
145	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
146
147	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
148	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
149	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
150	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
151	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
152	eor	$t0.16b, $t0.16b, $t1.16b
153	eor	$t2.16b, $t2.16b, $t3.16b
154	eor	$r.16b, $r.16b, $t0.16b
155	eor	$r.16b, $r.16b, $t2.16b
156___
157}
158
159$code .= <<___;
160#include <openssl/arm_arch.h>
161
162.text
163
164.global	gcm_init_neon
165.type	gcm_init_neon,%function
166.align	4
167gcm_init_neon:
168	AARCH64_VALID_CALL_TARGET
169	// This function is adapted from gcm_init_v8. xC2 is t3.
170	ld1	{$t1.2d}, [x1]			// load H
171	movi	$t3.16b, #0xe1
172	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
173	ext	$INlo.16b, $t1.16b, $t1.16b, #8
174	ushr	$t2.2d, $t3.2d, #63
175	dup	$t1.4s, $t1.s[1]
176	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
177	ushr	$t2.2d, $INlo.2d, #63
178	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
179	and	$t2.16b, $t2.16b, $t0.16b
180	shl	$INlo.2d, $INlo.2d, #1
181	ext	$t2.16b, $t2.16b, $t2.16b, #8
182	and	$t0.16b, $t0.16b, $t1.16b
183	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
184	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
185	st1	{$Hlo.2d}, [x0]			// store Htable[0]
186	ret
187.size	gcm_init_neon,.-gcm_init_neon
188
189.global	gcm_gmult_neon
190.type	gcm_gmult_neon,%function
191.align	4
192gcm_gmult_neon:
193	AARCH64_VALID_CALL_TARGET
194	ld1	{$INlo.16b}, [$Xi]		// load Xi
195	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
196	ld1	{$Hhi.1d}, [$Htbl]
197	adrp	x9, :pg_hi21:.Lmasks		// load constants
198	add	x9, x9, :lo12:.Lmasks
199	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
200	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
201	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
202	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
203
204	mov	$len, #16
205	b	.Lgmult_neon
206.size	gcm_gmult_neon,.-gcm_gmult_neon
207
208.global	gcm_ghash_neon
209.type	gcm_ghash_neon,%function
210.align	4
211gcm_ghash_neon:
212	AARCH64_VALID_CALL_TARGET
213	ld1	{$Xl.16b}, [$Xi]		// load Xi
214	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
215	ld1	{$Hhi.1d}, [$Htbl]
216	adrp	x9, :pg_hi21:.Lmasks		// load constants
217	add	x9, x9, :lo12:.Lmasks
218	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
219	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
220	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
221	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
222
223.Loop_neon:
224	ld1	{$INlo.16b}, [$inp], #16	// load inp
225	rev64	$INlo.16b, $INlo.16b		// byteswap inp
226	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
227	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
228
229.Lgmult_neon:
230	// Split the input into $INlo and $INhi. (The upper halves are unused,
231	// so it is okay to leave them alone.)
232	ins	$INhi.d[0], $INlo.d[1]
233___
234&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
235$code .= <<___;
236	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
237___
238&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
239&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
240$code .= <<___;
241	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
242	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
243	eor	$Xm.16b, $Xm.16b, $Xh.16b
244	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
245	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
246	// This is a no-op due to the ins instruction below.
247	// ins	$Xh.d[0], $Xm.d[1]
248
249	// equivalent of reduction_avx from ghash-x86_64.pl
250	shl	$t1.2d, $Xl.2d, #57		// 1st phase
251	shl	$t2.2d, $Xl.2d, #62
252	eor	$t2.16b, $t2.16b, $t1.16b	//
253	shl	$t1.2d, $Xl.2d, #63
254	eor	$t2.16b, $t2.16b, $t1.16b	//
255	// Note Xm contains {Xl.d[1], Xh.d[0]}.
256	eor	$t2.16b, $t2.16b, $Xm.16b
257	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
258	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
259
260	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
261	eor	$Xh.16b, $Xh.16b,$Xl.16b
262	eor	$Xl.16b, $Xl.16b,$t2.16b	//
263	ushr	$t2.2d, $t2.2d, #6
264	ushr	$Xl.2d, $Xl.2d, #1		//
265	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
266	eor	$Xl.16b, $Xl.16b, $t2.16b	//
267
268	subs	$len, $len, #16
269	bne	.Loop_neon
270
271	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
272	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
273	st1	{$Xl.16b}, [$Xi]
274
275	ret
276.size	gcm_ghash_neon,.-gcm_ghash_neon
277
278.section	.rodata
279.align	4
280.Lmasks:
281.quad	0x0000ffffffffffff	// k48
282.quad	0x00000000ffffffff	// k32
283.quad	0x000000000000ffff	// k16
284.quad	0x0000000000000000	// k0
285.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
286.align  2
287___
288
289foreach (split("\n",$code)) {
290	s/\`([^\`]*)\`/eval $1/geo;
291
292	print $_,"\n";
293}
294close STDOUT or die "error closing STDOUT"; # enforce flush
295