1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2015
18#
19# ChaCha20 for x86.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#		1xIALU/gcc	4xSSSE3
24# Pentium	17.5/+80%
25# PIII		14.2/+60%
26# P4		18.6/+84%
27# Core2		9.56/+89%	4.83
28# Westmere	9.50/+45%	3.35
29# Sandy Bridge	10.5/+47%	3.20
30# Haswell	8.15/+50%	2.83
31# Skylake	7.53/+22%	2.75
32# Silvermont	17.4/+36%	8.35
33# Goldmont	13.4/+40%	4.36
34# Sledgehammer	10.2/+54%
35# Bulldozer	13.4/+50%	4.38(*)
36#
37# (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
38#
39# Modified from upstream OpenSSL to remove the XOP code.
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42push(@INC,"${dir}","${dir}../../perlasm");
43require "x86asm.pl";
44
45$output=pop;
46open STDOUT,">$output";
47
48&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
49
50$xmm=$ymm=1;
51$gasver=999;  # enable everything
52
53$a="eax";
54($b,$b_)=("ebx","ebp");
55($c,$c_)=("ecx","esi");
56($d,$d_)=("edx","edi");
57
58sub QUARTERROUND {
59my ($ai,$bi,$ci,$di,$i)=@_;
60my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
61my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
62
63	#       a   b   c   d
64	#
65	#       0   4   8  12 < even round
66	#       1   5   9  13
67	#       2   6  10  14
68	#       3   7  11  15
69	#       0   5  10  15 < odd round
70	#       1   6  11  12
71	#       2   7   8  13
72	#       3   4   9  14
73
74	if ($i==0) {
75            my $j=4;
76	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
77	} elsif ($i==3) {
78            my $j=0;
79	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
80	} elsif ($i==4) {
81            my $j=4;
82	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
83	} elsif ($i==7) {
84            my $j=0;
85	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
86	}
87
88	#&add	($a,$b);			# see elsewhere
89	&xor	($d,$a);
90	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
91	&rol	($d,16);
92	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
93	&add	($c,$d);
94	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
95	&xor	($b,$c);
96	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
97	&rol	($b,12);
98	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
99	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
100	&add	($a,$b);
101	&xor	($d,$a);
102	&mov	(&DWP(4*$ai,"esp"),$a);
103	&rol	($d,8);
104	&mov	($a,&DWP(4*$an,"esp"));
105	&add	($c,$d);
106	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
107	&mov	($d_,$d)			if ($di==$dn);
108	&xor	($b,$c);
109	 &add	($a,$b_)			if ($i<7);	# elsewhere
110	&rol	($b,7);
111
112	($b,$b_)=($b_,$b);
113	($c,$c_)=($c_,$c);
114	($d,$d_)=($d_,$d);
115}
116
117&static_label("ssse3_shortcut");
118&static_label("ssse3_data");
119&static_label("pic_point");
120
121&function_begin("GFp_ChaCha20_ctr32");
122	&xor	("eax","eax");
123	&cmp	("eax",&wparam(2));		# len==0?
124	&je	(&label("no_data"));
125if ($xmm) {
126	&call	(&label("pic_point"));
127&set_label("pic_point");
128	&blindpop("eax");
129	&picmeup("ebp","GFp_ia32cap_P","eax",&label("pic_point"));
130	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
131	&jz	(&label("x86"));
132	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
133	&jz	(&label("x86"));
134	&jmp	(&label("ssse3_shortcut"));
135&set_label("x86");
136}
137	&mov	("esi",&wparam(3));		# key
138	&mov	("edi",&wparam(4));		# counter and nonce
139
140	&stack_push(33);
141
142	&mov	("eax",&DWP(4*0,"esi"));	# copy key
143	&mov	("ebx",&DWP(4*1,"esi"));
144	&mov	("ecx",&DWP(4*2,"esi"));
145	&mov	("edx",&DWP(4*3,"esi"));
146	&mov	(&DWP(64+4*4,"esp"),"eax");
147	&mov	(&DWP(64+4*5,"esp"),"ebx");
148	&mov	(&DWP(64+4*6,"esp"),"ecx");
149	&mov	(&DWP(64+4*7,"esp"),"edx");
150	&mov	("eax",&DWP(4*4,"esi"));
151	&mov	("ebx",&DWP(4*5,"esi"));
152	&mov	("ecx",&DWP(4*6,"esi"));
153	&mov	("edx",&DWP(4*7,"esi"));
154	&mov	(&DWP(64+4*8,"esp"),"eax");
155	&mov	(&DWP(64+4*9,"esp"),"ebx");
156	&mov	(&DWP(64+4*10,"esp"),"ecx");
157	&mov	(&DWP(64+4*11,"esp"),"edx");
158	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
159	&mov	("ebx",&DWP(4*1,"edi"));
160	&mov	("ecx",&DWP(4*2,"edi"));
161	&mov	("edx",&DWP(4*3,"edi"));
162	&sub	("eax",1);
163	&mov	(&DWP(64+4*12,"esp"),"eax");
164	&mov	(&DWP(64+4*13,"esp"),"ebx");
165	&mov	(&DWP(64+4*14,"esp"),"ecx");
166	&mov	(&DWP(64+4*15,"esp"),"edx");
167	&jmp	(&label("entry"));
168
169&set_label("outer_loop",16);
170	&mov	(&wparam(1),$b);		# save input
171	&mov	(&wparam(0),$a);		# save output
172	&mov	(&wparam(2),$c);		# save len
173&set_label("entry");
174	&mov	($a,0x61707865);
175	&mov	(&DWP(4*1,"esp"),0x3320646e);
176	&mov	(&DWP(4*2,"esp"),0x79622d32);
177	&mov	(&DWP(4*3,"esp"),0x6b206574);
178
179	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
180	&mov	($b_,&DWP(64+4*6,"esp"));
181	&mov	($c, &DWP(64+4*10,"esp"));
182	&mov	($c_,&DWP(64+4*11,"esp"));
183	&mov	($d, &DWP(64+4*13,"esp"));
184	&mov	($d_,&DWP(64+4*14,"esp"));
185	&mov	(&DWP(4*5,"esp"),$b);
186	&mov	(&DWP(4*6,"esp"),$b_);
187	&mov	(&DWP(4*10,"esp"),$c);
188	&mov	(&DWP(4*11,"esp"),$c_);
189	&mov	(&DWP(4*13,"esp"),$d);
190	&mov	(&DWP(4*14,"esp"),$d_);
191
192	&mov	($b, &DWP(64+4*7,"esp"));
193	&mov	($d_,&DWP(64+4*15,"esp"));
194	&mov	($d, &DWP(64+4*12,"esp"));
195	&mov	($b_,&DWP(64+4*4,"esp"));
196	&mov	($c, &DWP(64+4*8,"esp"));
197	&mov	($c_,&DWP(64+4*9,"esp"));
198	&add	($d,1);				# counter value
199	&mov	(&DWP(4*7,"esp"),$b);
200	&mov	(&DWP(4*15,"esp"),$d_);
201	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
202
203	&mov	($b,10);			# loop counter
204	&jmp	(&label("loop"));
205
206&set_label("loop",16);
207	&add	($a,$b_);			# elsewhere
208	&mov	(&DWP(128,"esp"),$b);		# save loop counter
209	&mov	($b,$b_);
210	&QUARTERROUND(0, 4, 8, 12, 0);
211	&QUARTERROUND(1, 5, 9, 13, 1);
212	&QUARTERROUND(2, 6,10, 14, 2);
213	&QUARTERROUND(3, 7,11, 15, 3);
214	&QUARTERROUND(0, 5,10, 15, 4);
215	&QUARTERROUND(1, 6,11, 12, 5);
216	&QUARTERROUND(2, 7, 8, 13, 6);
217	&QUARTERROUND(3, 4, 9, 14, 7);
218	&dec	($b);
219	&jnz	(&label("loop"));
220
221	&mov	($b,&wparam(2));		# load len
222
223	&add	($a,0x61707865);		# accumulate key material
224	&add	($b_,&DWP(64+4*4,"esp"));
225	&add	($c, &DWP(64+4*8,"esp"));
226	&add	($c_,&DWP(64+4*9,"esp"));
227
228	&cmp	($b,64);
229	&jb	(&label("tail"));
230
231	&mov	($b,&wparam(1));		# load input pointer
232	&add	($d, &DWP(64+4*12,"esp"));
233	&add	($d_,&DWP(64+4*14,"esp"));
234
235	&xor	($a, &DWP(4*0,$b));		# xor with input
236	&xor	($b_,&DWP(4*4,$b));
237	&mov	(&DWP(4*0,"esp"),$a);
238	&mov	($a,&wparam(0));		# load output pointer
239	&xor	($c, &DWP(4*8,$b));
240	&xor	($c_,&DWP(4*9,$b));
241	&xor	($d, &DWP(4*12,$b));
242	&xor	($d_,&DWP(4*14,$b));
243	&mov	(&DWP(4*4,$a),$b_);		# write output
244	&mov	(&DWP(4*8,$a),$c);
245	&mov	(&DWP(4*9,$a),$c_);
246	&mov	(&DWP(4*12,$a),$d);
247	&mov	(&DWP(4*14,$a),$d_);
248
249	&mov	($b_,&DWP(4*1,"esp"));
250	&mov	($c, &DWP(4*2,"esp"));
251	&mov	($c_,&DWP(4*3,"esp"));
252	&mov	($d, &DWP(4*5,"esp"));
253	&mov	($d_,&DWP(4*6,"esp"));
254	&add	($b_,0x3320646e);		# accumulate key material
255	&add	($c, 0x79622d32);
256	&add	($c_,0x6b206574);
257	&add	($d, &DWP(64+4*5,"esp"));
258	&add	($d_,&DWP(64+4*6,"esp"));
259	&xor	($b_,&DWP(4*1,$b));
260	&xor	($c, &DWP(4*2,$b));
261	&xor	($c_,&DWP(4*3,$b));
262	&xor	($d, &DWP(4*5,$b));
263	&xor	($d_,&DWP(4*6,$b));
264	&mov	(&DWP(4*1,$a),$b_);
265	&mov	(&DWP(4*2,$a),$c);
266	&mov	(&DWP(4*3,$a),$c_);
267	&mov	(&DWP(4*5,$a),$d);
268	&mov	(&DWP(4*6,$a),$d_);
269
270	&mov	($b_,&DWP(4*7,"esp"));
271	&mov	($c, &DWP(4*10,"esp"));
272	&mov	($c_,&DWP(4*11,"esp"));
273	&mov	($d, &DWP(4*13,"esp"));
274	&mov	($d_,&DWP(4*15,"esp"));
275	&add	($b_,&DWP(64+4*7,"esp"));
276	&add	($c, &DWP(64+4*10,"esp"));
277	&add	($c_,&DWP(64+4*11,"esp"));
278	&add	($d, &DWP(64+4*13,"esp"));
279	&add	($d_,&DWP(64+4*15,"esp"));
280	&xor	($b_,&DWP(4*7,$b));
281	&xor	($c, &DWP(4*10,$b));
282	&xor	($c_,&DWP(4*11,$b));
283	&xor	($d, &DWP(4*13,$b));
284	&xor	($d_,&DWP(4*15,$b));
285	&lea	($b,&DWP(4*16,$b));
286	&mov	(&DWP(4*7,$a),$b_);
287	&mov	($b_,&DWP(4*0,"esp"));
288	&mov	(&DWP(4*10,$a),$c);
289	&mov	($c,&wparam(2));		# len
290	&mov	(&DWP(4*11,$a),$c_);
291	&mov	(&DWP(4*13,$a),$d);
292	&mov	(&DWP(4*15,$a),$d_);
293	&mov	(&DWP(4*0,$a),$b_);
294	&lea	($a,&DWP(4*16,$a));
295	&sub	($c,64);
296	&jnz	(&label("outer_loop"));
297
298	&jmp	(&label("done"));
299
300&set_label("tail");
301	&add	($d, &DWP(64+4*12,"esp"));
302	&add	($d_,&DWP(64+4*14,"esp"));
303	&mov	(&DWP(4*0,"esp"),$a);
304	&mov	(&DWP(4*4,"esp"),$b_);
305	&mov	(&DWP(4*8,"esp"),$c);
306	&mov	(&DWP(4*9,"esp"),$c_);
307	&mov	(&DWP(4*12,"esp"),$d);
308	&mov	(&DWP(4*14,"esp"),$d_);
309
310	&mov	($b_,&DWP(4*1,"esp"));
311	&mov	($c, &DWP(4*2,"esp"));
312	&mov	($c_,&DWP(4*3,"esp"));
313	&mov	($d, &DWP(4*5,"esp"));
314	&mov	($d_,&DWP(4*6,"esp"));
315	&add	($b_,0x3320646e);		# accumulate key material
316	&add	($c, 0x79622d32);
317	&add	($c_,0x6b206574);
318	&add	($d, &DWP(64+4*5,"esp"));
319	&add	($d_,&DWP(64+4*6,"esp"));
320	&mov	(&DWP(4*1,"esp"),$b_);
321	&mov	(&DWP(4*2,"esp"),$c);
322	&mov	(&DWP(4*3,"esp"),$c_);
323	&mov	(&DWP(4*5,"esp"),$d);
324	&mov	(&DWP(4*6,"esp"),$d_);
325
326	&mov	($b_,&DWP(4*7,"esp"));
327	&mov	($c, &DWP(4*10,"esp"));
328	&mov	($c_,&DWP(4*11,"esp"));
329	&mov	($d, &DWP(4*13,"esp"));
330	&mov	($d_,&DWP(4*15,"esp"));
331	&add	($b_,&DWP(64+4*7,"esp"));
332	&add	($c, &DWP(64+4*10,"esp"));
333	&add	($c_,&DWP(64+4*11,"esp"));
334	&add	($d, &DWP(64+4*13,"esp"));
335	&add	($d_,&DWP(64+4*15,"esp"));
336	&mov	(&DWP(4*7,"esp"),$b_);
337	&mov	($b_,&wparam(1));		# load input
338	&mov	(&DWP(4*10,"esp"),$c);
339	&mov	($c,&wparam(0));		# load output
340	&mov	(&DWP(4*11,"esp"),$c_);
341	&xor	($c_,$c_);
342	&mov	(&DWP(4*13,"esp"),$d);
343	&mov	(&DWP(4*15,"esp"),$d_);
344
345	&xor	("eax","eax");
346	&xor	("edx","edx");
347&set_label("tail_loop");
348	&movb	("al",&BP(0,$c_,$b_));
349	&movb	("dl",&BP(0,"esp",$c_));
350	&lea	($c_,&DWP(1,$c_));
351	&xor	("al","dl");
352	&mov	(&BP(-1,$c,$c_),"al");
353	&dec	($b);
354	&jnz	(&label("tail_loop"));
355
356&set_label("done");
357	&stack_pop(33);
358&set_label("no_data");
359&function_end("GFp_ChaCha20_ctr32");
360
361if ($xmm) {
362my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
363my ($out,$inp,$len)=("edi","esi","ecx");
364
365sub QUARTERROUND_SSSE3 {
366my ($ai,$bi,$ci,$di,$i)=@_;
367my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
368my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
369
370	#       a   b   c   d
371	#
372	#       0   4   8  12 < even round
373	#       1   5   9  13
374	#       2   6  10  14
375	#       3   7  11  15
376	#       0   5  10  15 < odd round
377	#       1   6  11  12
378	#       2   7   8  13
379	#       3   4   9  14
380
381	if ($i==0) {
382            my $j=4;
383	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
384	} elsif ($i==3) {
385            my $j=0;
386	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
387	} elsif ($i==4) {
388            my $j=4;
389	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
390	} elsif ($i==7) {
391            my $j=0;
392	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
393	}
394
395	#&paddd	($xa,$xb);			# see elsewhere
396	#&pxor	($xd,$xa);			# see elsewhere
397	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
398	&pshufb	($xd,&QWP(0,"eax"));		# rot16
399	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
400	&paddd	($xc,$xd);
401	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
402	&pxor	($xb,$xc);
403	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
404	&movdqa	($xa_,$xb);			# borrow as temporary
405	&pslld	($xb,12);
406	&psrld	($xa_,20);
407	&por	($xb,$xa_);
408	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
409	&paddd	($xa,$xb);
410	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
411	&pxor	($xd,$xa);
412	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
413	&pshufb	($xd,&QWP(16,"eax"));		# rot8
414	&paddd	($xc,$xd);
415	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
416	&movdqa	($xd_,$xd)			if ($di==$dn);
417	&pxor	($xb,$xc);
418	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
419	&movdqa	($xa,$xb);			# borrow as temporary
420	&pslld	($xb,7);
421	&psrld	($xa,25);
422	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
423	&por	($xb,$xa);
424
425	($xa,$xa_)=($xa_,$xa);
426	($xb,$xb_)=($xb_,$xb);
427	($xc,$xc_)=($xc_,$xc);
428	($xd,$xd_)=($xd_,$xd);
429}
430
431&function_begin("_ChaCha20_ssse3");
432&set_label("ssse3_shortcut");
433	&mov		($out,&wparam(0));
434	&mov		($inp,&wparam(1));
435	&mov		($len,&wparam(2));
436	&mov		("edx",&wparam(3));		# key
437	&mov		("ebx",&wparam(4));		# counter and nonce
438
439	&mov		("ebp","esp");
440	&stack_push	(131);
441	&and		("esp",-64);
442	&mov		(&DWP(512,"esp"),"ebp");
443
444	&lea		("eax",&DWP(&label("ssse3_data")."-".
445				    &label("pic_point"),"eax"));
446	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
447
448if (defined($gasver) && $gasver>=2.17) {		# even though we encode
449							# pshufb manually, we
450							# handle only register
451							# operands, while this
452							# segment uses memory
453							# operand...
454	&cmp		($len,64*4);
455	&jb		(&label("1x"));
456
457	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
458	&mov		(&DWP(512+8,"esp"),"ebx");
459	&sub		($len,64*4);			# bias len
460	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
461
462	&movdqu		("xmm7",&QWP(0,"edx"));		# key
463	&pshufd		("xmm0","xmm3",0x00);
464	&pshufd		("xmm1","xmm3",0x55);
465	&pshufd		("xmm2","xmm3",0xaa);
466	&pshufd		("xmm3","xmm3",0xff);
467	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
468	&pshufd		("xmm4","xmm7",0x00);
469	&pshufd		("xmm5","xmm7",0x55);
470	 &psubd		("xmm0",&QWP(16*4,"eax"));
471	&pshufd		("xmm6","xmm7",0xaa);
472	&pshufd		("xmm7","xmm7",0xff);
473	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
474	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
475	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
476	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
477	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
478	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
479	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
480	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
481	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
482	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
483	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
484
485	&pshufd		("xmm0","xmm3",0x00);
486	&pshufd		("xmm1","xmm3",0x55);
487	&pshufd		("xmm2","xmm3",0xaa);
488	&pshufd		("xmm3","xmm3",0xff);
489	&pshufd		("xmm4","xmm7",0x00);
490	&pshufd		("xmm5","xmm7",0x55);
491	&pshufd		("xmm6","xmm7",0xaa);
492	&pshufd		("xmm7","xmm7",0xff);
493	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
494	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
495	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
496	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
497	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
498	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
499	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
500	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
501
502	&lea		($inp,&DWP(128,$inp));		# size optimization
503	&lea		($out,&DWP(128,$out));		# size optimization
504	&jmp		(&label("outer_loop"));
505
506&set_label("outer_loop",16);
507	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
508	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
509	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
510	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
511	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
512	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
513	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
514	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
515	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
516	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
517	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
518	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
519	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
520	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
521	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
522	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
523	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
524	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
525	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
526	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
527	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
528	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
529	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
530	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
531	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
532	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
533	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
534	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
535	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
536	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
537	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
538	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
539	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
540	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
541
542	&movdqa		($xa, &QWP(16*0-128,"ebp"));
543	&movdqa		($xd, "xmm4");
544	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
545	&movdqa		($xc, &QWP(16*8-128,"ebp"));
546	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
547
548	&mov		("edx",10);			# loop counter
549	&nop		();
550
551&set_label("loop",16);
552	&paddd		($xa,$xb_);			# elsewhere
553	&movdqa		($xb,$xb_);
554	&pxor		($xd,$xa);			# elsewhere
555	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
556	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
557	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
558	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
559	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
560	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
561	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
562	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
563	&dec		("edx");
564	&jnz		(&label("loop"));
565
566	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
567	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
568	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
569	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
570	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
571
572    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
573
574	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
575	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
576	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
577	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
578
579    for($i=0;$i<256;$i+=64) {
580	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
581	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
582	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
583	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
584
585	&movdqa		($xt2,$xa0);		# "de-interlace" data
586	&punpckldq	($xa0,$xa1);
587	&movdqa		($xt3,$xa2);
588	&punpckldq	($xa2,$xa3);
589	&punpckhdq	($xt2,$xa1);
590	&punpckhdq	($xt3,$xa3);
591	&movdqa		($xa1,$xa0);
592	&punpcklqdq	($xa0,$xa2);		# "a0"
593	&movdqa		($xa3,$xt2);
594	&punpcklqdq	($xt2,$xt3);		# "a2"
595	&punpckhqdq	($xa1,$xa2);		# "a1"
596	&punpckhqdq	($xa3,$xt3);		# "a3"
597
598	#($xa2,$xt2)=($xt2,$xa2);
599
600	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
601	&movdqu		($xt1,&QWP(64*1-128,$inp));
602	&movdqu		($xa2,&QWP(64*2-128,$inp));
603	&movdqu		($xt3,&QWP(64*3-128,$inp));
604	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
605	&pxor		($xt0,$xa0);
606	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
607	&pxor		($xt1,$xa1);
608	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
609	&pxor		($xt2,$xa2);
610	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
611	&pxor		($xt3,$xa3);
612	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
613	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
614	&movdqu		(&QWP(64*1-128,$out),$xt1);
615	&movdqu		(&QWP(64*2-128,$out),$xt2);
616	&movdqu		(&QWP(64*3-128,$out),$xt3);
617	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
618    }
619	&sub		($len,64*4);
620	&jnc		(&label("outer_loop"));
621
622	&add		($len,64*4);
623	&jz		(&label("done"));
624
625	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
626	&lea		($inp,&DWP(-128,$inp));
627	&mov		("edx",&DWP(512+4,"esp"));
628	&lea		($out,&DWP(-128,$out));
629
630	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
631	&movdqu		("xmm3",&QWP(0,"ebx"));
632	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
633	&pand		("xmm3",&QWP(16*7,"eax"));
634	&por		("xmm3","xmm2");		# counter value
635}
636{
637my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
638
639sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
640	&paddd		($a,$b);
641	&pxor		($d,$a);
642	&pshufb		($d,$rot16);
643
644	&paddd		($c,$d);
645	&pxor		($b,$c);
646	&movdqa		($t,$b);
647	&psrld		($b,20);
648	&pslld		($t,12);
649	&por		($b,$t);
650
651	&paddd		($a,$b);
652	&pxor		($d,$a);
653	&pshufb		($d,$rot24);
654
655	&paddd		($c,$d);
656	&pxor		($b,$c);
657	&movdqa		($t,$b);
658	&psrld		($b,25);
659	&pslld		($t,7);
660	&por		($b,$t);
661}
662
663&set_label("1x");
664	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
665	&movdqu		($b,&QWP(0,"edx"));
666	&movdqu		($c,&QWP(16,"edx"));
667	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
668	&movdqa		($rot16,&QWP(0,"eax"));
669	&movdqa		($rot24,&QWP(16,"eax"));
670	&mov		(&DWP(16*3,"esp"),"ebp");
671
672	&movdqa		(&QWP(16*0,"esp"),$a);
673	&movdqa		(&QWP(16*1,"esp"),$b);
674	&movdqa		(&QWP(16*2,"esp"),$c);
675	&movdqa		(&QWP(16*3,"esp"),$d);
676	&mov		("edx",10);
677	&jmp		(&label("loop1x"));
678
679&set_label("outer1x",16);
680	&movdqa		($d,&QWP(16*5,"eax"));		# one
681	&movdqa		($a,&QWP(16*0,"esp"));
682	&movdqa		($b,&QWP(16*1,"esp"));
683	&movdqa		($c,&QWP(16*2,"esp"));
684	&paddd		($d,&QWP(16*3,"esp"));
685	&mov		("edx",10);
686	&movdqa		(&QWP(16*3,"esp"),$d);
687	&jmp		(&label("loop1x"));
688
689&set_label("loop1x",16);
690	&SSSE3ROUND();
691	&pshufd	($c,$c,0b01001110);
692	&pshufd	($b,$b,0b00111001);
693	&pshufd	($d,$d,0b10010011);
694	&nop	();
695
696	&SSSE3ROUND();
697	&pshufd	($c,$c,0b01001110);
698	&pshufd	($b,$b,0b10010011);
699	&pshufd	($d,$d,0b00111001);
700
701	&dec		("edx");
702	&jnz		(&label("loop1x"));
703
704	&paddd		($a,&QWP(16*0,"esp"));
705	&paddd		($b,&QWP(16*1,"esp"));
706	&paddd		($c,&QWP(16*2,"esp"));
707	&paddd		($d,&QWP(16*3,"esp"));
708
709	&cmp		($len,64);
710	&jb		(&label("tail"));
711
712	&movdqu		($t,&QWP(16*0,$inp));
713	&movdqu		($t1,&QWP(16*1,$inp));
714	&pxor		($a,$t);		# xor with input
715	&movdqu		($t,&QWP(16*2,$inp));
716	&pxor		($b,$t1);
717	&movdqu		($t1,&QWP(16*3,$inp));
718	&pxor		($c,$t);
719	&pxor		($d,$t1);
720	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
721
722	&movdqu		(&QWP(16*0,$out),$a);	# write output
723	&movdqu		(&QWP(16*1,$out),$b);
724	&movdqu		(&QWP(16*2,$out),$c);
725	&movdqu		(&QWP(16*3,$out),$d);
726	&lea		($out,&DWP(16*4,$out));	# inp+=64
727
728	&sub		($len,64);
729	&jnz		(&label("outer1x"));
730
731	&jmp		(&label("done"));
732
733&set_label("tail");
734	&movdqa		(&QWP(16*0,"esp"),$a);
735	&movdqa		(&QWP(16*1,"esp"),$b);
736	&movdqa		(&QWP(16*2,"esp"),$c);
737	&movdqa		(&QWP(16*3,"esp"),$d);
738
739	&xor		("eax","eax");
740	&xor		("edx","edx");
741	&xor		("ebp","ebp");
742
743&set_label("tail_loop");
744	&movb		("al",&BP(0,"esp","ebp"));
745	&movb		("dl",&BP(0,$inp,"ebp"));
746	&lea		("ebp",&DWP(1,"ebp"));
747	&xor		("al","dl");
748	&movb		(&BP(-1,$out,"ebp"),"al");
749	&dec		($len);
750	&jnz		(&label("tail_loop"));
751}
752&set_label("done");
753	&mov		("esp",&DWP(512,"esp"));
754&function_end("_ChaCha20_ssse3");
755
756&align	(64);
757&set_label("ssse3_data");
758&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
759&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
760&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
761&data_word(0,1,2,3);
762&data_word(4,4,4,4);
763&data_word(1,0,0,0);
764&data_word(4,0,0,0);
765&data_word(0,-1,-1,-1);
766&align	(64);
767}
768&asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
769
770&asm_finish();
771
772close STDOUT or die "error closing STDOUT";
773