1#if defined(__i386__)
2.file	"chacha-x86.S"
3.text
4.globl	ChaCha20_ctr32
5.hidden	ChaCha20_ctr32
6.type	ChaCha20_ctr32,@function
7.align	16
8ChaCha20_ctr32:
9.L_ChaCha20_ctr32_begin:
10	pushl	%ebp
11	pushl	%ebx
12	pushl	%esi
13	pushl	%edi
14	xorl	%eax,%eax
15	cmpl	28(%esp),%eax
16	je	.L000no_data
17	call	.Lpic_point
18.Lpic_point:
19	popl	%eax
20	leal	OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp
21	testl	$16777216,(%ebp)
22	jz	.L001x86
23	testl	$512,4(%ebp)
24	jz	.L001x86
25	jmp	.Lssse3_shortcut
26.L001x86:
27	movl	32(%esp),%esi
28	movl	36(%esp),%edi
29	subl	$132,%esp
30	movl	(%esi),%eax
31	movl	4(%esi),%ebx
32	movl	8(%esi),%ecx
33	movl	12(%esi),%edx
34	movl	%eax,80(%esp)
35	movl	%ebx,84(%esp)
36	movl	%ecx,88(%esp)
37	movl	%edx,92(%esp)
38	movl	16(%esi),%eax
39	movl	20(%esi),%ebx
40	movl	24(%esi),%ecx
41	movl	28(%esi),%edx
42	movl	%eax,96(%esp)
43	movl	%ebx,100(%esp)
44	movl	%ecx,104(%esp)
45	movl	%edx,108(%esp)
46	movl	(%edi),%eax
47	movl	4(%edi),%ebx
48	movl	8(%edi),%ecx
49	movl	12(%edi),%edx
50	subl	$1,%eax
51	movl	%eax,112(%esp)
52	movl	%ebx,116(%esp)
53	movl	%ecx,120(%esp)
54	movl	%edx,124(%esp)
55	jmp	.L002entry
56.align	16
57.L003outer_loop:
58	movl	%ebx,156(%esp)
59	movl	%eax,152(%esp)
60	movl	%ecx,160(%esp)
61.L002entry:
62	movl	$1634760805,%eax
63	movl	$857760878,4(%esp)
64	movl	$2036477234,8(%esp)
65	movl	$1797285236,12(%esp)
66	movl	84(%esp),%ebx
67	movl	88(%esp),%ebp
68	movl	104(%esp),%ecx
69	movl	108(%esp),%esi
70	movl	116(%esp),%edx
71	movl	120(%esp),%edi
72	movl	%ebx,20(%esp)
73	movl	%ebp,24(%esp)
74	movl	%ecx,40(%esp)
75	movl	%esi,44(%esp)
76	movl	%edx,52(%esp)
77	movl	%edi,56(%esp)
78	movl	92(%esp),%ebx
79	movl	124(%esp),%edi
80	movl	112(%esp),%edx
81	movl	80(%esp),%ebp
82	movl	96(%esp),%ecx
83	movl	100(%esp),%esi
84	addl	$1,%edx
85	movl	%ebx,28(%esp)
86	movl	%edi,60(%esp)
87	movl	%edx,112(%esp)
88	movl	$10,%ebx
89	jmp	.L004loop
90.align	16
91.L004loop:
92	addl	%ebp,%eax
93	movl	%ebx,128(%esp)
94	movl	%ebp,%ebx
95	xorl	%eax,%edx
96	roll	$16,%edx
97	addl	%edx,%ecx
98	xorl	%ecx,%ebx
99	movl	52(%esp),%edi
100	roll	$12,%ebx
101	movl	20(%esp),%ebp
102	addl	%ebx,%eax
103	xorl	%eax,%edx
104	movl	%eax,(%esp)
105	roll	$8,%edx
106	movl	4(%esp),%eax
107	addl	%edx,%ecx
108	movl	%edx,48(%esp)
109	xorl	%ecx,%ebx
110	addl	%ebp,%eax
111	roll	$7,%ebx
112	xorl	%eax,%edi
113	movl	%ecx,32(%esp)
114	roll	$16,%edi
115	movl	%ebx,16(%esp)
116	addl	%edi,%esi
117	movl	40(%esp),%ecx
118	xorl	%esi,%ebp
119	movl	56(%esp),%edx
120	roll	$12,%ebp
121	movl	24(%esp),%ebx
122	addl	%ebp,%eax
123	xorl	%eax,%edi
124	movl	%eax,4(%esp)
125	roll	$8,%edi
126	movl	8(%esp),%eax
127	addl	%edi,%esi
128	movl	%edi,52(%esp)
129	xorl	%esi,%ebp
130	addl	%ebx,%eax
131	roll	$7,%ebp
132	xorl	%eax,%edx
133	movl	%esi,36(%esp)
134	roll	$16,%edx
135	movl	%ebp,20(%esp)
136	addl	%edx,%ecx
137	movl	44(%esp),%esi
138	xorl	%ecx,%ebx
139	movl	60(%esp),%edi
140	roll	$12,%ebx
141	movl	28(%esp),%ebp
142	addl	%ebx,%eax
143	xorl	%eax,%edx
144	movl	%eax,8(%esp)
145	roll	$8,%edx
146	movl	12(%esp),%eax
147	addl	%edx,%ecx
148	movl	%edx,56(%esp)
149	xorl	%ecx,%ebx
150	addl	%ebp,%eax
151	roll	$7,%ebx
152	xorl	%eax,%edi
153	roll	$16,%edi
154	movl	%ebx,24(%esp)
155	addl	%edi,%esi
156	xorl	%esi,%ebp
157	roll	$12,%ebp
158	movl	20(%esp),%ebx
159	addl	%ebp,%eax
160	xorl	%eax,%edi
161	movl	%eax,12(%esp)
162	roll	$8,%edi
163	movl	(%esp),%eax
164	addl	%edi,%esi
165	movl	%edi,%edx
166	xorl	%esi,%ebp
167	addl	%ebx,%eax
168	roll	$7,%ebp
169	xorl	%eax,%edx
170	roll	$16,%edx
171	movl	%ebp,28(%esp)
172	addl	%edx,%ecx
173	xorl	%ecx,%ebx
174	movl	48(%esp),%edi
175	roll	$12,%ebx
176	movl	24(%esp),%ebp
177	addl	%ebx,%eax
178	xorl	%eax,%edx
179	movl	%eax,(%esp)
180	roll	$8,%edx
181	movl	4(%esp),%eax
182	addl	%edx,%ecx
183	movl	%edx,60(%esp)
184	xorl	%ecx,%ebx
185	addl	%ebp,%eax
186	roll	$7,%ebx
187	xorl	%eax,%edi
188	movl	%ecx,40(%esp)
189	roll	$16,%edi
190	movl	%ebx,20(%esp)
191	addl	%edi,%esi
192	movl	32(%esp),%ecx
193	xorl	%esi,%ebp
194	movl	52(%esp),%edx
195	roll	$12,%ebp
196	movl	28(%esp),%ebx
197	addl	%ebp,%eax
198	xorl	%eax,%edi
199	movl	%eax,4(%esp)
200	roll	$8,%edi
201	movl	8(%esp),%eax
202	addl	%edi,%esi
203	movl	%edi,48(%esp)
204	xorl	%esi,%ebp
205	addl	%ebx,%eax
206	roll	$7,%ebp
207	xorl	%eax,%edx
208	movl	%esi,44(%esp)
209	roll	$16,%edx
210	movl	%ebp,24(%esp)
211	addl	%edx,%ecx
212	movl	36(%esp),%esi
213	xorl	%ecx,%ebx
214	movl	56(%esp),%edi
215	roll	$12,%ebx
216	movl	16(%esp),%ebp
217	addl	%ebx,%eax
218	xorl	%eax,%edx
219	movl	%eax,8(%esp)
220	roll	$8,%edx
221	movl	12(%esp),%eax
222	addl	%edx,%ecx
223	movl	%edx,52(%esp)
224	xorl	%ecx,%ebx
225	addl	%ebp,%eax
226	roll	$7,%ebx
227	xorl	%eax,%edi
228	roll	$16,%edi
229	movl	%ebx,28(%esp)
230	addl	%edi,%esi
231	xorl	%esi,%ebp
232	movl	48(%esp),%edx
233	roll	$12,%ebp
234	movl	128(%esp),%ebx
235	addl	%ebp,%eax
236	xorl	%eax,%edi
237	movl	%eax,12(%esp)
238	roll	$8,%edi
239	movl	(%esp),%eax
240	addl	%edi,%esi
241	movl	%edi,56(%esp)
242	xorl	%esi,%ebp
243	roll	$7,%ebp
244	decl	%ebx
245	jnz	.L004loop
246	movl	160(%esp),%ebx
247	addl	$1634760805,%eax
248	addl	80(%esp),%ebp
249	addl	96(%esp),%ecx
250	addl	100(%esp),%esi
251	cmpl	$64,%ebx
252	jb	.L005tail
253	movl	156(%esp),%ebx
254	addl	112(%esp),%edx
255	addl	120(%esp),%edi
256	xorl	(%ebx),%eax
257	xorl	16(%ebx),%ebp
258	movl	%eax,(%esp)
259	movl	152(%esp),%eax
260	xorl	32(%ebx),%ecx
261	xorl	36(%ebx),%esi
262	xorl	48(%ebx),%edx
263	xorl	56(%ebx),%edi
264	movl	%ebp,16(%eax)
265	movl	%ecx,32(%eax)
266	movl	%esi,36(%eax)
267	movl	%edx,48(%eax)
268	movl	%edi,56(%eax)
269	movl	4(%esp),%ebp
270	movl	8(%esp),%ecx
271	movl	12(%esp),%esi
272	movl	20(%esp),%edx
273	movl	24(%esp),%edi
274	addl	$857760878,%ebp
275	addl	$2036477234,%ecx
276	addl	$1797285236,%esi
277	addl	84(%esp),%edx
278	addl	88(%esp),%edi
279	xorl	4(%ebx),%ebp
280	xorl	8(%ebx),%ecx
281	xorl	12(%ebx),%esi
282	xorl	20(%ebx),%edx
283	xorl	24(%ebx),%edi
284	movl	%ebp,4(%eax)
285	movl	%ecx,8(%eax)
286	movl	%esi,12(%eax)
287	movl	%edx,20(%eax)
288	movl	%edi,24(%eax)
289	movl	28(%esp),%ebp
290	movl	40(%esp),%ecx
291	movl	44(%esp),%esi
292	movl	52(%esp),%edx
293	movl	60(%esp),%edi
294	addl	92(%esp),%ebp
295	addl	104(%esp),%ecx
296	addl	108(%esp),%esi
297	addl	116(%esp),%edx
298	addl	124(%esp),%edi
299	xorl	28(%ebx),%ebp
300	xorl	40(%ebx),%ecx
301	xorl	44(%ebx),%esi
302	xorl	52(%ebx),%edx
303	xorl	60(%ebx),%edi
304	leal	64(%ebx),%ebx
305	movl	%ebp,28(%eax)
306	movl	(%esp),%ebp
307	movl	%ecx,40(%eax)
308	movl	160(%esp),%ecx
309	movl	%esi,44(%eax)
310	movl	%edx,52(%eax)
311	movl	%edi,60(%eax)
312	movl	%ebp,(%eax)
313	leal	64(%eax),%eax
314	subl	$64,%ecx
315	jnz	.L003outer_loop
316	jmp	.L006done
317.L005tail:
318	addl	112(%esp),%edx
319	addl	120(%esp),%edi
320	movl	%eax,(%esp)
321	movl	%ebp,16(%esp)
322	movl	%ecx,32(%esp)
323	movl	%esi,36(%esp)
324	movl	%edx,48(%esp)
325	movl	%edi,56(%esp)
326	movl	4(%esp),%ebp
327	movl	8(%esp),%ecx
328	movl	12(%esp),%esi
329	movl	20(%esp),%edx
330	movl	24(%esp),%edi
331	addl	$857760878,%ebp
332	addl	$2036477234,%ecx
333	addl	$1797285236,%esi
334	addl	84(%esp),%edx
335	addl	88(%esp),%edi
336	movl	%ebp,4(%esp)
337	movl	%ecx,8(%esp)
338	movl	%esi,12(%esp)
339	movl	%edx,20(%esp)
340	movl	%edi,24(%esp)
341	movl	28(%esp),%ebp
342	movl	40(%esp),%ecx
343	movl	44(%esp),%esi
344	movl	52(%esp),%edx
345	movl	60(%esp),%edi
346	addl	92(%esp),%ebp
347	addl	104(%esp),%ecx
348	addl	108(%esp),%esi
349	addl	116(%esp),%edx
350	addl	124(%esp),%edi
351	movl	%ebp,28(%esp)
352	movl	156(%esp),%ebp
353	movl	%ecx,40(%esp)
354	movl	152(%esp),%ecx
355	movl	%esi,44(%esp)
356	xorl	%esi,%esi
357	movl	%edx,52(%esp)
358	movl	%edi,60(%esp)
359	xorl	%eax,%eax
360	xorl	%edx,%edx
361.L007tail_loop:
362	movb	(%esi,%ebp,1),%al
363	movb	(%esp,%esi,1),%dl
364	leal	1(%esi),%esi
365	xorb	%dl,%al
366	movb	%al,-1(%ecx,%esi,1)
367	decl	%ebx
368	jnz	.L007tail_loop
369.L006done:
370	addl	$132,%esp
371.L000no_data:
372	popl	%edi
373	popl	%esi
374	popl	%ebx
375	popl	%ebp
376	ret
377.size	ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin
378.globl	ChaCha20_ssse3
379.hidden	ChaCha20_ssse3
380.type	ChaCha20_ssse3,@function
381.align	16
382ChaCha20_ssse3:
383.L_ChaCha20_ssse3_begin:
384	pushl	%ebp
385	pushl	%ebx
386	pushl	%esi
387	pushl	%edi
388.Lssse3_shortcut:
389	movl	20(%esp),%edi
390	movl	24(%esp),%esi
391	movl	28(%esp),%ecx
392	movl	32(%esp),%edx
393	movl	36(%esp),%ebx
394	movl	%esp,%ebp
395	subl	$524,%esp
396	andl	$-64,%esp
397	movl	%ebp,512(%esp)
398	leal	.Lssse3_data-.Lpic_point(%eax),%eax
399	movdqu	(%ebx),%xmm3
400	cmpl	$256,%ecx
401	jb	.L0081x
402	movl	%edx,516(%esp)
403	movl	%ebx,520(%esp)
404	subl	$256,%ecx
405	leal	384(%esp),%ebp
406	movdqu	(%edx),%xmm7
407	pshufd	$0,%xmm3,%xmm0
408	pshufd	$85,%xmm3,%xmm1
409	pshufd	$170,%xmm3,%xmm2
410	pshufd	$255,%xmm3,%xmm3
411	paddd	48(%eax),%xmm0
412	pshufd	$0,%xmm7,%xmm4
413	pshufd	$85,%xmm7,%xmm5
414	psubd	64(%eax),%xmm0
415	pshufd	$170,%xmm7,%xmm6
416	pshufd	$255,%xmm7,%xmm7
417	movdqa	%xmm0,64(%ebp)
418	movdqa	%xmm1,80(%ebp)
419	movdqa	%xmm2,96(%ebp)
420	movdqa	%xmm3,112(%ebp)
421	movdqu	16(%edx),%xmm3
422	movdqa	%xmm4,-64(%ebp)
423	movdqa	%xmm5,-48(%ebp)
424	movdqa	%xmm6,-32(%ebp)
425	movdqa	%xmm7,-16(%ebp)
426	movdqa	32(%eax),%xmm7
427	leal	128(%esp),%ebx
428	pshufd	$0,%xmm3,%xmm0
429	pshufd	$85,%xmm3,%xmm1
430	pshufd	$170,%xmm3,%xmm2
431	pshufd	$255,%xmm3,%xmm3
432	pshufd	$0,%xmm7,%xmm4
433	pshufd	$85,%xmm7,%xmm5
434	pshufd	$170,%xmm7,%xmm6
435	pshufd	$255,%xmm7,%xmm7
436	movdqa	%xmm0,(%ebp)
437	movdqa	%xmm1,16(%ebp)
438	movdqa	%xmm2,32(%ebp)
439	movdqa	%xmm3,48(%ebp)
440	movdqa	%xmm4,-128(%ebp)
441	movdqa	%xmm5,-112(%ebp)
442	movdqa	%xmm6,-96(%ebp)
443	movdqa	%xmm7,-80(%ebp)
444	leal	128(%esi),%esi
445	leal	128(%edi),%edi
446	jmp	.L009outer_loop
447.align	16
448.L009outer_loop:
449	movdqa	-112(%ebp),%xmm1
450	movdqa	-96(%ebp),%xmm2
451	movdqa	-80(%ebp),%xmm3
452	movdqa	-48(%ebp),%xmm5
453	movdqa	-32(%ebp),%xmm6
454	movdqa	-16(%ebp),%xmm7
455	movdqa	%xmm1,-112(%ebx)
456	movdqa	%xmm2,-96(%ebx)
457	movdqa	%xmm3,-80(%ebx)
458	movdqa	%xmm5,-48(%ebx)
459	movdqa	%xmm6,-32(%ebx)
460	movdqa	%xmm7,-16(%ebx)
461	movdqa	32(%ebp),%xmm2
462	movdqa	48(%ebp),%xmm3
463	movdqa	64(%ebp),%xmm4
464	movdqa	80(%ebp),%xmm5
465	movdqa	96(%ebp),%xmm6
466	movdqa	112(%ebp),%xmm7
467	paddd	64(%eax),%xmm4
468	movdqa	%xmm2,32(%ebx)
469	movdqa	%xmm3,48(%ebx)
470	movdqa	%xmm4,64(%ebx)
471	movdqa	%xmm5,80(%ebx)
472	movdqa	%xmm6,96(%ebx)
473	movdqa	%xmm7,112(%ebx)
474	movdqa	%xmm4,64(%ebp)
475	movdqa	-128(%ebp),%xmm0
476	movdqa	%xmm4,%xmm6
477	movdqa	-64(%ebp),%xmm3
478	movdqa	(%ebp),%xmm4
479	movdqa	16(%ebp),%xmm5
480	movl	$10,%edx
481	nop
482.align	16
483.L010loop:
484	paddd	%xmm3,%xmm0
485	movdqa	%xmm3,%xmm2
486	pxor	%xmm0,%xmm6
487	pshufb	(%eax),%xmm6
488	paddd	%xmm6,%xmm4
489	pxor	%xmm4,%xmm2
490	movdqa	-48(%ebx),%xmm3
491	movdqa	%xmm2,%xmm1
492	pslld	$12,%xmm2
493	psrld	$20,%xmm1
494	por	%xmm1,%xmm2
495	movdqa	-112(%ebx),%xmm1
496	paddd	%xmm2,%xmm0
497	movdqa	80(%ebx),%xmm7
498	pxor	%xmm0,%xmm6
499	movdqa	%xmm0,-128(%ebx)
500	pshufb	16(%eax),%xmm6
501	paddd	%xmm6,%xmm4
502	movdqa	%xmm6,64(%ebx)
503	pxor	%xmm4,%xmm2
504	paddd	%xmm3,%xmm1
505	movdqa	%xmm2,%xmm0
506	pslld	$7,%xmm2
507	psrld	$25,%xmm0
508	pxor	%xmm1,%xmm7
509	por	%xmm0,%xmm2
510	movdqa	%xmm4,(%ebx)
511	pshufb	(%eax),%xmm7
512	movdqa	%xmm2,-64(%ebx)
513	paddd	%xmm7,%xmm5
514	movdqa	32(%ebx),%xmm4
515	pxor	%xmm5,%xmm3
516	movdqa	-32(%ebx),%xmm2
517	movdqa	%xmm3,%xmm0
518	pslld	$12,%xmm3
519	psrld	$20,%xmm0
520	por	%xmm0,%xmm3
521	movdqa	-96(%ebx),%xmm0
522	paddd	%xmm3,%xmm1
523	movdqa	96(%ebx),%xmm6
524	pxor	%xmm1,%xmm7
525	movdqa	%xmm1,-112(%ebx)
526	pshufb	16(%eax),%xmm7
527	paddd	%xmm7,%xmm5
528	movdqa	%xmm7,80(%ebx)
529	pxor	%xmm5,%xmm3
530	paddd	%xmm2,%xmm0
531	movdqa	%xmm3,%xmm1
532	pslld	$7,%xmm3
533	psrld	$25,%xmm1
534	pxor	%xmm0,%xmm6
535	por	%xmm1,%xmm3
536	movdqa	%xmm5,16(%ebx)
537	pshufb	(%eax),%xmm6
538	movdqa	%xmm3,-48(%ebx)
539	paddd	%xmm6,%xmm4
540	movdqa	48(%ebx),%xmm5
541	pxor	%xmm4,%xmm2
542	movdqa	-16(%ebx),%xmm3
543	movdqa	%xmm2,%xmm1
544	pslld	$12,%xmm2
545	psrld	$20,%xmm1
546	por	%xmm1,%xmm2
547	movdqa	-80(%ebx),%xmm1
548	paddd	%xmm2,%xmm0
549	movdqa	112(%ebx),%xmm7
550	pxor	%xmm0,%xmm6
551	movdqa	%xmm0,-96(%ebx)
552	pshufb	16(%eax),%xmm6
553	paddd	%xmm6,%xmm4
554	movdqa	%xmm6,96(%ebx)
555	pxor	%xmm4,%xmm2
556	paddd	%xmm3,%xmm1
557	movdqa	%xmm2,%xmm0
558	pslld	$7,%xmm2
559	psrld	$25,%xmm0
560	pxor	%xmm1,%xmm7
561	por	%xmm0,%xmm2
562	pshufb	(%eax),%xmm7
563	movdqa	%xmm2,-32(%ebx)
564	paddd	%xmm7,%xmm5
565	pxor	%xmm5,%xmm3
566	movdqa	-48(%ebx),%xmm2
567	movdqa	%xmm3,%xmm0
568	pslld	$12,%xmm3
569	psrld	$20,%xmm0
570	por	%xmm0,%xmm3
571	movdqa	-128(%ebx),%xmm0
572	paddd	%xmm3,%xmm1
573	pxor	%xmm1,%xmm7
574	movdqa	%xmm1,-80(%ebx)
575	pshufb	16(%eax),%xmm7
576	paddd	%xmm7,%xmm5
577	movdqa	%xmm7,%xmm6
578	pxor	%xmm5,%xmm3
579	paddd	%xmm2,%xmm0
580	movdqa	%xmm3,%xmm1
581	pslld	$7,%xmm3
582	psrld	$25,%xmm1
583	pxor	%xmm0,%xmm6
584	por	%xmm1,%xmm3
585	pshufb	(%eax),%xmm6
586	movdqa	%xmm3,-16(%ebx)
587	paddd	%xmm6,%xmm4
588	pxor	%xmm4,%xmm2
589	movdqa	-32(%ebx),%xmm3
590	movdqa	%xmm2,%xmm1
591	pslld	$12,%xmm2
592	psrld	$20,%xmm1
593	por	%xmm1,%xmm2
594	movdqa	-112(%ebx),%xmm1
595	paddd	%xmm2,%xmm0
596	movdqa	64(%ebx),%xmm7
597	pxor	%xmm0,%xmm6
598	movdqa	%xmm0,-128(%ebx)
599	pshufb	16(%eax),%xmm6
600	paddd	%xmm6,%xmm4
601	movdqa	%xmm6,112(%ebx)
602	pxor	%xmm4,%xmm2
603	paddd	%xmm3,%xmm1
604	movdqa	%xmm2,%xmm0
605	pslld	$7,%xmm2
606	psrld	$25,%xmm0
607	pxor	%xmm1,%xmm7
608	por	%xmm0,%xmm2
609	movdqa	%xmm4,32(%ebx)
610	pshufb	(%eax),%xmm7
611	movdqa	%xmm2,-48(%ebx)
612	paddd	%xmm7,%xmm5
613	movdqa	(%ebx),%xmm4
614	pxor	%xmm5,%xmm3
615	movdqa	-16(%ebx),%xmm2
616	movdqa	%xmm3,%xmm0
617	pslld	$12,%xmm3
618	psrld	$20,%xmm0
619	por	%xmm0,%xmm3
620	movdqa	-96(%ebx),%xmm0
621	paddd	%xmm3,%xmm1
622	movdqa	80(%ebx),%xmm6
623	pxor	%xmm1,%xmm7
624	movdqa	%xmm1,-112(%ebx)
625	pshufb	16(%eax),%xmm7
626	paddd	%xmm7,%xmm5
627	movdqa	%xmm7,64(%ebx)
628	pxor	%xmm5,%xmm3
629	paddd	%xmm2,%xmm0
630	movdqa	%xmm3,%xmm1
631	pslld	$7,%xmm3
632	psrld	$25,%xmm1
633	pxor	%xmm0,%xmm6
634	por	%xmm1,%xmm3
635	movdqa	%xmm5,48(%ebx)
636	pshufb	(%eax),%xmm6
637	movdqa	%xmm3,-32(%ebx)
638	paddd	%xmm6,%xmm4
639	movdqa	16(%ebx),%xmm5
640	pxor	%xmm4,%xmm2
641	movdqa	-64(%ebx),%xmm3
642	movdqa	%xmm2,%xmm1
643	pslld	$12,%xmm2
644	psrld	$20,%xmm1
645	por	%xmm1,%xmm2
646	movdqa	-80(%ebx),%xmm1
647	paddd	%xmm2,%xmm0
648	movdqa	96(%ebx),%xmm7
649	pxor	%xmm0,%xmm6
650	movdqa	%xmm0,-96(%ebx)
651	pshufb	16(%eax),%xmm6
652	paddd	%xmm6,%xmm4
653	movdqa	%xmm6,80(%ebx)
654	pxor	%xmm4,%xmm2
655	paddd	%xmm3,%xmm1
656	movdqa	%xmm2,%xmm0
657	pslld	$7,%xmm2
658	psrld	$25,%xmm0
659	pxor	%xmm1,%xmm7
660	por	%xmm0,%xmm2
661	pshufb	(%eax),%xmm7
662	movdqa	%xmm2,-16(%ebx)
663	paddd	%xmm7,%xmm5
664	pxor	%xmm5,%xmm3
665	movdqa	%xmm3,%xmm0
666	pslld	$12,%xmm3
667	psrld	$20,%xmm0
668	por	%xmm0,%xmm3
669	movdqa	-128(%ebx),%xmm0
670	paddd	%xmm3,%xmm1
671	movdqa	64(%ebx),%xmm6
672	pxor	%xmm1,%xmm7
673	movdqa	%xmm1,-80(%ebx)
674	pshufb	16(%eax),%xmm7
675	paddd	%xmm7,%xmm5
676	movdqa	%xmm7,96(%ebx)
677	pxor	%xmm5,%xmm3
678	movdqa	%xmm3,%xmm1
679	pslld	$7,%xmm3
680	psrld	$25,%xmm1
681	por	%xmm1,%xmm3
682	decl	%edx
683	jnz	.L010loop
684	movdqa	%xmm3,-64(%ebx)
685	movdqa	%xmm4,(%ebx)
686	movdqa	%xmm5,16(%ebx)
687	movdqa	%xmm6,64(%ebx)
688	movdqa	%xmm7,96(%ebx)
689	movdqa	-112(%ebx),%xmm1
690	movdqa	-96(%ebx),%xmm2
691	movdqa	-80(%ebx),%xmm3
692	paddd	-128(%ebp),%xmm0
693	paddd	-112(%ebp),%xmm1
694	paddd	-96(%ebp),%xmm2
695	paddd	-80(%ebp),%xmm3
696	movdqa	%xmm0,%xmm6
697	punpckldq	%xmm1,%xmm0
698	movdqa	%xmm2,%xmm7
699	punpckldq	%xmm3,%xmm2
700	punpckhdq	%xmm1,%xmm6
701	punpckhdq	%xmm3,%xmm7
702	movdqa	%xmm0,%xmm1
703	punpcklqdq	%xmm2,%xmm0
704	movdqa	%xmm6,%xmm3
705	punpcklqdq	%xmm7,%xmm6
706	punpckhqdq	%xmm2,%xmm1
707	punpckhqdq	%xmm7,%xmm3
708	movdqu	-128(%esi),%xmm4
709	movdqu	-64(%esi),%xmm5
710	movdqu	(%esi),%xmm2
711	movdqu	64(%esi),%xmm7
712	leal	16(%esi),%esi
713	pxor	%xmm0,%xmm4
714	movdqa	-64(%ebx),%xmm0
715	pxor	%xmm1,%xmm5
716	movdqa	-48(%ebx),%xmm1
717	pxor	%xmm2,%xmm6
718	movdqa	-32(%ebx),%xmm2
719	pxor	%xmm3,%xmm7
720	movdqa	-16(%ebx),%xmm3
721	movdqu	%xmm4,-128(%edi)
722	movdqu	%xmm5,-64(%edi)
723	movdqu	%xmm6,(%edi)
724	movdqu	%xmm7,64(%edi)
725	leal	16(%edi),%edi
726	paddd	-64(%ebp),%xmm0
727	paddd	-48(%ebp),%xmm1
728	paddd	-32(%ebp),%xmm2
729	paddd	-16(%ebp),%xmm3
730	movdqa	%xmm0,%xmm6
731	punpckldq	%xmm1,%xmm0
732	movdqa	%xmm2,%xmm7
733	punpckldq	%xmm3,%xmm2
734	punpckhdq	%xmm1,%xmm6
735	punpckhdq	%xmm3,%xmm7
736	movdqa	%xmm0,%xmm1
737	punpcklqdq	%xmm2,%xmm0
738	movdqa	%xmm6,%xmm3
739	punpcklqdq	%xmm7,%xmm6
740	punpckhqdq	%xmm2,%xmm1
741	punpckhqdq	%xmm7,%xmm3
742	movdqu	-128(%esi),%xmm4
743	movdqu	-64(%esi),%xmm5
744	movdqu	(%esi),%xmm2
745	movdqu	64(%esi),%xmm7
746	leal	16(%esi),%esi
747	pxor	%xmm0,%xmm4
748	movdqa	(%ebx),%xmm0
749	pxor	%xmm1,%xmm5
750	movdqa	16(%ebx),%xmm1
751	pxor	%xmm2,%xmm6
752	movdqa	32(%ebx),%xmm2
753	pxor	%xmm3,%xmm7
754	movdqa	48(%ebx),%xmm3
755	movdqu	%xmm4,-128(%edi)
756	movdqu	%xmm5,-64(%edi)
757	movdqu	%xmm6,(%edi)
758	movdqu	%xmm7,64(%edi)
759	leal	16(%edi),%edi
760	paddd	(%ebp),%xmm0
761	paddd	16(%ebp),%xmm1
762	paddd	32(%ebp),%xmm2
763	paddd	48(%ebp),%xmm3
764	movdqa	%xmm0,%xmm6
765	punpckldq	%xmm1,%xmm0
766	movdqa	%xmm2,%xmm7
767	punpckldq	%xmm3,%xmm2
768	punpckhdq	%xmm1,%xmm6
769	punpckhdq	%xmm3,%xmm7
770	movdqa	%xmm0,%xmm1
771	punpcklqdq	%xmm2,%xmm0
772	movdqa	%xmm6,%xmm3
773	punpcklqdq	%xmm7,%xmm6
774	punpckhqdq	%xmm2,%xmm1
775	punpckhqdq	%xmm7,%xmm3
776	movdqu	-128(%esi),%xmm4
777	movdqu	-64(%esi),%xmm5
778	movdqu	(%esi),%xmm2
779	movdqu	64(%esi),%xmm7
780	leal	16(%esi),%esi
781	pxor	%xmm0,%xmm4
782	movdqa	64(%ebx),%xmm0
783	pxor	%xmm1,%xmm5
784	movdqa	80(%ebx),%xmm1
785	pxor	%xmm2,%xmm6
786	movdqa	96(%ebx),%xmm2
787	pxor	%xmm3,%xmm7
788	movdqa	112(%ebx),%xmm3
789	movdqu	%xmm4,-128(%edi)
790	movdqu	%xmm5,-64(%edi)
791	movdqu	%xmm6,(%edi)
792	movdqu	%xmm7,64(%edi)
793	leal	16(%edi),%edi
794	paddd	64(%ebp),%xmm0
795	paddd	80(%ebp),%xmm1
796	paddd	96(%ebp),%xmm2
797	paddd	112(%ebp),%xmm3
798	movdqa	%xmm0,%xmm6
799	punpckldq	%xmm1,%xmm0
800	movdqa	%xmm2,%xmm7
801	punpckldq	%xmm3,%xmm2
802	punpckhdq	%xmm1,%xmm6
803	punpckhdq	%xmm3,%xmm7
804	movdqa	%xmm0,%xmm1
805	punpcklqdq	%xmm2,%xmm0
806	movdqa	%xmm6,%xmm3
807	punpcklqdq	%xmm7,%xmm6
808	punpckhqdq	%xmm2,%xmm1
809	punpckhqdq	%xmm7,%xmm3
810	movdqu	-128(%esi),%xmm4
811	movdqu	-64(%esi),%xmm5
812	movdqu	(%esi),%xmm2
813	movdqu	64(%esi),%xmm7
814	leal	208(%esi),%esi
815	pxor	%xmm0,%xmm4
816	pxor	%xmm1,%xmm5
817	pxor	%xmm2,%xmm6
818	pxor	%xmm3,%xmm7
819	movdqu	%xmm4,-128(%edi)
820	movdqu	%xmm5,-64(%edi)
821	movdqu	%xmm6,(%edi)
822	movdqu	%xmm7,64(%edi)
823	leal	208(%edi),%edi
824	subl	$256,%ecx
825	jnc	.L009outer_loop
826	addl	$256,%ecx
827	jz	.L011done
828	movl	520(%esp),%ebx
829	leal	-128(%esi),%esi
830	movl	516(%esp),%edx
831	leal	-128(%edi),%edi
832	movd	64(%ebp),%xmm2
833	movdqu	(%ebx),%xmm3
834	paddd	96(%eax),%xmm2
835	pand	112(%eax),%xmm3
836	por	%xmm2,%xmm3
837.L0081x:
838	movdqa	32(%eax),%xmm0
839	movdqu	(%edx),%xmm1
840	movdqu	16(%edx),%xmm2
841	movdqa	(%eax),%xmm6
842	movdqa	16(%eax),%xmm7
843	movl	%ebp,48(%esp)
844	movdqa	%xmm0,(%esp)
845	movdqa	%xmm1,16(%esp)
846	movdqa	%xmm2,32(%esp)
847	movdqa	%xmm3,48(%esp)
848	movl	$10,%edx
849	jmp	.L012loop1x
850.align	16
851.L013outer1x:
852	movdqa	80(%eax),%xmm3
853	movdqa	(%esp),%xmm0
854	movdqa	16(%esp),%xmm1
855	movdqa	32(%esp),%xmm2
856	paddd	48(%esp),%xmm3
857	movl	$10,%edx
858	movdqa	%xmm3,48(%esp)
859	jmp	.L012loop1x
860.align	16
861.L012loop1x:
862	paddd	%xmm1,%xmm0
863	pxor	%xmm0,%xmm3
864.byte	102,15,56,0,222
865	paddd	%xmm3,%xmm2
866	pxor	%xmm2,%xmm1
867	movdqa	%xmm1,%xmm4
868	psrld	$20,%xmm1
869	pslld	$12,%xmm4
870	por	%xmm4,%xmm1
871	paddd	%xmm1,%xmm0
872	pxor	%xmm0,%xmm3
873.byte	102,15,56,0,223
874	paddd	%xmm3,%xmm2
875	pxor	%xmm2,%xmm1
876	movdqa	%xmm1,%xmm4
877	psrld	$25,%xmm1
878	pslld	$7,%xmm4
879	por	%xmm4,%xmm1
880	pshufd	$78,%xmm2,%xmm2
881	pshufd	$57,%xmm1,%xmm1
882	pshufd	$147,%xmm3,%xmm3
883	nop
884	paddd	%xmm1,%xmm0
885	pxor	%xmm0,%xmm3
886.byte	102,15,56,0,222
887	paddd	%xmm3,%xmm2
888	pxor	%xmm2,%xmm1
889	movdqa	%xmm1,%xmm4
890	psrld	$20,%xmm1
891	pslld	$12,%xmm4
892	por	%xmm4,%xmm1
893	paddd	%xmm1,%xmm0
894	pxor	%xmm0,%xmm3
895.byte	102,15,56,0,223
896	paddd	%xmm3,%xmm2
897	pxor	%xmm2,%xmm1
898	movdqa	%xmm1,%xmm4
899	psrld	$25,%xmm1
900	pslld	$7,%xmm4
901	por	%xmm4,%xmm1
902	pshufd	$78,%xmm2,%xmm2
903	pshufd	$147,%xmm1,%xmm1
904	pshufd	$57,%xmm3,%xmm3
905	decl	%edx
906	jnz	.L012loop1x
907	paddd	(%esp),%xmm0
908	paddd	16(%esp),%xmm1
909	paddd	32(%esp),%xmm2
910	paddd	48(%esp),%xmm3
911	cmpl	$64,%ecx
912	jb	.L014tail
913	movdqu	(%esi),%xmm4
914	movdqu	16(%esi),%xmm5
915	pxor	%xmm4,%xmm0
916	movdqu	32(%esi),%xmm4
917	pxor	%xmm5,%xmm1
918	movdqu	48(%esi),%xmm5
919	pxor	%xmm4,%xmm2
920	pxor	%xmm5,%xmm3
921	leal	64(%esi),%esi
922	movdqu	%xmm0,(%edi)
923	movdqu	%xmm1,16(%edi)
924	movdqu	%xmm2,32(%edi)
925	movdqu	%xmm3,48(%edi)
926	leal	64(%edi),%edi
927	subl	$64,%ecx
928	jnz	.L013outer1x
929	jmp	.L011done
930.L014tail:
931	movdqa	%xmm0,(%esp)
932	movdqa	%xmm1,16(%esp)
933	movdqa	%xmm2,32(%esp)
934	movdqa	%xmm3,48(%esp)
935	xorl	%eax,%eax
936	xorl	%edx,%edx
937	xorl	%ebp,%ebp
938.L015tail_loop:
939	movb	(%esp,%ebp,1),%al
940	movb	(%esi,%ebp,1),%dl
941	leal	1(%ebp),%ebp
942	xorb	%dl,%al
943	movb	%al,-1(%edi,%ebp,1)
944	decl	%ecx
945	jnz	.L015tail_loop
946.L011done:
947	movl	512(%esp),%esp
948	popl	%edi
949	popl	%esi
950	popl	%ebx
951	popl	%ebp
952	ret
953.size	ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin
954.align	64
955.Lssse3_data:
956.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
957.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
958.long	1634760805,857760878,2036477234,1797285236
959.long	0,1,2,3
960.long	4,4,4,4
961.long	1,0,0,0
962.long	4,0,0,0
963.long	0,-1,-1,-1
964.align	64
965.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
966.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
967.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
968.byte	114,103,62,0
969#endif
970