1#if defined(__x86_64__)
2.text
3.extern	OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5
6.globl	gcm_gmult_4bit
7.hidden gcm_gmult_4bit
8.type	gcm_gmult_4bit,@function
9.align	16
10gcm_gmult_4bit:
11	pushq	%rbx
12	pushq	%rbp
13	pushq	%r12
14	pushq	%r13
15	pushq	%r14
16	pushq	%r15
17	subq	$280,%rsp
18.Lgmult_prologue:
19
20	movzbq	15(%rdi),%r8
21	leaq	.Lrem_4bit(%rip),%r11
22	xorq	%rax,%rax
23	xorq	%rbx,%rbx
24	movb	%r8b,%al
25	movb	%r8b,%bl
26	shlb	$4,%al
27	movq	$14,%rcx
28	movq	8(%rsi,%rax,1),%r8
29	movq	(%rsi,%rax,1),%r9
30	andb	$0xf0,%bl
31	movq	%r8,%rdx
32	jmp	.Loop1
33
34.align	16
35.Loop1:
36	shrq	$4,%r8
37	andq	$0xf,%rdx
38	movq	%r9,%r10
39	movb	(%rdi,%rcx,1),%al
40	shrq	$4,%r9
41	xorq	8(%rsi,%rbx,1),%r8
42	shlq	$60,%r10
43	xorq	(%rsi,%rbx,1),%r9
44	movb	%al,%bl
45	xorq	(%r11,%rdx,8),%r9
46	movq	%r8,%rdx
47	shlb	$4,%al
48	xorq	%r10,%r8
49	decq	%rcx
50	js	.Lbreak1
51
52	shrq	$4,%r8
53	andq	$0xf,%rdx
54	movq	%r9,%r10
55	shrq	$4,%r9
56	xorq	8(%rsi,%rax,1),%r8
57	shlq	$60,%r10
58	xorq	(%rsi,%rax,1),%r9
59	andb	$0xf0,%bl
60	xorq	(%r11,%rdx,8),%r9
61	movq	%r8,%rdx
62	xorq	%r10,%r8
63	jmp	.Loop1
64
65.align	16
66.Lbreak1:
67	shrq	$4,%r8
68	andq	$0xf,%rdx
69	movq	%r9,%r10
70	shrq	$4,%r9
71	xorq	8(%rsi,%rax,1),%r8
72	shlq	$60,%r10
73	xorq	(%rsi,%rax,1),%r9
74	andb	$0xf0,%bl
75	xorq	(%r11,%rdx,8),%r9
76	movq	%r8,%rdx
77	xorq	%r10,%r8
78
79	shrq	$4,%r8
80	andq	$0xf,%rdx
81	movq	%r9,%r10
82	shrq	$4,%r9
83	xorq	8(%rsi,%rbx,1),%r8
84	shlq	$60,%r10
85	xorq	(%rsi,%rbx,1),%r9
86	xorq	%r10,%r8
87	xorq	(%r11,%rdx,8),%r9
88
89	bswapq	%r8
90	bswapq	%r9
91	movq	%r8,8(%rdi)
92	movq	%r9,(%rdi)
93
94	leaq	280+48(%rsp),%rsi
95	movq	-8(%rsi),%rbx
96	leaq	(%rsi),%rsp
97.Lgmult_epilogue:
98	.byte	0xf3,0xc3
99.size	gcm_gmult_4bit,.-gcm_gmult_4bit
100.globl	gcm_ghash_4bit
101.hidden gcm_ghash_4bit
102.type	gcm_ghash_4bit,@function
103.align	16
104gcm_ghash_4bit:
105	pushq	%rbx
106	pushq	%rbp
107	pushq	%r12
108	pushq	%r13
109	pushq	%r14
110	pushq	%r15
111	subq	$280,%rsp
112.Lghash_prologue:
113	movq	%rdx,%r14
114	movq	%rcx,%r15
115	subq	$-128,%rsi
116	leaq	16+128(%rsp),%rbp
117	xorl	%edx,%edx
118	movq	0+0-128(%rsi),%r8
119	movq	0+8-128(%rsi),%rax
120	movb	%al,%dl
121	shrq	$4,%rax
122	movq	%r8,%r10
123	shrq	$4,%r8
124	movq	16+0-128(%rsi),%r9
125	shlb	$4,%dl
126	movq	16+8-128(%rsi),%rbx
127	shlq	$60,%r10
128	movb	%dl,0(%rsp)
129	orq	%r10,%rax
130	movb	%bl,%dl
131	shrq	$4,%rbx
132	movq	%r9,%r10
133	shrq	$4,%r9
134	movq	%r8,0(%rbp)
135	movq	32+0-128(%rsi),%r8
136	shlb	$4,%dl
137	movq	%rax,0-128(%rbp)
138	movq	32+8-128(%rsi),%rax
139	shlq	$60,%r10
140	movb	%dl,1(%rsp)
141	orq	%r10,%rbx
142	movb	%al,%dl
143	shrq	$4,%rax
144	movq	%r8,%r10
145	shrq	$4,%r8
146	movq	%r9,8(%rbp)
147	movq	48+0-128(%rsi),%r9
148	shlb	$4,%dl
149	movq	%rbx,8-128(%rbp)
150	movq	48+8-128(%rsi),%rbx
151	shlq	$60,%r10
152	movb	%dl,2(%rsp)
153	orq	%r10,%rax
154	movb	%bl,%dl
155	shrq	$4,%rbx
156	movq	%r9,%r10
157	shrq	$4,%r9
158	movq	%r8,16(%rbp)
159	movq	64+0-128(%rsi),%r8
160	shlb	$4,%dl
161	movq	%rax,16-128(%rbp)
162	movq	64+8-128(%rsi),%rax
163	shlq	$60,%r10
164	movb	%dl,3(%rsp)
165	orq	%r10,%rbx
166	movb	%al,%dl
167	shrq	$4,%rax
168	movq	%r8,%r10
169	shrq	$4,%r8
170	movq	%r9,24(%rbp)
171	movq	80+0-128(%rsi),%r9
172	shlb	$4,%dl
173	movq	%rbx,24-128(%rbp)
174	movq	80+8-128(%rsi),%rbx
175	shlq	$60,%r10
176	movb	%dl,4(%rsp)
177	orq	%r10,%rax
178	movb	%bl,%dl
179	shrq	$4,%rbx
180	movq	%r9,%r10
181	shrq	$4,%r9
182	movq	%r8,32(%rbp)
183	movq	96+0-128(%rsi),%r8
184	shlb	$4,%dl
185	movq	%rax,32-128(%rbp)
186	movq	96+8-128(%rsi),%rax
187	shlq	$60,%r10
188	movb	%dl,5(%rsp)
189	orq	%r10,%rbx
190	movb	%al,%dl
191	shrq	$4,%rax
192	movq	%r8,%r10
193	shrq	$4,%r8
194	movq	%r9,40(%rbp)
195	movq	112+0-128(%rsi),%r9
196	shlb	$4,%dl
197	movq	%rbx,40-128(%rbp)
198	movq	112+8-128(%rsi),%rbx
199	shlq	$60,%r10
200	movb	%dl,6(%rsp)
201	orq	%r10,%rax
202	movb	%bl,%dl
203	shrq	$4,%rbx
204	movq	%r9,%r10
205	shrq	$4,%r9
206	movq	%r8,48(%rbp)
207	movq	128+0-128(%rsi),%r8
208	shlb	$4,%dl
209	movq	%rax,48-128(%rbp)
210	movq	128+8-128(%rsi),%rax
211	shlq	$60,%r10
212	movb	%dl,7(%rsp)
213	orq	%r10,%rbx
214	movb	%al,%dl
215	shrq	$4,%rax
216	movq	%r8,%r10
217	shrq	$4,%r8
218	movq	%r9,56(%rbp)
219	movq	144+0-128(%rsi),%r9
220	shlb	$4,%dl
221	movq	%rbx,56-128(%rbp)
222	movq	144+8-128(%rsi),%rbx
223	shlq	$60,%r10
224	movb	%dl,8(%rsp)
225	orq	%r10,%rax
226	movb	%bl,%dl
227	shrq	$4,%rbx
228	movq	%r9,%r10
229	shrq	$4,%r9
230	movq	%r8,64(%rbp)
231	movq	160+0-128(%rsi),%r8
232	shlb	$4,%dl
233	movq	%rax,64-128(%rbp)
234	movq	160+8-128(%rsi),%rax
235	shlq	$60,%r10
236	movb	%dl,9(%rsp)
237	orq	%r10,%rbx
238	movb	%al,%dl
239	shrq	$4,%rax
240	movq	%r8,%r10
241	shrq	$4,%r8
242	movq	%r9,72(%rbp)
243	movq	176+0-128(%rsi),%r9
244	shlb	$4,%dl
245	movq	%rbx,72-128(%rbp)
246	movq	176+8-128(%rsi),%rbx
247	shlq	$60,%r10
248	movb	%dl,10(%rsp)
249	orq	%r10,%rax
250	movb	%bl,%dl
251	shrq	$4,%rbx
252	movq	%r9,%r10
253	shrq	$4,%r9
254	movq	%r8,80(%rbp)
255	movq	192+0-128(%rsi),%r8
256	shlb	$4,%dl
257	movq	%rax,80-128(%rbp)
258	movq	192+8-128(%rsi),%rax
259	shlq	$60,%r10
260	movb	%dl,11(%rsp)
261	orq	%r10,%rbx
262	movb	%al,%dl
263	shrq	$4,%rax
264	movq	%r8,%r10
265	shrq	$4,%r8
266	movq	%r9,88(%rbp)
267	movq	208+0-128(%rsi),%r9
268	shlb	$4,%dl
269	movq	%rbx,88-128(%rbp)
270	movq	208+8-128(%rsi),%rbx
271	shlq	$60,%r10
272	movb	%dl,12(%rsp)
273	orq	%r10,%rax
274	movb	%bl,%dl
275	shrq	$4,%rbx
276	movq	%r9,%r10
277	shrq	$4,%r9
278	movq	%r8,96(%rbp)
279	movq	224+0-128(%rsi),%r8
280	shlb	$4,%dl
281	movq	%rax,96-128(%rbp)
282	movq	224+8-128(%rsi),%rax
283	shlq	$60,%r10
284	movb	%dl,13(%rsp)
285	orq	%r10,%rbx
286	movb	%al,%dl
287	shrq	$4,%rax
288	movq	%r8,%r10
289	shrq	$4,%r8
290	movq	%r9,104(%rbp)
291	movq	240+0-128(%rsi),%r9
292	shlb	$4,%dl
293	movq	%rbx,104-128(%rbp)
294	movq	240+8-128(%rsi),%rbx
295	shlq	$60,%r10
296	movb	%dl,14(%rsp)
297	orq	%r10,%rax
298	movb	%bl,%dl
299	shrq	$4,%rbx
300	movq	%r9,%r10
301	shrq	$4,%r9
302	movq	%r8,112(%rbp)
303	shlb	$4,%dl
304	movq	%rax,112-128(%rbp)
305	shlq	$60,%r10
306	movb	%dl,15(%rsp)
307	orq	%r10,%rbx
308	movq	%r9,120(%rbp)
309	movq	%rbx,120-128(%rbp)
310	addq	$-128,%rsi
311	movq	8(%rdi),%r8
312	movq	0(%rdi),%r9
313	addq	%r14,%r15
314	leaq	.Lrem_8bit(%rip),%r11
315	jmp	.Louter_loop
316.align	16
317.Louter_loop:
318	xorq	(%r14),%r9
319	movq	8(%r14),%rdx
320	leaq	16(%r14),%r14
321	xorq	%r8,%rdx
322	movq	%r9,(%rdi)
323	movq	%rdx,8(%rdi)
324	shrq	$32,%rdx
325	xorq	%rax,%rax
326	roll	$8,%edx
327	movb	%dl,%al
328	movzbl	%dl,%ebx
329	shlb	$4,%al
330	shrl	$4,%ebx
331	roll	$8,%edx
332	movq	8(%rsi,%rax,1),%r8
333	movq	(%rsi,%rax,1),%r9
334	movb	%dl,%al
335	movzbl	%dl,%ecx
336	shlb	$4,%al
337	movzbq	(%rsp,%rbx,1),%r12
338	shrl	$4,%ecx
339	xorq	%r8,%r12
340	movq	%r9,%r10
341	shrq	$8,%r8
342	movzbq	%r12b,%r12
343	shrq	$8,%r9
344	xorq	-128(%rbp,%rbx,8),%r8
345	shlq	$56,%r10
346	xorq	(%rbp,%rbx,8),%r9
347	roll	$8,%edx
348	xorq	8(%rsi,%rax,1),%r8
349	xorq	(%rsi,%rax,1),%r9
350	movb	%dl,%al
351	xorq	%r10,%r8
352	movzwq	(%r11,%r12,2),%r12
353	movzbl	%dl,%ebx
354	shlb	$4,%al
355	movzbq	(%rsp,%rcx,1),%r13
356	shrl	$4,%ebx
357	shlq	$48,%r12
358	xorq	%r8,%r13
359	movq	%r9,%r10
360	xorq	%r12,%r9
361	shrq	$8,%r8
362	movzbq	%r13b,%r13
363	shrq	$8,%r9
364	xorq	-128(%rbp,%rcx,8),%r8
365	shlq	$56,%r10
366	xorq	(%rbp,%rcx,8),%r9
367	roll	$8,%edx
368	xorq	8(%rsi,%rax,1),%r8
369	xorq	(%rsi,%rax,1),%r9
370	movb	%dl,%al
371	xorq	%r10,%r8
372	movzwq	(%r11,%r13,2),%r13
373	movzbl	%dl,%ecx
374	shlb	$4,%al
375	movzbq	(%rsp,%rbx,1),%r12
376	shrl	$4,%ecx
377	shlq	$48,%r13
378	xorq	%r8,%r12
379	movq	%r9,%r10
380	xorq	%r13,%r9
381	shrq	$8,%r8
382	movzbq	%r12b,%r12
383	movl	8(%rdi),%edx
384	shrq	$8,%r9
385	xorq	-128(%rbp,%rbx,8),%r8
386	shlq	$56,%r10
387	xorq	(%rbp,%rbx,8),%r9
388	roll	$8,%edx
389	xorq	8(%rsi,%rax,1),%r8
390	xorq	(%rsi,%rax,1),%r9
391	movb	%dl,%al
392	xorq	%r10,%r8
393	movzwq	(%r11,%r12,2),%r12
394	movzbl	%dl,%ebx
395	shlb	$4,%al
396	movzbq	(%rsp,%rcx,1),%r13
397	shrl	$4,%ebx
398	shlq	$48,%r12
399	xorq	%r8,%r13
400	movq	%r9,%r10
401	xorq	%r12,%r9
402	shrq	$8,%r8
403	movzbq	%r13b,%r13
404	shrq	$8,%r9
405	xorq	-128(%rbp,%rcx,8),%r8
406	shlq	$56,%r10
407	xorq	(%rbp,%rcx,8),%r9
408	roll	$8,%edx
409	xorq	8(%rsi,%rax,1),%r8
410	xorq	(%rsi,%rax,1),%r9
411	movb	%dl,%al
412	xorq	%r10,%r8
413	movzwq	(%r11,%r13,2),%r13
414	movzbl	%dl,%ecx
415	shlb	$4,%al
416	movzbq	(%rsp,%rbx,1),%r12
417	shrl	$4,%ecx
418	shlq	$48,%r13
419	xorq	%r8,%r12
420	movq	%r9,%r10
421	xorq	%r13,%r9
422	shrq	$8,%r8
423	movzbq	%r12b,%r12
424	shrq	$8,%r9
425	xorq	-128(%rbp,%rbx,8),%r8
426	shlq	$56,%r10
427	xorq	(%rbp,%rbx,8),%r9
428	roll	$8,%edx
429	xorq	8(%rsi,%rax,1),%r8
430	xorq	(%rsi,%rax,1),%r9
431	movb	%dl,%al
432	xorq	%r10,%r8
433	movzwq	(%r11,%r12,2),%r12
434	movzbl	%dl,%ebx
435	shlb	$4,%al
436	movzbq	(%rsp,%rcx,1),%r13
437	shrl	$4,%ebx
438	shlq	$48,%r12
439	xorq	%r8,%r13
440	movq	%r9,%r10
441	xorq	%r12,%r9
442	shrq	$8,%r8
443	movzbq	%r13b,%r13
444	shrq	$8,%r9
445	xorq	-128(%rbp,%rcx,8),%r8
446	shlq	$56,%r10
447	xorq	(%rbp,%rcx,8),%r9
448	roll	$8,%edx
449	xorq	8(%rsi,%rax,1),%r8
450	xorq	(%rsi,%rax,1),%r9
451	movb	%dl,%al
452	xorq	%r10,%r8
453	movzwq	(%r11,%r13,2),%r13
454	movzbl	%dl,%ecx
455	shlb	$4,%al
456	movzbq	(%rsp,%rbx,1),%r12
457	shrl	$4,%ecx
458	shlq	$48,%r13
459	xorq	%r8,%r12
460	movq	%r9,%r10
461	xorq	%r13,%r9
462	shrq	$8,%r8
463	movzbq	%r12b,%r12
464	movl	4(%rdi),%edx
465	shrq	$8,%r9
466	xorq	-128(%rbp,%rbx,8),%r8
467	shlq	$56,%r10
468	xorq	(%rbp,%rbx,8),%r9
469	roll	$8,%edx
470	xorq	8(%rsi,%rax,1),%r8
471	xorq	(%rsi,%rax,1),%r9
472	movb	%dl,%al
473	xorq	%r10,%r8
474	movzwq	(%r11,%r12,2),%r12
475	movzbl	%dl,%ebx
476	shlb	$4,%al
477	movzbq	(%rsp,%rcx,1),%r13
478	shrl	$4,%ebx
479	shlq	$48,%r12
480	xorq	%r8,%r13
481	movq	%r9,%r10
482	xorq	%r12,%r9
483	shrq	$8,%r8
484	movzbq	%r13b,%r13
485	shrq	$8,%r9
486	xorq	-128(%rbp,%rcx,8),%r8
487	shlq	$56,%r10
488	xorq	(%rbp,%rcx,8),%r9
489	roll	$8,%edx
490	xorq	8(%rsi,%rax,1),%r8
491	xorq	(%rsi,%rax,1),%r9
492	movb	%dl,%al
493	xorq	%r10,%r8
494	movzwq	(%r11,%r13,2),%r13
495	movzbl	%dl,%ecx
496	shlb	$4,%al
497	movzbq	(%rsp,%rbx,1),%r12
498	shrl	$4,%ecx
499	shlq	$48,%r13
500	xorq	%r8,%r12
501	movq	%r9,%r10
502	xorq	%r13,%r9
503	shrq	$8,%r8
504	movzbq	%r12b,%r12
505	shrq	$8,%r9
506	xorq	-128(%rbp,%rbx,8),%r8
507	shlq	$56,%r10
508	xorq	(%rbp,%rbx,8),%r9
509	roll	$8,%edx
510	xorq	8(%rsi,%rax,1),%r8
511	xorq	(%rsi,%rax,1),%r9
512	movb	%dl,%al
513	xorq	%r10,%r8
514	movzwq	(%r11,%r12,2),%r12
515	movzbl	%dl,%ebx
516	shlb	$4,%al
517	movzbq	(%rsp,%rcx,1),%r13
518	shrl	$4,%ebx
519	shlq	$48,%r12
520	xorq	%r8,%r13
521	movq	%r9,%r10
522	xorq	%r12,%r9
523	shrq	$8,%r8
524	movzbq	%r13b,%r13
525	shrq	$8,%r9
526	xorq	-128(%rbp,%rcx,8),%r8
527	shlq	$56,%r10
528	xorq	(%rbp,%rcx,8),%r9
529	roll	$8,%edx
530	xorq	8(%rsi,%rax,1),%r8
531	xorq	(%rsi,%rax,1),%r9
532	movb	%dl,%al
533	xorq	%r10,%r8
534	movzwq	(%r11,%r13,2),%r13
535	movzbl	%dl,%ecx
536	shlb	$4,%al
537	movzbq	(%rsp,%rbx,1),%r12
538	shrl	$4,%ecx
539	shlq	$48,%r13
540	xorq	%r8,%r12
541	movq	%r9,%r10
542	xorq	%r13,%r9
543	shrq	$8,%r8
544	movzbq	%r12b,%r12
545	movl	0(%rdi),%edx
546	shrq	$8,%r9
547	xorq	-128(%rbp,%rbx,8),%r8
548	shlq	$56,%r10
549	xorq	(%rbp,%rbx,8),%r9
550	roll	$8,%edx
551	xorq	8(%rsi,%rax,1),%r8
552	xorq	(%rsi,%rax,1),%r9
553	movb	%dl,%al
554	xorq	%r10,%r8
555	movzwq	(%r11,%r12,2),%r12
556	movzbl	%dl,%ebx
557	shlb	$4,%al
558	movzbq	(%rsp,%rcx,1),%r13
559	shrl	$4,%ebx
560	shlq	$48,%r12
561	xorq	%r8,%r13
562	movq	%r9,%r10
563	xorq	%r12,%r9
564	shrq	$8,%r8
565	movzbq	%r13b,%r13
566	shrq	$8,%r9
567	xorq	-128(%rbp,%rcx,8),%r8
568	shlq	$56,%r10
569	xorq	(%rbp,%rcx,8),%r9
570	roll	$8,%edx
571	xorq	8(%rsi,%rax,1),%r8
572	xorq	(%rsi,%rax,1),%r9
573	movb	%dl,%al
574	xorq	%r10,%r8
575	movzwq	(%r11,%r13,2),%r13
576	movzbl	%dl,%ecx
577	shlb	$4,%al
578	movzbq	(%rsp,%rbx,1),%r12
579	shrl	$4,%ecx
580	shlq	$48,%r13
581	xorq	%r8,%r12
582	movq	%r9,%r10
583	xorq	%r13,%r9
584	shrq	$8,%r8
585	movzbq	%r12b,%r12
586	shrq	$8,%r9
587	xorq	-128(%rbp,%rbx,8),%r8
588	shlq	$56,%r10
589	xorq	(%rbp,%rbx,8),%r9
590	roll	$8,%edx
591	xorq	8(%rsi,%rax,1),%r8
592	xorq	(%rsi,%rax,1),%r9
593	movb	%dl,%al
594	xorq	%r10,%r8
595	movzwq	(%r11,%r12,2),%r12
596	movzbl	%dl,%ebx
597	shlb	$4,%al
598	movzbq	(%rsp,%rcx,1),%r13
599	shrl	$4,%ebx
600	shlq	$48,%r12
601	xorq	%r8,%r13
602	movq	%r9,%r10
603	xorq	%r12,%r9
604	shrq	$8,%r8
605	movzbq	%r13b,%r13
606	shrq	$8,%r9
607	xorq	-128(%rbp,%rcx,8),%r8
608	shlq	$56,%r10
609	xorq	(%rbp,%rcx,8),%r9
610	roll	$8,%edx
611	xorq	8(%rsi,%rax,1),%r8
612	xorq	(%rsi,%rax,1),%r9
613	movb	%dl,%al
614	xorq	%r10,%r8
615	movzwq	(%r11,%r13,2),%r13
616	movzbl	%dl,%ecx
617	shlb	$4,%al
618	movzbq	(%rsp,%rbx,1),%r12
619	andl	$240,%ecx
620	shlq	$48,%r13
621	xorq	%r8,%r12
622	movq	%r9,%r10
623	xorq	%r13,%r9
624	shrq	$8,%r8
625	movzbq	%r12b,%r12
626	movl	-4(%rdi),%edx
627	shrq	$8,%r9
628	xorq	-128(%rbp,%rbx,8),%r8
629	shlq	$56,%r10
630	xorq	(%rbp,%rbx,8),%r9
631	movzwq	(%r11,%r12,2),%r12
632	xorq	8(%rsi,%rax,1),%r8
633	xorq	(%rsi,%rax,1),%r9
634	shlq	$48,%r12
635	xorq	%r10,%r8
636	xorq	%r12,%r9
637	movzbq	%r8b,%r13
638	shrq	$4,%r8
639	movq	%r9,%r10
640	shlb	$4,%r13b
641	shrq	$4,%r9
642	xorq	8(%rsi,%rcx,1),%r8
643	movzwq	(%r11,%r13,2),%r13
644	shlq	$60,%r10
645	xorq	(%rsi,%rcx,1),%r9
646	xorq	%r10,%r8
647	shlq	$48,%r13
648	bswapq	%r8
649	xorq	%r13,%r9
650	bswapq	%r9
651	cmpq	%r15,%r14
652	jb	.Louter_loop
653	movq	%r8,8(%rdi)
654	movq	%r9,(%rdi)
655
656	leaq	280+48(%rsp),%rsi
657	movq	-48(%rsi),%r15
658	movq	-40(%rsi),%r14
659	movq	-32(%rsi),%r13
660	movq	-24(%rsi),%r12
661	movq	-16(%rsi),%rbp
662	movq	-8(%rsi),%rbx
663	leaq	0(%rsi),%rsp
664.Lghash_epilogue:
665	.byte	0xf3,0xc3
666.size	gcm_ghash_4bit,.-gcm_ghash_4bit
667.globl	gcm_init_clmul
668.hidden gcm_init_clmul
669.type	gcm_init_clmul,@function
670.align	16
671gcm_init_clmul:
672.L_init_clmul:
673	movdqu	(%rsi),%xmm2
674	pshufd	$78,%xmm2,%xmm2
675
676
677	pshufd	$255,%xmm2,%xmm4
678	movdqa	%xmm2,%xmm3
679	psllq	$1,%xmm2
680	pxor	%xmm5,%xmm5
681	psrlq	$63,%xmm3
682	pcmpgtd	%xmm4,%xmm5
683	pslldq	$8,%xmm3
684	por	%xmm3,%xmm2
685
686
687	pand	.L0x1c2_polynomial(%rip),%xmm5
688	pxor	%xmm5,%xmm2
689
690
691	pshufd	$78,%xmm2,%xmm6
692	movdqa	%xmm2,%xmm0
693	pxor	%xmm2,%xmm6
694	movdqa	%xmm0,%xmm1
695	pshufd	$78,%xmm0,%xmm3
696	pxor	%xmm0,%xmm3
697.byte	102,15,58,68,194,0
698.byte	102,15,58,68,202,17
699.byte	102,15,58,68,222,0
700	pxor	%xmm0,%xmm3
701	pxor	%xmm1,%xmm3
702
703	movdqa	%xmm3,%xmm4
704	psrldq	$8,%xmm3
705	pslldq	$8,%xmm4
706	pxor	%xmm3,%xmm1
707	pxor	%xmm4,%xmm0
708
709	movdqa	%xmm0,%xmm4
710	movdqa	%xmm0,%xmm3
711	psllq	$5,%xmm0
712	pxor	%xmm0,%xmm3
713	psllq	$1,%xmm0
714	pxor	%xmm3,%xmm0
715	psllq	$57,%xmm0
716	movdqa	%xmm0,%xmm3
717	pslldq	$8,%xmm0
718	psrldq	$8,%xmm3
719	pxor	%xmm4,%xmm0
720	pxor	%xmm3,%xmm1
721
722
723	movdqa	%xmm0,%xmm4
724	psrlq	$1,%xmm0
725	pxor	%xmm4,%xmm1
726	pxor	%xmm0,%xmm4
727	psrlq	$5,%xmm0
728	pxor	%xmm4,%xmm0
729	psrlq	$1,%xmm0
730	pxor	%xmm1,%xmm0
731	pshufd	$78,%xmm2,%xmm3
732	pshufd	$78,%xmm0,%xmm4
733	pxor	%xmm2,%xmm3
734	movdqu	%xmm2,0(%rdi)
735	pxor	%xmm0,%xmm4
736	movdqu	%xmm0,16(%rdi)
737.byte	102,15,58,15,227,8
738	movdqu	%xmm4,32(%rdi)
739	movdqa	%xmm0,%xmm1
740	pshufd	$78,%xmm0,%xmm3
741	pxor	%xmm0,%xmm3
742.byte	102,15,58,68,194,0
743.byte	102,15,58,68,202,17
744.byte	102,15,58,68,222,0
745	pxor	%xmm0,%xmm3
746	pxor	%xmm1,%xmm3
747
748	movdqa	%xmm3,%xmm4
749	psrldq	$8,%xmm3
750	pslldq	$8,%xmm4
751	pxor	%xmm3,%xmm1
752	pxor	%xmm4,%xmm0
753
754	movdqa	%xmm0,%xmm4
755	movdqa	%xmm0,%xmm3
756	psllq	$5,%xmm0
757	pxor	%xmm0,%xmm3
758	psllq	$1,%xmm0
759	pxor	%xmm3,%xmm0
760	psllq	$57,%xmm0
761	movdqa	%xmm0,%xmm3
762	pslldq	$8,%xmm0
763	psrldq	$8,%xmm3
764	pxor	%xmm4,%xmm0
765	pxor	%xmm3,%xmm1
766
767
768	movdqa	%xmm0,%xmm4
769	psrlq	$1,%xmm0
770	pxor	%xmm4,%xmm1
771	pxor	%xmm0,%xmm4
772	psrlq	$5,%xmm0
773	pxor	%xmm4,%xmm0
774	psrlq	$1,%xmm0
775	pxor	%xmm1,%xmm0
776	movdqa	%xmm0,%xmm5
777	movdqa	%xmm0,%xmm1
778	pshufd	$78,%xmm0,%xmm3
779	pxor	%xmm0,%xmm3
780.byte	102,15,58,68,194,0
781.byte	102,15,58,68,202,17
782.byte	102,15,58,68,222,0
783	pxor	%xmm0,%xmm3
784	pxor	%xmm1,%xmm3
785
786	movdqa	%xmm3,%xmm4
787	psrldq	$8,%xmm3
788	pslldq	$8,%xmm4
789	pxor	%xmm3,%xmm1
790	pxor	%xmm4,%xmm0
791
792	movdqa	%xmm0,%xmm4
793	movdqa	%xmm0,%xmm3
794	psllq	$5,%xmm0
795	pxor	%xmm0,%xmm3
796	psllq	$1,%xmm0
797	pxor	%xmm3,%xmm0
798	psllq	$57,%xmm0
799	movdqa	%xmm0,%xmm3
800	pslldq	$8,%xmm0
801	psrldq	$8,%xmm3
802	pxor	%xmm4,%xmm0
803	pxor	%xmm3,%xmm1
804
805
806	movdqa	%xmm0,%xmm4
807	psrlq	$1,%xmm0
808	pxor	%xmm4,%xmm1
809	pxor	%xmm0,%xmm4
810	psrlq	$5,%xmm0
811	pxor	%xmm4,%xmm0
812	psrlq	$1,%xmm0
813	pxor	%xmm1,%xmm0
814	pshufd	$78,%xmm5,%xmm3
815	pshufd	$78,%xmm0,%xmm4
816	pxor	%xmm5,%xmm3
817	movdqu	%xmm5,48(%rdi)
818	pxor	%xmm0,%xmm4
819	movdqu	%xmm0,64(%rdi)
820.byte	102,15,58,15,227,8
821	movdqu	%xmm4,80(%rdi)
822	.byte	0xf3,0xc3
823.size	gcm_init_clmul,.-gcm_init_clmul
824.globl	gcm_gmult_clmul
825.hidden gcm_gmult_clmul
826.type	gcm_gmult_clmul,@function
827.align	16
828gcm_gmult_clmul:
829.L_gmult_clmul:
830	movdqu	(%rdi),%xmm0
831	movdqa	.Lbswap_mask(%rip),%xmm5
832	movdqu	(%rsi),%xmm2
833	movdqu	32(%rsi),%xmm4
834.byte	102,15,56,0,197
835	movdqa	%xmm0,%xmm1
836	pshufd	$78,%xmm0,%xmm3
837	pxor	%xmm0,%xmm3
838.byte	102,15,58,68,194,0
839.byte	102,15,58,68,202,17
840.byte	102,15,58,68,220,0
841	pxor	%xmm0,%xmm3
842	pxor	%xmm1,%xmm3
843
844	movdqa	%xmm3,%xmm4
845	psrldq	$8,%xmm3
846	pslldq	$8,%xmm4
847	pxor	%xmm3,%xmm1
848	pxor	%xmm4,%xmm0
849
850	movdqa	%xmm0,%xmm4
851	movdqa	%xmm0,%xmm3
852	psllq	$5,%xmm0
853	pxor	%xmm0,%xmm3
854	psllq	$1,%xmm0
855	pxor	%xmm3,%xmm0
856	psllq	$57,%xmm0
857	movdqa	%xmm0,%xmm3
858	pslldq	$8,%xmm0
859	psrldq	$8,%xmm3
860	pxor	%xmm4,%xmm0
861	pxor	%xmm3,%xmm1
862
863
864	movdqa	%xmm0,%xmm4
865	psrlq	$1,%xmm0
866	pxor	%xmm4,%xmm1
867	pxor	%xmm0,%xmm4
868	psrlq	$5,%xmm0
869	pxor	%xmm4,%xmm0
870	psrlq	$1,%xmm0
871	pxor	%xmm1,%xmm0
872.byte	102,15,56,0,197
873	movdqu	%xmm0,(%rdi)
874	.byte	0xf3,0xc3
875.size	gcm_gmult_clmul,.-gcm_gmult_clmul
876.globl	gcm_ghash_clmul
877.hidden gcm_ghash_clmul
878.type	gcm_ghash_clmul,@function
879.align	32
880gcm_ghash_clmul:
881.L_ghash_clmul:
882	movdqa	.Lbswap_mask(%rip),%xmm10
883
884	movdqu	(%rdi),%xmm0
885	movdqu	(%rsi),%xmm2
886	movdqu	32(%rsi),%xmm7
887.byte	102,65,15,56,0,194
888
889	subq	$0x10,%rcx
890	jz	.Lodd_tail
891
892	movdqu	16(%rsi),%xmm6
893	movl	OPENSSL_ia32cap_P+4(%rip),%eax
894	cmpq	$0x30,%rcx
895	jb	.Lskip4x
896
897	andl	$71303168,%eax
898	cmpl	$4194304,%eax
899	je	.Lskip4x
900
901	subq	$0x30,%rcx
902	movq	$0xA040608020C0E000,%rax
903	movdqu	48(%rsi),%xmm14
904	movdqu	64(%rsi),%xmm15
905
906
907
908
909	movdqu	48(%rdx),%xmm3
910	movdqu	32(%rdx),%xmm11
911.byte	102,65,15,56,0,218
912.byte	102,69,15,56,0,218
913	movdqa	%xmm3,%xmm5
914	pshufd	$78,%xmm3,%xmm4
915	pxor	%xmm3,%xmm4
916.byte	102,15,58,68,218,0
917.byte	102,15,58,68,234,17
918.byte	102,15,58,68,231,0
919
920	movdqa	%xmm11,%xmm13
921	pshufd	$78,%xmm11,%xmm12
922	pxor	%xmm11,%xmm12
923.byte	102,68,15,58,68,222,0
924.byte	102,68,15,58,68,238,17
925.byte	102,68,15,58,68,231,16
926	xorps	%xmm11,%xmm3
927	xorps	%xmm13,%xmm5
928	movups	80(%rsi),%xmm7
929	xorps	%xmm12,%xmm4
930
931	movdqu	16(%rdx),%xmm11
932	movdqu	0(%rdx),%xmm8
933.byte	102,69,15,56,0,218
934.byte	102,69,15,56,0,194
935	movdqa	%xmm11,%xmm13
936	pshufd	$78,%xmm11,%xmm12
937	pxor	%xmm8,%xmm0
938	pxor	%xmm11,%xmm12
939.byte	102,69,15,58,68,222,0
940	movdqa	%xmm0,%xmm1
941	pshufd	$78,%xmm0,%xmm8
942	pxor	%xmm0,%xmm8
943.byte	102,69,15,58,68,238,17
944.byte	102,68,15,58,68,231,0
945	xorps	%xmm11,%xmm3
946	xorps	%xmm13,%xmm5
947
948	leaq	64(%rdx),%rdx
949	subq	$0x40,%rcx
950	jc	.Ltail4x
951
952	jmp	.Lmod4_loop
953.align	32
954.Lmod4_loop:
955.byte	102,65,15,58,68,199,0
956	xorps	%xmm12,%xmm4
957	movdqu	48(%rdx),%xmm11
958.byte	102,69,15,56,0,218
959.byte	102,65,15,58,68,207,17
960	xorps	%xmm3,%xmm0
961	movdqu	32(%rdx),%xmm3
962	movdqa	%xmm11,%xmm13
963.byte	102,68,15,58,68,199,16
964	pshufd	$78,%xmm11,%xmm12
965	xorps	%xmm5,%xmm1
966	pxor	%xmm11,%xmm12
967.byte	102,65,15,56,0,218
968	movups	32(%rsi),%xmm7
969	xorps	%xmm4,%xmm8
970.byte	102,68,15,58,68,218,0
971	pshufd	$78,%xmm3,%xmm4
972
973	pxor	%xmm0,%xmm8
974	movdqa	%xmm3,%xmm5
975	pxor	%xmm1,%xmm8
976	pxor	%xmm3,%xmm4
977	movdqa	%xmm8,%xmm9
978.byte	102,68,15,58,68,234,17
979	pslldq	$8,%xmm8
980	psrldq	$8,%xmm9
981	pxor	%xmm8,%xmm0
982	movdqa	.L7_mask(%rip),%xmm8
983	pxor	%xmm9,%xmm1
984.byte	102,76,15,110,200
985
986	pand	%xmm0,%xmm8
987.byte	102,69,15,56,0,200
988	pxor	%xmm0,%xmm9
989.byte	102,68,15,58,68,231,0
990	psllq	$57,%xmm9
991	movdqa	%xmm9,%xmm8
992	pslldq	$8,%xmm9
993.byte	102,15,58,68,222,0
994	psrldq	$8,%xmm8
995	pxor	%xmm9,%xmm0
996	pxor	%xmm8,%xmm1
997	movdqu	0(%rdx),%xmm8
998
999	movdqa	%xmm0,%xmm9
1000	psrlq	$1,%xmm0
1001.byte	102,15,58,68,238,17
1002	xorps	%xmm11,%xmm3
1003	movdqu	16(%rdx),%xmm11
1004.byte	102,69,15,56,0,218
1005.byte	102,15,58,68,231,16
1006	xorps	%xmm13,%xmm5
1007	movups	80(%rsi),%xmm7
1008.byte	102,69,15,56,0,194
1009	pxor	%xmm9,%xmm1
1010	pxor	%xmm0,%xmm9
1011	psrlq	$5,%xmm0
1012
1013	movdqa	%xmm11,%xmm13
1014	pxor	%xmm12,%xmm4
1015	pshufd	$78,%xmm11,%xmm12
1016	pxor	%xmm9,%xmm0
1017	pxor	%xmm8,%xmm1
1018	pxor	%xmm11,%xmm12
1019.byte	102,69,15,58,68,222,0
1020	psrlq	$1,%xmm0
1021	pxor	%xmm1,%xmm0
1022	movdqa	%xmm0,%xmm1
1023.byte	102,69,15,58,68,238,17
1024	xorps	%xmm11,%xmm3
1025	pshufd	$78,%xmm0,%xmm8
1026	pxor	%xmm0,%xmm8
1027
1028.byte	102,68,15,58,68,231,0
1029	xorps	%xmm13,%xmm5
1030
1031	leaq	64(%rdx),%rdx
1032	subq	$0x40,%rcx
1033	jnc	.Lmod4_loop
1034
1035.Ltail4x:
1036.byte	102,65,15,58,68,199,0
1037.byte	102,65,15,58,68,207,17
1038.byte	102,68,15,58,68,199,16
1039	xorps	%xmm12,%xmm4
1040	xorps	%xmm3,%xmm0
1041	xorps	%xmm5,%xmm1
1042	pxor	%xmm0,%xmm1
1043	pxor	%xmm4,%xmm8
1044
1045	pxor	%xmm1,%xmm8
1046	pxor	%xmm0,%xmm1
1047
1048	movdqa	%xmm8,%xmm9
1049	psrldq	$8,%xmm8
1050	pslldq	$8,%xmm9
1051	pxor	%xmm8,%xmm1
1052	pxor	%xmm9,%xmm0
1053
1054	movdqa	%xmm0,%xmm4
1055	movdqa	%xmm0,%xmm3
1056	psllq	$5,%xmm0
1057	pxor	%xmm0,%xmm3
1058	psllq	$1,%xmm0
1059	pxor	%xmm3,%xmm0
1060	psllq	$57,%xmm0
1061	movdqa	%xmm0,%xmm3
1062	pslldq	$8,%xmm0
1063	psrldq	$8,%xmm3
1064	pxor	%xmm4,%xmm0
1065	pxor	%xmm3,%xmm1
1066
1067
1068	movdqa	%xmm0,%xmm4
1069	psrlq	$1,%xmm0
1070	pxor	%xmm4,%xmm1
1071	pxor	%xmm0,%xmm4
1072	psrlq	$5,%xmm0
1073	pxor	%xmm4,%xmm0
1074	psrlq	$1,%xmm0
1075	pxor	%xmm1,%xmm0
1076	addq	$0x40,%rcx
1077	jz	.Ldone
1078	movdqu	32(%rsi),%xmm7
1079	subq	$0x10,%rcx
1080	jz	.Lodd_tail
1081.Lskip4x:
1082
1083
1084
1085
1086
1087	movdqu	(%rdx),%xmm8
1088	movdqu	16(%rdx),%xmm3
1089.byte	102,69,15,56,0,194
1090.byte	102,65,15,56,0,218
1091	pxor	%xmm8,%xmm0
1092
1093	movdqa	%xmm3,%xmm5
1094	pshufd	$78,%xmm3,%xmm4
1095	pxor	%xmm3,%xmm4
1096.byte	102,15,58,68,218,0
1097.byte	102,15,58,68,234,17
1098.byte	102,15,58,68,231,0
1099
1100	leaq	32(%rdx),%rdx
1101	nop
1102	subq	$0x20,%rcx
1103	jbe	.Leven_tail
1104	nop
1105	jmp	.Lmod_loop
1106
1107.align	32
1108.Lmod_loop:
1109	movdqa	%xmm0,%xmm1
1110	movdqa	%xmm4,%xmm8
1111	pshufd	$78,%xmm0,%xmm4
1112	pxor	%xmm0,%xmm4
1113
1114.byte	102,15,58,68,198,0
1115.byte	102,15,58,68,206,17
1116.byte	102,15,58,68,231,16
1117
1118	pxor	%xmm3,%xmm0
1119	pxor	%xmm5,%xmm1
1120	movdqu	(%rdx),%xmm9
1121	pxor	%xmm0,%xmm8
1122.byte	102,69,15,56,0,202
1123	movdqu	16(%rdx),%xmm3
1124
1125	pxor	%xmm1,%xmm8
1126	pxor	%xmm9,%xmm1
1127	pxor	%xmm8,%xmm4
1128.byte	102,65,15,56,0,218
1129	movdqa	%xmm4,%xmm8
1130	psrldq	$8,%xmm8
1131	pslldq	$8,%xmm4
1132	pxor	%xmm8,%xmm1
1133	pxor	%xmm4,%xmm0
1134
1135	movdqa	%xmm3,%xmm5
1136
1137	movdqa	%xmm0,%xmm9
1138	movdqa	%xmm0,%xmm8
1139	psllq	$5,%xmm0
1140	pxor	%xmm0,%xmm8
1141.byte	102,15,58,68,218,0
1142	psllq	$1,%xmm0
1143	pxor	%xmm8,%xmm0
1144	psllq	$57,%xmm0
1145	movdqa	%xmm0,%xmm8
1146	pslldq	$8,%xmm0
1147	psrldq	$8,%xmm8
1148	pxor	%xmm9,%xmm0
1149	pshufd	$78,%xmm5,%xmm4
1150	pxor	%xmm8,%xmm1
1151	pxor	%xmm5,%xmm4
1152
1153	movdqa	%xmm0,%xmm9
1154	psrlq	$1,%xmm0
1155.byte	102,15,58,68,234,17
1156	pxor	%xmm9,%xmm1
1157	pxor	%xmm0,%xmm9
1158	psrlq	$5,%xmm0
1159	pxor	%xmm9,%xmm0
1160	leaq	32(%rdx),%rdx
1161	psrlq	$1,%xmm0
1162.byte	102,15,58,68,231,0
1163	pxor	%xmm1,%xmm0
1164
1165	subq	$0x20,%rcx
1166	ja	.Lmod_loop
1167
1168.Leven_tail:
1169	movdqa	%xmm0,%xmm1
1170	movdqa	%xmm4,%xmm8
1171	pshufd	$78,%xmm0,%xmm4
1172	pxor	%xmm0,%xmm4
1173
1174.byte	102,15,58,68,198,0
1175.byte	102,15,58,68,206,17
1176.byte	102,15,58,68,231,16
1177
1178	pxor	%xmm3,%xmm0
1179	pxor	%xmm5,%xmm1
1180	pxor	%xmm0,%xmm8
1181	pxor	%xmm1,%xmm8
1182	pxor	%xmm8,%xmm4
1183	movdqa	%xmm4,%xmm8
1184	psrldq	$8,%xmm8
1185	pslldq	$8,%xmm4
1186	pxor	%xmm8,%xmm1
1187	pxor	%xmm4,%xmm0
1188
1189	movdqa	%xmm0,%xmm4
1190	movdqa	%xmm0,%xmm3
1191	psllq	$5,%xmm0
1192	pxor	%xmm0,%xmm3
1193	psllq	$1,%xmm0
1194	pxor	%xmm3,%xmm0
1195	psllq	$57,%xmm0
1196	movdqa	%xmm0,%xmm3
1197	pslldq	$8,%xmm0
1198	psrldq	$8,%xmm3
1199	pxor	%xmm4,%xmm0
1200	pxor	%xmm3,%xmm1
1201
1202
1203	movdqa	%xmm0,%xmm4
1204	psrlq	$1,%xmm0
1205	pxor	%xmm4,%xmm1
1206	pxor	%xmm0,%xmm4
1207	psrlq	$5,%xmm0
1208	pxor	%xmm4,%xmm0
1209	psrlq	$1,%xmm0
1210	pxor	%xmm1,%xmm0
1211	testq	%rcx,%rcx
1212	jnz	.Ldone
1213
1214.Lodd_tail:
1215	movdqu	(%rdx),%xmm8
1216.byte	102,69,15,56,0,194
1217	pxor	%xmm8,%xmm0
1218	movdqa	%xmm0,%xmm1
1219	pshufd	$78,%xmm0,%xmm3
1220	pxor	%xmm0,%xmm3
1221.byte	102,15,58,68,194,0
1222.byte	102,15,58,68,202,17
1223.byte	102,15,58,68,223,0
1224	pxor	%xmm0,%xmm3
1225	pxor	%xmm1,%xmm3
1226
1227	movdqa	%xmm3,%xmm4
1228	psrldq	$8,%xmm3
1229	pslldq	$8,%xmm4
1230	pxor	%xmm3,%xmm1
1231	pxor	%xmm4,%xmm0
1232
1233	movdqa	%xmm0,%xmm4
1234	movdqa	%xmm0,%xmm3
1235	psllq	$5,%xmm0
1236	pxor	%xmm0,%xmm3
1237	psllq	$1,%xmm0
1238	pxor	%xmm3,%xmm0
1239	psllq	$57,%xmm0
1240	movdqa	%xmm0,%xmm3
1241	pslldq	$8,%xmm0
1242	psrldq	$8,%xmm3
1243	pxor	%xmm4,%xmm0
1244	pxor	%xmm3,%xmm1
1245
1246
1247	movdqa	%xmm0,%xmm4
1248	psrlq	$1,%xmm0
1249	pxor	%xmm4,%xmm1
1250	pxor	%xmm0,%xmm4
1251	psrlq	$5,%xmm0
1252	pxor	%xmm4,%xmm0
1253	psrlq	$1,%xmm0
1254	pxor	%xmm1,%xmm0
1255.Ldone:
1256.byte	102,65,15,56,0,194
1257	movdqu	%xmm0,(%rdi)
1258	.byte	0xf3,0xc3
1259.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1260.globl	gcm_init_avx
1261.hidden gcm_init_avx
1262.type	gcm_init_avx,@function
1263.align	32
1264gcm_init_avx:
1265	vzeroupper
1266
1267	vmovdqu	(%rsi),%xmm2
1268	vpshufd	$78,%xmm2,%xmm2
1269
1270
1271	vpshufd	$255,%xmm2,%xmm4
1272	vpsrlq	$63,%xmm2,%xmm3
1273	vpsllq	$1,%xmm2,%xmm2
1274	vpxor	%xmm5,%xmm5,%xmm5
1275	vpcmpgtd	%xmm4,%xmm5,%xmm5
1276	vpslldq	$8,%xmm3,%xmm3
1277	vpor	%xmm3,%xmm2,%xmm2
1278
1279
1280	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1281	vpxor	%xmm5,%xmm2,%xmm2
1282
1283	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1284	vmovdqa	%xmm2,%xmm0
1285	vpxor	%xmm2,%xmm6,%xmm6
1286	movq	$4,%r10
1287	jmp	.Linit_start_avx
1288.align	32
1289.Linit_loop_avx:
1290	vpalignr	$8,%xmm3,%xmm4,%xmm5
1291	vmovdqu	%xmm5,-16(%rdi)
1292	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1293	vpxor	%xmm0,%xmm3,%xmm3
1294	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1295	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1296	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1297	vpxor	%xmm0,%xmm1,%xmm4
1298	vpxor	%xmm4,%xmm3,%xmm3
1299
1300	vpslldq	$8,%xmm3,%xmm4
1301	vpsrldq	$8,%xmm3,%xmm3
1302	vpxor	%xmm4,%xmm0,%xmm0
1303	vpxor	%xmm3,%xmm1,%xmm1
1304	vpsllq	$57,%xmm0,%xmm3
1305	vpsllq	$62,%xmm0,%xmm4
1306	vpxor	%xmm3,%xmm4,%xmm4
1307	vpsllq	$63,%xmm0,%xmm3
1308	vpxor	%xmm3,%xmm4,%xmm4
1309	vpslldq	$8,%xmm4,%xmm3
1310	vpsrldq	$8,%xmm4,%xmm4
1311	vpxor	%xmm3,%xmm0,%xmm0
1312	vpxor	%xmm4,%xmm1,%xmm1
1313
1314	vpsrlq	$1,%xmm0,%xmm4
1315	vpxor	%xmm0,%xmm1,%xmm1
1316	vpxor	%xmm4,%xmm0,%xmm0
1317	vpsrlq	$5,%xmm4,%xmm4
1318	vpxor	%xmm4,%xmm0,%xmm0
1319	vpsrlq	$1,%xmm0,%xmm0
1320	vpxor	%xmm1,%xmm0,%xmm0
1321.Linit_start_avx:
1322	vmovdqa	%xmm0,%xmm5
1323	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1324	vpxor	%xmm0,%xmm3,%xmm3
1325	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1326	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1327	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1328	vpxor	%xmm0,%xmm1,%xmm4
1329	vpxor	%xmm4,%xmm3,%xmm3
1330
1331	vpslldq	$8,%xmm3,%xmm4
1332	vpsrldq	$8,%xmm3,%xmm3
1333	vpxor	%xmm4,%xmm0,%xmm0
1334	vpxor	%xmm3,%xmm1,%xmm1
1335	vpsllq	$57,%xmm0,%xmm3
1336	vpsllq	$62,%xmm0,%xmm4
1337	vpxor	%xmm3,%xmm4,%xmm4
1338	vpsllq	$63,%xmm0,%xmm3
1339	vpxor	%xmm3,%xmm4,%xmm4
1340	vpslldq	$8,%xmm4,%xmm3
1341	vpsrldq	$8,%xmm4,%xmm4
1342	vpxor	%xmm3,%xmm0,%xmm0
1343	vpxor	%xmm4,%xmm1,%xmm1
1344
1345	vpsrlq	$1,%xmm0,%xmm4
1346	vpxor	%xmm0,%xmm1,%xmm1
1347	vpxor	%xmm4,%xmm0,%xmm0
1348	vpsrlq	$5,%xmm4,%xmm4
1349	vpxor	%xmm4,%xmm0,%xmm0
1350	vpsrlq	$1,%xmm0,%xmm0
1351	vpxor	%xmm1,%xmm0,%xmm0
1352	vpshufd	$78,%xmm5,%xmm3
1353	vpshufd	$78,%xmm0,%xmm4
1354	vpxor	%xmm5,%xmm3,%xmm3
1355	vmovdqu	%xmm5,0(%rdi)
1356	vpxor	%xmm0,%xmm4,%xmm4
1357	vmovdqu	%xmm0,16(%rdi)
1358	leaq	48(%rdi),%rdi
1359	subq	$1,%r10
1360	jnz	.Linit_loop_avx
1361
1362	vpalignr	$8,%xmm4,%xmm3,%xmm5
1363	vmovdqu	%xmm5,-16(%rdi)
1364
1365	vzeroupper
1366	.byte	0xf3,0xc3
1367.size	gcm_init_avx,.-gcm_init_avx
1368.globl	gcm_gmult_avx
1369.hidden gcm_gmult_avx
1370.type	gcm_gmult_avx,@function
1371.align	32
1372gcm_gmult_avx:
1373	jmp	.L_gmult_clmul
1374.size	gcm_gmult_avx,.-gcm_gmult_avx
1375.globl	gcm_ghash_avx
1376.hidden gcm_ghash_avx
1377.type	gcm_ghash_avx,@function
1378.align	32
1379gcm_ghash_avx:
1380	vzeroupper
1381
1382	vmovdqu	(%rdi),%xmm10
1383	leaq	.L0x1c2_polynomial(%rip),%r10
1384	leaq	64(%rsi),%rsi
1385	vmovdqu	.Lbswap_mask(%rip),%xmm13
1386	vpshufb	%xmm13,%xmm10,%xmm10
1387	cmpq	$0x80,%rcx
1388	jb	.Lshort_avx
1389	subq	$0x80,%rcx
1390
1391	vmovdqu	112(%rdx),%xmm14
1392	vmovdqu	0-64(%rsi),%xmm6
1393	vpshufb	%xmm13,%xmm14,%xmm14
1394	vmovdqu	32-64(%rsi),%xmm7
1395
1396	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1397	vmovdqu	96(%rdx),%xmm15
1398	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1399	vpxor	%xmm14,%xmm9,%xmm9
1400	vpshufb	%xmm13,%xmm15,%xmm15
1401	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1402	vmovdqu	16-64(%rsi),%xmm6
1403	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1404	vmovdqu	80(%rdx),%xmm14
1405	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1406	vpxor	%xmm15,%xmm8,%xmm8
1407
1408	vpshufb	%xmm13,%xmm14,%xmm14
1409	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1410	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1411	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1412	vmovdqu	48-64(%rsi),%xmm6
1413	vpxor	%xmm14,%xmm9,%xmm9
1414	vmovdqu	64(%rdx),%xmm15
1415	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1416	vmovdqu	80-64(%rsi),%xmm7
1417
1418	vpshufb	%xmm13,%xmm15,%xmm15
1419	vpxor	%xmm0,%xmm3,%xmm3
1420	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1421	vpxor	%xmm1,%xmm4,%xmm4
1422	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1423	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1424	vmovdqu	64-64(%rsi),%xmm6
1425	vpxor	%xmm2,%xmm5,%xmm5
1426	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1427	vpxor	%xmm15,%xmm8,%xmm8
1428
1429	vmovdqu	48(%rdx),%xmm14
1430	vpxor	%xmm3,%xmm0,%xmm0
1431	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1432	vpxor	%xmm4,%xmm1,%xmm1
1433	vpshufb	%xmm13,%xmm14,%xmm14
1434	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1435	vmovdqu	96-64(%rsi),%xmm6
1436	vpxor	%xmm5,%xmm2,%xmm2
1437	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1438	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1439	vmovdqu	128-64(%rsi),%xmm7
1440	vpxor	%xmm14,%xmm9,%xmm9
1441
1442	vmovdqu	32(%rdx),%xmm15
1443	vpxor	%xmm0,%xmm3,%xmm3
1444	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1445	vpxor	%xmm1,%xmm4,%xmm4
1446	vpshufb	%xmm13,%xmm15,%xmm15
1447	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1448	vmovdqu	112-64(%rsi),%xmm6
1449	vpxor	%xmm2,%xmm5,%xmm5
1450	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1451	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1452	vpxor	%xmm15,%xmm8,%xmm8
1453
1454	vmovdqu	16(%rdx),%xmm14
1455	vpxor	%xmm3,%xmm0,%xmm0
1456	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1457	vpxor	%xmm4,%xmm1,%xmm1
1458	vpshufb	%xmm13,%xmm14,%xmm14
1459	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1460	vmovdqu	144-64(%rsi),%xmm6
1461	vpxor	%xmm5,%xmm2,%xmm2
1462	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1463	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1464	vmovdqu	176-64(%rsi),%xmm7
1465	vpxor	%xmm14,%xmm9,%xmm9
1466
1467	vmovdqu	(%rdx),%xmm15
1468	vpxor	%xmm0,%xmm3,%xmm3
1469	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1470	vpxor	%xmm1,%xmm4,%xmm4
1471	vpshufb	%xmm13,%xmm15,%xmm15
1472	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1473	vmovdqu	160-64(%rsi),%xmm6
1474	vpxor	%xmm2,%xmm5,%xmm5
1475	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1476
1477	leaq	128(%rdx),%rdx
1478	cmpq	$0x80,%rcx
1479	jb	.Ltail_avx
1480
1481	vpxor	%xmm10,%xmm15,%xmm15
1482	subq	$0x80,%rcx
1483	jmp	.Loop8x_avx
1484
1485.align	32
1486.Loop8x_avx:
1487	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1488	vmovdqu	112(%rdx),%xmm14
1489	vpxor	%xmm0,%xmm3,%xmm3
1490	vpxor	%xmm15,%xmm8,%xmm8
1491	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1492	vpshufb	%xmm13,%xmm14,%xmm14
1493	vpxor	%xmm1,%xmm4,%xmm4
1494	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1495	vmovdqu	0-64(%rsi),%xmm6
1496	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1497	vpxor	%xmm2,%xmm5,%xmm5
1498	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1499	vmovdqu	32-64(%rsi),%xmm7
1500	vpxor	%xmm14,%xmm9,%xmm9
1501
1502	vmovdqu	96(%rdx),%xmm15
1503	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1504	vpxor	%xmm3,%xmm10,%xmm10
1505	vpshufb	%xmm13,%xmm15,%xmm15
1506	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1507	vxorps	%xmm4,%xmm11,%xmm11
1508	vmovdqu	16-64(%rsi),%xmm6
1509	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1510	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1511	vpxor	%xmm5,%xmm12,%xmm12
1512	vxorps	%xmm15,%xmm8,%xmm8
1513
1514	vmovdqu	80(%rdx),%xmm14
1515	vpxor	%xmm10,%xmm12,%xmm12
1516	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1517	vpxor	%xmm11,%xmm12,%xmm12
1518	vpslldq	$8,%xmm12,%xmm9
1519	vpxor	%xmm0,%xmm3,%xmm3
1520	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1521	vpsrldq	$8,%xmm12,%xmm12
1522	vpxor	%xmm9,%xmm10,%xmm10
1523	vmovdqu	48-64(%rsi),%xmm6
1524	vpshufb	%xmm13,%xmm14,%xmm14
1525	vxorps	%xmm12,%xmm11,%xmm11
1526	vpxor	%xmm1,%xmm4,%xmm4
1527	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1528	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1529	vmovdqu	80-64(%rsi),%xmm7
1530	vpxor	%xmm14,%xmm9,%xmm9
1531	vpxor	%xmm2,%xmm5,%xmm5
1532
1533	vmovdqu	64(%rdx),%xmm15
1534	vpalignr	$8,%xmm10,%xmm10,%xmm12
1535	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1536	vpshufb	%xmm13,%xmm15,%xmm15
1537	vpxor	%xmm3,%xmm0,%xmm0
1538	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1539	vmovdqu	64-64(%rsi),%xmm6
1540	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1541	vpxor	%xmm4,%xmm1,%xmm1
1542	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1543	vxorps	%xmm15,%xmm8,%xmm8
1544	vpxor	%xmm5,%xmm2,%xmm2
1545
1546	vmovdqu	48(%rdx),%xmm14
1547	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1548	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1549	vpshufb	%xmm13,%xmm14,%xmm14
1550	vpxor	%xmm0,%xmm3,%xmm3
1551	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1552	vmovdqu	96-64(%rsi),%xmm6
1553	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1554	vpxor	%xmm1,%xmm4,%xmm4
1555	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1556	vmovdqu	128-64(%rsi),%xmm7
1557	vpxor	%xmm14,%xmm9,%xmm9
1558	vpxor	%xmm2,%xmm5,%xmm5
1559
1560	vmovdqu	32(%rdx),%xmm15
1561	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1562	vpshufb	%xmm13,%xmm15,%xmm15
1563	vpxor	%xmm3,%xmm0,%xmm0
1564	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1565	vmovdqu	112-64(%rsi),%xmm6
1566	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1567	vpxor	%xmm4,%xmm1,%xmm1
1568	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1569	vpxor	%xmm15,%xmm8,%xmm8
1570	vpxor	%xmm5,%xmm2,%xmm2
1571	vxorps	%xmm12,%xmm10,%xmm10
1572
1573	vmovdqu	16(%rdx),%xmm14
1574	vpalignr	$8,%xmm10,%xmm10,%xmm12
1575	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1576	vpshufb	%xmm13,%xmm14,%xmm14
1577	vpxor	%xmm0,%xmm3,%xmm3
1578	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1579	vmovdqu	144-64(%rsi),%xmm6
1580	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1581	vxorps	%xmm11,%xmm12,%xmm12
1582	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1583	vpxor	%xmm1,%xmm4,%xmm4
1584	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1585	vmovdqu	176-64(%rsi),%xmm7
1586	vpxor	%xmm14,%xmm9,%xmm9
1587	vpxor	%xmm2,%xmm5,%xmm5
1588
1589	vmovdqu	(%rdx),%xmm15
1590	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1591	vpshufb	%xmm13,%xmm15,%xmm15
1592	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1593	vmovdqu	160-64(%rsi),%xmm6
1594	vpxor	%xmm12,%xmm15,%xmm15
1595	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1596	vpxor	%xmm10,%xmm15,%xmm15
1597
1598	leaq	128(%rdx),%rdx
1599	subq	$0x80,%rcx
1600	jnc	.Loop8x_avx
1601
1602	addq	$0x80,%rcx
1603	jmp	.Ltail_no_xor_avx
1604
1605.align	32
1606.Lshort_avx:
1607	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1608	leaq	(%rdx,%rcx,1),%rdx
1609	vmovdqu	0-64(%rsi),%xmm6
1610	vmovdqu	32-64(%rsi),%xmm7
1611	vpshufb	%xmm13,%xmm14,%xmm15
1612
1613	vmovdqa	%xmm0,%xmm3
1614	vmovdqa	%xmm1,%xmm4
1615	vmovdqa	%xmm2,%xmm5
1616	subq	$0x10,%rcx
1617	jz	.Ltail_avx
1618
1619	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1620	vpxor	%xmm0,%xmm3,%xmm3
1621	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1622	vpxor	%xmm15,%xmm8,%xmm8
1623	vmovdqu	-32(%rdx),%xmm14
1624	vpxor	%xmm1,%xmm4,%xmm4
1625	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1626	vmovdqu	16-64(%rsi),%xmm6
1627	vpshufb	%xmm13,%xmm14,%xmm15
1628	vpxor	%xmm2,%xmm5,%xmm5
1629	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1630	vpsrldq	$8,%xmm7,%xmm7
1631	subq	$0x10,%rcx
1632	jz	.Ltail_avx
1633
1634	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1635	vpxor	%xmm0,%xmm3,%xmm3
1636	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1637	vpxor	%xmm15,%xmm8,%xmm8
1638	vmovdqu	-48(%rdx),%xmm14
1639	vpxor	%xmm1,%xmm4,%xmm4
1640	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1641	vmovdqu	48-64(%rsi),%xmm6
1642	vpshufb	%xmm13,%xmm14,%xmm15
1643	vpxor	%xmm2,%xmm5,%xmm5
1644	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1645	vmovdqu	80-64(%rsi),%xmm7
1646	subq	$0x10,%rcx
1647	jz	.Ltail_avx
1648
1649	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1650	vpxor	%xmm0,%xmm3,%xmm3
1651	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1652	vpxor	%xmm15,%xmm8,%xmm8
1653	vmovdqu	-64(%rdx),%xmm14
1654	vpxor	%xmm1,%xmm4,%xmm4
1655	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1656	vmovdqu	64-64(%rsi),%xmm6
1657	vpshufb	%xmm13,%xmm14,%xmm15
1658	vpxor	%xmm2,%xmm5,%xmm5
1659	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1660	vpsrldq	$8,%xmm7,%xmm7
1661	subq	$0x10,%rcx
1662	jz	.Ltail_avx
1663
1664	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1665	vpxor	%xmm0,%xmm3,%xmm3
1666	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1667	vpxor	%xmm15,%xmm8,%xmm8
1668	vmovdqu	-80(%rdx),%xmm14
1669	vpxor	%xmm1,%xmm4,%xmm4
1670	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1671	vmovdqu	96-64(%rsi),%xmm6
1672	vpshufb	%xmm13,%xmm14,%xmm15
1673	vpxor	%xmm2,%xmm5,%xmm5
1674	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1675	vmovdqu	128-64(%rsi),%xmm7
1676	subq	$0x10,%rcx
1677	jz	.Ltail_avx
1678
1679	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1680	vpxor	%xmm0,%xmm3,%xmm3
1681	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1682	vpxor	%xmm15,%xmm8,%xmm8
1683	vmovdqu	-96(%rdx),%xmm14
1684	vpxor	%xmm1,%xmm4,%xmm4
1685	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1686	vmovdqu	112-64(%rsi),%xmm6
1687	vpshufb	%xmm13,%xmm14,%xmm15
1688	vpxor	%xmm2,%xmm5,%xmm5
1689	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1690	vpsrldq	$8,%xmm7,%xmm7
1691	subq	$0x10,%rcx
1692	jz	.Ltail_avx
1693
1694	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1695	vpxor	%xmm0,%xmm3,%xmm3
1696	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1697	vpxor	%xmm15,%xmm8,%xmm8
1698	vmovdqu	-112(%rdx),%xmm14
1699	vpxor	%xmm1,%xmm4,%xmm4
1700	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1701	vmovdqu	144-64(%rsi),%xmm6
1702	vpshufb	%xmm13,%xmm14,%xmm15
1703	vpxor	%xmm2,%xmm5,%xmm5
1704	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1705	vmovq	184-64(%rsi),%xmm7
1706	subq	$0x10,%rcx
1707	jmp	.Ltail_avx
1708
1709.align	32
1710.Ltail_avx:
1711	vpxor	%xmm10,%xmm15,%xmm15
1712.Ltail_no_xor_avx:
1713	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1714	vpxor	%xmm0,%xmm3,%xmm3
1715	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1716	vpxor	%xmm15,%xmm8,%xmm8
1717	vpxor	%xmm1,%xmm4,%xmm4
1718	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1719	vpxor	%xmm2,%xmm5,%xmm5
1720	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1721
1722	vmovdqu	(%r10),%xmm12
1723
1724	vpxor	%xmm0,%xmm3,%xmm10
1725	vpxor	%xmm1,%xmm4,%xmm11
1726	vpxor	%xmm2,%xmm5,%xmm5
1727
1728	vpxor	%xmm10,%xmm5,%xmm5
1729	vpxor	%xmm11,%xmm5,%xmm5
1730	vpslldq	$8,%xmm5,%xmm9
1731	vpsrldq	$8,%xmm5,%xmm5
1732	vpxor	%xmm9,%xmm10,%xmm10
1733	vpxor	%xmm5,%xmm11,%xmm11
1734
1735	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1736	vpalignr	$8,%xmm10,%xmm10,%xmm10
1737	vpxor	%xmm9,%xmm10,%xmm10
1738
1739	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1740	vpalignr	$8,%xmm10,%xmm10,%xmm10
1741	vpxor	%xmm11,%xmm10,%xmm10
1742	vpxor	%xmm9,%xmm10,%xmm10
1743
1744	cmpq	$0,%rcx
1745	jne	.Lshort_avx
1746
1747	vpshufb	%xmm13,%xmm10,%xmm10
1748	vmovdqu	%xmm10,(%rdi)
1749	vzeroupper
1750	.byte	0xf3,0xc3
1751.size	gcm_ghash_avx,.-gcm_ghash_avx
1752.align	64
1753.Lbswap_mask:
1754.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1755.L0x1c2_polynomial:
1756.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1757.L7_mask:
1758.long	7,0,7,0
1759.L7_mask_poly:
1760.long	7,0,450,0
1761.align	64
1762.type	.Lrem_4bit,@object
1763.Lrem_4bit:
1764.long	0,0,0,471859200,0,943718400,0,610271232
1765.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1766.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1767.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1768.type	.Lrem_8bit,@object
1769.Lrem_8bit:
1770.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1771.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1772.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1773.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1774.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1775.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1776.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1777.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1778.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1779.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1780.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1781.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1782.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1783.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1784.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1785.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1786.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1787.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1788.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1789.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1790.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1791.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1792.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1793.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1794.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1795.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1796.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1797.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1798.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1799.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1800.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1801.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1802
1803.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1804.align	64
1805#endif
1806