1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15.extern	OPENSSL_ia32cap_P
16.hidden OPENSSL_ia32cap_P
17
18.globl	gcm_gmult_4bit
19.hidden gcm_gmult_4bit
20.type	gcm_gmult_4bit,@function
21.align	16
22gcm_gmult_4bit:
23.cfi_startproc
24	pushq	%rbx
25.cfi_adjust_cfa_offset	8
26.cfi_offset	%rbx,-16
27	pushq	%rbp
28.cfi_adjust_cfa_offset	8
29.cfi_offset	%rbp,-24
30	pushq	%r12
31.cfi_adjust_cfa_offset	8
32.cfi_offset	%r12,-32
33	pushq	%r13
34.cfi_adjust_cfa_offset	8
35.cfi_offset	%r13,-40
36	pushq	%r14
37.cfi_adjust_cfa_offset	8
38.cfi_offset	%r14,-48
39	pushq	%r15
40.cfi_adjust_cfa_offset	8
41.cfi_offset	%r15,-56
42	subq	$280,%rsp
43.cfi_adjust_cfa_offset	280
44.Lgmult_prologue:
45
46	movzbq	15(%rdi),%r8
47	leaq	.Lrem_4bit(%rip),%r11
48	xorq	%rax,%rax
49	xorq	%rbx,%rbx
50	movb	%r8b,%al
51	movb	%r8b,%bl
52	shlb	$4,%al
53	movq	$14,%rcx
54	movq	8(%rsi,%rax,1),%r8
55	movq	(%rsi,%rax,1),%r9
56	andb	$0xf0,%bl
57	movq	%r8,%rdx
58	jmp	.Loop1
59
60.align	16
61.Loop1:
62	shrq	$4,%r8
63	andq	$0xf,%rdx
64	movq	%r9,%r10
65	movb	(%rdi,%rcx,1),%al
66	shrq	$4,%r9
67	xorq	8(%rsi,%rbx,1),%r8
68	shlq	$60,%r10
69	xorq	(%rsi,%rbx,1),%r9
70	movb	%al,%bl
71	xorq	(%r11,%rdx,8),%r9
72	movq	%r8,%rdx
73	shlb	$4,%al
74	xorq	%r10,%r8
75	decq	%rcx
76	js	.Lbreak1
77
78	shrq	$4,%r8
79	andq	$0xf,%rdx
80	movq	%r9,%r10
81	shrq	$4,%r9
82	xorq	8(%rsi,%rax,1),%r8
83	shlq	$60,%r10
84	xorq	(%rsi,%rax,1),%r9
85	andb	$0xf0,%bl
86	xorq	(%r11,%rdx,8),%r9
87	movq	%r8,%rdx
88	xorq	%r10,%r8
89	jmp	.Loop1
90
91.align	16
92.Lbreak1:
93	shrq	$4,%r8
94	andq	$0xf,%rdx
95	movq	%r9,%r10
96	shrq	$4,%r9
97	xorq	8(%rsi,%rax,1),%r8
98	shlq	$60,%r10
99	xorq	(%rsi,%rax,1),%r9
100	andb	$0xf0,%bl
101	xorq	(%r11,%rdx,8),%r9
102	movq	%r8,%rdx
103	xorq	%r10,%r8
104
105	shrq	$4,%r8
106	andq	$0xf,%rdx
107	movq	%r9,%r10
108	shrq	$4,%r9
109	xorq	8(%rsi,%rbx,1),%r8
110	shlq	$60,%r10
111	xorq	(%rsi,%rbx,1),%r9
112	xorq	%r10,%r8
113	xorq	(%r11,%rdx,8),%r9
114
115	bswapq	%r8
116	bswapq	%r9
117	movq	%r8,8(%rdi)
118	movq	%r9,(%rdi)
119
120	leaq	280+48(%rsp),%rsi
121.cfi_def_cfa	%rsi,8
122	movq	-8(%rsi),%rbx
123.cfi_restore	%rbx
124	leaq	(%rsi),%rsp
125.cfi_def_cfa_register	%rsp
126.Lgmult_epilogue:
127	.byte	0xf3,0xc3
128.cfi_endproc
129.size	gcm_gmult_4bit,.-gcm_gmult_4bit
130.globl	gcm_ghash_4bit
131.hidden gcm_ghash_4bit
132.type	gcm_ghash_4bit,@function
133.align	16
134gcm_ghash_4bit:
135.cfi_startproc
136	pushq	%rbx
137.cfi_adjust_cfa_offset	8
138.cfi_offset	%rbx,-16
139	pushq	%rbp
140.cfi_adjust_cfa_offset	8
141.cfi_offset	%rbp,-24
142	pushq	%r12
143.cfi_adjust_cfa_offset	8
144.cfi_offset	%r12,-32
145	pushq	%r13
146.cfi_adjust_cfa_offset	8
147.cfi_offset	%r13,-40
148	pushq	%r14
149.cfi_adjust_cfa_offset	8
150.cfi_offset	%r14,-48
151	pushq	%r15
152.cfi_adjust_cfa_offset	8
153.cfi_offset	%r15,-56
154	subq	$280,%rsp
155.cfi_adjust_cfa_offset	280
156.Lghash_prologue:
157	movq	%rdx,%r14
158	movq	%rcx,%r15
159	subq	$-128,%rsi
160	leaq	16+128(%rsp),%rbp
161	xorl	%edx,%edx
162	movq	0+0-128(%rsi),%r8
163	movq	0+8-128(%rsi),%rax
164	movb	%al,%dl
165	shrq	$4,%rax
166	movq	%r8,%r10
167	shrq	$4,%r8
168	movq	16+0-128(%rsi),%r9
169	shlb	$4,%dl
170	movq	16+8-128(%rsi),%rbx
171	shlq	$60,%r10
172	movb	%dl,0(%rsp)
173	orq	%r10,%rax
174	movb	%bl,%dl
175	shrq	$4,%rbx
176	movq	%r9,%r10
177	shrq	$4,%r9
178	movq	%r8,0(%rbp)
179	movq	32+0-128(%rsi),%r8
180	shlb	$4,%dl
181	movq	%rax,0-128(%rbp)
182	movq	32+8-128(%rsi),%rax
183	shlq	$60,%r10
184	movb	%dl,1(%rsp)
185	orq	%r10,%rbx
186	movb	%al,%dl
187	shrq	$4,%rax
188	movq	%r8,%r10
189	shrq	$4,%r8
190	movq	%r9,8(%rbp)
191	movq	48+0-128(%rsi),%r9
192	shlb	$4,%dl
193	movq	%rbx,8-128(%rbp)
194	movq	48+8-128(%rsi),%rbx
195	shlq	$60,%r10
196	movb	%dl,2(%rsp)
197	orq	%r10,%rax
198	movb	%bl,%dl
199	shrq	$4,%rbx
200	movq	%r9,%r10
201	shrq	$4,%r9
202	movq	%r8,16(%rbp)
203	movq	64+0-128(%rsi),%r8
204	shlb	$4,%dl
205	movq	%rax,16-128(%rbp)
206	movq	64+8-128(%rsi),%rax
207	shlq	$60,%r10
208	movb	%dl,3(%rsp)
209	orq	%r10,%rbx
210	movb	%al,%dl
211	shrq	$4,%rax
212	movq	%r8,%r10
213	shrq	$4,%r8
214	movq	%r9,24(%rbp)
215	movq	80+0-128(%rsi),%r9
216	shlb	$4,%dl
217	movq	%rbx,24-128(%rbp)
218	movq	80+8-128(%rsi),%rbx
219	shlq	$60,%r10
220	movb	%dl,4(%rsp)
221	orq	%r10,%rax
222	movb	%bl,%dl
223	shrq	$4,%rbx
224	movq	%r9,%r10
225	shrq	$4,%r9
226	movq	%r8,32(%rbp)
227	movq	96+0-128(%rsi),%r8
228	shlb	$4,%dl
229	movq	%rax,32-128(%rbp)
230	movq	96+8-128(%rsi),%rax
231	shlq	$60,%r10
232	movb	%dl,5(%rsp)
233	orq	%r10,%rbx
234	movb	%al,%dl
235	shrq	$4,%rax
236	movq	%r8,%r10
237	shrq	$4,%r8
238	movq	%r9,40(%rbp)
239	movq	112+0-128(%rsi),%r9
240	shlb	$4,%dl
241	movq	%rbx,40-128(%rbp)
242	movq	112+8-128(%rsi),%rbx
243	shlq	$60,%r10
244	movb	%dl,6(%rsp)
245	orq	%r10,%rax
246	movb	%bl,%dl
247	shrq	$4,%rbx
248	movq	%r9,%r10
249	shrq	$4,%r9
250	movq	%r8,48(%rbp)
251	movq	128+0-128(%rsi),%r8
252	shlb	$4,%dl
253	movq	%rax,48-128(%rbp)
254	movq	128+8-128(%rsi),%rax
255	shlq	$60,%r10
256	movb	%dl,7(%rsp)
257	orq	%r10,%rbx
258	movb	%al,%dl
259	shrq	$4,%rax
260	movq	%r8,%r10
261	shrq	$4,%r8
262	movq	%r9,56(%rbp)
263	movq	144+0-128(%rsi),%r9
264	shlb	$4,%dl
265	movq	%rbx,56-128(%rbp)
266	movq	144+8-128(%rsi),%rbx
267	shlq	$60,%r10
268	movb	%dl,8(%rsp)
269	orq	%r10,%rax
270	movb	%bl,%dl
271	shrq	$4,%rbx
272	movq	%r9,%r10
273	shrq	$4,%r9
274	movq	%r8,64(%rbp)
275	movq	160+0-128(%rsi),%r8
276	shlb	$4,%dl
277	movq	%rax,64-128(%rbp)
278	movq	160+8-128(%rsi),%rax
279	shlq	$60,%r10
280	movb	%dl,9(%rsp)
281	orq	%r10,%rbx
282	movb	%al,%dl
283	shrq	$4,%rax
284	movq	%r8,%r10
285	shrq	$4,%r8
286	movq	%r9,72(%rbp)
287	movq	176+0-128(%rsi),%r9
288	shlb	$4,%dl
289	movq	%rbx,72-128(%rbp)
290	movq	176+8-128(%rsi),%rbx
291	shlq	$60,%r10
292	movb	%dl,10(%rsp)
293	orq	%r10,%rax
294	movb	%bl,%dl
295	shrq	$4,%rbx
296	movq	%r9,%r10
297	shrq	$4,%r9
298	movq	%r8,80(%rbp)
299	movq	192+0-128(%rsi),%r8
300	shlb	$4,%dl
301	movq	%rax,80-128(%rbp)
302	movq	192+8-128(%rsi),%rax
303	shlq	$60,%r10
304	movb	%dl,11(%rsp)
305	orq	%r10,%rbx
306	movb	%al,%dl
307	shrq	$4,%rax
308	movq	%r8,%r10
309	shrq	$4,%r8
310	movq	%r9,88(%rbp)
311	movq	208+0-128(%rsi),%r9
312	shlb	$4,%dl
313	movq	%rbx,88-128(%rbp)
314	movq	208+8-128(%rsi),%rbx
315	shlq	$60,%r10
316	movb	%dl,12(%rsp)
317	orq	%r10,%rax
318	movb	%bl,%dl
319	shrq	$4,%rbx
320	movq	%r9,%r10
321	shrq	$4,%r9
322	movq	%r8,96(%rbp)
323	movq	224+0-128(%rsi),%r8
324	shlb	$4,%dl
325	movq	%rax,96-128(%rbp)
326	movq	224+8-128(%rsi),%rax
327	shlq	$60,%r10
328	movb	%dl,13(%rsp)
329	orq	%r10,%rbx
330	movb	%al,%dl
331	shrq	$4,%rax
332	movq	%r8,%r10
333	shrq	$4,%r8
334	movq	%r9,104(%rbp)
335	movq	240+0-128(%rsi),%r9
336	shlb	$4,%dl
337	movq	%rbx,104-128(%rbp)
338	movq	240+8-128(%rsi),%rbx
339	shlq	$60,%r10
340	movb	%dl,14(%rsp)
341	orq	%r10,%rax
342	movb	%bl,%dl
343	shrq	$4,%rbx
344	movq	%r9,%r10
345	shrq	$4,%r9
346	movq	%r8,112(%rbp)
347	shlb	$4,%dl
348	movq	%rax,112-128(%rbp)
349	shlq	$60,%r10
350	movb	%dl,15(%rsp)
351	orq	%r10,%rbx
352	movq	%r9,120(%rbp)
353	movq	%rbx,120-128(%rbp)
354	addq	$-128,%rsi
355	movq	8(%rdi),%r8
356	movq	0(%rdi),%r9
357	addq	%r14,%r15
358	leaq	.Lrem_8bit(%rip),%r11
359	jmp	.Louter_loop
360.align	16
361.Louter_loop:
362	xorq	(%r14),%r9
363	movq	8(%r14),%rdx
364	leaq	16(%r14),%r14
365	xorq	%r8,%rdx
366	movq	%r9,(%rdi)
367	movq	%rdx,8(%rdi)
368	shrq	$32,%rdx
369	xorq	%rax,%rax
370	roll	$8,%edx
371	movb	%dl,%al
372	movzbl	%dl,%ebx
373	shlb	$4,%al
374	shrl	$4,%ebx
375	roll	$8,%edx
376	movq	8(%rsi,%rax,1),%r8
377	movq	(%rsi,%rax,1),%r9
378	movb	%dl,%al
379	movzbl	%dl,%ecx
380	shlb	$4,%al
381	movzbq	(%rsp,%rbx,1),%r12
382	shrl	$4,%ecx
383	xorq	%r8,%r12
384	movq	%r9,%r10
385	shrq	$8,%r8
386	movzbq	%r12b,%r12
387	shrq	$8,%r9
388	xorq	-128(%rbp,%rbx,8),%r8
389	shlq	$56,%r10
390	xorq	(%rbp,%rbx,8),%r9
391	roll	$8,%edx
392	xorq	8(%rsi,%rax,1),%r8
393	xorq	(%rsi,%rax,1),%r9
394	movb	%dl,%al
395	xorq	%r10,%r8
396	movzwq	(%r11,%r12,2),%r12
397	movzbl	%dl,%ebx
398	shlb	$4,%al
399	movzbq	(%rsp,%rcx,1),%r13
400	shrl	$4,%ebx
401	shlq	$48,%r12
402	xorq	%r8,%r13
403	movq	%r9,%r10
404	xorq	%r12,%r9
405	shrq	$8,%r8
406	movzbq	%r13b,%r13
407	shrq	$8,%r9
408	xorq	-128(%rbp,%rcx,8),%r8
409	shlq	$56,%r10
410	xorq	(%rbp,%rcx,8),%r9
411	roll	$8,%edx
412	xorq	8(%rsi,%rax,1),%r8
413	xorq	(%rsi,%rax,1),%r9
414	movb	%dl,%al
415	xorq	%r10,%r8
416	movzwq	(%r11,%r13,2),%r13
417	movzbl	%dl,%ecx
418	shlb	$4,%al
419	movzbq	(%rsp,%rbx,1),%r12
420	shrl	$4,%ecx
421	shlq	$48,%r13
422	xorq	%r8,%r12
423	movq	%r9,%r10
424	xorq	%r13,%r9
425	shrq	$8,%r8
426	movzbq	%r12b,%r12
427	movl	8(%rdi),%edx
428	shrq	$8,%r9
429	xorq	-128(%rbp,%rbx,8),%r8
430	shlq	$56,%r10
431	xorq	(%rbp,%rbx,8),%r9
432	roll	$8,%edx
433	xorq	8(%rsi,%rax,1),%r8
434	xorq	(%rsi,%rax,1),%r9
435	movb	%dl,%al
436	xorq	%r10,%r8
437	movzwq	(%r11,%r12,2),%r12
438	movzbl	%dl,%ebx
439	shlb	$4,%al
440	movzbq	(%rsp,%rcx,1),%r13
441	shrl	$4,%ebx
442	shlq	$48,%r12
443	xorq	%r8,%r13
444	movq	%r9,%r10
445	xorq	%r12,%r9
446	shrq	$8,%r8
447	movzbq	%r13b,%r13
448	shrq	$8,%r9
449	xorq	-128(%rbp,%rcx,8),%r8
450	shlq	$56,%r10
451	xorq	(%rbp,%rcx,8),%r9
452	roll	$8,%edx
453	xorq	8(%rsi,%rax,1),%r8
454	xorq	(%rsi,%rax,1),%r9
455	movb	%dl,%al
456	xorq	%r10,%r8
457	movzwq	(%r11,%r13,2),%r13
458	movzbl	%dl,%ecx
459	shlb	$4,%al
460	movzbq	(%rsp,%rbx,1),%r12
461	shrl	$4,%ecx
462	shlq	$48,%r13
463	xorq	%r8,%r12
464	movq	%r9,%r10
465	xorq	%r13,%r9
466	shrq	$8,%r8
467	movzbq	%r12b,%r12
468	shrq	$8,%r9
469	xorq	-128(%rbp,%rbx,8),%r8
470	shlq	$56,%r10
471	xorq	(%rbp,%rbx,8),%r9
472	roll	$8,%edx
473	xorq	8(%rsi,%rax,1),%r8
474	xorq	(%rsi,%rax,1),%r9
475	movb	%dl,%al
476	xorq	%r10,%r8
477	movzwq	(%r11,%r12,2),%r12
478	movzbl	%dl,%ebx
479	shlb	$4,%al
480	movzbq	(%rsp,%rcx,1),%r13
481	shrl	$4,%ebx
482	shlq	$48,%r12
483	xorq	%r8,%r13
484	movq	%r9,%r10
485	xorq	%r12,%r9
486	shrq	$8,%r8
487	movzbq	%r13b,%r13
488	shrq	$8,%r9
489	xorq	-128(%rbp,%rcx,8),%r8
490	shlq	$56,%r10
491	xorq	(%rbp,%rcx,8),%r9
492	roll	$8,%edx
493	xorq	8(%rsi,%rax,1),%r8
494	xorq	(%rsi,%rax,1),%r9
495	movb	%dl,%al
496	xorq	%r10,%r8
497	movzwq	(%r11,%r13,2),%r13
498	movzbl	%dl,%ecx
499	shlb	$4,%al
500	movzbq	(%rsp,%rbx,1),%r12
501	shrl	$4,%ecx
502	shlq	$48,%r13
503	xorq	%r8,%r12
504	movq	%r9,%r10
505	xorq	%r13,%r9
506	shrq	$8,%r8
507	movzbq	%r12b,%r12
508	movl	4(%rdi),%edx
509	shrq	$8,%r9
510	xorq	-128(%rbp,%rbx,8),%r8
511	shlq	$56,%r10
512	xorq	(%rbp,%rbx,8),%r9
513	roll	$8,%edx
514	xorq	8(%rsi,%rax,1),%r8
515	xorq	(%rsi,%rax,1),%r9
516	movb	%dl,%al
517	xorq	%r10,%r8
518	movzwq	(%r11,%r12,2),%r12
519	movzbl	%dl,%ebx
520	shlb	$4,%al
521	movzbq	(%rsp,%rcx,1),%r13
522	shrl	$4,%ebx
523	shlq	$48,%r12
524	xorq	%r8,%r13
525	movq	%r9,%r10
526	xorq	%r12,%r9
527	shrq	$8,%r8
528	movzbq	%r13b,%r13
529	shrq	$8,%r9
530	xorq	-128(%rbp,%rcx,8),%r8
531	shlq	$56,%r10
532	xorq	(%rbp,%rcx,8),%r9
533	roll	$8,%edx
534	xorq	8(%rsi,%rax,1),%r8
535	xorq	(%rsi,%rax,1),%r9
536	movb	%dl,%al
537	xorq	%r10,%r8
538	movzwq	(%r11,%r13,2),%r13
539	movzbl	%dl,%ecx
540	shlb	$4,%al
541	movzbq	(%rsp,%rbx,1),%r12
542	shrl	$4,%ecx
543	shlq	$48,%r13
544	xorq	%r8,%r12
545	movq	%r9,%r10
546	xorq	%r13,%r9
547	shrq	$8,%r8
548	movzbq	%r12b,%r12
549	shrq	$8,%r9
550	xorq	-128(%rbp,%rbx,8),%r8
551	shlq	$56,%r10
552	xorq	(%rbp,%rbx,8),%r9
553	roll	$8,%edx
554	xorq	8(%rsi,%rax,1),%r8
555	xorq	(%rsi,%rax,1),%r9
556	movb	%dl,%al
557	xorq	%r10,%r8
558	movzwq	(%r11,%r12,2),%r12
559	movzbl	%dl,%ebx
560	shlb	$4,%al
561	movzbq	(%rsp,%rcx,1),%r13
562	shrl	$4,%ebx
563	shlq	$48,%r12
564	xorq	%r8,%r13
565	movq	%r9,%r10
566	xorq	%r12,%r9
567	shrq	$8,%r8
568	movzbq	%r13b,%r13
569	shrq	$8,%r9
570	xorq	-128(%rbp,%rcx,8),%r8
571	shlq	$56,%r10
572	xorq	(%rbp,%rcx,8),%r9
573	roll	$8,%edx
574	xorq	8(%rsi,%rax,1),%r8
575	xorq	(%rsi,%rax,1),%r9
576	movb	%dl,%al
577	xorq	%r10,%r8
578	movzwq	(%r11,%r13,2),%r13
579	movzbl	%dl,%ecx
580	shlb	$4,%al
581	movzbq	(%rsp,%rbx,1),%r12
582	shrl	$4,%ecx
583	shlq	$48,%r13
584	xorq	%r8,%r12
585	movq	%r9,%r10
586	xorq	%r13,%r9
587	shrq	$8,%r8
588	movzbq	%r12b,%r12
589	movl	0(%rdi),%edx
590	shrq	$8,%r9
591	xorq	-128(%rbp,%rbx,8),%r8
592	shlq	$56,%r10
593	xorq	(%rbp,%rbx,8),%r9
594	roll	$8,%edx
595	xorq	8(%rsi,%rax,1),%r8
596	xorq	(%rsi,%rax,1),%r9
597	movb	%dl,%al
598	xorq	%r10,%r8
599	movzwq	(%r11,%r12,2),%r12
600	movzbl	%dl,%ebx
601	shlb	$4,%al
602	movzbq	(%rsp,%rcx,1),%r13
603	shrl	$4,%ebx
604	shlq	$48,%r12
605	xorq	%r8,%r13
606	movq	%r9,%r10
607	xorq	%r12,%r9
608	shrq	$8,%r8
609	movzbq	%r13b,%r13
610	shrq	$8,%r9
611	xorq	-128(%rbp,%rcx,8),%r8
612	shlq	$56,%r10
613	xorq	(%rbp,%rcx,8),%r9
614	roll	$8,%edx
615	xorq	8(%rsi,%rax,1),%r8
616	xorq	(%rsi,%rax,1),%r9
617	movb	%dl,%al
618	xorq	%r10,%r8
619	movzwq	(%r11,%r13,2),%r13
620	movzbl	%dl,%ecx
621	shlb	$4,%al
622	movzbq	(%rsp,%rbx,1),%r12
623	shrl	$4,%ecx
624	shlq	$48,%r13
625	xorq	%r8,%r12
626	movq	%r9,%r10
627	xorq	%r13,%r9
628	shrq	$8,%r8
629	movzbq	%r12b,%r12
630	shrq	$8,%r9
631	xorq	-128(%rbp,%rbx,8),%r8
632	shlq	$56,%r10
633	xorq	(%rbp,%rbx,8),%r9
634	roll	$8,%edx
635	xorq	8(%rsi,%rax,1),%r8
636	xorq	(%rsi,%rax,1),%r9
637	movb	%dl,%al
638	xorq	%r10,%r8
639	movzwq	(%r11,%r12,2),%r12
640	movzbl	%dl,%ebx
641	shlb	$4,%al
642	movzbq	(%rsp,%rcx,1),%r13
643	shrl	$4,%ebx
644	shlq	$48,%r12
645	xorq	%r8,%r13
646	movq	%r9,%r10
647	xorq	%r12,%r9
648	shrq	$8,%r8
649	movzbq	%r13b,%r13
650	shrq	$8,%r9
651	xorq	-128(%rbp,%rcx,8),%r8
652	shlq	$56,%r10
653	xorq	(%rbp,%rcx,8),%r9
654	roll	$8,%edx
655	xorq	8(%rsi,%rax,1),%r8
656	xorq	(%rsi,%rax,1),%r9
657	movb	%dl,%al
658	xorq	%r10,%r8
659	movzwq	(%r11,%r13,2),%r13
660	movzbl	%dl,%ecx
661	shlb	$4,%al
662	movzbq	(%rsp,%rbx,1),%r12
663	andl	$240,%ecx
664	shlq	$48,%r13
665	xorq	%r8,%r12
666	movq	%r9,%r10
667	xorq	%r13,%r9
668	shrq	$8,%r8
669	movzbq	%r12b,%r12
670	movl	-4(%rdi),%edx
671	shrq	$8,%r9
672	xorq	-128(%rbp,%rbx,8),%r8
673	shlq	$56,%r10
674	xorq	(%rbp,%rbx,8),%r9
675	movzwq	(%r11,%r12,2),%r12
676	xorq	8(%rsi,%rax,1),%r8
677	xorq	(%rsi,%rax,1),%r9
678	shlq	$48,%r12
679	xorq	%r10,%r8
680	xorq	%r12,%r9
681	movzbq	%r8b,%r13
682	shrq	$4,%r8
683	movq	%r9,%r10
684	shlb	$4,%r13b
685	shrq	$4,%r9
686	xorq	8(%rsi,%rcx,1),%r8
687	movzwq	(%r11,%r13,2),%r13
688	shlq	$60,%r10
689	xorq	(%rsi,%rcx,1),%r9
690	xorq	%r10,%r8
691	shlq	$48,%r13
692	bswapq	%r8
693	xorq	%r13,%r9
694	bswapq	%r9
695	cmpq	%r15,%r14
696	jb	.Louter_loop
697	movq	%r8,8(%rdi)
698	movq	%r9,(%rdi)
699
700	leaq	280+48(%rsp),%rsi
701.cfi_def_cfa	%rsi,8
702	movq	-48(%rsi),%r15
703.cfi_restore	%r15
704	movq	-40(%rsi),%r14
705.cfi_restore	%r14
706	movq	-32(%rsi),%r13
707.cfi_restore	%r13
708	movq	-24(%rsi),%r12
709.cfi_restore	%r12
710	movq	-16(%rsi),%rbp
711.cfi_restore	%rbp
712	movq	-8(%rsi),%rbx
713.cfi_restore	%rbx
714	leaq	0(%rsi),%rsp
715.cfi_def_cfa_register	%rsp
716.Lghash_epilogue:
717	.byte	0xf3,0xc3
718.cfi_endproc
719.size	gcm_ghash_4bit,.-gcm_ghash_4bit
720.globl	gcm_init_clmul
721.hidden gcm_init_clmul
722.type	gcm_init_clmul,@function
723.align	16
724gcm_init_clmul:
725.cfi_startproc
726.L_init_clmul:
727	movdqu	(%rsi),%xmm2
728	pshufd	$78,%xmm2,%xmm2
729
730
731	pshufd	$255,%xmm2,%xmm4
732	movdqa	%xmm2,%xmm3
733	psllq	$1,%xmm2
734	pxor	%xmm5,%xmm5
735	psrlq	$63,%xmm3
736	pcmpgtd	%xmm4,%xmm5
737	pslldq	$8,%xmm3
738	por	%xmm3,%xmm2
739
740
741	pand	.L0x1c2_polynomial(%rip),%xmm5
742	pxor	%xmm5,%xmm2
743
744
745	pshufd	$78,%xmm2,%xmm6
746	movdqa	%xmm2,%xmm0
747	pxor	%xmm2,%xmm6
748	movdqa	%xmm0,%xmm1
749	pshufd	$78,%xmm0,%xmm3
750	pxor	%xmm0,%xmm3
751.byte	102,15,58,68,194,0
752.byte	102,15,58,68,202,17
753.byte	102,15,58,68,222,0
754	pxor	%xmm0,%xmm3
755	pxor	%xmm1,%xmm3
756
757	movdqa	%xmm3,%xmm4
758	psrldq	$8,%xmm3
759	pslldq	$8,%xmm4
760	pxor	%xmm3,%xmm1
761	pxor	%xmm4,%xmm0
762
763	movdqa	%xmm0,%xmm4
764	movdqa	%xmm0,%xmm3
765	psllq	$5,%xmm0
766	pxor	%xmm0,%xmm3
767	psllq	$1,%xmm0
768	pxor	%xmm3,%xmm0
769	psllq	$57,%xmm0
770	movdqa	%xmm0,%xmm3
771	pslldq	$8,%xmm0
772	psrldq	$8,%xmm3
773	pxor	%xmm4,%xmm0
774	pxor	%xmm3,%xmm1
775
776
777	movdqa	%xmm0,%xmm4
778	psrlq	$1,%xmm0
779	pxor	%xmm4,%xmm1
780	pxor	%xmm0,%xmm4
781	psrlq	$5,%xmm0
782	pxor	%xmm4,%xmm0
783	psrlq	$1,%xmm0
784	pxor	%xmm1,%xmm0
785	pshufd	$78,%xmm2,%xmm3
786	pshufd	$78,%xmm0,%xmm4
787	pxor	%xmm2,%xmm3
788	movdqu	%xmm2,0(%rdi)
789	pxor	%xmm0,%xmm4
790	movdqu	%xmm0,16(%rdi)
791.byte	102,15,58,15,227,8
792	movdqu	%xmm4,32(%rdi)
793	movdqa	%xmm0,%xmm1
794	pshufd	$78,%xmm0,%xmm3
795	pxor	%xmm0,%xmm3
796.byte	102,15,58,68,194,0
797.byte	102,15,58,68,202,17
798.byte	102,15,58,68,222,0
799	pxor	%xmm0,%xmm3
800	pxor	%xmm1,%xmm3
801
802	movdqa	%xmm3,%xmm4
803	psrldq	$8,%xmm3
804	pslldq	$8,%xmm4
805	pxor	%xmm3,%xmm1
806	pxor	%xmm4,%xmm0
807
808	movdqa	%xmm0,%xmm4
809	movdqa	%xmm0,%xmm3
810	psllq	$5,%xmm0
811	pxor	%xmm0,%xmm3
812	psllq	$1,%xmm0
813	pxor	%xmm3,%xmm0
814	psllq	$57,%xmm0
815	movdqa	%xmm0,%xmm3
816	pslldq	$8,%xmm0
817	psrldq	$8,%xmm3
818	pxor	%xmm4,%xmm0
819	pxor	%xmm3,%xmm1
820
821
822	movdqa	%xmm0,%xmm4
823	psrlq	$1,%xmm0
824	pxor	%xmm4,%xmm1
825	pxor	%xmm0,%xmm4
826	psrlq	$5,%xmm0
827	pxor	%xmm4,%xmm0
828	psrlq	$1,%xmm0
829	pxor	%xmm1,%xmm0
830	movdqa	%xmm0,%xmm5
831	movdqa	%xmm0,%xmm1
832	pshufd	$78,%xmm0,%xmm3
833	pxor	%xmm0,%xmm3
834.byte	102,15,58,68,194,0
835.byte	102,15,58,68,202,17
836.byte	102,15,58,68,222,0
837	pxor	%xmm0,%xmm3
838	pxor	%xmm1,%xmm3
839
840	movdqa	%xmm3,%xmm4
841	psrldq	$8,%xmm3
842	pslldq	$8,%xmm4
843	pxor	%xmm3,%xmm1
844	pxor	%xmm4,%xmm0
845
846	movdqa	%xmm0,%xmm4
847	movdqa	%xmm0,%xmm3
848	psllq	$5,%xmm0
849	pxor	%xmm0,%xmm3
850	psllq	$1,%xmm0
851	pxor	%xmm3,%xmm0
852	psllq	$57,%xmm0
853	movdqa	%xmm0,%xmm3
854	pslldq	$8,%xmm0
855	psrldq	$8,%xmm3
856	pxor	%xmm4,%xmm0
857	pxor	%xmm3,%xmm1
858
859
860	movdqa	%xmm0,%xmm4
861	psrlq	$1,%xmm0
862	pxor	%xmm4,%xmm1
863	pxor	%xmm0,%xmm4
864	psrlq	$5,%xmm0
865	pxor	%xmm4,%xmm0
866	psrlq	$1,%xmm0
867	pxor	%xmm1,%xmm0
868	pshufd	$78,%xmm5,%xmm3
869	pshufd	$78,%xmm0,%xmm4
870	pxor	%xmm5,%xmm3
871	movdqu	%xmm5,48(%rdi)
872	pxor	%xmm0,%xmm4
873	movdqu	%xmm0,64(%rdi)
874.byte	102,15,58,15,227,8
875	movdqu	%xmm4,80(%rdi)
876	.byte	0xf3,0xc3
877.cfi_endproc
878.size	gcm_init_clmul,.-gcm_init_clmul
879.globl	gcm_gmult_clmul
880.hidden gcm_gmult_clmul
881.type	gcm_gmult_clmul,@function
882.align	16
883gcm_gmult_clmul:
884.cfi_startproc
885.L_gmult_clmul:
886	movdqu	(%rdi),%xmm0
887	movdqa	.Lbswap_mask(%rip),%xmm5
888	movdqu	(%rsi),%xmm2
889	movdqu	32(%rsi),%xmm4
890.byte	102,15,56,0,197
891	movdqa	%xmm0,%xmm1
892	pshufd	$78,%xmm0,%xmm3
893	pxor	%xmm0,%xmm3
894.byte	102,15,58,68,194,0
895.byte	102,15,58,68,202,17
896.byte	102,15,58,68,220,0
897	pxor	%xmm0,%xmm3
898	pxor	%xmm1,%xmm3
899
900	movdqa	%xmm3,%xmm4
901	psrldq	$8,%xmm3
902	pslldq	$8,%xmm4
903	pxor	%xmm3,%xmm1
904	pxor	%xmm4,%xmm0
905
906	movdqa	%xmm0,%xmm4
907	movdqa	%xmm0,%xmm3
908	psllq	$5,%xmm0
909	pxor	%xmm0,%xmm3
910	psllq	$1,%xmm0
911	pxor	%xmm3,%xmm0
912	psllq	$57,%xmm0
913	movdqa	%xmm0,%xmm3
914	pslldq	$8,%xmm0
915	psrldq	$8,%xmm3
916	pxor	%xmm4,%xmm0
917	pxor	%xmm3,%xmm1
918
919
920	movdqa	%xmm0,%xmm4
921	psrlq	$1,%xmm0
922	pxor	%xmm4,%xmm1
923	pxor	%xmm0,%xmm4
924	psrlq	$5,%xmm0
925	pxor	%xmm4,%xmm0
926	psrlq	$1,%xmm0
927	pxor	%xmm1,%xmm0
928.byte	102,15,56,0,197
929	movdqu	%xmm0,(%rdi)
930	.byte	0xf3,0xc3
931.cfi_endproc
932.size	gcm_gmult_clmul,.-gcm_gmult_clmul
933.globl	gcm_ghash_clmul
934.hidden gcm_ghash_clmul
935.type	gcm_ghash_clmul,@function
936.align	32
937gcm_ghash_clmul:
938.cfi_startproc
939.L_ghash_clmul:
940	movdqa	.Lbswap_mask(%rip),%xmm10
941
942	movdqu	(%rdi),%xmm0
943	movdqu	(%rsi),%xmm2
944	movdqu	32(%rsi),%xmm7
945.byte	102,65,15,56,0,194
946
947	subq	$0x10,%rcx
948	jz	.Lodd_tail
949
950	movdqu	16(%rsi),%xmm6
951	leaq	OPENSSL_ia32cap_P(%rip),%rax
952	movl	4(%rax),%eax
953	cmpq	$0x30,%rcx
954	jb	.Lskip4x
955
956	andl	$71303168,%eax
957	cmpl	$4194304,%eax
958	je	.Lskip4x
959
960	subq	$0x30,%rcx
961	movq	$0xA040608020C0E000,%rax
962	movdqu	48(%rsi),%xmm14
963	movdqu	64(%rsi),%xmm15
964
965
966
967
968	movdqu	48(%rdx),%xmm3
969	movdqu	32(%rdx),%xmm11
970.byte	102,65,15,56,0,218
971.byte	102,69,15,56,0,218
972	movdqa	%xmm3,%xmm5
973	pshufd	$78,%xmm3,%xmm4
974	pxor	%xmm3,%xmm4
975.byte	102,15,58,68,218,0
976.byte	102,15,58,68,234,17
977.byte	102,15,58,68,231,0
978
979	movdqa	%xmm11,%xmm13
980	pshufd	$78,%xmm11,%xmm12
981	pxor	%xmm11,%xmm12
982.byte	102,68,15,58,68,222,0
983.byte	102,68,15,58,68,238,17
984.byte	102,68,15,58,68,231,16
985	xorps	%xmm11,%xmm3
986	xorps	%xmm13,%xmm5
987	movups	80(%rsi),%xmm7
988	xorps	%xmm12,%xmm4
989
990	movdqu	16(%rdx),%xmm11
991	movdqu	0(%rdx),%xmm8
992.byte	102,69,15,56,0,218
993.byte	102,69,15,56,0,194
994	movdqa	%xmm11,%xmm13
995	pshufd	$78,%xmm11,%xmm12
996	pxor	%xmm8,%xmm0
997	pxor	%xmm11,%xmm12
998.byte	102,69,15,58,68,222,0
999	movdqa	%xmm0,%xmm1
1000	pshufd	$78,%xmm0,%xmm8
1001	pxor	%xmm0,%xmm8
1002.byte	102,69,15,58,68,238,17
1003.byte	102,68,15,58,68,231,0
1004	xorps	%xmm11,%xmm3
1005	xorps	%xmm13,%xmm5
1006
1007	leaq	64(%rdx),%rdx
1008	subq	$0x40,%rcx
1009	jc	.Ltail4x
1010
1011	jmp	.Lmod4_loop
1012.align	32
1013.Lmod4_loop:
1014.byte	102,65,15,58,68,199,0
1015	xorps	%xmm12,%xmm4
1016	movdqu	48(%rdx),%xmm11
1017.byte	102,69,15,56,0,218
1018.byte	102,65,15,58,68,207,17
1019	xorps	%xmm3,%xmm0
1020	movdqu	32(%rdx),%xmm3
1021	movdqa	%xmm11,%xmm13
1022.byte	102,68,15,58,68,199,16
1023	pshufd	$78,%xmm11,%xmm12
1024	xorps	%xmm5,%xmm1
1025	pxor	%xmm11,%xmm12
1026.byte	102,65,15,56,0,218
1027	movups	32(%rsi),%xmm7
1028	xorps	%xmm4,%xmm8
1029.byte	102,68,15,58,68,218,0
1030	pshufd	$78,%xmm3,%xmm4
1031
1032	pxor	%xmm0,%xmm8
1033	movdqa	%xmm3,%xmm5
1034	pxor	%xmm1,%xmm8
1035	pxor	%xmm3,%xmm4
1036	movdqa	%xmm8,%xmm9
1037.byte	102,68,15,58,68,234,17
1038	pslldq	$8,%xmm8
1039	psrldq	$8,%xmm9
1040	pxor	%xmm8,%xmm0
1041	movdqa	.L7_mask(%rip),%xmm8
1042	pxor	%xmm9,%xmm1
1043.byte	102,76,15,110,200
1044
1045	pand	%xmm0,%xmm8
1046.byte	102,69,15,56,0,200
1047	pxor	%xmm0,%xmm9
1048.byte	102,68,15,58,68,231,0
1049	psllq	$57,%xmm9
1050	movdqa	%xmm9,%xmm8
1051	pslldq	$8,%xmm9
1052.byte	102,15,58,68,222,0
1053	psrldq	$8,%xmm8
1054	pxor	%xmm9,%xmm0
1055	pxor	%xmm8,%xmm1
1056	movdqu	0(%rdx),%xmm8
1057
1058	movdqa	%xmm0,%xmm9
1059	psrlq	$1,%xmm0
1060.byte	102,15,58,68,238,17
1061	xorps	%xmm11,%xmm3
1062	movdqu	16(%rdx),%xmm11
1063.byte	102,69,15,56,0,218
1064.byte	102,15,58,68,231,16
1065	xorps	%xmm13,%xmm5
1066	movups	80(%rsi),%xmm7
1067.byte	102,69,15,56,0,194
1068	pxor	%xmm9,%xmm1
1069	pxor	%xmm0,%xmm9
1070	psrlq	$5,%xmm0
1071
1072	movdqa	%xmm11,%xmm13
1073	pxor	%xmm12,%xmm4
1074	pshufd	$78,%xmm11,%xmm12
1075	pxor	%xmm9,%xmm0
1076	pxor	%xmm8,%xmm1
1077	pxor	%xmm11,%xmm12
1078.byte	102,69,15,58,68,222,0
1079	psrlq	$1,%xmm0
1080	pxor	%xmm1,%xmm0
1081	movdqa	%xmm0,%xmm1
1082.byte	102,69,15,58,68,238,17
1083	xorps	%xmm11,%xmm3
1084	pshufd	$78,%xmm0,%xmm8
1085	pxor	%xmm0,%xmm8
1086
1087.byte	102,68,15,58,68,231,0
1088	xorps	%xmm13,%xmm5
1089
1090	leaq	64(%rdx),%rdx
1091	subq	$0x40,%rcx
1092	jnc	.Lmod4_loop
1093
1094.Ltail4x:
1095.byte	102,65,15,58,68,199,0
1096.byte	102,65,15,58,68,207,17
1097.byte	102,68,15,58,68,199,16
1098	xorps	%xmm12,%xmm4
1099	xorps	%xmm3,%xmm0
1100	xorps	%xmm5,%xmm1
1101	pxor	%xmm0,%xmm1
1102	pxor	%xmm4,%xmm8
1103
1104	pxor	%xmm1,%xmm8
1105	pxor	%xmm0,%xmm1
1106
1107	movdqa	%xmm8,%xmm9
1108	psrldq	$8,%xmm8
1109	pslldq	$8,%xmm9
1110	pxor	%xmm8,%xmm1
1111	pxor	%xmm9,%xmm0
1112
1113	movdqa	%xmm0,%xmm4
1114	movdqa	%xmm0,%xmm3
1115	psllq	$5,%xmm0
1116	pxor	%xmm0,%xmm3
1117	psllq	$1,%xmm0
1118	pxor	%xmm3,%xmm0
1119	psllq	$57,%xmm0
1120	movdqa	%xmm0,%xmm3
1121	pslldq	$8,%xmm0
1122	psrldq	$8,%xmm3
1123	pxor	%xmm4,%xmm0
1124	pxor	%xmm3,%xmm1
1125
1126
1127	movdqa	%xmm0,%xmm4
1128	psrlq	$1,%xmm0
1129	pxor	%xmm4,%xmm1
1130	pxor	%xmm0,%xmm4
1131	psrlq	$5,%xmm0
1132	pxor	%xmm4,%xmm0
1133	psrlq	$1,%xmm0
1134	pxor	%xmm1,%xmm0
1135	addq	$0x40,%rcx
1136	jz	.Ldone
1137	movdqu	32(%rsi),%xmm7
1138	subq	$0x10,%rcx
1139	jz	.Lodd_tail
1140.Lskip4x:
1141
1142
1143
1144
1145
1146	movdqu	(%rdx),%xmm8
1147	movdqu	16(%rdx),%xmm3
1148.byte	102,69,15,56,0,194
1149.byte	102,65,15,56,0,218
1150	pxor	%xmm8,%xmm0
1151
1152	movdqa	%xmm3,%xmm5
1153	pshufd	$78,%xmm3,%xmm4
1154	pxor	%xmm3,%xmm4
1155.byte	102,15,58,68,218,0
1156.byte	102,15,58,68,234,17
1157.byte	102,15,58,68,231,0
1158
1159	leaq	32(%rdx),%rdx
1160	nop
1161	subq	$0x20,%rcx
1162	jbe	.Leven_tail
1163	nop
1164	jmp	.Lmod_loop
1165
1166.align	32
1167.Lmod_loop:
1168	movdqa	%xmm0,%xmm1
1169	movdqa	%xmm4,%xmm8
1170	pshufd	$78,%xmm0,%xmm4
1171	pxor	%xmm0,%xmm4
1172
1173.byte	102,15,58,68,198,0
1174.byte	102,15,58,68,206,17
1175.byte	102,15,58,68,231,16
1176
1177	pxor	%xmm3,%xmm0
1178	pxor	%xmm5,%xmm1
1179	movdqu	(%rdx),%xmm9
1180	pxor	%xmm0,%xmm8
1181.byte	102,69,15,56,0,202
1182	movdqu	16(%rdx),%xmm3
1183
1184	pxor	%xmm1,%xmm8
1185	pxor	%xmm9,%xmm1
1186	pxor	%xmm8,%xmm4
1187.byte	102,65,15,56,0,218
1188	movdqa	%xmm4,%xmm8
1189	psrldq	$8,%xmm8
1190	pslldq	$8,%xmm4
1191	pxor	%xmm8,%xmm1
1192	pxor	%xmm4,%xmm0
1193
1194	movdqa	%xmm3,%xmm5
1195
1196	movdqa	%xmm0,%xmm9
1197	movdqa	%xmm0,%xmm8
1198	psllq	$5,%xmm0
1199	pxor	%xmm0,%xmm8
1200.byte	102,15,58,68,218,0
1201	psllq	$1,%xmm0
1202	pxor	%xmm8,%xmm0
1203	psllq	$57,%xmm0
1204	movdqa	%xmm0,%xmm8
1205	pslldq	$8,%xmm0
1206	psrldq	$8,%xmm8
1207	pxor	%xmm9,%xmm0
1208	pshufd	$78,%xmm5,%xmm4
1209	pxor	%xmm8,%xmm1
1210	pxor	%xmm5,%xmm4
1211
1212	movdqa	%xmm0,%xmm9
1213	psrlq	$1,%xmm0
1214.byte	102,15,58,68,234,17
1215	pxor	%xmm9,%xmm1
1216	pxor	%xmm0,%xmm9
1217	psrlq	$5,%xmm0
1218	pxor	%xmm9,%xmm0
1219	leaq	32(%rdx),%rdx
1220	psrlq	$1,%xmm0
1221.byte	102,15,58,68,231,0
1222	pxor	%xmm1,%xmm0
1223
1224	subq	$0x20,%rcx
1225	ja	.Lmod_loop
1226
1227.Leven_tail:
1228	movdqa	%xmm0,%xmm1
1229	movdqa	%xmm4,%xmm8
1230	pshufd	$78,%xmm0,%xmm4
1231	pxor	%xmm0,%xmm4
1232
1233.byte	102,15,58,68,198,0
1234.byte	102,15,58,68,206,17
1235.byte	102,15,58,68,231,16
1236
1237	pxor	%xmm3,%xmm0
1238	pxor	%xmm5,%xmm1
1239	pxor	%xmm0,%xmm8
1240	pxor	%xmm1,%xmm8
1241	pxor	%xmm8,%xmm4
1242	movdqa	%xmm4,%xmm8
1243	psrldq	$8,%xmm8
1244	pslldq	$8,%xmm4
1245	pxor	%xmm8,%xmm1
1246	pxor	%xmm4,%xmm0
1247
1248	movdqa	%xmm0,%xmm4
1249	movdqa	%xmm0,%xmm3
1250	psllq	$5,%xmm0
1251	pxor	%xmm0,%xmm3
1252	psllq	$1,%xmm0
1253	pxor	%xmm3,%xmm0
1254	psllq	$57,%xmm0
1255	movdqa	%xmm0,%xmm3
1256	pslldq	$8,%xmm0
1257	psrldq	$8,%xmm3
1258	pxor	%xmm4,%xmm0
1259	pxor	%xmm3,%xmm1
1260
1261
1262	movdqa	%xmm0,%xmm4
1263	psrlq	$1,%xmm0
1264	pxor	%xmm4,%xmm1
1265	pxor	%xmm0,%xmm4
1266	psrlq	$5,%xmm0
1267	pxor	%xmm4,%xmm0
1268	psrlq	$1,%xmm0
1269	pxor	%xmm1,%xmm0
1270	testq	%rcx,%rcx
1271	jnz	.Ldone
1272
1273.Lodd_tail:
1274	movdqu	(%rdx),%xmm8
1275.byte	102,69,15,56,0,194
1276	pxor	%xmm8,%xmm0
1277	movdqa	%xmm0,%xmm1
1278	pshufd	$78,%xmm0,%xmm3
1279	pxor	%xmm0,%xmm3
1280.byte	102,15,58,68,194,0
1281.byte	102,15,58,68,202,17
1282.byte	102,15,58,68,223,0
1283	pxor	%xmm0,%xmm3
1284	pxor	%xmm1,%xmm3
1285
1286	movdqa	%xmm3,%xmm4
1287	psrldq	$8,%xmm3
1288	pslldq	$8,%xmm4
1289	pxor	%xmm3,%xmm1
1290	pxor	%xmm4,%xmm0
1291
1292	movdqa	%xmm0,%xmm4
1293	movdqa	%xmm0,%xmm3
1294	psllq	$5,%xmm0
1295	pxor	%xmm0,%xmm3
1296	psllq	$1,%xmm0
1297	pxor	%xmm3,%xmm0
1298	psllq	$57,%xmm0
1299	movdqa	%xmm0,%xmm3
1300	pslldq	$8,%xmm0
1301	psrldq	$8,%xmm3
1302	pxor	%xmm4,%xmm0
1303	pxor	%xmm3,%xmm1
1304
1305
1306	movdqa	%xmm0,%xmm4
1307	psrlq	$1,%xmm0
1308	pxor	%xmm4,%xmm1
1309	pxor	%xmm0,%xmm4
1310	psrlq	$5,%xmm0
1311	pxor	%xmm4,%xmm0
1312	psrlq	$1,%xmm0
1313	pxor	%xmm1,%xmm0
1314.Ldone:
1315.byte	102,65,15,56,0,194
1316	movdqu	%xmm0,(%rdi)
1317	.byte	0xf3,0xc3
1318.cfi_endproc
1319.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1320.globl	gcm_init_avx
1321.hidden gcm_init_avx
1322.type	gcm_init_avx,@function
1323.align	32
1324gcm_init_avx:
1325.cfi_startproc
1326	vzeroupper
1327
1328	vmovdqu	(%rsi),%xmm2
1329	vpshufd	$78,%xmm2,%xmm2
1330
1331
1332	vpshufd	$255,%xmm2,%xmm4
1333	vpsrlq	$63,%xmm2,%xmm3
1334	vpsllq	$1,%xmm2,%xmm2
1335	vpxor	%xmm5,%xmm5,%xmm5
1336	vpcmpgtd	%xmm4,%xmm5,%xmm5
1337	vpslldq	$8,%xmm3,%xmm3
1338	vpor	%xmm3,%xmm2,%xmm2
1339
1340
1341	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1342	vpxor	%xmm5,%xmm2,%xmm2
1343
1344	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1345	vmovdqa	%xmm2,%xmm0
1346	vpxor	%xmm2,%xmm6,%xmm6
1347	movq	$4,%r10
1348	jmp	.Linit_start_avx
1349.align	32
1350.Linit_loop_avx:
1351	vpalignr	$8,%xmm3,%xmm4,%xmm5
1352	vmovdqu	%xmm5,-16(%rdi)
1353	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1354	vpxor	%xmm0,%xmm3,%xmm3
1355	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1356	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1357	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1358	vpxor	%xmm0,%xmm1,%xmm4
1359	vpxor	%xmm4,%xmm3,%xmm3
1360
1361	vpslldq	$8,%xmm3,%xmm4
1362	vpsrldq	$8,%xmm3,%xmm3
1363	vpxor	%xmm4,%xmm0,%xmm0
1364	vpxor	%xmm3,%xmm1,%xmm1
1365	vpsllq	$57,%xmm0,%xmm3
1366	vpsllq	$62,%xmm0,%xmm4
1367	vpxor	%xmm3,%xmm4,%xmm4
1368	vpsllq	$63,%xmm0,%xmm3
1369	vpxor	%xmm3,%xmm4,%xmm4
1370	vpslldq	$8,%xmm4,%xmm3
1371	vpsrldq	$8,%xmm4,%xmm4
1372	vpxor	%xmm3,%xmm0,%xmm0
1373	vpxor	%xmm4,%xmm1,%xmm1
1374
1375	vpsrlq	$1,%xmm0,%xmm4
1376	vpxor	%xmm0,%xmm1,%xmm1
1377	vpxor	%xmm4,%xmm0,%xmm0
1378	vpsrlq	$5,%xmm4,%xmm4
1379	vpxor	%xmm4,%xmm0,%xmm0
1380	vpsrlq	$1,%xmm0,%xmm0
1381	vpxor	%xmm1,%xmm0,%xmm0
1382.Linit_start_avx:
1383	vmovdqa	%xmm0,%xmm5
1384	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1385	vpxor	%xmm0,%xmm3,%xmm3
1386	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1387	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1388	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1389	vpxor	%xmm0,%xmm1,%xmm4
1390	vpxor	%xmm4,%xmm3,%xmm3
1391
1392	vpslldq	$8,%xmm3,%xmm4
1393	vpsrldq	$8,%xmm3,%xmm3
1394	vpxor	%xmm4,%xmm0,%xmm0
1395	vpxor	%xmm3,%xmm1,%xmm1
1396	vpsllq	$57,%xmm0,%xmm3
1397	vpsllq	$62,%xmm0,%xmm4
1398	vpxor	%xmm3,%xmm4,%xmm4
1399	vpsllq	$63,%xmm0,%xmm3
1400	vpxor	%xmm3,%xmm4,%xmm4
1401	vpslldq	$8,%xmm4,%xmm3
1402	vpsrldq	$8,%xmm4,%xmm4
1403	vpxor	%xmm3,%xmm0,%xmm0
1404	vpxor	%xmm4,%xmm1,%xmm1
1405
1406	vpsrlq	$1,%xmm0,%xmm4
1407	vpxor	%xmm0,%xmm1,%xmm1
1408	vpxor	%xmm4,%xmm0,%xmm0
1409	vpsrlq	$5,%xmm4,%xmm4
1410	vpxor	%xmm4,%xmm0,%xmm0
1411	vpsrlq	$1,%xmm0,%xmm0
1412	vpxor	%xmm1,%xmm0,%xmm0
1413	vpshufd	$78,%xmm5,%xmm3
1414	vpshufd	$78,%xmm0,%xmm4
1415	vpxor	%xmm5,%xmm3,%xmm3
1416	vmovdqu	%xmm5,0(%rdi)
1417	vpxor	%xmm0,%xmm4,%xmm4
1418	vmovdqu	%xmm0,16(%rdi)
1419	leaq	48(%rdi),%rdi
1420	subq	$1,%r10
1421	jnz	.Linit_loop_avx
1422
1423	vpalignr	$8,%xmm4,%xmm3,%xmm5
1424	vmovdqu	%xmm5,-16(%rdi)
1425
1426	vzeroupper
1427	.byte	0xf3,0xc3
1428.cfi_endproc
1429.size	gcm_init_avx,.-gcm_init_avx
1430.globl	gcm_gmult_avx
1431.hidden gcm_gmult_avx
1432.type	gcm_gmult_avx,@function
1433.align	32
1434gcm_gmult_avx:
1435.cfi_startproc
1436	jmp	.L_gmult_clmul
1437.cfi_endproc
1438.size	gcm_gmult_avx,.-gcm_gmult_avx
1439.globl	gcm_ghash_avx
1440.hidden gcm_ghash_avx
1441.type	gcm_ghash_avx,@function
1442.align	32
1443gcm_ghash_avx:
1444.cfi_startproc
1445	vzeroupper
1446
1447	vmovdqu	(%rdi),%xmm10
1448	leaq	.L0x1c2_polynomial(%rip),%r10
1449	leaq	64(%rsi),%rsi
1450	vmovdqu	.Lbswap_mask(%rip),%xmm13
1451	vpshufb	%xmm13,%xmm10,%xmm10
1452	cmpq	$0x80,%rcx
1453	jb	.Lshort_avx
1454	subq	$0x80,%rcx
1455
1456	vmovdqu	112(%rdx),%xmm14
1457	vmovdqu	0-64(%rsi),%xmm6
1458	vpshufb	%xmm13,%xmm14,%xmm14
1459	vmovdqu	32-64(%rsi),%xmm7
1460
1461	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1462	vmovdqu	96(%rdx),%xmm15
1463	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1464	vpxor	%xmm14,%xmm9,%xmm9
1465	vpshufb	%xmm13,%xmm15,%xmm15
1466	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1467	vmovdqu	16-64(%rsi),%xmm6
1468	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1469	vmovdqu	80(%rdx),%xmm14
1470	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1471	vpxor	%xmm15,%xmm8,%xmm8
1472
1473	vpshufb	%xmm13,%xmm14,%xmm14
1474	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1475	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1476	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1477	vmovdqu	48-64(%rsi),%xmm6
1478	vpxor	%xmm14,%xmm9,%xmm9
1479	vmovdqu	64(%rdx),%xmm15
1480	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1481	vmovdqu	80-64(%rsi),%xmm7
1482
1483	vpshufb	%xmm13,%xmm15,%xmm15
1484	vpxor	%xmm0,%xmm3,%xmm3
1485	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1486	vpxor	%xmm1,%xmm4,%xmm4
1487	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1488	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1489	vmovdqu	64-64(%rsi),%xmm6
1490	vpxor	%xmm2,%xmm5,%xmm5
1491	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1492	vpxor	%xmm15,%xmm8,%xmm8
1493
1494	vmovdqu	48(%rdx),%xmm14
1495	vpxor	%xmm3,%xmm0,%xmm0
1496	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1497	vpxor	%xmm4,%xmm1,%xmm1
1498	vpshufb	%xmm13,%xmm14,%xmm14
1499	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1500	vmovdqu	96-64(%rsi),%xmm6
1501	vpxor	%xmm5,%xmm2,%xmm2
1502	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1503	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1504	vmovdqu	128-64(%rsi),%xmm7
1505	vpxor	%xmm14,%xmm9,%xmm9
1506
1507	vmovdqu	32(%rdx),%xmm15
1508	vpxor	%xmm0,%xmm3,%xmm3
1509	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1510	vpxor	%xmm1,%xmm4,%xmm4
1511	vpshufb	%xmm13,%xmm15,%xmm15
1512	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1513	vmovdqu	112-64(%rsi),%xmm6
1514	vpxor	%xmm2,%xmm5,%xmm5
1515	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1516	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1517	vpxor	%xmm15,%xmm8,%xmm8
1518
1519	vmovdqu	16(%rdx),%xmm14
1520	vpxor	%xmm3,%xmm0,%xmm0
1521	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1522	vpxor	%xmm4,%xmm1,%xmm1
1523	vpshufb	%xmm13,%xmm14,%xmm14
1524	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1525	vmovdqu	144-64(%rsi),%xmm6
1526	vpxor	%xmm5,%xmm2,%xmm2
1527	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1528	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1529	vmovdqu	176-64(%rsi),%xmm7
1530	vpxor	%xmm14,%xmm9,%xmm9
1531
1532	vmovdqu	(%rdx),%xmm15
1533	vpxor	%xmm0,%xmm3,%xmm3
1534	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1535	vpxor	%xmm1,%xmm4,%xmm4
1536	vpshufb	%xmm13,%xmm15,%xmm15
1537	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1538	vmovdqu	160-64(%rsi),%xmm6
1539	vpxor	%xmm2,%xmm5,%xmm5
1540	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1541
1542	leaq	128(%rdx),%rdx
1543	cmpq	$0x80,%rcx
1544	jb	.Ltail_avx
1545
1546	vpxor	%xmm10,%xmm15,%xmm15
1547	subq	$0x80,%rcx
1548	jmp	.Loop8x_avx
1549
1550.align	32
1551.Loop8x_avx:
1552	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1553	vmovdqu	112(%rdx),%xmm14
1554	vpxor	%xmm0,%xmm3,%xmm3
1555	vpxor	%xmm15,%xmm8,%xmm8
1556	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1557	vpshufb	%xmm13,%xmm14,%xmm14
1558	vpxor	%xmm1,%xmm4,%xmm4
1559	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1560	vmovdqu	0-64(%rsi),%xmm6
1561	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1562	vpxor	%xmm2,%xmm5,%xmm5
1563	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1564	vmovdqu	32-64(%rsi),%xmm7
1565	vpxor	%xmm14,%xmm9,%xmm9
1566
1567	vmovdqu	96(%rdx),%xmm15
1568	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1569	vpxor	%xmm3,%xmm10,%xmm10
1570	vpshufb	%xmm13,%xmm15,%xmm15
1571	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1572	vxorps	%xmm4,%xmm11,%xmm11
1573	vmovdqu	16-64(%rsi),%xmm6
1574	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1575	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1576	vpxor	%xmm5,%xmm12,%xmm12
1577	vxorps	%xmm15,%xmm8,%xmm8
1578
1579	vmovdqu	80(%rdx),%xmm14
1580	vpxor	%xmm10,%xmm12,%xmm12
1581	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1582	vpxor	%xmm11,%xmm12,%xmm12
1583	vpslldq	$8,%xmm12,%xmm9
1584	vpxor	%xmm0,%xmm3,%xmm3
1585	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1586	vpsrldq	$8,%xmm12,%xmm12
1587	vpxor	%xmm9,%xmm10,%xmm10
1588	vmovdqu	48-64(%rsi),%xmm6
1589	vpshufb	%xmm13,%xmm14,%xmm14
1590	vxorps	%xmm12,%xmm11,%xmm11
1591	vpxor	%xmm1,%xmm4,%xmm4
1592	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1593	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1594	vmovdqu	80-64(%rsi),%xmm7
1595	vpxor	%xmm14,%xmm9,%xmm9
1596	vpxor	%xmm2,%xmm5,%xmm5
1597
1598	vmovdqu	64(%rdx),%xmm15
1599	vpalignr	$8,%xmm10,%xmm10,%xmm12
1600	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1601	vpshufb	%xmm13,%xmm15,%xmm15
1602	vpxor	%xmm3,%xmm0,%xmm0
1603	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1604	vmovdqu	64-64(%rsi),%xmm6
1605	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1606	vpxor	%xmm4,%xmm1,%xmm1
1607	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1608	vxorps	%xmm15,%xmm8,%xmm8
1609	vpxor	%xmm5,%xmm2,%xmm2
1610
1611	vmovdqu	48(%rdx),%xmm14
1612	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1613	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1614	vpshufb	%xmm13,%xmm14,%xmm14
1615	vpxor	%xmm0,%xmm3,%xmm3
1616	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1617	vmovdqu	96-64(%rsi),%xmm6
1618	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1619	vpxor	%xmm1,%xmm4,%xmm4
1620	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1621	vmovdqu	128-64(%rsi),%xmm7
1622	vpxor	%xmm14,%xmm9,%xmm9
1623	vpxor	%xmm2,%xmm5,%xmm5
1624
1625	vmovdqu	32(%rdx),%xmm15
1626	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1627	vpshufb	%xmm13,%xmm15,%xmm15
1628	vpxor	%xmm3,%xmm0,%xmm0
1629	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1630	vmovdqu	112-64(%rsi),%xmm6
1631	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1632	vpxor	%xmm4,%xmm1,%xmm1
1633	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1634	vpxor	%xmm15,%xmm8,%xmm8
1635	vpxor	%xmm5,%xmm2,%xmm2
1636	vxorps	%xmm12,%xmm10,%xmm10
1637
1638	vmovdqu	16(%rdx),%xmm14
1639	vpalignr	$8,%xmm10,%xmm10,%xmm12
1640	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1641	vpshufb	%xmm13,%xmm14,%xmm14
1642	vpxor	%xmm0,%xmm3,%xmm3
1643	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1644	vmovdqu	144-64(%rsi),%xmm6
1645	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1646	vxorps	%xmm11,%xmm12,%xmm12
1647	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1648	vpxor	%xmm1,%xmm4,%xmm4
1649	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1650	vmovdqu	176-64(%rsi),%xmm7
1651	vpxor	%xmm14,%xmm9,%xmm9
1652	vpxor	%xmm2,%xmm5,%xmm5
1653
1654	vmovdqu	(%rdx),%xmm15
1655	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1656	vpshufb	%xmm13,%xmm15,%xmm15
1657	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1658	vmovdqu	160-64(%rsi),%xmm6
1659	vpxor	%xmm12,%xmm15,%xmm15
1660	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1661	vpxor	%xmm10,%xmm15,%xmm15
1662
1663	leaq	128(%rdx),%rdx
1664	subq	$0x80,%rcx
1665	jnc	.Loop8x_avx
1666
1667	addq	$0x80,%rcx
1668	jmp	.Ltail_no_xor_avx
1669
1670.align	32
1671.Lshort_avx:
1672	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1673	leaq	(%rdx,%rcx,1),%rdx
1674	vmovdqu	0-64(%rsi),%xmm6
1675	vmovdqu	32-64(%rsi),%xmm7
1676	vpshufb	%xmm13,%xmm14,%xmm15
1677
1678	vmovdqa	%xmm0,%xmm3
1679	vmovdqa	%xmm1,%xmm4
1680	vmovdqa	%xmm2,%xmm5
1681	subq	$0x10,%rcx
1682	jz	.Ltail_avx
1683
1684	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1685	vpxor	%xmm0,%xmm3,%xmm3
1686	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1687	vpxor	%xmm15,%xmm8,%xmm8
1688	vmovdqu	-32(%rdx),%xmm14
1689	vpxor	%xmm1,%xmm4,%xmm4
1690	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1691	vmovdqu	16-64(%rsi),%xmm6
1692	vpshufb	%xmm13,%xmm14,%xmm15
1693	vpxor	%xmm2,%xmm5,%xmm5
1694	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1695	vpsrldq	$8,%xmm7,%xmm7
1696	subq	$0x10,%rcx
1697	jz	.Ltail_avx
1698
1699	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1700	vpxor	%xmm0,%xmm3,%xmm3
1701	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1702	vpxor	%xmm15,%xmm8,%xmm8
1703	vmovdqu	-48(%rdx),%xmm14
1704	vpxor	%xmm1,%xmm4,%xmm4
1705	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1706	vmovdqu	48-64(%rsi),%xmm6
1707	vpshufb	%xmm13,%xmm14,%xmm15
1708	vpxor	%xmm2,%xmm5,%xmm5
1709	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1710	vmovdqu	80-64(%rsi),%xmm7
1711	subq	$0x10,%rcx
1712	jz	.Ltail_avx
1713
1714	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1715	vpxor	%xmm0,%xmm3,%xmm3
1716	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1717	vpxor	%xmm15,%xmm8,%xmm8
1718	vmovdqu	-64(%rdx),%xmm14
1719	vpxor	%xmm1,%xmm4,%xmm4
1720	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1721	vmovdqu	64-64(%rsi),%xmm6
1722	vpshufb	%xmm13,%xmm14,%xmm15
1723	vpxor	%xmm2,%xmm5,%xmm5
1724	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1725	vpsrldq	$8,%xmm7,%xmm7
1726	subq	$0x10,%rcx
1727	jz	.Ltail_avx
1728
1729	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1730	vpxor	%xmm0,%xmm3,%xmm3
1731	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1732	vpxor	%xmm15,%xmm8,%xmm8
1733	vmovdqu	-80(%rdx),%xmm14
1734	vpxor	%xmm1,%xmm4,%xmm4
1735	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1736	vmovdqu	96-64(%rsi),%xmm6
1737	vpshufb	%xmm13,%xmm14,%xmm15
1738	vpxor	%xmm2,%xmm5,%xmm5
1739	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1740	vmovdqu	128-64(%rsi),%xmm7
1741	subq	$0x10,%rcx
1742	jz	.Ltail_avx
1743
1744	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1745	vpxor	%xmm0,%xmm3,%xmm3
1746	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1747	vpxor	%xmm15,%xmm8,%xmm8
1748	vmovdqu	-96(%rdx),%xmm14
1749	vpxor	%xmm1,%xmm4,%xmm4
1750	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1751	vmovdqu	112-64(%rsi),%xmm6
1752	vpshufb	%xmm13,%xmm14,%xmm15
1753	vpxor	%xmm2,%xmm5,%xmm5
1754	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1755	vpsrldq	$8,%xmm7,%xmm7
1756	subq	$0x10,%rcx
1757	jz	.Ltail_avx
1758
1759	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1760	vpxor	%xmm0,%xmm3,%xmm3
1761	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1762	vpxor	%xmm15,%xmm8,%xmm8
1763	vmovdqu	-112(%rdx),%xmm14
1764	vpxor	%xmm1,%xmm4,%xmm4
1765	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1766	vmovdqu	144-64(%rsi),%xmm6
1767	vpshufb	%xmm13,%xmm14,%xmm15
1768	vpxor	%xmm2,%xmm5,%xmm5
1769	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1770	vmovq	184-64(%rsi),%xmm7
1771	subq	$0x10,%rcx
1772	jmp	.Ltail_avx
1773
1774.align	32
1775.Ltail_avx:
1776	vpxor	%xmm10,%xmm15,%xmm15
1777.Ltail_no_xor_avx:
1778	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1779	vpxor	%xmm0,%xmm3,%xmm3
1780	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1781	vpxor	%xmm15,%xmm8,%xmm8
1782	vpxor	%xmm1,%xmm4,%xmm4
1783	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1784	vpxor	%xmm2,%xmm5,%xmm5
1785	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1786
1787	vmovdqu	(%r10),%xmm12
1788
1789	vpxor	%xmm0,%xmm3,%xmm10
1790	vpxor	%xmm1,%xmm4,%xmm11
1791	vpxor	%xmm2,%xmm5,%xmm5
1792
1793	vpxor	%xmm10,%xmm5,%xmm5
1794	vpxor	%xmm11,%xmm5,%xmm5
1795	vpslldq	$8,%xmm5,%xmm9
1796	vpsrldq	$8,%xmm5,%xmm5
1797	vpxor	%xmm9,%xmm10,%xmm10
1798	vpxor	%xmm5,%xmm11,%xmm11
1799
1800	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1801	vpalignr	$8,%xmm10,%xmm10,%xmm10
1802	vpxor	%xmm9,%xmm10,%xmm10
1803
1804	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1805	vpalignr	$8,%xmm10,%xmm10,%xmm10
1806	vpxor	%xmm11,%xmm10,%xmm10
1807	vpxor	%xmm9,%xmm10,%xmm10
1808
1809	cmpq	$0,%rcx
1810	jne	.Lshort_avx
1811
1812	vpshufb	%xmm13,%xmm10,%xmm10
1813	vmovdqu	%xmm10,(%rdi)
1814	vzeroupper
1815	.byte	0xf3,0xc3
1816.cfi_endproc
1817.size	gcm_ghash_avx,.-gcm_ghash_avx
1818.align	64
1819.Lbswap_mask:
1820.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1821.L0x1c2_polynomial:
1822.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1823.L7_mask:
1824.long	7,0,7,0
1825.L7_mask_poly:
1826.long	7,0,450,0
1827.align	64
1828.type	.Lrem_4bit,@object
1829.Lrem_4bit:
1830.long	0,0,0,471859200,0,943718400,0,610271232
1831.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1832.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1833.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1834.type	.Lrem_8bit,@object
1835.Lrem_8bit:
1836.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1837.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1838.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1839.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1840.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1841.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1842.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1843.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1844.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1845.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1846.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1847.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1848.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1849.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1850.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1851.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1852.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1853.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1854.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1855.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1856.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1857.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1858.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1859.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1860.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1861.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1862.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1863.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1864.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1865.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1866.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1867.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1868
1869.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1870.align	64
1871#endif
1872