1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "cache.h"
18
19#ifndef MEMSET
20# define MEMSET		android_memset32
21#endif
22
23#ifndef L
24# define L(label)	.L##label
25#endif
26
27#ifndef ALIGN
28# define ALIGN(n)	.p2align n
29#endif
30
31#ifndef cfi_startproc
32# define cfi_startproc			.cfi_startproc
33#endif
34
35#ifndef cfi_endproc
36# define cfi_endproc			.cfi_endproc
37#endif
38
39#ifndef ENTRY
40# define ENTRY(name)			\
41	.type name,  @function; 	\
42	.globl name;			\
43	.p2align 4;			\
44name:					\
45	cfi_startproc
46#endif
47
48#ifndef END
49# define END(name)			\
50	cfi_endproc;			\
51	.size name, .-name
52#endif
53
54#define JMPTBL(I, B)	I - B
55
56/* Branch to an entry in a jump table.  TABLE is a jump table with
57   relative offsets.  INDEX is a register contains the index into the
58   jump table.  SCALE is the scale of INDEX.  */
59#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
60	lea    TABLE(%rip), %r11;						\
61	movslq (%r11, INDEX, SCALE), INDEX;				\
62	lea    (%r11, INDEX), INDEX;					\
63	jmp    *INDEX
64
65	.section .text.sse2,"ax",@progbits
66	ALIGN (4)
67ENTRY (MEMSET)	// Address in rdi
68	shr    $2, %rdx			// Count in rdx
69	movl   %esi, %ecx		// Pattern in ecx
70
71	cmp    $16, %rdx
72	jae    L(16dbwordsormore)
73
74L(write_less16dbwords):
75	lea    (%rdi, %rdx, 4), %rdi
76	BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4)
77
78	.pushsection .rodata.sse2,"a",@progbits
79	ALIGN (2)
80L(table_less16dbwords):
81	.int	JMPTBL (L(write_0dbwords), L(table_less16dbwords))
82	.int	JMPTBL (L(write_1dbwords), L(table_less16dbwords))
83	.int	JMPTBL (L(write_2dbwords), L(table_less16dbwords))
84	.int	JMPTBL (L(write_3dbwords), L(table_less16dbwords))
85	.int	JMPTBL (L(write_4dbwords), L(table_less16dbwords))
86	.int	JMPTBL (L(write_5dbwords), L(table_less16dbwords))
87	.int	JMPTBL (L(write_6dbwords), L(table_less16dbwords))
88	.int	JMPTBL (L(write_7dbwords), L(table_less16dbwords))
89	.int	JMPTBL (L(write_8dbwords), L(table_less16dbwords))
90	.int	JMPTBL (L(write_9dbwords), L(table_less16dbwords))
91	.int	JMPTBL (L(write_10dbwords), L(table_less16dbwords))
92	.int	JMPTBL (L(write_11dbwords), L(table_less16dbwords))
93	.int	JMPTBL (L(write_12dbwords), L(table_less16dbwords))
94	.int	JMPTBL (L(write_13dbwords), L(table_less16dbwords))
95	.int	JMPTBL (L(write_14dbwords), L(table_less16dbwords))
96	.int	JMPTBL (L(write_15dbwords), L(table_less16dbwords))
97	.popsection
98
99	ALIGN (4)
100L(write_15dbwords):
101	movl   %ecx, -60(%rdi)
102L(write_14dbwords):
103	movl   %ecx, -56(%rdi)
104L(write_13dbwords):
105	movl   %ecx, -52(%rdi)
106L(write_12dbwords):
107	movl   %ecx, -48(%rdi)
108L(write_11dbwords):
109	movl   %ecx, -44(%rdi)
110L(write_10dbwords):
111	movl   %ecx, -40(%rdi)
112L(write_9dbwords):
113	movl   %ecx, -36(%rdi)
114L(write_8dbwords):
115	movl   %ecx, -32(%rdi)
116L(write_7dbwords):
117	movl   %ecx, -28(%rdi)
118L(write_6dbwords):
119	movl   %ecx, -24(%rdi)
120L(write_5dbwords):
121	movl   %ecx, -20(%rdi)
122L(write_4dbwords):
123	movl   %ecx, -16(%rdi)
124L(write_3dbwords):
125	movl   %ecx, -12(%rdi)
126L(write_2dbwords):
127	movl   %ecx, -8(%rdi)
128L(write_1dbwords):
129	movl   %ecx, -4(%rdi)
130L(write_0dbwords):
131	ret
132
133	ALIGN (4)
134L(16dbwordsormore):
135	test   $3, %edi
136	jz     L(aligned4bytes)
137	mov    %ecx, (%rdi)
138	mov    %ecx, -4(%rdi, %rdx, 4)
139	sub    $1, %rdx
140	rol    $24, %ecx
141	add    $1, %rdi
142	test   $3, %edi
143	jz     L(aligned4bytes)
144	ror    $8, %ecx
145	add    $1, %rdi
146	test   $3, %edi
147	jz     L(aligned4bytes)
148	ror    $8, %ecx
149	add    $1, %rdi
150L(aligned4bytes):
151	shl    $2, %rdx
152
153	/* Fill xmm0 with the pattern.  */
154	movd   %ecx, %xmm0
155	pshufd $0, %xmm0, %xmm0
156
157	testl  $0xf, %edi
158	jz     L(aligned_16)
159/* RDX > 32 and RDI is not 16 byte aligned.  */
160	movdqu %xmm0, (%rdi)
161	mov    %rdi, %rsi
162	and    $-16, %rdi
163	add    $16, %rdi
164	sub    %rdi, %rsi
165	add    %rsi, %rdx
166
167	ALIGN (4)
168L(aligned_16):
169	cmp    $128, %rdx
170	jge    L(128bytesormore)
171
172L(aligned_16_less128bytes):
173	add    %rdx, %rdi
174	shr    $2, %rdx
175	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
176
177	ALIGN (4)
178L(128bytesormore):
179	cmp    $SHARED_CACHE_SIZE, %rdx
180	jg     L(128bytesormore_nt)
181
182L(128bytesormore_normal):
183	sub    $128, %rdx
184	movdqa %xmm0, (%rdi)
185	movdqa %xmm0, 0x10(%rdi)
186	movdqa %xmm0, 0x20(%rdi)
187	movdqa %xmm0, 0x30(%rdi)
188	movdqa %xmm0, 0x40(%rdi)
189	movdqa %xmm0, 0x50(%rdi)
190	movdqa %xmm0, 0x60(%rdi)
191	movdqa %xmm0, 0x70(%rdi)
192	lea    128(%rdi), %rdi
193	cmp    $128, %rdx
194	jl     L(128bytesless_normal)
195
196	sub    $128, %rdx
197	movdqa %xmm0, (%rdi)
198	movdqa %xmm0, 0x10(%rdi)
199	movdqa %xmm0, 0x20(%rdi)
200	movdqa %xmm0, 0x30(%rdi)
201	movdqa %xmm0, 0x40(%rdi)
202	movdqa %xmm0, 0x50(%rdi)
203	movdqa %xmm0, 0x60(%rdi)
204	movdqa %xmm0, 0x70(%rdi)
205	lea    128(%rdi), %rdi
206	cmp    $128, %rdx
207	jl     L(128bytesless_normal)
208
209	sub    $128, %rdx
210	movdqa %xmm0, (%rdi)
211	movdqa %xmm0, 0x10(%rdi)
212	movdqa %xmm0, 0x20(%rdi)
213	movdqa %xmm0, 0x30(%rdi)
214	movdqa %xmm0, 0x40(%rdi)
215	movdqa %xmm0, 0x50(%rdi)
216	movdqa %xmm0, 0x60(%rdi)
217	movdqa %xmm0, 0x70(%rdi)
218	lea    128(%rdi), %rdi
219	cmp    $128, %rdx
220	jl     L(128bytesless_normal)
221
222	sub    $128, %rdx
223	movdqa %xmm0, (%rdi)
224	movdqa %xmm0, 0x10(%rdi)
225	movdqa %xmm0, 0x20(%rdi)
226	movdqa %xmm0, 0x30(%rdi)
227	movdqa %xmm0, 0x40(%rdi)
228	movdqa %xmm0, 0x50(%rdi)
229	movdqa %xmm0, 0x60(%rdi)
230	movdqa %xmm0, 0x70(%rdi)
231	lea    128(%rdi), %rdi
232	cmp    $128, %rdx
233	jge    L(128bytesormore_normal)
234
235L(128bytesless_normal):
236	add    %rdx, %rdi
237	shr    $2, %rdx
238	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
239
240	ALIGN (4)
241L(128bytesormore_nt):
242	sub    $128, %rdx
243	movntdq %xmm0, (%rdi)
244	movntdq %xmm0, 0x10(%rdi)
245	movntdq %xmm0, 0x20(%rdi)
246	movntdq %xmm0, 0x30(%rdi)
247	movntdq %xmm0, 0x40(%rdi)
248	movntdq %xmm0, 0x50(%rdi)
249	movntdq %xmm0, 0x60(%rdi)
250	movntdq %xmm0, 0x70(%rdi)
251	lea    128(%rdi), %rdi
252	cmp    $128, %rdx
253	jge    L(128bytesormore_nt)
254
255	sfence
256	add    %rdx, %rdi
257	shr    $2, %rdx
258	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
259
260	.pushsection .rodata.sse2,"a",@progbits
261	ALIGN (2)
262L(table_16_128bytes):
263	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
264	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
265	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
266	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
267	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
268	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
269	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
270	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
271	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
272	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
273	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
274	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
275	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
276	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
277	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
278	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
279	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
280	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
281	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
282	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
283	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
284	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
285	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
286	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
287	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
288	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
289	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
290	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
291	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
292	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
293	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
294	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
295	.popsection
296
297	ALIGN (4)
298L(aligned_16_112bytes):
299	movdqa	%xmm0, -112(%rdi)
300L(aligned_16_96bytes):
301	movdqa	%xmm0, -96(%rdi)
302L(aligned_16_80bytes):
303	movdqa	%xmm0, -80(%rdi)
304L(aligned_16_64bytes):
305	movdqa	%xmm0, -64(%rdi)
306L(aligned_16_48bytes):
307	movdqa	%xmm0, -48(%rdi)
308L(aligned_16_32bytes):
309	movdqa	%xmm0, -32(%rdi)
310L(aligned_16_16bytes):
311	movdqa	%xmm0, -16(%rdi)
312L(aligned_16_0bytes):
313	ret
314
315	ALIGN (4)
316L(aligned_16_116bytes):
317	movdqa	%xmm0, -116(%rdi)
318L(aligned_16_100bytes):
319	movdqa	%xmm0, -100(%rdi)
320L(aligned_16_84bytes):
321	movdqa	%xmm0, -84(%rdi)
322L(aligned_16_68bytes):
323	movdqa	%xmm0, -68(%rdi)
324L(aligned_16_52bytes):
325	movdqa	%xmm0, -52(%rdi)
326L(aligned_16_36bytes):
327	movdqa	%xmm0, -36(%rdi)
328L(aligned_16_20bytes):
329	movdqa	%xmm0, -20(%rdi)
330L(aligned_16_4bytes):
331	movl	%ecx, -4(%rdi)
332	ret
333
334	ALIGN (4)
335L(aligned_16_120bytes):
336	movdqa	%xmm0, -120(%rdi)
337L(aligned_16_104bytes):
338	movdqa	%xmm0, -104(%rdi)
339L(aligned_16_88bytes):
340	movdqa	%xmm0, -88(%rdi)
341L(aligned_16_72bytes):
342	movdqa	%xmm0, -72(%rdi)
343L(aligned_16_56bytes):
344	movdqa	%xmm0, -56(%rdi)
345L(aligned_16_40bytes):
346	movdqa	%xmm0, -40(%rdi)
347L(aligned_16_24bytes):
348	movdqa	%xmm0, -24(%rdi)
349L(aligned_16_8bytes):
350	movq	%xmm0, -8(%rdi)
351	ret
352
353	ALIGN (4)
354L(aligned_16_124bytes):
355	movdqa	%xmm0, -124(%rdi)
356L(aligned_16_108bytes):
357	movdqa	%xmm0, -108(%rdi)
358L(aligned_16_92bytes):
359	movdqa	%xmm0, -92(%rdi)
360L(aligned_16_76bytes):
361	movdqa	%xmm0, -76(%rdi)
362L(aligned_16_60bytes):
363	movdqa	%xmm0, -60(%rdi)
364L(aligned_16_44bytes):
365	movdqa	%xmm0, -44(%rdi)
366L(aligned_16_28bytes):
367	movdqa	%xmm0, -28(%rdi)
368L(aligned_16_12bytes):
369	movq	%xmm0, -12(%rdi)
370	movl	%ecx, -4(%rdi)
371	ret
372
373END (MEMSET)
374