1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "cache.h" 18 19#ifndef MEMSET 20# define MEMSET android_memset32 21#endif 22 23#ifndef L 24# define L(label) .L##label 25#endif 26 27#ifndef ALIGN 28# define ALIGN(n) .p2align n 29#endif 30 31#ifndef cfi_startproc 32# define cfi_startproc .cfi_startproc 33#endif 34 35#ifndef cfi_endproc 36# define cfi_endproc .cfi_endproc 37#endif 38 39#ifndef ENTRY 40# define ENTRY(name) \ 41 .type name, @function; \ 42 .globl name; \ 43 .p2align 4; \ 44name: \ 45 cfi_startproc 46#endif 47 48#ifndef END 49# define END(name) \ 50 cfi_endproc; \ 51 .size name, .-name 52#endif 53 54#define JMPTBL(I, B) I - B 55 56/* Branch to an entry in a jump table. TABLE is a jump table with 57 relative offsets. INDEX is a register contains the index into the 58 jump table. SCALE is the scale of INDEX. */ 59#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 60 lea TABLE(%rip), %r11; \ 61 movslq (%r11, INDEX, SCALE), INDEX; \ 62 lea (%r11, INDEX), INDEX; \ 63 jmp *INDEX 64 65 .section .text.sse2,"ax",@progbits 66 ALIGN (4) 67ENTRY (MEMSET) // Address in rdi 68 shr $2, %rdx // Count in rdx 69 movl %esi, %ecx // Pattern in ecx 70 71 cmp $16, %rdx 72 jae L(16dbwordsormore) 73 74L(write_less16dbwords): 75 lea (%rdi, %rdx, 4), %rdi 76 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4) 77 78 .pushsection .rodata.sse2,"a",@progbits 79 ALIGN (2) 80L(table_less16dbwords): 81 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 82 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 83 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 84 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 85 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 86 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 87 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 88 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 89 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 90 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 91 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 92 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 93 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 94 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 95 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 96 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 97 .popsection 98 99 ALIGN (4) 100L(write_15dbwords): 101 movl %ecx, -60(%rdi) 102L(write_14dbwords): 103 movl %ecx, -56(%rdi) 104L(write_13dbwords): 105 movl %ecx, -52(%rdi) 106L(write_12dbwords): 107 movl %ecx, -48(%rdi) 108L(write_11dbwords): 109 movl %ecx, -44(%rdi) 110L(write_10dbwords): 111 movl %ecx, -40(%rdi) 112L(write_9dbwords): 113 movl %ecx, -36(%rdi) 114L(write_8dbwords): 115 movl %ecx, -32(%rdi) 116L(write_7dbwords): 117 movl %ecx, -28(%rdi) 118L(write_6dbwords): 119 movl %ecx, -24(%rdi) 120L(write_5dbwords): 121 movl %ecx, -20(%rdi) 122L(write_4dbwords): 123 movl %ecx, -16(%rdi) 124L(write_3dbwords): 125 movl %ecx, -12(%rdi) 126L(write_2dbwords): 127 movl %ecx, -8(%rdi) 128L(write_1dbwords): 129 movl %ecx, -4(%rdi) 130L(write_0dbwords): 131 ret 132 133 ALIGN (4) 134L(16dbwordsormore): 135 test $3, %edi 136 jz L(aligned4bytes) 137 mov %ecx, (%rdi) 138 mov %ecx, -4(%rdi, %rdx, 4) 139 sub $1, %rdx 140 rol $24, %ecx 141 add $1, %rdi 142 test $3, %edi 143 jz L(aligned4bytes) 144 ror $8, %ecx 145 add $1, %rdi 146 test $3, %edi 147 jz L(aligned4bytes) 148 ror $8, %ecx 149 add $1, %rdi 150L(aligned4bytes): 151 shl $2, %rdx 152 153 /* Fill xmm0 with the pattern. */ 154 movd %ecx, %xmm0 155 pshufd $0, %xmm0, %xmm0 156 157 testl $0xf, %edi 158 jz L(aligned_16) 159/* RDX > 32 and RDI is not 16 byte aligned. */ 160 movdqu %xmm0, (%rdi) 161 mov %rdi, %rsi 162 and $-16, %rdi 163 add $16, %rdi 164 sub %rdi, %rsi 165 add %rsi, %rdx 166 167 ALIGN (4) 168L(aligned_16): 169 cmp $128, %rdx 170 jge L(128bytesormore) 171 172L(aligned_16_less128bytes): 173 add %rdx, %rdi 174 shr $2, %rdx 175 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 176 177 ALIGN (4) 178L(128bytesormore): 179 cmp $SHARED_CACHE_SIZE, %rdx 180 jg L(128bytesormore_nt) 181 182L(128bytesormore_normal): 183 sub $128, %rdx 184 movdqa %xmm0, (%rdi) 185 movdqa %xmm0, 0x10(%rdi) 186 movdqa %xmm0, 0x20(%rdi) 187 movdqa %xmm0, 0x30(%rdi) 188 movdqa %xmm0, 0x40(%rdi) 189 movdqa %xmm0, 0x50(%rdi) 190 movdqa %xmm0, 0x60(%rdi) 191 movdqa %xmm0, 0x70(%rdi) 192 lea 128(%rdi), %rdi 193 cmp $128, %rdx 194 jl L(128bytesless_normal) 195 196 sub $128, %rdx 197 movdqa %xmm0, (%rdi) 198 movdqa %xmm0, 0x10(%rdi) 199 movdqa %xmm0, 0x20(%rdi) 200 movdqa %xmm0, 0x30(%rdi) 201 movdqa %xmm0, 0x40(%rdi) 202 movdqa %xmm0, 0x50(%rdi) 203 movdqa %xmm0, 0x60(%rdi) 204 movdqa %xmm0, 0x70(%rdi) 205 lea 128(%rdi), %rdi 206 cmp $128, %rdx 207 jl L(128bytesless_normal) 208 209 sub $128, %rdx 210 movdqa %xmm0, (%rdi) 211 movdqa %xmm0, 0x10(%rdi) 212 movdqa %xmm0, 0x20(%rdi) 213 movdqa %xmm0, 0x30(%rdi) 214 movdqa %xmm0, 0x40(%rdi) 215 movdqa %xmm0, 0x50(%rdi) 216 movdqa %xmm0, 0x60(%rdi) 217 movdqa %xmm0, 0x70(%rdi) 218 lea 128(%rdi), %rdi 219 cmp $128, %rdx 220 jl L(128bytesless_normal) 221 222 sub $128, %rdx 223 movdqa %xmm0, (%rdi) 224 movdqa %xmm0, 0x10(%rdi) 225 movdqa %xmm0, 0x20(%rdi) 226 movdqa %xmm0, 0x30(%rdi) 227 movdqa %xmm0, 0x40(%rdi) 228 movdqa %xmm0, 0x50(%rdi) 229 movdqa %xmm0, 0x60(%rdi) 230 movdqa %xmm0, 0x70(%rdi) 231 lea 128(%rdi), %rdi 232 cmp $128, %rdx 233 jge L(128bytesormore_normal) 234 235L(128bytesless_normal): 236 add %rdx, %rdi 237 shr $2, %rdx 238 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 239 240 ALIGN (4) 241L(128bytesormore_nt): 242 sub $128, %rdx 243 movntdq %xmm0, (%rdi) 244 movntdq %xmm0, 0x10(%rdi) 245 movntdq %xmm0, 0x20(%rdi) 246 movntdq %xmm0, 0x30(%rdi) 247 movntdq %xmm0, 0x40(%rdi) 248 movntdq %xmm0, 0x50(%rdi) 249 movntdq %xmm0, 0x60(%rdi) 250 movntdq %xmm0, 0x70(%rdi) 251 lea 128(%rdi), %rdi 252 cmp $128, %rdx 253 jge L(128bytesormore_nt) 254 255 sfence 256 add %rdx, %rdi 257 shr $2, %rdx 258 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 259 260 .pushsection .rodata.sse2,"a",@progbits 261 ALIGN (2) 262L(table_16_128bytes): 263 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 264 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 265 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 266 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 267 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 268 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 269 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 270 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 271 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 272 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 273 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 274 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 275 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 276 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 277 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 278 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 279 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 280 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 281 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 282 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 283 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 284 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 285 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 286 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 287 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 288 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 289 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 290 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 291 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 292 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 293 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 294 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 295 .popsection 296 297 ALIGN (4) 298L(aligned_16_112bytes): 299 movdqa %xmm0, -112(%rdi) 300L(aligned_16_96bytes): 301 movdqa %xmm0, -96(%rdi) 302L(aligned_16_80bytes): 303 movdqa %xmm0, -80(%rdi) 304L(aligned_16_64bytes): 305 movdqa %xmm0, -64(%rdi) 306L(aligned_16_48bytes): 307 movdqa %xmm0, -48(%rdi) 308L(aligned_16_32bytes): 309 movdqa %xmm0, -32(%rdi) 310L(aligned_16_16bytes): 311 movdqa %xmm0, -16(%rdi) 312L(aligned_16_0bytes): 313 ret 314 315 ALIGN (4) 316L(aligned_16_116bytes): 317 movdqa %xmm0, -116(%rdi) 318L(aligned_16_100bytes): 319 movdqa %xmm0, -100(%rdi) 320L(aligned_16_84bytes): 321 movdqa %xmm0, -84(%rdi) 322L(aligned_16_68bytes): 323 movdqa %xmm0, -68(%rdi) 324L(aligned_16_52bytes): 325 movdqa %xmm0, -52(%rdi) 326L(aligned_16_36bytes): 327 movdqa %xmm0, -36(%rdi) 328L(aligned_16_20bytes): 329 movdqa %xmm0, -20(%rdi) 330L(aligned_16_4bytes): 331 movl %ecx, -4(%rdi) 332 ret 333 334 ALIGN (4) 335L(aligned_16_120bytes): 336 movdqa %xmm0, -120(%rdi) 337L(aligned_16_104bytes): 338 movdqa %xmm0, -104(%rdi) 339L(aligned_16_88bytes): 340 movdqa %xmm0, -88(%rdi) 341L(aligned_16_72bytes): 342 movdqa %xmm0, -72(%rdi) 343L(aligned_16_56bytes): 344 movdqa %xmm0, -56(%rdi) 345L(aligned_16_40bytes): 346 movdqa %xmm0, -40(%rdi) 347L(aligned_16_24bytes): 348 movdqa %xmm0, -24(%rdi) 349L(aligned_16_8bytes): 350 movq %xmm0, -8(%rdi) 351 ret 352 353 ALIGN (4) 354L(aligned_16_124bytes): 355 movdqa %xmm0, -124(%rdi) 356L(aligned_16_108bytes): 357 movdqa %xmm0, -108(%rdi) 358L(aligned_16_92bytes): 359 movdqa %xmm0, -92(%rdi) 360L(aligned_16_76bytes): 361 movdqa %xmm0, -76(%rdi) 362L(aligned_16_60bytes): 363 movdqa %xmm0, -60(%rdi) 364L(aligned_16_44bytes): 365 movdqa %xmm0, -44(%rdi) 366L(aligned_16_28bytes): 367 movdqa %xmm0, -28(%rdi) 368L(aligned_16_12bytes): 369 movq %xmm0, -12(%rdi) 370 movl %ecx, -4(%rdi) 371 ret 372 373END (MEMSET) 374