1/* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "cache.h" 18 19#ifndef MEMSET 20# define MEMSET android_memset32 21#endif 22 23#ifndef L 24# define L(label) .L##label 25#endif 26 27#ifndef ALIGN 28# define ALIGN(n) .p2align n 29#endif 30 31#ifndef cfi_startproc 32# define cfi_startproc .cfi_startproc 33#endif 34 35#ifndef cfi_endproc 36# define cfi_endproc .cfi_endproc 37#endif 38 39#ifndef cfi_rel_offset 40# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 41#endif 42 43#ifndef cfi_restore 44# define cfi_restore(reg) .cfi_restore reg 45#endif 46 47#ifndef cfi_adjust_cfa_offset 48# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 49#endif 50 51#ifndef ENTRY 52# define ENTRY(name) \ 53 .type name, @function; \ 54 .globl name; \ 55 .p2align 4; \ 56name: \ 57 cfi_startproc 58#endif 59 60#ifndef END 61# define END(name) \ 62 cfi_endproc; \ 63 .size name, .-name 64#endif 65 66#define CFI_PUSH(REG) \ 67 cfi_adjust_cfa_offset (4); \ 68 cfi_rel_offset (REG, 0) 69 70#define CFI_POP(REG) \ 71 cfi_adjust_cfa_offset (-4); \ 72 cfi_restore (REG) 73 74#define PUSH(REG) pushl REG; CFI_PUSH (REG) 75#define POP(REG) popl REG; CFI_POP (REG) 76 77#ifdef USE_AS_BZERO32 78# define DEST PARMS 79# define LEN DEST+4 80# define SETRTNVAL 81#else 82# define DEST PARMS 83# define DWDS DEST+4 84# define LEN DWDS+4 85# define SETRTNVAL movl DEST(%esp), %eax 86#endif 87 88#if (defined SHARED || defined __PIC__) 89# define ENTRANCE PUSH (%ebx); 90# define RETURN_END POP (%ebx); ret 91# define RETURN RETURN_END; CFI_PUSH (%ebx) 92# define PARMS 8 /* Preserve EBX. */ 93# define JMPTBL(I, B) I - B 94 95/* Load an entry in a jump table into EBX and branch to it. TABLE is a 96 jump table with relative offsets. */ 97# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 98 /* We first load PC into EBX. */ \ 99 call __x86.get_pc_thunk.bx; \ 100 /* Get the address of the jump table. */ \ 101 add $(TABLE - .), %ebx; \ 102 /* Get the entry and convert the relative offset to the \ 103 absolute address. */ \ 104 add (%ebx,%ecx,4), %ebx; \ 105 /* We loaded the jump table and adjuested EDX. Go. */ \ 106 jmp *%ebx 107 108 .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits 109 .globl __x86.get_pc_thunk.bx 110 .hidden __x86.get_pc_thunk.bx 111 ALIGN (4) 112 .type __x86.get_pc_thunk.bx,@function 113__x86.get_pc_thunk.bx: 114 movl (%esp), %ebx 115 ret 116#else 117# define ENTRANCE 118# define RETURN_END ret 119# define RETURN RETURN_END 120# define PARMS 4 121# define JMPTBL(I, B) I 122 123/* Branch to an entry in a jump table. TABLE is a jump table with 124 absolute offsets. */ 125# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 126 jmp *TABLE(,%ecx,4) 127#endif 128 129 .section .text.sse2,"ax",@progbits 130 ALIGN (4) 131ENTRY (MEMSET) 132 ENTRANCE 133 134 movl LEN(%esp), %ecx 135 shr $2, %ecx 136#ifdef USE_AS_BZERO32 137 xor %eax, %eax 138#else 139 mov DWDS(%esp), %eax 140 mov %eax, %edx 141#endif 142 movl DEST(%esp), %edx 143 cmp $16, %ecx 144 jae L(16dbwordsormore) 145 146L(write_less16dbwords): 147 lea (%edx, %ecx, 4), %edx 148 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords)) 149 150 .pushsection .rodata.sse2,"a",@progbits 151 ALIGN (2) 152L(table_less16dbwords): 153 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 154 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 155 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 156 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 157 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 158 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 159 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 160 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 161 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 162 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 163 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 164 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 165 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 166 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 167 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 168 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 169 .popsection 170 171 ALIGN (4) 172L(write_15dbwords): 173 movl %eax, -60(%edx) 174L(write_14dbwords): 175 movl %eax, -56(%edx) 176L(write_13dbwords): 177 movl %eax, -52(%edx) 178L(write_12dbwords): 179 movl %eax, -48(%edx) 180L(write_11dbwords): 181 movl %eax, -44(%edx) 182L(write_10dbwords): 183 movl %eax, -40(%edx) 184L(write_9dbwords): 185 movl %eax, -36(%edx) 186L(write_8dbwords): 187 movl %eax, -32(%edx) 188L(write_7dbwords): 189 movl %eax, -28(%edx) 190L(write_6dbwords): 191 movl %eax, -24(%edx) 192L(write_5dbwords): 193 movl %eax, -20(%edx) 194L(write_4dbwords): 195 movl %eax, -16(%edx) 196L(write_3dbwords): 197 movl %eax, -12(%edx) 198L(write_2dbwords): 199 movl %eax, -8(%edx) 200L(write_1dbwords): 201 movl %eax, -4(%edx) 202L(write_0dbwords): 203 SETRTNVAL 204 RETURN 205 206 ALIGN (4) 207L(16dbwordsormore): 208 test $3, %edx 209 jz L(aligned4bytes) 210 mov %eax, (%edx) 211 mov %eax, -4(%edx, %ecx, 4) 212 sub $1, %ecx 213 rol $24, %eax 214 add $1, %edx 215 test $3, %edx 216 jz L(aligned4bytes) 217 ror $8, %eax 218 add $1, %edx 219 test $3, %edx 220 jz L(aligned4bytes) 221 ror $8, %eax 222 add $1, %edx 223L(aligned4bytes): 224 shl $2, %ecx 225 226#ifdef USE_AS_BZERO32 227 pxor %xmm0, %xmm0 228#else 229 movd %eax, %xmm0 230 pshufd $0, %xmm0, %xmm0 231#endif 232 testl $0xf, %edx 233 jz L(aligned_16) 234/* ECX > 32 and EDX is not 16 byte aligned. */ 235L(not_aligned_16): 236 movdqu %xmm0, (%edx) 237 movl %edx, %eax 238 and $-16, %edx 239 add $16, %edx 240 sub %edx, %eax 241 add %eax, %ecx 242 movd %xmm0, %eax 243 ALIGN (4) 244L(aligned_16): 245 cmp $128, %ecx 246 jae L(128bytesormore) 247 248L(aligned_16_less128bytes): 249 add %ecx, %edx 250 shr $2, %ecx 251 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 252 253 ALIGN (4) 254L(128bytesormore): 255#ifdef SHARED_CACHE_SIZE 256 PUSH (%ebx) 257 mov $SHARED_CACHE_SIZE, %ebx 258#else 259# if (defined SHARED || defined __PIC__) 260 call __x86.get_pc_thunk.bx 261 add $_GLOBAL_OFFSET_TABLE_, %ebx 262 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx 263# else 264 PUSH (%ebx) 265 mov __x86_shared_cache_size, %ebx 266# endif 267#endif 268 cmp %ebx, %ecx 269 jae L(128bytesormore_nt_start) 270 271#ifdef DATA_CACHE_SIZE 272 POP (%ebx) 273# define RESTORE_EBX_STATE CFI_PUSH (%ebx) 274 cmp $DATA_CACHE_SIZE, %ecx 275#else 276# if (defined SHARED || defined __PIC__) 277# define RESTORE_EBX_STATE 278 call __x86.get_pc_thunk.bx 279 add $_GLOBAL_OFFSET_TABLE_, %ebx 280 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx 281# else 282 POP (%ebx) 283# define RESTORE_EBX_STATE CFI_PUSH (%ebx) 284 cmp __x86_data_cache_size, %ecx 285# endif 286#endif 287 288 jae L(128bytes_L2_normal) 289 subl $128, %ecx 290L(128bytesormore_normal): 291 sub $128, %ecx 292 movdqa %xmm0, (%edx) 293 movdqa %xmm0, 0x10(%edx) 294 movdqa %xmm0, 0x20(%edx) 295 movdqa %xmm0, 0x30(%edx) 296 movdqa %xmm0, 0x40(%edx) 297 movdqa %xmm0, 0x50(%edx) 298 movdqa %xmm0, 0x60(%edx) 299 movdqa %xmm0, 0x70(%edx) 300 lea 128(%edx), %edx 301 jb L(128bytesless_normal) 302 303 304 sub $128, %ecx 305 movdqa %xmm0, (%edx) 306 movdqa %xmm0, 0x10(%edx) 307 movdqa %xmm0, 0x20(%edx) 308 movdqa %xmm0, 0x30(%edx) 309 movdqa %xmm0, 0x40(%edx) 310 movdqa %xmm0, 0x50(%edx) 311 movdqa %xmm0, 0x60(%edx) 312 movdqa %xmm0, 0x70(%edx) 313 lea 128(%edx), %edx 314 jae L(128bytesormore_normal) 315 316L(128bytesless_normal): 317 lea 128(%ecx), %ecx 318 add %ecx, %edx 319 shr $2, %ecx 320 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 321 322 ALIGN (4) 323L(128bytes_L2_normal): 324 prefetcht0 0x380(%edx) 325 prefetcht0 0x3c0(%edx) 326 sub $128, %ecx 327 movdqa %xmm0, (%edx) 328 movaps %xmm0, 0x10(%edx) 329 movaps %xmm0, 0x20(%edx) 330 movaps %xmm0, 0x30(%edx) 331 movaps %xmm0, 0x40(%edx) 332 movaps %xmm0, 0x50(%edx) 333 movaps %xmm0, 0x60(%edx) 334 movaps %xmm0, 0x70(%edx) 335 add $128, %edx 336 cmp $128, %ecx 337 jae L(128bytes_L2_normal) 338 339L(128bytesless_L2_normal): 340 add %ecx, %edx 341 shr $2, %ecx 342 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 343 344 RESTORE_EBX_STATE 345L(128bytesormore_nt_start): 346 sub %ebx, %ecx 347 mov %ebx, %eax 348 and $0x7f, %eax 349 add %eax, %ecx 350 movd %xmm0, %eax 351 ALIGN (4) 352L(128bytesormore_shared_cache_loop): 353 prefetcht0 0x3c0(%edx) 354 prefetcht0 0x380(%edx) 355 sub $0x80, %ebx 356 movdqa %xmm0, (%edx) 357 movdqa %xmm0, 0x10(%edx) 358 movdqa %xmm0, 0x20(%edx) 359 movdqa %xmm0, 0x30(%edx) 360 movdqa %xmm0, 0x40(%edx) 361 movdqa %xmm0, 0x50(%edx) 362 movdqa %xmm0, 0x60(%edx) 363 movdqa %xmm0, 0x70(%edx) 364 add $0x80, %edx 365 cmp $0x80, %ebx 366 jae L(128bytesormore_shared_cache_loop) 367 cmp $0x80, %ecx 368 jb L(shared_cache_loop_end) 369 370 ALIGN (4) 371L(128bytesormore_nt): 372 sub $0x80, %ecx 373 movntdq %xmm0, (%edx) 374 movntdq %xmm0, 0x10(%edx) 375 movntdq %xmm0, 0x20(%edx) 376 movntdq %xmm0, 0x30(%edx) 377 movntdq %xmm0, 0x40(%edx) 378 movntdq %xmm0, 0x50(%edx) 379 movntdq %xmm0, 0x60(%edx) 380 movntdq %xmm0, 0x70(%edx) 381 add $0x80, %edx 382 cmp $0x80, %ecx 383 jae L(128bytesormore_nt) 384 sfence 385L(shared_cache_loop_end): 386#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__) 387 POP (%ebx) 388#endif 389 add %ecx, %edx 390 shr $2, %ecx 391 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 392 393 .pushsection .rodata.sse2,"a",@progbits 394 ALIGN (2) 395L(table_16_128bytes): 396 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 397 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 398 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 399 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 400 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 401 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 402 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 403 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 404 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 405 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 406 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 407 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 408 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 409 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 410 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 411 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 412 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 413 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 414 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 415 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 416 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 417 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 418 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 419 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 420 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 421 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 422 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 423 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 424 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 425 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 426 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 427 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 428 .popsection 429 430 ALIGN (4) 431L(aligned_16_112bytes): 432 movdqa %xmm0, -112(%edx) 433L(aligned_16_96bytes): 434 movdqa %xmm0, -96(%edx) 435L(aligned_16_80bytes): 436 movdqa %xmm0, -80(%edx) 437L(aligned_16_64bytes): 438 movdqa %xmm0, -64(%edx) 439L(aligned_16_48bytes): 440 movdqa %xmm0, -48(%edx) 441L(aligned_16_32bytes): 442 movdqa %xmm0, -32(%edx) 443L(aligned_16_16bytes): 444 movdqa %xmm0, -16(%edx) 445L(aligned_16_0bytes): 446 SETRTNVAL 447 RETURN 448 449 ALIGN (4) 450L(aligned_16_116bytes): 451 movdqa %xmm0, -116(%edx) 452L(aligned_16_100bytes): 453 movdqa %xmm0, -100(%edx) 454L(aligned_16_84bytes): 455 movdqa %xmm0, -84(%edx) 456L(aligned_16_68bytes): 457 movdqa %xmm0, -68(%edx) 458L(aligned_16_52bytes): 459 movdqa %xmm0, -52(%edx) 460L(aligned_16_36bytes): 461 movdqa %xmm0, -36(%edx) 462L(aligned_16_20bytes): 463 movdqa %xmm0, -20(%edx) 464L(aligned_16_4bytes): 465 movl %eax, -4(%edx) 466 SETRTNVAL 467 RETURN 468 469 ALIGN (4) 470L(aligned_16_120bytes): 471 movdqa %xmm0, -120(%edx) 472L(aligned_16_104bytes): 473 movdqa %xmm0, -104(%edx) 474L(aligned_16_88bytes): 475 movdqa %xmm0, -88(%edx) 476L(aligned_16_72bytes): 477 movdqa %xmm0, -72(%edx) 478L(aligned_16_56bytes): 479 movdqa %xmm0, -56(%edx) 480L(aligned_16_40bytes): 481 movdqa %xmm0, -40(%edx) 482L(aligned_16_24bytes): 483 movdqa %xmm0, -24(%edx) 484L(aligned_16_8bytes): 485 movq %xmm0, -8(%edx) 486 SETRTNVAL 487 RETURN 488 489 ALIGN (4) 490L(aligned_16_124bytes): 491 movdqa %xmm0, -124(%edx) 492L(aligned_16_108bytes): 493 movdqa %xmm0, -108(%edx) 494L(aligned_16_92bytes): 495 movdqa %xmm0, -92(%edx) 496L(aligned_16_76bytes): 497 movdqa %xmm0, -76(%edx) 498L(aligned_16_60bytes): 499 movdqa %xmm0, -60(%edx) 500L(aligned_16_44bytes): 501 movdqa %xmm0, -44(%edx) 502L(aligned_16_28bytes): 503 movdqa %xmm0, -28(%edx) 504L(aligned_16_12bytes): 505 movq %xmm0, -12(%edx) 506 movl %eax, -4(%edx) 507 SETRTNVAL 508 RETURN 509 510END (MEMSET) 511