1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMSET 34# define MEMSET memset 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef ALIGN 42# define ALIGN(n) .p2align n 43#endif 44 45#ifndef cfi_startproc 46# define cfi_startproc .cfi_startproc 47#endif 48 49#ifndef cfi_endproc 50# define cfi_endproc .cfi_endproc 51#endif 52 53#ifndef cfi_rel_offset 54# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 55#endif 56 57#ifndef cfi_restore 58# define cfi_restore(reg) .cfi_restore reg 59#endif 60 61#ifndef cfi_adjust_cfa_offset 62# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 63#endif 64 65#ifndef ENTRY 66# define ENTRY(name) \ 67 .type name, @function; \ 68 .globl name; \ 69 .p2align 4; \ 70name: \ 71 cfi_startproc 72#endif 73 74#ifndef END 75# define END(name) \ 76 cfi_endproc; \ 77 .size name, .-name 78#endif 79 80#define CFI_PUSH(REG) \ 81 cfi_adjust_cfa_offset (4); \ 82 cfi_rel_offset (REG, 0) 83 84#define CFI_POP(REG) \ 85 cfi_adjust_cfa_offset (-4); \ 86 cfi_restore (REG) 87 88#define PUSH(REG) pushl REG; CFI_PUSH (REG) 89#define POP(REG) popl REG; CFI_POP (REG) 90 91#ifdef USE_AS_BZERO 92# define DEST PARMS 93# define LEN DEST+4 94# define SETRTNVAL 95#else 96# define DEST PARMS 97# define CHR DEST+4 98# define LEN CHR+4 99# define SETRTNVAL movl DEST(%esp), %eax 100#endif 101 102#if (defined SHARED || defined __PIC__) 103# define ENTRANCE PUSH (%ebx); 104# define RETURN_END POP (%ebx); ret 105# define RETURN RETURN_END; CFI_PUSH (%ebx) 106# define PARMS 8 /* Preserve EBX. */ 107# define JMPTBL(I, B) I - B 108 109/* Load an entry in a jump table into EBX and branch to it. TABLE is a 110 jump table with relative offsets. */ 111# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 112 /* We first load PC into EBX. */ \ 113 call __x86.get_pc_thunk.bx; \ 114 /* Get the address of the jump table. */ \ 115 add $(TABLE - .), %ebx; \ 116 /* Get the entry and convert the relative offset to the \ 117 absolute address. */ \ 118 add (%ebx,%ecx,4), %ebx; \ 119 add %ecx, %edx; \ 120 /* We loaded the jump table and adjuested EDX. Go. */ \ 121 jmp *%ebx 122 123 .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits 124 .globl __x86.get_pc_thunk.bx 125 .hidden __x86.get_pc_thunk.bx 126 ALIGN (4) 127 .type __x86.get_pc_thunk.bx,@function 128__x86.get_pc_thunk.bx: 129 movl (%esp), %ebx 130 ret 131#else 132# define ENTRANCE 133# define RETURN_END ret 134# define RETURN RETURN_END 135# define PARMS 4 136# define JMPTBL(I, B) I 137 138/* Branch to an entry in a jump table. TABLE is a jump table with 139 absolute offsets. */ 140# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 141 add %ecx, %edx; \ 142 jmp *TABLE(,%ecx,4) 143#endif 144 145 .section .text.sse2,"ax",@progbits 146 ALIGN (4) 147ENTRY (MEMSET) 148 ENTRANCE 149 150 movl LEN(%esp), %ecx 151 cmp $0, %ecx 152 ja L(1byteormore) 153 SETRTNVAL 154 RETURN 155 156L(1byteormore): 157#ifdef USE_AS_BZERO 158 xor %eax, %eax 159#else 160 movzbl CHR(%esp), %eax 161 movb %al, %ah 162 /* Fill the whole EAX with pattern. */ 163 movl %eax, %edx 164 shl $16, %eax 165 or %edx, %eax 166#endif 167 movl DEST(%esp), %edx 168 cmp $1, %ecx 169 je L(1byte) 170 cmp $16, %ecx 171 jae L(16bytesormore) 172 173 cmp $4, %ecx 174 jb L(4bytesless) 175 movl %eax, (%edx) 176 movl %eax, -4(%edx, %ecx) 177 cmp $8, %ecx 178 jb L(8bytesless) 179 movl %eax, 4(%edx) 180 movl %eax, -8(%edx, %ecx) 181L(8bytesless): 182 SETRTNVAL 183 RETURN 184 185L(4bytesless): 186 movw %ax, (%edx) 187 movw %ax, -2(%edx, %ecx) 188 SETRTNVAL 189 RETURN 190 191L(1byte): 192 movb %al, (%edx) 193 SETRTNVAL 194 RETURN 195 196 ALIGN (4) 197L(16bytesormore): 198#ifdef USE_AS_BZERO 199 pxor %xmm0, %xmm0 200#else 201 movd %eax, %xmm0 202 pshufd $0, %xmm0, %xmm0 203#endif 204 205 cmp $64, %ecx 206 ja L(64bytesmore) 207 movdqu %xmm0, (%edx) 208 movdqu %xmm0, -16(%edx, %ecx) 209 cmp $32, %ecx 210 jbe L(32bytesless) 211 movdqu %xmm0, 16(%edx) 212 movdqu %xmm0, -32(%edx, %ecx) 213L(32bytesless): 214 SETRTNVAL 215 RETURN 216 217L(64bytesmore): 218 testl $0xf, %edx 219 jz L(aligned_16) 220L(not_aligned_16): 221 movdqu %xmm0, (%edx) 222 movl %edx, %eax 223 and $-16, %edx 224 add $16, %edx 225 sub %edx, %eax 226 add %eax, %ecx 227 movd %xmm0, %eax 228 229 ALIGN (4) 230L(aligned_16): 231 cmp $128, %ecx 232 jae L(128bytesormore) 233 234L(aligned_16_less128bytes): 235 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 236 237 ALIGN (4) 238L(128bytesormore): 239#ifdef SHARED_CACHE_SIZE 240 PUSH (%ebx) 241 mov $SHARED_CACHE_SIZE, %ebx 242#else 243# if (defined SHARED || defined __PIC__) 244 call __x86.get_pc_thunk.bx 245 add $_GLOBAL_OFFSET_TABLE_, %ebx 246 mov $__x86_shared_cache_size@GOTOFF(%ebx), %ebx 247# else 248 PUSH (%ebx) 249 mov $__x86_shared_cache_size, %ebx 250# endif 251#endif 252 cmp %ebx, %ecx 253 jae L(128bytesormore_nt_start) 254 255 POP (%ebx) 256 257#ifdef DATA_CACHE_SIZE 258 PUSH (%ebx) 259 mov $DATA_CACHE_SIZE, %ebx 260#else 261# if (defined SHARED || defined __PIC__) 262 call __x86.get_pc_thunk.bx 263 add $_GLOBAL_OFFSET_TABLE_, %ebx 264 mov $__x86_data_cache_size@GOTOFF(%ebx), %ebx 265# else 266 PUSH (%ebx) 267 mov $__x86_data_cache_size, %ebx 268# endif 269#endif 270 271 cmp %ebx, %ecx 272 jae L(128bytes_L2_normal) 273 subl $128, %ecx 274L(128bytesormore_normal): 275 sub $128, %ecx 276 movdqa %xmm0, (%edx) 277 movaps %xmm0, 0x10(%edx) 278 movaps %xmm0, 0x20(%edx) 279 movaps %xmm0, 0x30(%edx) 280 movaps %xmm0, 0x40(%edx) 281 movaps %xmm0, 0x50(%edx) 282 movaps %xmm0, 0x60(%edx) 283 movaps %xmm0, 0x70(%edx) 284 lea 128(%edx), %edx 285 jb L(128bytesless_normal) 286 287 288 sub $128, %ecx 289 movdqa %xmm0, (%edx) 290 movaps %xmm0, 0x10(%edx) 291 movaps %xmm0, 0x20(%edx) 292 movaps %xmm0, 0x30(%edx) 293 movaps %xmm0, 0x40(%edx) 294 movaps %xmm0, 0x50(%edx) 295 movaps %xmm0, 0x60(%edx) 296 movaps %xmm0, 0x70(%edx) 297 lea 128(%edx), %edx 298 jae L(128bytesormore_normal) 299 300L(128bytesless_normal): 301 lea 128(%ecx), %ecx 302#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__) 303 POP (%ebx) 304#endif 305 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 306 307 ALIGN (4) 308L(128bytes_L2_normal): 309 prefetchnta 0x380(%edx) 310 prefetchnta 0x3c0(%edx) 311 sub $128, %ecx 312 movdqa %xmm0, (%edx) 313 movaps %xmm0, 0x10(%edx) 314 movaps %xmm0, 0x20(%edx) 315 movaps %xmm0, 0x30(%edx) 316 movaps %xmm0, 0x40(%edx) 317 movaps %xmm0, 0x50(%edx) 318 movaps %xmm0, 0x60(%edx) 319 movaps %xmm0, 0x70(%edx) 320 add $128, %edx 321 cmp $128, %ecx 322 jae L(128bytes_L2_normal) 323 324L(128bytesless_L2_normal): 325#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__) 326 POP (%ebx) 327#endif 328 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 329 330L(128bytesormore_nt_start): 331 sub %ebx, %ecx 332 ALIGN (4) 333L(128bytesormore_shared_cache_loop): 334 prefetchnta 0x3c0(%edx) 335 prefetchnta 0x380(%edx) 336 sub $0x80, %ebx 337 movdqa %xmm0, (%edx) 338 movaps %xmm0, 0x10(%edx) 339 movaps %xmm0, 0x20(%edx) 340 movaps %xmm0, 0x30(%edx) 341 movaps %xmm0, 0x40(%edx) 342 movaps %xmm0, 0x50(%edx) 343 movaps %xmm0, 0x60(%edx) 344 movaps %xmm0, 0x70(%edx) 345 add $0x80, %edx 346 cmp $0x80, %ebx 347 jae L(128bytesormore_shared_cache_loop) 348 cmp $0x80, %ecx 349 jb L(shared_cache_loop_end) 350 ALIGN (4) 351L(128bytesormore_nt): 352 sub $0x80, %ecx 353 movntdq %xmm0, (%edx) 354 movntdq %xmm0, 0x10(%edx) 355 movntdq %xmm0, 0x20(%edx) 356 movntdq %xmm0, 0x30(%edx) 357 movntdq %xmm0, 0x40(%edx) 358 movntdq %xmm0, 0x50(%edx) 359 movntdq %xmm0, 0x60(%edx) 360 movntdq %xmm0, 0x70(%edx) 361 add $0x80, %edx 362 cmp $0x80, %ecx 363 jae L(128bytesormore_nt) 364 sfence 365L(shared_cache_loop_end): 366#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__) 367 POP (%ebx) 368#endif 369 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 370 371 372 .pushsection .rodata.sse2,"a",@progbits 373 ALIGN (2) 374L(table_16_128bytes): 375 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 376 .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) 377 .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) 378 .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) 379 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 380 .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) 381 .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) 382 .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) 383 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 384 .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) 385 .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) 386 .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) 387 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 388 .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) 389 .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) 390 .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) 391 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 392 .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) 393 .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) 394 .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) 395 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 396 .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) 397 .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) 398 .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) 399 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 400 .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) 401 .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) 402 .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) 403 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 404 .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) 405 .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) 406 .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) 407 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 408 .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) 409 .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) 410 .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) 411 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 412 .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) 413 .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) 414 .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) 415 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 416 .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) 417 .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) 418 .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) 419 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 420 .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) 421 .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) 422 .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) 423 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 424 .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) 425 .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) 426 .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) 427 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 428 .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) 429 .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) 430 .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) 431 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 432 .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) 433 .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) 434 .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) 435 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 436 .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) 437 .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) 438 .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) 439 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 440 .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) 441 .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) 442 .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) 443 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 444 .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) 445 .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) 446 .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) 447 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 448 .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) 449 .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) 450 .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) 451 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 452 .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) 453 .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) 454 .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) 455 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 456 .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) 457 .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) 458 .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) 459 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 460 .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) 461 .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) 462 .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) 463 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 464 .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) 465 .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) 466 .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) 467 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 468 .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) 469 .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) 470 .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) 471 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 472 .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) 473 .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) 474 .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) 475 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 476 .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) 477 .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) 478 .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) 479 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 480 .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) 481 .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) 482 .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) 483 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 484 .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) 485 .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) 486 .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) 487 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 488 .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) 489 .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) 490 .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) 491 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 492 .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) 493 .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) 494 .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) 495 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 496 .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) 497 .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) 498 .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) 499 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 500 .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) 501 .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) 502 .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) 503 .popsection 504 505 ALIGN (4) 506L(aligned_16_112bytes): 507 movdqa %xmm0, -112(%edx) 508L(aligned_16_96bytes): 509 movdqa %xmm0, -96(%edx) 510L(aligned_16_80bytes): 511 movdqa %xmm0, -80(%edx) 512L(aligned_16_64bytes): 513 movdqa %xmm0, -64(%edx) 514L(aligned_16_48bytes): 515 movdqa %xmm0, -48(%edx) 516L(aligned_16_32bytes): 517 movdqa %xmm0, -32(%edx) 518L(aligned_16_16bytes): 519 movdqa %xmm0, -16(%edx) 520L(aligned_16_0bytes): 521 SETRTNVAL 522 RETURN 523 524 ALIGN (4) 525L(aligned_16_113bytes): 526 movdqa %xmm0, -113(%edx) 527L(aligned_16_97bytes): 528 movdqa %xmm0, -97(%edx) 529L(aligned_16_81bytes): 530 movdqa %xmm0, -81(%edx) 531L(aligned_16_65bytes): 532 movdqa %xmm0, -65(%edx) 533L(aligned_16_49bytes): 534 movdqa %xmm0, -49(%edx) 535L(aligned_16_33bytes): 536 movdqa %xmm0, -33(%edx) 537L(aligned_16_17bytes): 538 movdqa %xmm0, -17(%edx) 539L(aligned_16_1bytes): 540 movb %al, -1(%edx) 541 SETRTNVAL 542 RETURN 543 544 ALIGN (4) 545L(aligned_16_114bytes): 546 movdqa %xmm0, -114(%edx) 547L(aligned_16_98bytes): 548 movdqa %xmm0, -98(%edx) 549L(aligned_16_82bytes): 550 movdqa %xmm0, -82(%edx) 551L(aligned_16_66bytes): 552 movdqa %xmm0, -66(%edx) 553L(aligned_16_50bytes): 554 movdqa %xmm0, -50(%edx) 555L(aligned_16_34bytes): 556 movdqa %xmm0, -34(%edx) 557L(aligned_16_18bytes): 558 movdqa %xmm0, -18(%edx) 559L(aligned_16_2bytes): 560 movw %ax, -2(%edx) 561 SETRTNVAL 562 RETURN 563 564 ALIGN (4) 565L(aligned_16_115bytes): 566 movdqa %xmm0, -115(%edx) 567L(aligned_16_99bytes): 568 movdqa %xmm0, -99(%edx) 569L(aligned_16_83bytes): 570 movdqa %xmm0, -83(%edx) 571L(aligned_16_67bytes): 572 movdqa %xmm0, -67(%edx) 573L(aligned_16_51bytes): 574 movdqa %xmm0, -51(%edx) 575L(aligned_16_35bytes): 576 movdqa %xmm0, -35(%edx) 577L(aligned_16_19bytes): 578 movdqa %xmm0, -19(%edx) 579L(aligned_16_3bytes): 580 movw %ax, -3(%edx) 581 movb %al, -1(%edx) 582 SETRTNVAL 583 RETURN 584 585 ALIGN (4) 586L(aligned_16_116bytes): 587 movdqa %xmm0, -116(%edx) 588L(aligned_16_100bytes): 589 movdqa %xmm0, -100(%edx) 590L(aligned_16_84bytes): 591 movdqa %xmm0, -84(%edx) 592L(aligned_16_68bytes): 593 movdqa %xmm0, -68(%edx) 594L(aligned_16_52bytes): 595 movdqa %xmm0, -52(%edx) 596L(aligned_16_36bytes): 597 movdqa %xmm0, -36(%edx) 598L(aligned_16_20bytes): 599 movdqa %xmm0, -20(%edx) 600L(aligned_16_4bytes): 601 movl %eax, -4(%edx) 602 SETRTNVAL 603 RETURN 604 605 ALIGN (4) 606L(aligned_16_117bytes): 607 movdqa %xmm0, -117(%edx) 608L(aligned_16_101bytes): 609 movdqa %xmm0, -101(%edx) 610L(aligned_16_85bytes): 611 movdqa %xmm0, -85(%edx) 612L(aligned_16_69bytes): 613 movdqa %xmm0, -69(%edx) 614L(aligned_16_53bytes): 615 movdqa %xmm0, -53(%edx) 616L(aligned_16_37bytes): 617 movdqa %xmm0, -37(%edx) 618L(aligned_16_21bytes): 619 movdqa %xmm0, -21(%edx) 620L(aligned_16_5bytes): 621 movl %eax, -5(%edx) 622 movb %al, -1(%edx) 623 SETRTNVAL 624 RETURN 625 626 ALIGN (4) 627L(aligned_16_118bytes): 628 movdqa %xmm0, -118(%edx) 629L(aligned_16_102bytes): 630 movdqa %xmm0, -102(%edx) 631L(aligned_16_86bytes): 632 movdqa %xmm0, -86(%edx) 633L(aligned_16_70bytes): 634 movdqa %xmm0, -70(%edx) 635L(aligned_16_54bytes): 636 movdqa %xmm0, -54(%edx) 637L(aligned_16_38bytes): 638 movdqa %xmm0, -38(%edx) 639L(aligned_16_22bytes): 640 movdqa %xmm0, -22(%edx) 641L(aligned_16_6bytes): 642 movl %eax, -6(%edx) 643 movw %ax, -2(%edx) 644 SETRTNVAL 645 RETURN 646 647 ALIGN (4) 648L(aligned_16_119bytes): 649 movdqa %xmm0, -119(%edx) 650L(aligned_16_103bytes): 651 movdqa %xmm0, -103(%edx) 652L(aligned_16_87bytes): 653 movdqa %xmm0, -87(%edx) 654L(aligned_16_71bytes): 655 movdqa %xmm0, -71(%edx) 656L(aligned_16_55bytes): 657 movdqa %xmm0, -55(%edx) 658L(aligned_16_39bytes): 659 movdqa %xmm0, -39(%edx) 660L(aligned_16_23bytes): 661 movdqa %xmm0, -23(%edx) 662L(aligned_16_7bytes): 663 movl %eax, -7(%edx) 664 movw %ax, -3(%edx) 665 movb %al, -1(%edx) 666 SETRTNVAL 667 RETURN 668 669 ALIGN (4) 670L(aligned_16_120bytes): 671 movdqa %xmm0, -120(%edx) 672L(aligned_16_104bytes): 673 movdqa %xmm0, -104(%edx) 674L(aligned_16_88bytes): 675 movdqa %xmm0, -88(%edx) 676L(aligned_16_72bytes): 677 movdqa %xmm0, -72(%edx) 678L(aligned_16_56bytes): 679 movdqa %xmm0, -56(%edx) 680L(aligned_16_40bytes): 681 movdqa %xmm0, -40(%edx) 682L(aligned_16_24bytes): 683 movdqa %xmm0, -24(%edx) 684L(aligned_16_8bytes): 685 movq %xmm0, -8(%edx) 686 SETRTNVAL 687 RETURN 688 689 ALIGN (4) 690L(aligned_16_121bytes): 691 movdqa %xmm0, -121(%edx) 692L(aligned_16_105bytes): 693 movdqa %xmm0, -105(%edx) 694L(aligned_16_89bytes): 695 movdqa %xmm0, -89(%edx) 696L(aligned_16_73bytes): 697 movdqa %xmm0, -73(%edx) 698L(aligned_16_57bytes): 699 movdqa %xmm0, -57(%edx) 700L(aligned_16_41bytes): 701 movdqa %xmm0, -41(%edx) 702L(aligned_16_25bytes): 703 movdqa %xmm0, -25(%edx) 704L(aligned_16_9bytes): 705 movq %xmm0, -9(%edx) 706 movb %al, -1(%edx) 707 SETRTNVAL 708 RETURN 709 710 ALIGN (4) 711L(aligned_16_122bytes): 712 movdqa %xmm0, -122(%edx) 713L(aligned_16_106bytes): 714 movdqa %xmm0, -106(%edx) 715L(aligned_16_90bytes): 716 movdqa %xmm0, -90(%edx) 717L(aligned_16_74bytes): 718 movdqa %xmm0, -74(%edx) 719L(aligned_16_58bytes): 720 movdqa %xmm0, -58(%edx) 721L(aligned_16_42bytes): 722 movdqa %xmm0, -42(%edx) 723L(aligned_16_26bytes): 724 movdqa %xmm0, -26(%edx) 725L(aligned_16_10bytes): 726 movq %xmm0, -10(%edx) 727 movw %ax, -2(%edx) 728 SETRTNVAL 729 RETURN 730 731 ALIGN (4) 732L(aligned_16_123bytes): 733 movdqa %xmm0, -123(%edx) 734L(aligned_16_107bytes): 735 movdqa %xmm0, -107(%edx) 736L(aligned_16_91bytes): 737 movdqa %xmm0, -91(%edx) 738L(aligned_16_75bytes): 739 movdqa %xmm0, -75(%edx) 740L(aligned_16_59bytes): 741 movdqa %xmm0, -59(%edx) 742L(aligned_16_43bytes): 743 movdqa %xmm0, -43(%edx) 744L(aligned_16_27bytes): 745 movdqa %xmm0, -27(%edx) 746L(aligned_16_11bytes): 747 movq %xmm0, -11(%edx) 748 movw %ax, -3(%edx) 749 movb %al, -1(%edx) 750 SETRTNVAL 751 RETURN 752 753 ALIGN (4) 754L(aligned_16_124bytes): 755 movdqa %xmm0, -124(%edx) 756L(aligned_16_108bytes): 757 movdqa %xmm0, -108(%edx) 758L(aligned_16_92bytes): 759 movdqa %xmm0, -92(%edx) 760L(aligned_16_76bytes): 761 movdqa %xmm0, -76(%edx) 762L(aligned_16_60bytes): 763 movdqa %xmm0, -60(%edx) 764L(aligned_16_44bytes): 765 movdqa %xmm0, -44(%edx) 766L(aligned_16_28bytes): 767 movdqa %xmm0, -28(%edx) 768L(aligned_16_12bytes): 769 movq %xmm0, -12(%edx) 770 movl %eax, -4(%edx) 771 SETRTNVAL 772 RETURN 773 774 ALIGN (4) 775L(aligned_16_125bytes): 776 movdqa %xmm0, -125(%edx) 777L(aligned_16_109bytes): 778 movdqa %xmm0, -109(%edx) 779L(aligned_16_93bytes): 780 movdqa %xmm0, -93(%edx) 781L(aligned_16_77bytes): 782 movdqa %xmm0, -77(%edx) 783L(aligned_16_61bytes): 784 movdqa %xmm0, -61(%edx) 785L(aligned_16_45bytes): 786 movdqa %xmm0, -45(%edx) 787L(aligned_16_29bytes): 788 movdqa %xmm0, -29(%edx) 789L(aligned_16_13bytes): 790 movq %xmm0, -13(%edx) 791 movl %eax, -5(%edx) 792 movb %al, -1(%edx) 793 SETRTNVAL 794 RETURN 795 796 ALIGN (4) 797L(aligned_16_126bytes): 798 movdqa %xmm0, -126(%edx) 799L(aligned_16_110bytes): 800 movdqa %xmm0, -110(%edx) 801L(aligned_16_94bytes): 802 movdqa %xmm0, -94(%edx) 803L(aligned_16_78bytes): 804 movdqa %xmm0, -78(%edx) 805L(aligned_16_62bytes): 806 movdqa %xmm0, -62(%edx) 807L(aligned_16_46bytes): 808 movdqa %xmm0, -46(%edx) 809L(aligned_16_30bytes): 810 movdqa %xmm0, -30(%edx) 811L(aligned_16_14bytes): 812 movq %xmm0, -14(%edx) 813 movl %eax, -6(%edx) 814 movw %ax, -2(%edx) 815 SETRTNVAL 816 RETURN 817 818 ALIGN (4) 819L(aligned_16_127bytes): 820 movdqa %xmm0, -127(%edx) 821L(aligned_16_111bytes): 822 movdqa %xmm0, -111(%edx) 823L(aligned_16_95bytes): 824 movdqa %xmm0, -95(%edx) 825L(aligned_16_79bytes): 826 movdqa %xmm0, -79(%edx) 827L(aligned_16_63bytes): 828 movdqa %xmm0, -63(%edx) 829L(aligned_16_47bytes): 830 movdqa %xmm0, -47(%edx) 831L(aligned_16_31bytes): 832 movdqa %xmm0, -31(%edx) 833L(aligned_16_15bytes): 834 movq %xmm0, -15(%edx) 835 movl %eax, -7(%edx) 836 movw %ax, -3(%edx) 837 movb %al, -1(%edx) 838 SETRTNVAL 839 RETURN_END 840 841END (MEMSET) 842