1 // Copyright 2008 Google Inc. All Rights Reserved. 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 7 // http://www.apache.org/licenses/LICENSE-2.0 8 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "adler32memcpy.h" 16 17 // We are using (a modified form of) adler-32 checksum algorithm instead 18 // of CRC since adler-32 is faster than CRC. 19 // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/) 20 // This form of adler is bit modified, instead of treating the data in 21 // units of bytes, 32-bit data is taken as a unit and two 64-bit 22 // checksums are done (we could have one checksum but two checksums 23 // make the code run faster). 24 25 // Adler-32 implementation: 26 // Data is treated as 1-byte numbers and, 27 // there are two 16-bit numbers a and b 28 // Initialize a with 1 and b with 0. 29 // for each data unit 'd' 30 // a += d 31 // b += a 32 // checksum = a<<16 + b 33 // This sum should never overflow. 34 // 35 // Adler-64+64 implementation: 36 // (applied in this code) 37 // Data is treated as 32-bit numbers and whole data is separated into two 38 // streams, and hence the two checksums a1, a2, b1 and b2. 39 // Initialize a1 and a2 with 1, b1 and b2 with 0 40 // add first dataunit to a1 41 // add a1 to b1 42 // add second dataunit to a1 43 // add a1 to b1 44 // add third dataunit to a2 45 // add a2 to b2 46 // add fourth dataunit to a2 47 // add a2 to b2 48 // ... 49 // repeat the sequence back for next 4 dataunits 50 // 51 // variable A = XMM6 and variable B = XMM7. 52 // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7) 53 54 // Assumptions 55 // 1. size_in_bytes is a multiple of 16. 56 // 2. srcmem and dstmem are 16 byte aligned. 57 // 3. size_in_bytes is less than 2^19 bytes. 58 59 // Assumption 3 ensures that there is no overflow when numbers are being 60 // added (we can remove this assumption by doing modulus with a prime 61 // number when it is just about to overflow but that would be a very costly 62 // exercise) 63 64 // Returns true if the checksums are equal. 65 bool AdlerChecksum::Equals(const AdlerChecksum &other) const { 66 return ( (a1_ == other.a1_) && (a2_ == other.a2_) && 67 (b1_ == other.b1_) && (b2_ == other.b2_) ); 68 } 69 70 // Returns string representation of the Adler checksum. 71 string AdlerChecksum::ToHexString() const { 72 char buffer[128]; 73 snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_); 74 return string(buffer); 75 } 76 77 // Sets components of the Adler checksum. 78 void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) { 79 a1_ = a1; 80 a2_ = a2; 81 b1_ = b1; 82 b2_ = b2; 83 } 84 85 // Calculates Adler checksum for supplied data. 86 bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes, 87 AdlerChecksum *checksum) { 88 // Use this data wrapper to access memory with 64bit read/write. 89 datacast_t data; 90 unsigned int count = size_in_bytes / sizeof(data); 91 92 if (count > (1U) << 19) { 93 // Size is too large, must be strictly less than 512 KB. 94 return false; 95 } 96 97 uint64 a1 = 1; 98 uint64 a2 = 1; 99 uint64 b1 = 0; 100 uint64 b2 = 0; 101 102 unsigned int i = 0; 103 while (i < count) { 104 // Process 64 bits at a time. 105 data.l64 = data64[i]; 106 a1 = a1 + data.l32.l; 107 b1 = b1 + a1; 108 a1 = a1 + data.l32.h; 109 b1 = b1 + a1; 110 i++; 111 112 data.l64 = data64[i]; 113 a2 = a2 + data.l32.l; 114 b2 = b2 + a2; 115 a2 = a2 + data.l32.h; 116 b2 = b2 + a2; 117 i++; 118 } 119 checksum->Set(a1, a2, b1, b2); 120 return true; 121 } 122 123 // C implementation of Adler memory copy. 124 bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64, 125 unsigned int size_in_bytes, AdlerChecksum *checksum) { 126 // Use this data wrapper to access memory with 64bit read/write. 127 datacast_t data; 128 unsigned int count = size_in_bytes / sizeof(data); 129 130 if (count > ((1U) << 19)) { 131 // Size is too large, must be strictly less than 512 KB. 132 return false; 133 } 134 135 uint64 a1 = 1; 136 uint64 a2 = 1; 137 uint64 b1 = 0; 138 uint64 b2 = 0; 139 140 unsigned int i = 0; 141 while (i < count) { 142 // Process 64 bits at a time. 143 data.l64 = srcmem64[i]; 144 a1 = a1 + data.l32.l; 145 b1 = b1 + a1; 146 a1 = a1 + data.l32.h; 147 b1 = b1 + a1; 148 dstmem64[i] = data.l64; 149 i++; 150 151 data.l64 = srcmem64[i]; 152 a2 = a2 + data.l32.l; 153 b2 = b2 + a2; 154 a2 = a2 + data.l32.h; 155 b2 = b2 + a2; 156 dstmem64[i] = data.l64; 157 i++; 158 } 159 checksum->Set(a1, a2, b1, b2); 160 return true; 161 } 162 163 // C implementation of Adler memory copy with some float point ops, 164 // attempting to warm up the CPU. 165 bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64, 166 unsigned int size_in_bytes, AdlerChecksum *checksum) { 167 // Use this data wrapper to access memory with 64bit read/write. 168 datacast_t data; 169 unsigned int count = size_in_bytes / sizeof(data); 170 171 if (count > ((1U) << 19)) { 172 // Size is too large, must be strictly less than 512 KB. 173 return false; 174 } 175 176 uint64 a1 = 1; 177 uint64 a2 = 1; 178 uint64 b1 = 0; 179 uint64 b2 = 0; 180 181 double a = 2.0 * static_cast<double>(srcmem64[0]); 182 double b = 5.0 * static_cast<double>(srcmem64[0]); 183 double c = 7.0 * static_cast<double>(srcmem64[0]); 184 double d = 9.0 * static_cast<double>(srcmem64[0]); 185 186 unsigned int i = 0; 187 while (i < count) { 188 // Process 64 bits at a time. 189 data.l64 = srcmem64[i]; 190 a1 = a1 + data.l32.l; 191 b1 = b1 + a1; 192 a1 = a1 + data.l32.h; 193 b1 = b1 + a1; 194 dstmem64[i] = data.l64; 195 i++; 196 197 // Warm cpu up. 198 a = a * b; 199 b = b + c; 200 201 data.l64 = srcmem64[i]; 202 a2 = a2 + data.l32.l; 203 b2 = b2 + a2; 204 a2 = a2 + data.l32.h; 205 b2 = b2 + a2; 206 dstmem64[i] = data.l64; 207 i++; 208 209 // Warm cpu up. 210 c = c * d; 211 d = d + d; 212 } 213 214 // Warm cpu up. 215 d = a + b + c + d; 216 if (d == 1.0) { 217 // Reference the result so that it can't be discarded by the compiler. 218 printf("Log: This will probably never happen.\n"); 219 } 220 221 checksum->Set(a1, a2, b1, b2); 222 return true; 223 } 224 225 // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy. 226 bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, 227 unsigned int size_in_bytes, AdlerChecksum *checksum) { 228 // Use assembly implementation where supported. 229 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) 230 231 // Pull a bit of tricky preprocessing to make the inline asm both 232 // 32 bit and 64 bit. 233 #ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86... 234 #define rAX "%%eax" 235 #define rCX "%%ecx" 236 #define rDX "%%edx" 237 #define rBX "%%ebx" 238 #define rSP "%%esp" 239 #define rBP "%%ebp" 240 #define rSI "%%esi" 241 #define rDI "%%edi" 242 #endif 243 244 #ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros. 245 #define rAX "%%rax" 246 #define rCX "%%rcx" 247 #define rDX "%%rdx" 248 #define rBX "%%rbx" 249 #define rSP "%%rsp" 250 #define rBP "%%rbp" 251 #define rSI "%%rsi" 252 #define rDI "%%rdi" 253 #endif 254 255 // Elements 0 to 3 are used for holding checksum terms a1, a2, 256 // b1, b2 respectively. These elements are filled by asm code. 257 // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing 258 // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are 259 // setting word index 0 and word index 2 to zero). 260 // Element 6 and 7 are used for setting a1 and a2 to 1. 261 volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = 262 {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1}; 263 264 if ((size_in_bytes >> 19) > 0) { 265 // Size is too large. Must be less than 2^19 bytes = 512 KB. 266 return false; 267 } 268 269 // Number of 32-bit words which are not added to a1/a2 in the main loop. 270 uint32 remaining_words = (size_in_bytes % 48) / 4; 271 272 // Since we are moving 48 bytes at a time number of iterations = total size/48 273 // is value of counter. 274 uint32 num_of_48_byte_units = size_in_bytes / 48; 275 276 asm volatile ( 277 // Source address is in ESI (extended source index) 278 // destination is in EDI (extended destination index) 279 // and counter is already in ECX (extended counter 280 // index). 281 "cmp $0, " rCX ";" // Compare counter to zero. 282 "jz END;" 283 284 // XMM6 is initialized with 1 and XMM7 with 0. 285 "prefetchnta 0(" rSI ");" 286 "prefetchnta 64(" rSI ");" 287 "movdqu 48(" rAX "), %%xmm6;" 288 "xorps %%xmm7, %%xmm7;" 289 290 // Start of the loop which copies 48 bytes from source to dst each time. 291 "TOP:\n" 292 293 // Make 6 moves each of 16 bytes from srcmem to XMM registers. 294 // We are using 2 words out of 4 words in each XMM register, 295 // word index 0 and word index 2 296 "movdqa 0(" rSI "), %%xmm0;" 297 "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here. 298 "movdqa 16(" rSI "), %%xmm2;" 299 "movdqu 20(" rSI "), %%xmm3;" 300 "movdqa 32(" rSI "), %%xmm4;" 301 "movdqu 36(" rSI "), %%xmm5;" 302 303 // Move 3 * 16 bytes from XMM registers to dstmem. 304 // Note: this copy must be performed before pinsrw instructions since 305 // they will modify the XMM registers. 306 "movntdq %%xmm0, 0(" rDI ");" 307 "movntdq %%xmm2, 16(" rDI ");" 308 "movntdq %%xmm4, 32(" rDI ");" 309 310 // Sets the word[1] and word[3] of XMM0 to XMM5 to zero. 311 "andps 32(" rAX "), %%xmm0;" 312 "andps 32(" rAX "), %%xmm1;" 313 "andps 32(" rAX "), %%xmm2;" 314 "andps 32(" rAX "), %%xmm3;" 315 "andps 32(" rAX "), %%xmm4;" 316 "andps 32(" rAX "), %%xmm5;" 317 318 // Add XMM0 to XMM6 and then add XMM6 to XMM7. 319 // Repeat this for XMM1, ..., XMM5. 320 // Overflow(for XMM7) can occur only if there are more 321 // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so 322 // if size_in_bytes > 2^19 than overflow occurs. 323 "paddq %%xmm0, %%xmm6;" 324 "paddq %%xmm6, %%xmm7;" 325 "paddq %%xmm1, %%xmm6;" 326 "paddq %%xmm6, %%xmm7;" 327 "paddq %%xmm2, %%xmm6;" 328 "paddq %%xmm6, %%xmm7;" 329 "paddq %%xmm3, %%xmm6;" 330 "paddq %%xmm6, %%xmm7;" 331 "paddq %%xmm4, %%xmm6;" 332 "paddq %%xmm6, %%xmm7;" 333 "paddq %%xmm5, %%xmm6;" 334 "paddq %%xmm6, %%xmm7;" 335 336 // Increment ESI and EDI by 48 bytes and decrement counter by 1. 337 "add $48, " rSI ";" 338 "add $48, " rDI ";" 339 "prefetchnta 0(" rSI ");" 340 "prefetchnta 64(" rSI ");" 341 "dec " rCX ";" 342 "jnz TOP;" 343 344 // Now only remaining_words 32-bit words are left. 345 // make a loop, add first two words to a1 and next two to a2 (just like 346 // above loop, the only extra thing we are doing is rechecking 347 // rDX (=remaining_words) everytime we add a number to a1/a2. 348 "REM_IS_STILL_NOT_ZERO:\n" 349 // Unless remaining_words becomes less than 4 words(16 bytes) 350 // there is not much issue and remaining_words will always 351 // be a multiple of four by assumption. 352 "cmp $4, " rDX ";" 353 // In case for some weird reasons if remaining_words becomes 354 // less than 4 but not zero then also break the code and go off to END. 355 "jl END;" 356 // Otherwise just go on and copy data in chunks of 4-words at a time till 357 // whole data (<48 bytes) is copied. 358 "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1. 359 360 "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without 361 "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary. 362 363 "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination. 364 "andps 32(" rAX "), %%xmm0;" 365 "andps 32(" rAX "), %%xmm1;" 366 "paddq %%xmm0, %%xmm6;" 367 "paddq %%xmm6, %%xmm7;" 368 "paddq %%xmm1, %%xmm6;" 369 "paddq %%xmm6, %%xmm7;" 370 "add $16, " rSI ";" 371 "add $16, " rDI ";" 372 "sub $4, " rDX ";" 373 // Decrement %rDX by 4 since %rDX is number of 32-bit 374 // words left after considering all 48-byte units. 375 "jmp REM_IS_STILL_NOT_ZERO;" 376 377 "END:\n" 378 // Report checksum values A and B (both right now are two concatenated 379 // 64 bit numbers and have to be converted to 64 bit numbers) 380 // seems like Adler128 (since size of each part is 4 byte rather than 381 // 1 byte). 382 "movdqa %%xmm6, 0(" rAX ");" 383 "movdqa %%xmm7, 16(" rAX ");" 384 "sfence;" 385 386 // No output registers. 387 : 388 // Input registers. 389 : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr), 390 "c" (num_of_48_byte_units), "d" (remaining_words) 391 ); // asm. 392 393 if (checksum != NULL) { 394 checksum->Set(checksum_arr[0], checksum_arr[1], 395 checksum_arr[2], checksum_arr[3]); 396 } 397 398 // Everything went fine, so return true (this does not mean 399 // that there is no problem with memory this just mean that data was copied 400 // from src to dst and checksum was calculated successfully). 401 return true; 402 #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__) 403 // Elements 0 to 3 are used for holding checksum terms a1, a2, 404 // b1, b2 respectively. These elements are filled by asm code. 405 // Checksum is seeded with the null checksum. 406 volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = 407 {1, 1, 0, 0}; 408 409 if ((size_in_bytes >> 19) > 0) { 410 // Size is too large. Must be less than 2^19 bytes = 512 KB. 411 return false; 412 } 413 414 // Since we are moving 64 bytes at a time number of iterations = total size/64 415 uint32 blocks = size_in_bytes / 64; 416 417 uint64 *dst = dstmem64; 418 uint64 *src = srcmem64; 419 420 #define src_r "r3" 421 #define dst_r "r4" 422 #define blocks_r "r5" 423 #define crc_r "r6" 424 425 asm volatile ( 426 "mov " src_r ", %[src]; \n" 427 "mov " dst_r ", %[dst]; \n" 428 "mov " crc_r ", %[crc]; \n" 429 "mov " blocks_r ", %[blocks]; \n" 430 431 // Loop over block count. 432 "cmp " blocks_r ", #0; \n" // Compare counter to zero. 433 "ble END; \n" 434 435 436 // Preload upcoming cacheline. 437 "pld [" src_r ", #0x0]; \n" 438 "pld [" src_r ", #0x20]; \n" 439 440 // Init checksum 441 "vldm " crc_r ", {q0}; \n" 442 "vmov.i32 q1, #0; \n" 443 444 // Start of the loop which copies 48 bytes from source to dst each time. 445 "TOP: \n" 446 447 // Make 3 moves each of 16 bytes from srcmem to qX registers. 448 // We are using 2 words out of 4 words in each qX register, 449 // word index 0 and word index 2. We'll swizzle them in a bit. 450 // Copy it. 451 "vldm " src_r "!, {q8, q9, q10, q11}; \n" 452 "vstm " dst_r "!, {q8, q9, q10, q11}; \n" 453 454 // Arrange it. 455 "vmov.i64 q12, #0; \n" 456 "vmov.i64 q13, #0; \n" 457 "vmov.i64 q14, #0; \n" 458 "vmov.i64 q15, #0; \n" 459 // This exchenges words 1,3 in the filled registers with 460 // words 0,2 in the empty registers. 461 "vtrn.32 q8, q12; \n" 462 "vtrn.32 q9, q13; \n" 463 "vtrn.32 q10, q14; \n" 464 "vtrn.32 q11, q15; \n" 465 466 // Sum into q0, then into q1. 467 // Repeat this for q8 - q13. 468 // Overflow can occur only if there are more 469 // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so 470 // if size_in_bytes > 2^19 than overflow occurs. 471 "vadd.i64 q0, q0, q8; \n" 472 "vadd.i64 q1, q1, q0; \n" 473 "vadd.i64 q0, q0, q12; \n" 474 "vadd.i64 q1, q1, q0; \n" 475 "vadd.i64 q0, q0, q9; \n" 476 "vadd.i64 q1, q1, q0; \n" 477 "vadd.i64 q0, q0, q13; \n" 478 "vadd.i64 q1, q1, q0; \n" 479 480 "vadd.i64 q0, q0, q10; \n" 481 "vadd.i64 q1, q1, q0; \n" 482 "vadd.i64 q0, q0, q14; \n" 483 "vadd.i64 q1, q1, q0; \n" 484 "vadd.i64 q0, q0, q11; \n" 485 "vadd.i64 q1, q1, q0; \n" 486 "vadd.i64 q0, q0, q15; \n" 487 "vadd.i64 q1, q1, q0; \n" 488 489 // Increment counter and loop. 490 "sub " blocks_r ", " blocks_r ", #1; \n" 491 "cmp " blocks_r ", #0; \n" // Compare counter to zero. 492 "bgt TOP; \n" 493 494 495 "END:\n" 496 // Report checksum values A and B (both right now are two concatenated 497 // 64 bit numbers and have to be converted to 64 bit numbers) 498 // seems like Adler128 (since size of each part is 4 byte rather than 499 // 1 byte). 500 "vstm " crc_r ", {q0, q1}; \n" 501 502 // Output registers. 503 : 504 // Input registers. 505 : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr) 506 : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15" 507 ); // asm. 508 509 if (checksum != NULL) { 510 checksum->Set(checksum_arr[0], checksum_arr[1], 511 checksum_arr[2], checksum_arr[3]); 512 } 513 514 // Everything went fine, so return true (this does not mean 515 // that there is no problem with memory this just mean that data was copied 516 // from src to dst and checksum was calculated successfully). 517 return true; 518 #else 519 #warning "No vector copy defined for this architecture." 520 // Fall back to C implementation for anything else. 521 return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); 522 #endif 523 } 524