1 /* crc32_simd.c 2 * 3 * Copyright 2017 The Chromium Authors. All rights reserved. 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the Chromium source repository LICENSE file. 6 */ 7 8 #include "crc32_simd.h" 9 10 #if defined(CRC32_SIMD_SSE42_PCLMUL) 11 12 /* 13 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 14 * length must be at least 64, and a multiple of 16. Based on: 15 * 16 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" 17 * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 18 */ 19 20 #include <emmintrin.h> 21 #include <smmintrin.h> 22 #include <wmmintrin.h> 23 24 uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ 25 const unsigned char *buf, 26 z_size_t len, 27 uint32_t crc) 28 { 29 /* 30 * Definitions of the bit-reflected domain constants k1,k2,k3, etc and 31 * the CRC32+Barrett polynomials given at the end of the paper. 32 */ 33 static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; 34 static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; 35 static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; 36 static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; 37 38 __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; 39 40 /* 41 * There's at least one block of 64. 42 */ 43 x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); 44 x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); 45 x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); 46 x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); 47 48 x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); 49 50 x0 = _mm_load_si128((__m128i *)k1k2); 51 52 buf += 64; 53 len -= 64; 54 55 /* 56 * Parallel fold blocks of 64, if any. 57 */ 58 while (len >= 64) 59 { 60 x5 = _mm_clmulepi64_si128(x1, x0, 0x00); 61 x6 = _mm_clmulepi64_si128(x2, x0, 0x00); 62 x7 = _mm_clmulepi64_si128(x3, x0, 0x00); 63 x8 = _mm_clmulepi64_si128(x4, x0, 0x00); 64 65 x1 = _mm_clmulepi64_si128(x1, x0, 0x11); 66 x2 = _mm_clmulepi64_si128(x2, x0, 0x11); 67 x3 = _mm_clmulepi64_si128(x3, x0, 0x11); 68 x4 = _mm_clmulepi64_si128(x4, x0, 0x11); 69 70 y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); 71 y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); 72 y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); 73 y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); 74 75 x1 = _mm_xor_si128(x1, x5); 76 x2 = _mm_xor_si128(x2, x6); 77 x3 = _mm_xor_si128(x3, x7); 78 x4 = _mm_xor_si128(x4, x8); 79 80 x1 = _mm_xor_si128(x1, y5); 81 x2 = _mm_xor_si128(x2, y6); 82 x3 = _mm_xor_si128(x3, y7); 83 x4 = _mm_xor_si128(x4, y8); 84 85 buf += 64; 86 len -= 64; 87 } 88 89 /* 90 * Fold into 128-bits. 91 */ 92 x0 = _mm_load_si128((__m128i *)k3k4); 93 94 x5 = _mm_clmulepi64_si128(x1, x0, 0x00); 95 x1 = _mm_clmulepi64_si128(x1, x0, 0x11); 96 x1 = _mm_xor_si128(x1, x2); 97 x1 = _mm_xor_si128(x1, x5); 98 99 x5 = _mm_clmulepi64_si128(x1, x0, 0x00); 100 x1 = _mm_clmulepi64_si128(x1, x0, 0x11); 101 x1 = _mm_xor_si128(x1, x3); 102 x1 = _mm_xor_si128(x1, x5); 103 104 x5 = _mm_clmulepi64_si128(x1, x0, 0x00); 105 x1 = _mm_clmulepi64_si128(x1, x0, 0x11); 106 x1 = _mm_xor_si128(x1, x4); 107 x1 = _mm_xor_si128(x1, x5); 108 109 /* 110 * Single fold blocks of 16, if any. 111 */ 112 while (len >= 16) 113 { 114 x2 = _mm_loadu_si128((__m128i *)buf); 115 116 x5 = _mm_clmulepi64_si128(x1, x0, 0x00); 117 x1 = _mm_clmulepi64_si128(x1, x0, 0x11); 118 x1 = _mm_xor_si128(x1, x2); 119 x1 = _mm_xor_si128(x1, x5); 120 121 buf += 16; 122 len -= 16; 123 } 124 125 /* 126 * Fold 128-bits to 64-bits. 127 */ 128 x2 = _mm_clmulepi64_si128(x1, x0, 0x10); 129 x3 = _mm_setr_epi32(~0, 0, ~0, 0); 130 x1 = _mm_srli_si128(x1, 8); 131 x1 = _mm_xor_si128(x1, x2); 132 133 x0 = _mm_loadl_epi64((__m128i*)k5k0); 134 135 x2 = _mm_srli_si128(x1, 4); 136 x1 = _mm_and_si128(x1, x3); 137 x1 = _mm_clmulepi64_si128(x1, x0, 0x00); 138 x1 = _mm_xor_si128(x1, x2); 139 140 /* 141 * Barret reduce to 32-bits. 142 */ 143 x0 = _mm_load_si128((__m128i*)poly); 144 145 x2 = _mm_and_si128(x1, x3); 146 x2 = _mm_clmulepi64_si128(x2, x0, 0x10); 147 x2 = _mm_and_si128(x2, x3); 148 x2 = _mm_clmulepi64_si128(x2, x0, 0x00); 149 x1 = _mm_xor_si128(x1, x2); 150 151 /* 152 * Return the crc32. 153 */ 154 return _mm_extract_epi32(x1, 1); 155 } 156 157 #elif defined(CRC32_ARMV8_CRC32) 158 159 /* CRC32 checksums using ARMv8-a crypto instructions. 160 * 161 * TODO: implement a version using the PMULL instruction. 162 */ 163 164 #if defined(__clang__) 165 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an 166 * armv8 target, which is incompatible with ThinLTO optimizations on Android. 167 * (Namely, mixing and matching different module-level targets makes ThinLTO 168 * warn, and Android defaults to armv7-a. This restriction does not apply to 169 * function-level `target`s, however.) 170 * 171 * Since we only need four crc intrinsics, and since clang's implementation of 172 * those are just wrappers around compiler builtins, it's simplest to #define 173 * those builtins directly. If this #define list grows too much (or we depend on 174 * an intrinsic that isn't a trivial wrapper), we may have to find a better way 175 * to go about this. 176 * 177 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized 178 * feature for this target (ignoring feature)." This appears to be a harmless 179 * bug in clang. 180 */ 181 #define __crc32b __builtin_arm_crc32b 182 #define __crc32d __builtin_arm_crc32d 183 #define __crc32w __builtin_arm_crc32w 184 #define __crc32cw __builtin_arm_crc32cw 185 186 #if defined(__aarch64__) 187 #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) 188 #else // !defined(__aarch64__) 189 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) 190 #endif // defined(__aarch64__) 191 192 #elif defined(__GNUC__) 193 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not 194 * allowed. We can just include arm_acle.h. 195 */ 196 #include <arm_acle.h> 197 #define TARGET_ARMV8_WITH_CRC 198 #else // !defined(__GNUC__) && !defined(_aarch64__) 199 #error ARM CRC32 SIMD extensions only supported for Clang and GCC 200 #endif 201 202 TARGET_ARMV8_WITH_CRC 203 uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, 204 const unsigned char *buf, 205 z_size_t len) 206 { 207 uint32_t c = (uint32_t) ~crc; 208 209 while (len && ((uintptr_t)buf & 7)) { 210 c = __crc32b(c, *buf++); 211 --len; 212 } 213 214 const uint64_t *buf8 = (const uint64_t *)buf; 215 216 while (len >= 64) { 217 c = __crc32d(c, *buf8++); 218 c = __crc32d(c, *buf8++); 219 c = __crc32d(c, *buf8++); 220 c = __crc32d(c, *buf8++); 221 222 c = __crc32d(c, *buf8++); 223 c = __crc32d(c, *buf8++); 224 c = __crc32d(c, *buf8++); 225 c = __crc32d(c, *buf8++); 226 len -= 64; 227 } 228 229 while (len >= 8) { 230 c = __crc32d(c, *buf8++); 231 len -= 8; 232 } 233 234 buf = (const unsigned char *)buf8; 235 236 while (len--) { 237 c = __crc32b(c, *buf++); 238 } 239 240 return ~c; 241 } 242 243 #endif 244