1 /* crc32_simd.c
2  *
3  * Copyright 2017 The Chromium Authors. All rights reserved.
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the Chromium source repository LICENSE file.
6  */
7 
8 #include "crc32_simd.h"
9 
10 #if defined(CRC32_SIMD_SSE42_PCLMUL)
11 
12 /*
13  * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
14  * length must be at least 64, and a multiple of 16. Based on:
15  *
16  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
17  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
18  */
19 
20 #include <emmintrin.h>
21 #include <smmintrin.h>
22 #include <wmmintrin.h>
23 
24 uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
25     const unsigned char *buf,
26     z_size_t len,
27     uint32_t crc)
28 {
29     /*
30      * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
31      * the CRC32+Barrett polynomials given at the end of the paper.
32      */
33     static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
34     static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
35     static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
36     static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
37 
38     __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
39 
40     /*
41      * There's at least one block of 64.
42      */
43     x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
44     x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
45     x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
46     x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
47 
48     x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
49 
50     x0 = _mm_load_si128((__m128i *)k1k2);
51 
52     buf += 64;
53     len -= 64;
54 
55     /*
56      * Parallel fold blocks of 64, if any.
57      */
58     while (len >= 64)
59     {
60         x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
61         x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
62         x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
63         x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
64 
65         x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
66         x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
67         x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
68         x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
69 
70         y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
71         y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
72         y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
73         y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
74 
75         x1 = _mm_xor_si128(x1, x5);
76         x2 = _mm_xor_si128(x2, x6);
77         x3 = _mm_xor_si128(x3, x7);
78         x4 = _mm_xor_si128(x4, x8);
79 
80         x1 = _mm_xor_si128(x1, y5);
81         x2 = _mm_xor_si128(x2, y6);
82         x3 = _mm_xor_si128(x3, y7);
83         x4 = _mm_xor_si128(x4, y8);
84 
85         buf += 64;
86         len -= 64;
87     }
88 
89     /*
90      * Fold into 128-bits.
91      */
92     x0 = _mm_load_si128((__m128i *)k3k4);
93 
94     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
95     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
96     x1 = _mm_xor_si128(x1, x2);
97     x1 = _mm_xor_si128(x1, x5);
98 
99     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
100     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
101     x1 = _mm_xor_si128(x1, x3);
102     x1 = _mm_xor_si128(x1, x5);
103 
104     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
105     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
106     x1 = _mm_xor_si128(x1, x4);
107     x1 = _mm_xor_si128(x1, x5);
108 
109     /*
110      * Single fold blocks of 16, if any.
111      */
112     while (len >= 16)
113     {
114         x2 = _mm_loadu_si128((__m128i *)buf);
115 
116         x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
117         x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
118         x1 = _mm_xor_si128(x1, x2);
119         x1 = _mm_xor_si128(x1, x5);
120 
121         buf += 16;
122         len -= 16;
123     }
124 
125     /*
126      * Fold 128-bits to 64-bits.
127      */
128     x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
129     x3 = _mm_setr_epi32(~0, 0, ~0, 0);
130     x1 = _mm_srli_si128(x1, 8);
131     x1 = _mm_xor_si128(x1, x2);
132 
133     x0 = _mm_loadl_epi64((__m128i*)k5k0);
134 
135     x2 = _mm_srli_si128(x1, 4);
136     x1 = _mm_and_si128(x1, x3);
137     x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
138     x1 = _mm_xor_si128(x1, x2);
139 
140     /*
141      * Barret reduce to 32-bits.
142      */
143     x0 = _mm_load_si128((__m128i*)poly);
144 
145     x2 = _mm_and_si128(x1, x3);
146     x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
147     x2 = _mm_and_si128(x2, x3);
148     x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
149     x1 = _mm_xor_si128(x1, x2);
150 
151     /*
152      * Return the crc32.
153      */
154     return _mm_extract_epi32(x1, 1);
155 }
156 
157 #elif defined(CRC32_ARMV8_CRC32)
158 
159 /* CRC32 checksums using ARMv8-a crypto instructions.
160  *
161  * TODO: implement a version using the PMULL instruction.
162  */
163 
164 #if defined(__clang__)
165 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
166  * armv8 target, which is incompatible with ThinLTO optimizations on Android.
167  * (Namely, mixing and matching different module-level targets makes ThinLTO
168  * warn, and Android defaults to armv7-a. This restriction does not apply to
169  * function-level `target`s, however.)
170  *
171  * Since we only need four crc intrinsics, and since clang's implementation of
172  * those are just wrappers around compiler builtins, it's simplest to #define
173  * those builtins directly. If this #define list grows too much (or we depend on
174  * an intrinsic that isn't a trivial wrapper), we may have to find a better way
175  * to go about this.
176  *
177  * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
178  * feature for this target (ignoring feature)." This appears to be a harmless
179  * bug in clang.
180  */
181 #define __crc32b __builtin_arm_crc32b
182 #define __crc32d __builtin_arm_crc32d
183 #define __crc32w __builtin_arm_crc32w
184 #define __crc32cw __builtin_arm_crc32cw
185 
186 #if defined(__aarch64__)
187 #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
188 #else  // !defined(__aarch64__)
189 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
190 #endif  // defined(__aarch64__)
191 
192 #elif defined(__GNUC__)
193 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
194  * allowed. We can just include arm_acle.h.
195  */
196 #include <arm_acle.h>
197 #define TARGET_ARMV8_WITH_CRC
198 #else  // !defined(__GNUC__) && !defined(_aarch64__)
199 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
200 #endif
201 
202 TARGET_ARMV8_WITH_CRC
203 uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
204                                           const unsigned char *buf,
205                                           z_size_t len)
206 {
207     uint32_t c = (uint32_t) ~crc;
208 
209     while (len && ((uintptr_t)buf & 7)) {
210         c = __crc32b(c, *buf++);
211         --len;
212     }
213 
214     const uint64_t *buf8 = (const uint64_t *)buf;
215 
216     while (len >= 64) {
217         c = __crc32d(c, *buf8++);
218         c = __crc32d(c, *buf8++);
219         c = __crc32d(c, *buf8++);
220         c = __crc32d(c, *buf8++);
221 
222         c = __crc32d(c, *buf8++);
223         c = __crc32d(c, *buf8++);
224         c = __crc32d(c, *buf8++);
225         c = __crc32d(c, *buf8++);
226         len -= 64;
227     }
228 
229     while (len >= 8) {
230         c = __crc32d(c, *buf8++);
231         len -= 8;
232     }
233 
234     buf = (const unsigned char *)buf8;
235 
236     while (len--) {
237         c = __crc32b(c, *buf++);
238     }
239 
240     return ~c;
241 }
242 
243 #endif
244