1 /*
2  * Copyright (c) 2013 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Chris Wilson <chris@chris-wilson.co.uk>
25  *
26  */
27 
28 #include "config.h"
29 
30 #ifdef HAVE_CPUID_H
31 #include <cpuid.h>
32 #else
33 #define __get_cpuid_max(x, y) 0
34 #define __cpuid(level, a, b, c, d) a = b = c = d = 0
35 #define __cpuid_count(level, count, a, b, c, d) a = b = c = d = 0
36 #endif
37 
38 #include "igt_x86.h"
39 #include "igt_aux.h"
40 
41 #include <stdint.h>
42 #include <stdio.h>
43 #include <string.h>
44 
45 /**
46  * SECTION:igt_x86
47  * @short_description: x86 helper library
48  * @title: x86
49  * @include: igt_x86.h
50  */
51 
52 #define BASIC_CPUID 0x0
53 #define EXTENDED_CPUID 0x80000000
54 
55 #ifndef bit_MMX
56 #define bit_MMX		(1 << 23)
57 #endif
58 
59 #ifndef bit_SSE
60 #define bit_SSE		(1 << 25)
61 #endif
62 
63 #ifndef bit_SSE2
64 #define bit_SSE2	(1 << 26)
65 #endif
66 
67 #ifndef bit_SSE3
68 #define bit_SSE3	(1 << 0)
69 #endif
70 
71 #ifndef bit_SSSE3
72 #define bit_SSSE3	(1 << 9)
73 #endif
74 
75 #ifndef bit_SSE4_1
76 #define bit_SSE4_1	(1 << 19)
77 #endif
78 
79 #ifndef bit_SSE4_2
80 #define bit_SSE4_2	(1 << 20)
81 #endif
82 
83 #ifndef bit_OSXSAVE
84 #define bit_OSXSAVE	(1 << 27)
85 #endif
86 
87 #ifndef bit_AVX
88 #define bit_AVX		(1 << 28)
89 #endif
90 
91 #ifndef bit_F16C
92 #define bit_F16C	(1 << 29)
93 #endif
94 
95 #ifndef bit_AVX2
96 #define bit_AVX2	(1<<5)
97 #endif
98 
99 #define xgetbv(index,eax,edx) \
100 	__asm__ ("xgetbv" : "=a"(eax), "=d"(edx) : "c" (index))
101 
102 #define has_YMM 0x1
103 
104 #if defined(__x86_64__) || defined(__i386__)
igt_x86_features(void)105 unsigned igt_x86_features(void)
106 {
107 	unsigned max = __get_cpuid_max(BASIC_CPUID, 0);
108 	unsigned eax, ebx, ecx, edx;
109 	unsigned features = 0;
110 	unsigned extra = 0;
111 
112 	if (max >= 1) {
113 		__cpuid(1, eax, ebx, ecx, edx);
114 
115 		if (ecx & bit_SSE3)
116 			features |= SSE3;
117 
118 		if (ecx & bit_SSSE3)
119 			features |= SSSE3;
120 
121 		if (ecx & bit_SSE4_1)
122 			features |= SSE4_1;
123 
124 		if (ecx & bit_SSE4_2)
125 			features |= SSE4_2;
126 
127 		if (ecx & bit_OSXSAVE) {
128 			unsigned int bv_eax, bv_ecx;
129 			xgetbv(0, bv_eax, bv_ecx);
130 			if ((bv_eax & 6) == 6)
131 				extra |= has_YMM;
132 		}
133 
134 		if ((extra & has_YMM) && (ecx & bit_AVX))
135 			features |= AVX;
136 
137 		if (edx & bit_MMX)
138 			features |= MMX;
139 
140 		if (edx & bit_SSE)
141 			features |= SSE;
142 
143 		if (edx & bit_SSE2)
144 			features |= SSE2;
145 
146 		if (ecx & bit_F16C)
147 			features |= F16C;
148 	}
149 
150 	if (max >= 7) {
151 		__cpuid_count(7, 0, eax, ebx, ecx, edx);
152 
153 		if ((extra & has_YMM) && (ebx & bit_AVX2))
154 			features |= AVX2;
155 	}
156 
157 	return features;
158 }
159 
igt_x86_features_to_string(unsigned features,char * line)160 char *igt_x86_features_to_string(unsigned features, char *line)
161 {
162 	char *ret = line;
163 
164 #ifdef __x86_64__
165 	line += sprintf(line, "x86-64");
166 #else
167 	line += sprintf(line, "x86");
168 #endif
169 
170 	if (features & SSE2)
171 		line += sprintf(line, ", sse2");
172 	if (features & SSE3)
173 		line += sprintf(line, ", sse3");
174 	if (features & SSSE3)
175 		line += sprintf(line, ", ssse3");
176 	if (features & SSE4_1)
177 		line += sprintf(line, ", sse4.1");
178 	if (features & SSE4_2)
179 		line += sprintf(line, ", sse4.2");
180 	if (features & AVX)
181 		line += sprintf(line, ", avx");
182 	if (features & AVX2)
183 		line += sprintf(line, ", avx2");
184 	if (features & F16C)
185 		line += sprintf(line, ", f16c");
186 
187 	(void)line;
188 
189 	return ret;
190 }
191 #endif
192 
193 #if defined(__x86_64__) && !defined(__clang__)
194 #pragma GCC push_options
195 #pragma GCC target("sse4.1")
196 #pragma GCC diagnostic ignored "-Wpointer-arith"
197 
198 #include <smmintrin.h>
memcpy_from_wc_sse41(void * dst,const void * src,unsigned long len)199 static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
200 {
201 	char buf[16];
202 
203 	/* Flush the internal buffer of potential stale gfx data */
204 	_mm_mfence();
205 
206 	if ((uintptr_t)src & 15) {
207 		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
208 		unsigned long misalign = (uintptr_t)src & 15;
209 		unsigned long copy = min(len, 16 - misalign);
210 
211 		_mm_storeu_si128((__m128i *)buf,
212 				 _mm_stream_load_si128(S));
213 
214 		memcpy(dst, buf + misalign, copy);
215 
216 		dst += copy;
217 		src += copy;
218 		len -= copy;
219 	}
220 
221 	/* We assume we are doing bulk transfers, so prefer aligned moves */
222 	if (((uintptr_t)dst & 15) == 0) {
223 		while (len >= 64) {
224 			__m128i *S = (__m128i *)src;
225 			__m128i *D = (__m128i *)dst;
226 			__m128i tmp[4];
227 
228 			tmp[0] = _mm_stream_load_si128(S + 0);
229 			tmp[1] = _mm_stream_load_si128(S + 1);
230 			tmp[2] = _mm_stream_load_si128(S + 2);
231 			tmp[3] = _mm_stream_load_si128(S + 3);
232 
233 			_mm_store_si128(D + 0, tmp[0]);
234 			_mm_store_si128(D + 1, tmp[1]);
235 			_mm_store_si128(D + 2, tmp[2]);
236 			_mm_store_si128(D + 3, tmp[3]);
237 
238 			src += 64;
239 			dst += 64;
240 			len -= 64;
241 		}
242 	} else {
243 		while (len >= 64) {
244 			__m128i *S = (__m128i *)src;
245 			__m128i *D = (__m128i *)dst;
246 			__m128i tmp[4];
247 
248 			tmp[0] = _mm_stream_load_si128(S + 0);
249 			tmp[1] = _mm_stream_load_si128(S + 1);
250 			tmp[2] = _mm_stream_load_si128(S + 2);
251 			tmp[3] = _mm_stream_load_si128(S + 3);
252 
253 			_mm_storeu_si128(D + 0, tmp[0]);
254 			_mm_storeu_si128(D + 1, tmp[1]);
255 			_mm_storeu_si128(D + 2, tmp[2]);
256 			_mm_storeu_si128(D + 3, tmp[3]);
257 
258 			src += 64;
259 			dst += 64;
260 			len -= 64;
261 		}
262 	}
263 
264 	while (len >= 16) {
265 		_mm_storeu_si128((__m128i *)dst,
266 				 _mm_stream_load_si128((__m128i *)src));
267 
268 		src += 16;
269 		dst += 16;
270 		len -= 16;
271 	}
272 
273 	if (len) {
274 		_mm_storeu_si128((__m128i *)buf,
275 				 _mm_stream_load_si128((__m128i *)src));
276 		memcpy(dst, buf, len);
277 	}
278 }
279 
280 #pragma GCC pop_options
281 
memcpy_from_wc(void * dst,const void * src,unsigned long len)282 static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
283 {
284 	memcpy(dst, src, len);
285 }
286 
resolve_memcpy_from_wc(void)287 static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
288 {
289 	if (igt_x86_features() & SSE4_1)
290 		return memcpy_from_wc_sse41;
291 
292 	return memcpy_from_wc;
293 }
294 
295 void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
296 	__attribute__((ifunc("resolve_memcpy_from_wc")));
297 
298 #else
igt_memcpy_from_wc(void * dst,const void * src,unsigned long len)299 void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
300 {
301 	memcpy(dst, src, len);
302 }
303 #endif
304