1 /*
2 * Copyright (C) 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <stdio.h>
18 #include <xmmintrin.h>
19
20 #include <algorithm>
21 #include <iterator>
22 #include <memory>
23 #include <optional>
24 #include <string>
25 #include <tuple>
26 #include <type_traits>
27 #include <vector>
28
29 #include "berberis/base/checks.h"
30 #include "berberis/base/config.h"
31 #include "berberis/intrinsics/common_to_x86/intrinsics_bindings.h"
32 #include "berberis/intrinsics/intrinsics_args.h"
33 #include "berberis/intrinsics/intrinsics_float.h"
34 #include "berberis/intrinsics/macro_assembler.h"
35 #include "berberis/intrinsics/simd_register.h"
36 #include "berberis/intrinsics/type_traits.h"
37
38 #include "text_assembler.h"
39
40 namespace berberis {
41
42 namespace constants_pool {
43
44 // Note: kBerberisMacroAssemblerConstantsRelocated is the same as original,
45 // unrelocated version in 32-bit world. But in 64-bit world it's copy on the first 2GiB.
46 //
47 // Our builder could be built as 64-bit binary thus we must not mix them.
48 //
49 // Note: we have CHECK_*_LAYOUT tests in macro_assembler_common_x86.cc to make sure
50 // offsets produced by 64-bit builder are usable in 32-bit libberberis.so
51
52 extern const int32_t kBerberisMacroAssemblerConstantsRelocated;
53
GetOffset(int32_t address)54 int32_t GetOffset(int32_t address) {
55 return address - constants_pool::kBerberisMacroAssemblerConstantsRelocated;
56 }
57
58 } // namespace constants_pool
59
60 template <typename AsmCallInfo>
61 void GenerateOutputVariables(FILE* out, int indent);
62 template <typename AsmCallInfo>
63 void GenerateTemporaries(FILE* out, int indent);
64 template <typename AsmCallInfo>
65 void GenerateInShadows(FILE* out, int indent);
66 template <typename AsmCallInfo>
67 void AssignRegisterNumbers(int* register_numbers);
68 template <typename AsmCallInfo>
69 auto CallTextAssembler(FILE* out, int indent, int* register_numbers);
70 template <typename AsmCallInfo>
71 void GenerateAssemblerOuts(FILE* out, int indent);
72 template <typename AsmCallInfo>
73 void GenerateAssemblerIns(FILE* out,
74 int indent,
75 int* register_numbers,
76 bool need_gpr_macroassembler_scratch,
77 bool need_gpr_macroassembler_constants);
78 template <typename AsmCallInfo>
79 void GenerateOutShadows(FILE* out, int indent);
80 template <typename AsmCallInfo>
81 void GenerateElementsList(FILE* out,
82 int indent,
83 const std::string& prefix,
84 const std::string& suffix,
85 const std::vector<std::string>& elements);
86 template <typename AsmCallInfo, typename Arg>
87 constexpr bool NeedInputShadow(Arg arg);
88 template <typename AsmCallInfo, typename Arg>
89 constexpr bool NeedOutputShadow(Arg arg);
90
91 template <typename AsmCallInfo>
GenerateFunctionHeader(FILE * out,int indent)92 void GenerateFunctionHeader(FILE* out, int indent) {
93 if (strchr(AsmCallInfo::kIntrinsic, '<')) {
94 fprintf(out, "template <>\n");
95 }
96 std::string prefix;
97 if constexpr (std::tuple_size_v<typename AsmCallInfo::OutputArguments> == 0) {
98 prefix = "inline void " + std::string(AsmCallInfo::kIntrinsic) + "(";
99 } else {
100 const char* prefix_of_prefix = "inline std::tuple<";
101 for (const char* type_name : AsmCallInfo::OutputArgumentsTypeNames) {
102 prefix += prefix_of_prefix + std::string(type_name);
103 prefix_of_prefix = ", ";
104 }
105 prefix += "> " + std::string(AsmCallInfo::kIntrinsic) + "(";
106 }
107 std::vector<std::string> ins;
108 for (const char* type_name : AsmCallInfo::InputArgumentsTypeNames) {
109 ins.push_back(std::string(type_name) + " in" + std::to_string(ins.size()));
110 }
111 GenerateElementsList<AsmCallInfo>(out, indent, prefix, ") {", ins);
112 fprintf(out,
113 " [[maybe_unused]] alignas(berberis::config::kScratchAreaAlign)"
114 " uint8_t scratch[berberis::config::kScratchAreaSize];\n");
115 fprintf(out,
116 " [[maybe_unused]] auto& scratch2 ="
117 " scratch[berberis::config::kScratchAreaSlotSize];\n");
118 }
119
120 template <typename AsmCallInfo>
GenerateFunctionBody(FILE * out,int indent)121 void GenerateFunctionBody(FILE* out, int indent) {
122 // Declare out variables.
123 GenerateOutputVariables<AsmCallInfo>(out, indent);
124 // Declare temporary variables.
125 GenerateTemporaries<AsmCallInfo>(out, indent);
126 // We need "shadow variables" for ins of types: Float32, Float64 and SIMD128Register.
127 // This is because assembler does not accept these arguments for XMMRegisters and
128 // we couldn't use "float"/"double" function arguments because if ABI issues.
129 GenerateInShadows<AsmCallInfo>(out, indent);
130 // Even if we don't pass any registers we need to allocate at least one element.
131 int register_numbers[std::tuple_size_v<typename AsmCallInfo::Bindings> == 0
132 ? 1
133 : std::tuple_size_v<typename AsmCallInfo::Bindings>];
134 // Assign numbers to registers - we need to pass them to assembler and then, later,
135 // to Generator of Input Variable line.
136 AssignRegisterNumbers<AsmCallInfo>(register_numbers);
137 // Print opening line for asm call.
138 if constexpr (AsmCallInfo::kSideEffects) {
139 fprintf(out, "%*s__asm__ __volatile__(\n", indent, "");
140 } else {
141 fprintf(out, "%*s__asm__(\n", indent, "");
142 }
143 // Call text assembler to produce the body of an asm call.
144 auto [need_gpr_macroassembler_scratch, need_gpr_macroassembler_constants] =
145 CallTextAssembler<AsmCallInfo>(out, indent, register_numbers);
146 // Assembler instruction outs.
147 GenerateAssemblerOuts<AsmCallInfo>(out, indent);
148 // Assembler instruction ins.
149 GenerateAssemblerIns<AsmCallInfo>(out,
150 indent,
151 register_numbers,
152 need_gpr_macroassembler_scratch,
153 need_gpr_macroassembler_constants);
154 // Close asm call.
155 fprintf(out, "%*s);\n", indent, "");
156 // Generate copies from shadows to outputs.
157 GenerateOutShadows<AsmCallInfo>(out, indent);
158 // Return value from function.
159 if constexpr (std::tuple_size_v<typename AsmCallInfo::OutputArguments> > 0) {
160 std::vector<std::string> outs;
161 for (std::size_t id = 0; id < std::tuple_size_v<typename AsmCallInfo::OutputArguments>; ++id) {
162 outs.push_back("out" + std::to_string(id));
163 }
164 GenerateElementsList<AsmCallInfo>(out, indent, "return {", "};", outs);
165 }
166 }
167
168 template <typename AsmCallInfo>
GenerateOutputVariables(FILE * out,int indent)169 void GenerateOutputVariables(FILE* out, int indent) {
170 std::size_t id = 0;
171 for (const char* type_name : AsmCallInfo::OutputArgumentsTypeNames) {
172 fprintf(out, "%*s%s out%zd;\n", indent, "", type_name, id++);
173 }
174 }
175
176 template <typename AsmCallInfo>
GenerateTemporaries(FILE * out,int indent)177 void GenerateTemporaries(FILE* out, int indent) {
178 std::size_t id = 0;
179 AsmCallInfo::ProcessBindings([out, &id, indent](auto arg) {
180 using RegisterClass = typename decltype(arg)::RegisterClass;
181 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
182 if constexpr (!HaveInput(arg.arg_info) && !HaveOutput(arg.arg_info)) {
183 static_assert(
184 std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Def> ||
185 std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::DefEarlyClobber>);
186 fprintf(out,
187 "%*s%s tmp%zd;\n",
188 indent,
189 "",
190 TypeTraits<typename RegisterClass::Type>::kName,
191 id++);
192 }
193 }
194 });
195 }
196
197 template <typename AsmCallInfo>
GenerateInShadows(FILE * out,int indent)198 void GenerateInShadows(FILE* out, int indent) {
199 AsmCallInfo::ProcessBindings([out, indent](auto arg) {
200 using RegisterClass = typename decltype(arg)::RegisterClass;
201 if constexpr (RegisterClass::kAsRegister == 'm') {
202 // Only temporary memory scratch area is supported.
203 static_assert(!HaveInput(arg.arg_info) && !HaveOutput(arg.arg_info));
204 } else if constexpr (RegisterClass::kAsRegister == 'r') {
205 // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
206 if constexpr (NeedInputShadow<AsmCallInfo>(arg)) {
207 fprintf(out, "%2$*1$suint32_t in%3$d_shadow = in%3$d;\n", indent, "", arg.arg_info.from);
208 }
209 if constexpr (NeedOutputShadow<AsmCallInfo>(arg)) {
210 fprintf(out, "%*suint32_t out%d_shadow;\n", indent, "", arg.arg_info.to);
211 }
212 } else if constexpr (RegisterClass::kAsRegister == 'x') {
213 if constexpr (HaveInput(arg.arg_info)) {
214 using Type = std::tuple_element_t<arg.arg_info.from, typename AsmCallInfo::InputArguments>;
215 const char* type_name = TypeTraits<Type>::kName;
216 const char* xmm_type_name;
217 const char* expanded = "";
218 // Types allowed for 'x' restriction are float, double and __m128/__m128i/__m128d
219 // First two work for {,u}int32_t and {,u}int64_t, but small integer types must be expanded.
220 if constexpr (std::is_integral_v<Type> && sizeof(Type) < sizeof(int32_t)) {
221 fprintf(
222 out, "%2$*1$suint32_t in%3$d_expanded = in%3$d;\n", indent, "", arg.arg_info.from);
223 type_name = TypeTraits<uint32_t>::kName;
224 xmm_type_name =
225 TypeTraits<typename TypeTraits<typename TypeTraits<uint32_t>::Float>::Raw>::kName;
226 expanded = "_expanded";
227 } else if constexpr (std::is_integral_v<Type>) {
228 // {,u}int32_t and {,u}int64_t have to be converted to float/double.
229 xmm_type_name =
230 TypeTraits<typename TypeTraits<typename TypeTraits<Type>::Float>::Raw>::kName;
231 } else {
232 // Float32/Float64 can not be used, we need to use raw float/double.
233 xmm_type_name = TypeTraits<typename TypeTraits<Type>::Raw>::kName;
234 }
235 fprintf(out, "%*s%s in%d_shadow;\n", indent, "", xmm_type_name, arg.arg_info.from);
236 fprintf(out,
237 "%*sstatic_assert(sizeof(%s) == sizeof(%s));\n",
238 indent,
239 "",
240 type_name,
241 xmm_type_name);
242 // Note: it's not safe to use bit_cast here till we have std::bit_cast from C++20.
243 // If optimizer wouldn't be enabled (e.g. if code is compiled with -O0) then bit_cast
244 // would use %st on 32-bit platform which destroys NaNs.
245 fprintf(out,
246 "%2$*1$smemcpy(&in%3$d_shadow, &in%3$d%4$s, sizeof(%5$s));\n",
247 indent,
248 "",
249 arg.arg_info.from,
250 expanded,
251 xmm_type_name);
252 }
253 if constexpr (HaveOutput(arg.arg_info)) {
254 using Type = std::tuple_element_t<arg.arg_info.to, typename AsmCallInfo::OutputArguments>;
255 const char* xmm_type_name;
256 // {,u}int32_t and {,u}int64_t have to be converted to float/double.
257 if constexpr (std::is_integral_v<Type>) {
258 xmm_type_name =
259 TypeTraits<typename TypeTraits<typename TypeTraits<Type>::Float>::Raw>::kName;
260 } else {
261 // Float32/Float64 can not be used, we need to use raw float/double.
262 xmm_type_name = TypeTraits<typename TypeTraits<Type>::Raw>::kName;
263 }
264 fprintf(out, "%*s%s out%d_shadow;\n", indent, "", xmm_type_name, arg.arg_info.to);
265 }
266 }
267 });
268 }
269
270 template <typename AsmCallInfo>
AssignRegisterNumbers(int * register_numbers)271 void AssignRegisterNumbers(int* register_numbers) {
272 // Assign number for output (and temporary) arguments.
273 std::size_t id = 0;
274 int arg_counter = 0;
275 AsmCallInfo::ProcessBindings([&id, &arg_counter, ®ister_numbers](auto arg) {
276 using RegisterClass = typename decltype(arg)::RegisterClass;
277 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
278 if constexpr (!std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
279 register_numbers[arg_counter] = id++;
280 }
281 ++arg_counter;
282 }
283 });
284 // Assign numbers for input arguments.
285 arg_counter = 0;
286 AsmCallInfo::ProcessBindings([&id, &arg_counter, ®ister_numbers](auto arg) {
287 using RegisterClass = typename decltype(arg)::RegisterClass;
288 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
289 if constexpr (std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
290 register_numbers[arg_counter] = id++;
291 }
292 ++arg_counter;
293 }
294 });
295 }
296
297 template <typename AsmCallInfo>
CallTextAssembler(FILE * out,int indent,int * register_numbers)298 auto CallTextAssembler(FILE* out, int indent, int* register_numbers) {
299 MacroAssembler<TextAssembler> as(indent, out);
300 int arg_counter = 0;
301 AsmCallInfo::ProcessBindings([&arg_counter, &as, register_numbers](auto arg) {
302 using RegisterClass = typename decltype(arg)::RegisterClass;
303 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
304 if constexpr (RegisterClass::kAsRegister != 'm') {
305 if constexpr (RegisterClass::kIsImplicitReg) {
306 if constexpr (RegisterClass::kAsRegister == 'a') {
307 as.gpr_a = TextAssembler::Register(register_numbers[arg_counter]);
308 } else if constexpr (RegisterClass::kAsRegister == 'c') {
309 as.gpr_c = TextAssembler::Register(register_numbers[arg_counter]);
310 } else {
311 static_assert(RegisterClass::kAsRegister == 'd');
312 as.gpr_d = TextAssembler::Register(register_numbers[arg_counter]);
313 }
314 }
315 }
316 ++arg_counter;
317 }
318 });
319 as.gpr_macroassembler_constants = TextAssembler::Register(arg_counter);
320 arg_counter = 0;
321 int scratch_counter = 0;
322 std::apply(AsmCallInfo::kMacroInstruction,
323 std::tuple_cat(
324 std::tuple<MacroAssembler<TextAssembler>&>{as},
325 AsmCallInfo::MakeTuplefromBindings(
326 [&as, &arg_counter, &scratch_counter, register_numbers](auto arg) {
327 using RegisterClass = typename decltype(arg)::RegisterClass;
328 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
329 if constexpr (RegisterClass::kAsRegister == 'm') {
330 if (scratch_counter == 0) {
331 as.gpr_macroassembler_scratch = TextAssembler::Register(arg_counter++);
332 } else if (scratch_counter == 1) {
333 as.gpr_macroassembler_scratch2 =
334 TextAssembler::Register(arg_counter++);
335 } else {
336 FATAL("Only two scratch registers are supported for now");
337 }
338 // Note: as.gpr_scratch in combination with offset is treated by text
339 // assembler specially. We rely on offset set here to be the same as
340 // scratch2 address in scratch buffer.
341 return std::tuple{TextAssembler::Operand{
342 .base = as.gpr_scratch,
343 .disp = static_cast<int32_t>(config::kScratchAreaSlotSize *
344 scratch_counter++)}};
345 } else if constexpr (RegisterClass::kIsImplicitReg) {
346 ++arg_counter;
347 return std::tuple{};
348 } else {
349 return std::tuple{register_numbers[arg_counter++]};
350 }
351 } else {
352 return std::tuple{};
353 }
354 })));
355 // Verify CPU vendor and SSE restrictions.
356 bool expect_avx = false;
357 bool expect_bmi = false;
358 bool expect_fma = false;
359 bool expect_fma4 = false;
360 bool expect_lzcnt = false;
361 bool expect_popcnt = false;
362 bool expect_sse3 = false;
363 bool expect_ssse3 = false;
364 bool expect_sse4_1 = false;
365 bool expect_sse4_2 = false;
366 switch (AsmCallInfo::kCPUIDRestriction) {
367 case intrinsics::bindings::kHasBMI:
368 expect_bmi = true;
369 break;
370 case intrinsics::bindings::kHasLZCNT:
371 expect_lzcnt = true;
372 break;
373 case intrinsics::bindings::kHasPOPCNT:
374 expect_popcnt = true;
375 break;
376 case intrinsics::bindings::kHasFMA:
377 case intrinsics::bindings::kHasFMA4:
378 if (AsmCallInfo::kCPUIDRestriction == intrinsics::bindings::kHasFMA) {
379 expect_fma = true;
380 } else {
381 expect_fma4 = true;
382 }
383 [[fallthrough]];
384 case intrinsics::bindings::kHasAVX:
385 expect_avx = true;
386 [[fallthrough]];
387 case intrinsics::bindings::kHasSSE4_2:
388 expect_sse4_2 = true;
389 [[fallthrough]];
390 case intrinsics::bindings::kHasSSE4_1:
391 expect_sse4_1 = true;
392 [[fallthrough]];
393 case intrinsics::bindings::kHasSSSE3:
394 expect_ssse3 = true;
395 [[fallthrough]];
396 case intrinsics::bindings::kHasSSE3:
397 expect_sse3 = true;
398 [[fallthrough]];
399 case intrinsics::bindings::kIsAuthenticAMD:
400 case intrinsics::bindings::kNoCPUIDRestriction:; // Do nothing - make compiler happy.
401 }
402 CHECK_EQ(expect_avx, as.need_avx);
403 CHECK_EQ(expect_bmi, as.need_bmi);
404 CHECK_EQ(expect_fma, as.need_fma);
405 CHECK_EQ(expect_fma4, as.need_fma4);
406 CHECK_EQ(expect_lzcnt, as.need_lzcnt);
407 CHECK_EQ(expect_popcnt, as.need_popcnt);
408 CHECK_EQ(expect_sse3, as.need_sse3);
409 CHECK_EQ(expect_ssse3, as.need_ssse3);
410 CHECK_EQ(expect_sse4_1, as.need_sse4_1);
411 CHECK_EQ(expect_sse4_2, as.need_sse4_2);
412 return std::tuple{as.need_gpr_macroassembler_scratch(), as.need_gpr_macroassembler_constants()};
413 }
414
415 template <typename AsmCallInfo>
GenerateAssemblerOuts(FILE * out,int indent)416 void GenerateAssemblerOuts(FILE* out, int indent) {
417 std::vector<std::string> outs;
418 int tmp_id = 0;
419 AsmCallInfo::ProcessBindings([&outs, &tmp_id](auto arg) {
420 using RegisterClass = typename decltype(arg)::RegisterClass;
421 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS> &&
422 !std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
423 std::string out = "\"=";
424 if constexpr (std::is_same_v<typename decltype(arg)::Usage,
425 intrinsics::bindings::DefEarlyClobber>) {
426 out += "&";
427 }
428 out += RegisterClass::kAsRegister;
429 if constexpr (HaveOutput(arg.arg_info)) {
430 bool need_shadow = NeedOutputShadow<AsmCallInfo>(arg);
431 out += "\"(out" + std::to_string(arg.arg_info.to) + (need_shadow ? "_shadow)" : ")");
432 } else if constexpr (HaveInput(arg.arg_info)) {
433 bool need_shadow = NeedInputShadow<AsmCallInfo>(arg);
434 out += "\"(in" + std::to_string(arg.arg_info.from) + (need_shadow ? "_shadow)" : ")");
435 } else {
436 out += "\"(tmp" + std::to_string(tmp_id++) + ")";
437 }
438 outs.push_back(out);
439 }
440 });
441 GenerateElementsList<AsmCallInfo>(out, indent, " : ", "", outs);
442 }
443
444 template <typename AsmCallInfo>
GenerateAssemblerIns(FILE * out,int indent,int * register_numbers,bool need_gpr_macroassembler_scratch,bool need_gpr_macroassembler_constants)445 void GenerateAssemblerIns(FILE* out,
446 int indent,
447 int* register_numbers,
448 bool need_gpr_macroassembler_scratch,
449 bool need_gpr_macroassembler_constants) {
450 std::vector<std::string> ins;
451 AsmCallInfo::ProcessBindings([&ins](auto arg) {
452 using RegisterClass = typename decltype(arg)::RegisterClass;
453 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS> &&
454 std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
455 ins.push_back("\"" + std::string(1, RegisterClass::kAsRegister) + "\"(in" +
456 std::to_string(arg.arg_info.from) +
457 (NeedInputShadow<AsmCallInfo>(arg) ? "_shadow)" : ")"));
458 }
459 });
460 if (need_gpr_macroassembler_scratch) {
461 ins.push_back("\"m\"(scratch), \"m\"(scratch2)");
462 }
463 if (need_gpr_macroassembler_constants) {
464 ins.push_back(
465 "\"m\"(*reinterpret_cast<const char*>(&constants_pool::kBerberisMacroAssemblerConstants))");
466 }
467 int arg_counter = 0;
468 AsmCallInfo::ProcessBindings([&ins, &arg_counter, register_numbers](auto arg) {
469 using RegisterClass = typename decltype(arg)::RegisterClass;
470 if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
471 if constexpr (HaveInput(arg.arg_info) &&
472 !std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
473 ins.push_back("\"" + std::to_string(register_numbers[arg_counter]) + "\"(in" +
474 std::to_string(arg.arg_info.from) +
475 (NeedInputShadow<AsmCallInfo>(arg) ? "_shadow)" : ")"));
476 }
477 ++arg_counter;
478 }
479 });
480 GenerateElementsList<AsmCallInfo>(out, indent, " : ", "", ins);
481 }
482
483 template <typename AsmCallInfo>
GenerateOutShadows(FILE * out,int indent)484 void GenerateOutShadows(FILE* out, int indent) {
485 AsmCallInfo::ProcessBindings([out, indent](auto arg) {
486 using RegisterClass = typename decltype(arg)::RegisterClass;
487 if constexpr (RegisterClass::kAsRegister == 'r') {
488 // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
489 if constexpr (HaveOutput(arg.arg_info)) {
490 using Type = std::tuple_element_t<arg.arg_info.to, typename AsmCallInfo::OutputArguments>;
491 if constexpr (sizeof(Type) == sizeof(uint8_t)) {
492 fprintf(out, "%2$*1$sout%3$d = out%3$d_shadow;\n", indent, "", arg.arg_info.to);
493 }
494 }
495 } else if constexpr (RegisterClass::kAsRegister == 'x') {
496 if constexpr (HaveOutput(arg.arg_info)) {
497 using Type = std::tuple_element_t<arg.arg_info.to, typename AsmCallInfo::OutputArguments>;
498 const char* type_name = TypeTraits<Type>::kName;
499 const char* xmm_type_name;
500 // {,u}int32_t and {,u}int64_t have to be converted to float/double.
501 if constexpr (std::is_integral_v<Type>) {
502 xmm_type_name =
503 TypeTraits<typename TypeTraits<typename TypeTraits<Type>::Float>::Raw>::kName;
504 } else {
505 // Float32/Float64 can not be used, we need to use raw float/double.
506 xmm_type_name = TypeTraits<typename TypeTraits<Type>::Raw>::kName;
507 }
508 fprintf(out,
509 "%*sstatic_assert(sizeof(%s) == sizeof(%s));\n",
510 indent,
511 "",
512 type_name,
513 xmm_type_name);
514 // Note: it's not safe to use bit_cast here till we have std::bit_cast from C++20.
515 // If optimizer wouldn't be enabled (e.g. if code is compiled with -O0) then bit_cast
516 // would use %st on 32-bit platform which destroys NaNs.
517 fprintf(out,
518 "%2$*1$smemcpy(&out%3$d, &out%3$d_shadow, sizeof(%4$s));\n",
519 indent,
520 "",
521 arg.arg_info.to,
522 xmm_type_name);
523 }
524 }
525 });
526 }
527
528 template <typename AsmCallInfo>
GenerateElementsList(FILE * out,int indent,const std::string & prefix,const std::string & suffix,const std::vector<std::string> & elements)529 void GenerateElementsList(FILE* out,
530 int indent,
531 const std::string& prefix,
532 const std::string& suffix,
533 const std::vector<std::string>& elements) {
534 std::size_t length = prefix.length() + suffix.length();
535 if (elements.size() == 0) {
536 fprintf(out, "%*s%s%s\n", indent, "", prefix.c_str(), suffix.c_str());
537 return;
538 }
539 for (const auto& element : elements) {
540 length += element.length() + 2;
541 }
542 for (const auto& element : elements) {
543 if (&element == &elements[0]) {
544 fprintf(out, "%*s%s%s", indent, "", prefix.c_str(), element.c_str());
545 } else {
546 if (length <= 102) {
547 fprintf(out, ", %s", element.c_str());
548 } else {
549 fprintf(out, ",\n%*s%s", static_cast<int>(prefix.length()) + indent, "", element.c_str());
550 }
551 }
552 }
553 fprintf(out, "%s\n", suffix.c_str());
554 }
555
556 template <typename AsmCallInfo, typename Arg>
NeedInputShadow(Arg arg)557 constexpr bool NeedInputShadow(Arg arg) {
558 using RegisterClass = typename Arg::RegisterClass;
559 // Without shadow clang silently converts 'r' restriction into 'q' restriction which
560 // is wrong: if %ah or %bh is picked we would produce incorrect result here.
561 // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
562 if constexpr (RegisterClass::kAsRegister == 'r' && HaveInput(arg.arg_info)) {
563 // Only 8-bit registers are special because each 16-bit registers include two of them
564 // (%al/%ah, %cl/%ch, %dl/%dh, %bl/%bh).
565 // Mix of 16-bit and 64-bit registers doesn't trigger bug in Clang.
566 if constexpr (sizeof(std::tuple_element_t<arg.arg_info.from,
567 typename AsmCallInfo::InputArguments>) ==
568 sizeof(uint8_t)) {
569 return true;
570 }
571 } else if constexpr (RegisterClass::kAsRegister == 'x') {
572 return true;
573 }
574 return false;
575 }
576
577 template <typename AsmCallInfo, typename Arg>
NeedOutputShadow(Arg arg)578 constexpr bool NeedOutputShadow(Arg arg) {
579 using RegisterClass = typename Arg::RegisterClass;
580 // Without shadow clang silently converts 'r' restriction into 'q' restriction which
581 // is wrong: if %ah or %bh is picked we would produce incorrect result here.
582 // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
583 if constexpr (RegisterClass::kAsRegister == 'r' && HaveOutput(arg.arg_info)) {
584 // Only 8-bit registers are special because each some 16-bit registers include two of
585 // them (%al/%ah, %cl/%ch, %dl/%dh, %bl/%bh).
586 // Mix of 16-bit and 64-bit registers don't trigger bug in Clang.
587 if constexpr (sizeof(std::tuple_element_t<arg.arg_info.to,
588 typename AsmCallInfo::OutputArguments>) ==
589 sizeof(uint8_t)) {
590 return true;
591 }
592 } else if constexpr (RegisterClass::kAsRegister == 'x') {
593 return true;
594 }
595 return false;
596 }
597
598 #include "text_asm_intrinsics_process_bindings-inl.h"
599
GenerateTextAsmIntrinsics(FILE * out)600 void GenerateTextAsmIntrinsics(FILE* out) {
601 intrinsics::bindings::CPUIDRestriction cpuid_restriction =
602 intrinsics::bindings::kNoCPUIDRestriction;
603 bool if_opened = false;
604 std::string running_name;
605 ProcessAllBindings<TextAssemblerX86<TextAssembler>,
606 TextAssembler,
607 MacroAssembler<TextAssembler>::MacroAssemblers>(
608 [&running_name, &if_opened, &cpuid_restriction, out](auto&& asm_call_generator) {
609 using AsmCallInfo = std::decay_t<decltype(asm_call_generator)>;
610 std::string full_name = std::string(asm_call_generator.kIntrinsic,
611 std::strlen(asm_call_generator.kIntrinsic) - 1) +
612 ", kUseCppImplementation>";
613 if (size_t arguments_count = std::tuple_size_v<typename AsmCallInfo::InputArguments>) {
614 full_name += "(in0";
615 for (size_t i = 1; i < arguments_count; ++i) {
616 full_name += ", in" + std::to_string(i);
617 }
618 full_name += ")";
619 } else {
620 full_name += "()";
621 }
622 if (full_name != running_name) {
623 if (if_opened) {
624 if (cpuid_restriction != intrinsics::bindings::kNoCPUIDRestriction) {
625 fprintf(out, " } else {\n return %s;\n", running_name.c_str());
626 cpuid_restriction = intrinsics::bindings::kNoCPUIDRestriction;
627 }
628 if_opened = false;
629 fprintf(out, " }\n");
630 }
631 // Final line of function.
632 if (!running_name.empty()) {
633 fprintf(out, "};\n\n");
634 }
635 GenerateFunctionHeader<AsmCallInfo>(out, 0);
636 running_name = full_name;
637 }
638 if (asm_call_generator.kCPUIDRestriction != cpuid_restriction) {
639 if (asm_call_generator.kCPUIDRestriction == intrinsics::bindings::kNoCPUIDRestriction) {
640 fprintf(out, " } else {\n");
641 } else {
642 if (if_opened) {
643 fprintf(out, " } else if (");
644 } else {
645 fprintf(out, " if (");
646 if_opened = true;
647 }
648 switch (asm_call_generator.kCPUIDRestriction) {
649 default:
650 // Unsupported CPUID value.
651 CHECK(false);
652 case intrinsics::bindings::kIsAuthenticAMD:
653 fprintf(out, "host_platform::kIsAuthenticAMD");
654 break;
655 case intrinsics::bindings::kHasAVX:
656 fprintf(out, "host_platform::kHasAVX");
657 break;
658 case intrinsics::bindings::kHasBMI:
659 fprintf(out, "host_platform::kHasBMI");
660 break;
661 case intrinsics::bindings::kHasFMA:
662 fprintf(out, "host_platform::kHasFMA");
663 break;
664 case intrinsics::bindings::kHasFMA4:
665 fprintf(out, "host_platform::kHasFMA4");
666 break;
667 case intrinsics::bindings::kHasLZCNT:
668 fprintf(out, "host_platform::kHasLZCNT");
669 break;
670 case intrinsics::bindings::kHasPOPCNT:
671 fprintf(out, "host_platform::kHasPOPCNT");
672 break;
673 case intrinsics::bindings::kHasSSE3:
674 fprintf(out, "host_platform::kHasSSE3");
675 break;
676 case intrinsics::bindings::kHasSSSE3:
677 fprintf(out, "host_platform::kHasSSSE3");
678 break;
679 case intrinsics::bindings::kHasSSE4_1:
680 fprintf(out, "host_platform::kHasSSE4_1");
681 break;
682 case intrinsics::bindings::kHasSSE4_2:
683 fprintf(out, "host_platform::kHasSSE4_2");
684 break;
685 case intrinsics::bindings::kNoCPUIDRestriction:; // Do nothing - make compiler happy.
686 }
687 fprintf(out, ") {\n");
688 }
689 cpuid_restriction = asm_call_generator.kCPUIDRestriction;
690 }
691 GenerateFunctionBody<AsmCallInfo>(out, 2 + 2 * if_opened);
692 });
693 if (if_opened) {
694 fprintf(out, " }\n");
695 }
696 // Final line of function.
697 fprintf(out, "};\n\n");
698 }
699
700 } // namespace berberis
701
main(int argc,char * argv[])702 int main(int argc, char* argv[]) {
703 FILE* out = argc > 1 ? fopen(argv[1], "w") : stdout;
704 fprintf(out,
705 R"STRING(
706 // This file automatically generated by make_intrinsics.cc
707 // DO NOT EDIT!
708
709 #ifndef %2$s_%3$s_INTRINSICS_INTRINSICS_H_
710 #define %2$s_%3$s_INTRINSICS_INTRINSICS_H_
711
712 #include <xmmintrin.h>
713
714 #include "berberis/base/config.h"
715 #include "berberis/runtime_primitives/platform.h"
716 #include "%3$s/intrinsics/%1$s/intrinsics.h"
717 #include "%3$s/intrinsics/vector_intrinsics.h"
718
719 namespace berberis::constants_pool {
720
721 struct MacroAssemblerConstants;
722
723 extern const MacroAssemblerConstants kBerberisMacroAssemblerConstants
724 __attribute__((visibility("hidden")));
725
726 } // namespace berberis::constants_pool
727
728 namespace %3$s {
729
730 namespace constants_pool {
731
732 %4$s
733
734 } // namespace constants_pool
735
736 namespace intrinsics {
737 )STRING",
738 berberis::TextAssembler::kArchName,
739 berberis::TextAssembler::kArchGuard,
740 berberis::TextAssembler::kNamespaceName,
741 strcmp(berberis::TextAssembler::kNamespaceName, "berberis")
742 ? "using berberis::constants_pool::kBerberisMacroAssemblerConstants;"
743 : "");
744
745 berberis::GenerateTextAsmIntrinsics(out);
746 berberis::MakeExtraGuestFunctions(out);
747
748 fprintf(out,
749 R"STRING(
750 } // namespace intrinsics
751
752 } // namespace %2$s
753
754 #endif /* %1$s_%2$s_INTRINSICS_INTRINSICS_H_ */
755 )STRING",
756 berberis::TextAssembler::kArchGuard,
757 berberis::TextAssembler::kNamespaceName);
758
759 fclose(out);
760 return 0;
761 }
762