1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdio.h>
18 #include <xmmintrin.h>
19 
20 #include <algorithm>
21 #include <iterator>
22 #include <memory>
23 #include <optional>
24 #include <string>
25 #include <tuple>
26 #include <type_traits>
27 #include <vector>
28 
29 #include "berberis/base/checks.h"
30 #include "berberis/base/config.h"
31 #include "berberis/intrinsics/common_to_x86/intrinsics_bindings.h"
32 #include "berberis/intrinsics/intrinsics_args.h"
33 #include "berberis/intrinsics/intrinsics_float.h"
34 #include "berberis/intrinsics/macro_assembler.h"
35 #include "berberis/intrinsics/simd_register.h"
36 #include "berberis/intrinsics/type_traits.h"
37 
38 #include "text_assembler.h"
39 
40 namespace berberis {
41 
42 namespace constants_pool {
43 
44 // Note: kBerberisMacroAssemblerConstantsRelocated is the same as original,
45 // unrelocated version in 32-bit world.  But in 64-bit world it's copy on the first 2GiB.
46 //
47 // Our builder could be built as 64-bit binary thus we must not mix them.
48 //
49 // Note: we have CHECK_*_LAYOUT tests in macro_assembler_common_x86.cc to make sure
50 // offsets produced by 64-bit builder are usable in 32-bit libberberis.so
51 
52 extern const int32_t kBerberisMacroAssemblerConstantsRelocated;
53 
GetOffset(int32_t address)54 int32_t GetOffset(int32_t address) {
55   return address - constants_pool::kBerberisMacroAssemblerConstantsRelocated;
56 }
57 
58 }  // namespace constants_pool
59 
60 template <typename AsmCallInfo>
61 void GenerateOutputVariables(FILE* out, int indent);
62 template <typename AsmCallInfo>
63 void GenerateTemporaries(FILE* out, int indent);
64 template <typename AsmCallInfo>
65 void GenerateInShadows(FILE* out, int indent);
66 template <typename AsmCallInfo>
67 void AssignRegisterNumbers(int* register_numbers);
68 template <typename AsmCallInfo>
69 auto CallTextAssembler(FILE* out, int indent, int* register_numbers);
70 template <typename AsmCallInfo>
71 void GenerateAssemblerOuts(FILE* out, int indent);
72 template <typename AsmCallInfo>
73 void GenerateAssemblerIns(FILE* out,
74                           int indent,
75                           int* register_numbers,
76                           bool need_gpr_macroassembler_scratch,
77                           bool need_gpr_macroassembler_constants);
78 template <typename AsmCallInfo>
79 void GenerateOutShadows(FILE* out, int indent);
80 template <typename AsmCallInfo>
81 void GenerateElementsList(FILE* out,
82                           int indent,
83                           const std::string& prefix,
84                           const std::string& suffix,
85                           const std::vector<std::string>& elements);
86 template <typename AsmCallInfo, typename Arg>
87 constexpr bool NeedInputShadow(Arg arg);
88 template <typename AsmCallInfo, typename Arg>
89 constexpr bool NeedOutputShadow(Arg arg);
90 
91 template <typename AsmCallInfo>
GenerateFunctionHeader(FILE * out,int indent)92 void GenerateFunctionHeader(FILE* out, int indent) {
93   if (strchr(AsmCallInfo::kIntrinsic, '<')) {
94     fprintf(out, "template <>\n");
95   }
96   std::string prefix;
97   if constexpr (std::tuple_size_v<typename AsmCallInfo::OutputArguments> == 0) {
98     prefix = "inline void " + std::string(AsmCallInfo::kIntrinsic) + "(";
99   } else {
100     const char* prefix_of_prefix = "inline std::tuple<";
101     for (const char* type_name : AsmCallInfo::OutputArgumentsTypeNames) {
102       prefix += prefix_of_prefix + std::string(type_name);
103       prefix_of_prefix = ", ";
104     }
105     prefix += "> " + std::string(AsmCallInfo::kIntrinsic) + "(";
106   }
107   std::vector<std::string> ins;
108   for (const char* type_name : AsmCallInfo::InputArgumentsTypeNames) {
109     ins.push_back(std::string(type_name) + " in" + std::to_string(ins.size()));
110   }
111   GenerateElementsList<AsmCallInfo>(out, indent, prefix, ") {", ins);
112   fprintf(out,
113           "  [[maybe_unused]]  alignas(berberis::config::kScratchAreaAlign)"
114           " uint8_t scratch[berberis::config::kScratchAreaSize];\n");
115   fprintf(out,
116           "  [[maybe_unused]] auto& scratch2 ="
117           " scratch[berberis::config::kScratchAreaSlotSize];\n");
118 }
119 
120 template <typename AsmCallInfo>
GenerateFunctionBody(FILE * out,int indent)121 void GenerateFunctionBody(FILE* out, int indent) {
122   // Declare out variables.
123   GenerateOutputVariables<AsmCallInfo>(out, indent);
124   // Declare temporary variables.
125   GenerateTemporaries<AsmCallInfo>(out, indent);
126   // We need "shadow variables" for ins of types: Float32, Float64 and SIMD128Register.
127   // This is because assembler does not accept these arguments for XMMRegisters and
128   // we couldn't use "float"/"double" function arguments because if ABI issues.
129   GenerateInShadows<AsmCallInfo>(out, indent);
130   // Even if we don't pass any registers we need to allocate at least one element.
131   int register_numbers[std::tuple_size_v<typename AsmCallInfo::Bindings> == 0
132                            ? 1
133                            : std::tuple_size_v<typename AsmCallInfo::Bindings>];
134   // Assign numbers to registers - we need to pass them to assembler and then, later,
135   // to Generator of Input Variable line.
136   AssignRegisterNumbers<AsmCallInfo>(register_numbers);
137   // Print opening line for asm call.
138   if constexpr (AsmCallInfo::kSideEffects) {
139     fprintf(out, "%*s__asm__ __volatile__(\n", indent, "");
140   } else {
141     fprintf(out, "%*s__asm__(\n", indent, "");
142   }
143   // Call text assembler to produce the body of an asm call.
144   auto [need_gpr_macroassembler_scratch, need_gpr_macroassembler_constants] =
145       CallTextAssembler<AsmCallInfo>(out, indent, register_numbers);
146   // Assembler instruction outs.
147   GenerateAssemblerOuts<AsmCallInfo>(out, indent);
148   // Assembler instruction ins.
149   GenerateAssemblerIns<AsmCallInfo>(out,
150                                     indent,
151                                     register_numbers,
152                                     need_gpr_macroassembler_scratch,
153                                     need_gpr_macroassembler_constants);
154   // Close asm call.
155   fprintf(out, "%*s);\n", indent, "");
156   // Generate copies from shadows to outputs.
157   GenerateOutShadows<AsmCallInfo>(out, indent);
158   // Return value from function.
159   if constexpr (std::tuple_size_v<typename AsmCallInfo::OutputArguments> > 0) {
160     std::vector<std::string> outs;
161     for (std::size_t id = 0; id < std::tuple_size_v<typename AsmCallInfo::OutputArguments>; ++id) {
162       outs.push_back("out" + std::to_string(id));
163     }
164     GenerateElementsList<AsmCallInfo>(out, indent, "return {", "};", outs);
165   }
166 }
167 
168 template <typename AsmCallInfo>
GenerateOutputVariables(FILE * out,int indent)169 void GenerateOutputVariables(FILE* out, int indent) {
170   std::size_t id = 0;
171   for (const char* type_name : AsmCallInfo::OutputArgumentsTypeNames) {
172     fprintf(out, "%*s%s out%zd;\n", indent, "", type_name, id++);
173   }
174 }
175 
176 template <typename AsmCallInfo>
GenerateTemporaries(FILE * out,int indent)177 void GenerateTemporaries(FILE* out, int indent) {
178   std::size_t id = 0;
179   AsmCallInfo::ProcessBindings([out, &id, indent](auto arg) {
180     using RegisterClass = typename decltype(arg)::RegisterClass;
181     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
182       if constexpr (!HaveInput(arg.arg_info) && !HaveOutput(arg.arg_info)) {
183         static_assert(
184             std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Def> ||
185             std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::DefEarlyClobber>);
186         fprintf(out,
187                 "%*s%s tmp%zd;\n",
188                 indent,
189                 "",
190                 TypeTraits<typename RegisterClass::Type>::kName,
191                 id++);
192       }
193     }
194   });
195 }
196 
197 template <typename AsmCallInfo>
GenerateInShadows(FILE * out,int indent)198 void GenerateInShadows(FILE* out, int indent) {
199   AsmCallInfo::ProcessBindings([out, indent](auto arg) {
200     using RegisterClass = typename decltype(arg)::RegisterClass;
201     if constexpr (RegisterClass::kAsRegister == 'm') {
202       // Only temporary memory scratch area is supported.
203       static_assert(!HaveInput(arg.arg_info) && !HaveOutput(arg.arg_info));
204     } else if constexpr (RegisterClass::kAsRegister == 'r') {
205       // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
206       if constexpr (NeedInputShadow<AsmCallInfo>(arg)) {
207         fprintf(out, "%2$*1$suint32_t in%3$d_shadow = in%3$d;\n", indent, "", arg.arg_info.from);
208       }
209       if constexpr (NeedOutputShadow<AsmCallInfo>(arg)) {
210         fprintf(out, "%*suint32_t out%d_shadow;\n", indent, "", arg.arg_info.to);
211       }
212     } else if constexpr (RegisterClass::kAsRegister == 'x') {
213       if constexpr (HaveInput(arg.arg_info)) {
214         using Type = std::tuple_element_t<arg.arg_info.from, typename AsmCallInfo::InputArguments>;
215         const char* type_name = TypeTraits<Type>::kName;
216         const char* xmm_type_name;
217         const char* expanded = "";
218         // Types allowed for 'x' restriction are float, double and __m128/__m128i/__m128d
219         // First two work for {,u}int32_t and {,u}int64_t, but small integer types must be expanded.
220         if constexpr (std::is_integral_v<Type> && sizeof(Type) < sizeof(int32_t)) {
221           fprintf(
222               out, "%2$*1$suint32_t in%3$d_expanded = in%3$d;\n", indent, "", arg.arg_info.from);
223           type_name = TypeTraits<uint32_t>::kName;
224           xmm_type_name =
225               TypeTraits<typename TypeTraits<typename TypeTraits<uint32_t>::Float>::Raw>::kName;
226           expanded = "_expanded";
227         } else if constexpr (std::is_integral_v<Type>) {
228           // {,u}int32_t and {,u}int64_t have to be converted to float/double.
229           xmm_type_name =
230               TypeTraits<typename TypeTraits<typename TypeTraits<Type>::Float>::Raw>::kName;
231         } else {
232           // Float32/Float64 can not be used, we need to use raw float/double.
233           xmm_type_name = TypeTraits<typename TypeTraits<Type>::Raw>::kName;
234         }
235         fprintf(out, "%*s%s in%d_shadow;\n", indent, "", xmm_type_name, arg.arg_info.from);
236         fprintf(out,
237                 "%*sstatic_assert(sizeof(%s) == sizeof(%s));\n",
238                 indent,
239                 "",
240                 type_name,
241                 xmm_type_name);
242         // Note: it's not safe to use bit_cast here till we have std::bit_cast from C++20.
243         // If optimizer wouldn't be enabled (e.g. if code is compiled with -O0) then bit_cast
244         // would use %st on 32-bit platform which destroys NaNs.
245         fprintf(out,
246                 "%2$*1$smemcpy(&in%3$d_shadow, &in%3$d%4$s, sizeof(%5$s));\n",
247                 indent,
248                 "",
249                 arg.arg_info.from,
250                 expanded,
251                 xmm_type_name);
252       }
253       if constexpr (HaveOutput(arg.arg_info)) {
254         using Type = std::tuple_element_t<arg.arg_info.to, typename AsmCallInfo::OutputArguments>;
255         const char* xmm_type_name;
256         // {,u}int32_t and {,u}int64_t have to be converted to float/double.
257         if constexpr (std::is_integral_v<Type>) {
258           xmm_type_name =
259               TypeTraits<typename TypeTraits<typename TypeTraits<Type>::Float>::Raw>::kName;
260         } else {
261           // Float32/Float64 can not be used, we need to use raw float/double.
262           xmm_type_name = TypeTraits<typename TypeTraits<Type>::Raw>::kName;
263         }
264         fprintf(out, "%*s%s out%d_shadow;\n", indent, "", xmm_type_name, arg.arg_info.to);
265       }
266     }
267   });
268 }
269 
270 template <typename AsmCallInfo>
AssignRegisterNumbers(int * register_numbers)271 void AssignRegisterNumbers(int* register_numbers) {
272   // Assign number for output (and temporary) arguments.
273   std::size_t id = 0;
274   int arg_counter = 0;
275   AsmCallInfo::ProcessBindings([&id, &arg_counter, &register_numbers](auto arg) {
276     using RegisterClass = typename decltype(arg)::RegisterClass;
277     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
278       if constexpr (!std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
279         register_numbers[arg_counter] = id++;
280       }
281       ++arg_counter;
282     }
283   });
284   // Assign numbers for input arguments.
285   arg_counter = 0;
286   AsmCallInfo::ProcessBindings([&id, &arg_counter, &register_numbers](auto arg) {
287     using RegisterClass = typename decltype(arg)::RegisterClass;
288     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
289       if constexpr (std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
290         register_numbers[arg_counter] = id++;
291       }
292       ++arg_counter;
293     }
294   });
295 }
296 
297 template <typename AsmCallInfo>
CallTextAssembler(FILE * out,int indent,int * register_numbers)298 auto CallTextAssembler(FILE* out, int indent, int* register_numbers) {
299   MacroAssembler<TextAssembler> as(indent, out);
300   int arg_counter = 0;
301   AsmCallInfo::ProcessBindings([&arg_counter, &as, register_numbers](auto arg) {
302     using RegisterClass = typename decltype(arg)::RegisterClass;
303     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
304       if constexpr (RegisterClass::kAsRegister != 'm') {
305         if constexpr (RegisterClass::kIsImplicitReg) {
306           if constexpr (RegisterClass::kAsRegister == 'a') {
307             as.gpr_a = TextAssembler::Register(register_numbers[arg_counter]);
308           } else if constexpr (RegisterClass::kAsRegister == 'c') {
309             as.gpr_c = TextAssembler::Register(register_numbers[arg_counter]);
310           } else {
311             static_assert(RegisterClass::kAsRegister == 'd');
312             as.gpr_d = TextAssembler::Register(register_numbers[arg_counter]);
313           }
314         }
315       }
316       ++arg_counter;
317     }
318   });
319   as.gpr_macroassembler_constants = TextAssembler::Register(arg_counter);
320   arg_counter = 0;
321   int scratch_counter = 0;
322   std::apply(AsmCallInfo::kMacroInstruction,
323              std::tuple_cat(
324                  std::tuple<MacroAssembler<TextAssembler>&>{as},
325                  AsmCallInfo::MakeTuplefromBindings(
326                      [&as, &arg_counter, &scratch_counter, register_numbers](auto arg) {
327                        using RegisterClass = typename decltype(arg)::RegisterClass;
328                        if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
329                          if constexpr (RegisterClass::kAsRegister == 'm') {
330                            if (scratch_counter == 0) {
331                              as.gpr_macroassembler_scratch = TextAssembler::Register(arg_counter++);
332                            } else if (scratch_counter == 1) {
333                              as.gpr_macroassembler_scratch2 =
334                                  TextAssembler::Register(arg_counter++);
335                            } else {
336                              FATAL("Only two scratch registers are supported for now");
337                            }
338                            // Note: as.gpr_scratch in combination with offset is treated by text
339                            // assembler specially.  We rely on offset set here to be the same as
340                            // scratch2 address in scratch buffer.
341                            return std::tuple{TextAssembler::Operand{
342                                .base = as.gpr_scratch,
343                                .disp = static_cast<int32_t>(config::kScratchAreaSlotSize *
344                                                             scratch_counter++)}};
345                          } else if constexpr (RegisterClass::kIsImplicitReg) {
346                            ++arg_counter;
347                            return std::tuple{};
348                          } else {
349                            return std::tuple{register_numbers[arg_counter++]};
350                          }
351                        } else {
352                          return std::tuple{};
353                        }
354                      })));
355   // Verify CPU vendor and SSE restrictions.
356   bool expect_avx = false;
357   bool expect_bmi = false;
358   bool expect_fma = false;
359   bool expect_fma4 = false;
360   bool expect_lzcnt = false;
361   bool expect_popcnt = false;
362   bool expect_sse3 = false;
363   bool expect_ssse3 = false;
364   bool expect_sse4_1 = false;
365   bool expect_sse4_2 = false;
366   switch (AsmCallInfo::kCPUIDRestriction) {
367     case intrinsics::bindings::kHasBMI:
368       expect_bmi = true;
369       break;
370     case intrinsics::bindings::kHasLZCNT:
371       expect_lzcnt = true;
372       break;
373     case intrinsics::bindings::kHasPOPCNT:
374       expect_popcnt = true;
375       break;
376     case intrinsics::bindings::kHasFMA:
377     case intrinsics::bindings::kHasFMA4:
378       if (AsmCallInfo::kCPUIDRestriction == intrinsics::bindings::kHasFMA) {
379         expect_fma = true;
380       } else {
381         expect_fma4 = true;
382       }
383       [[fallthrough]];
384     case intrinsics::bindings::kHasAVX:
385       expect_avx = true;
386       [[fallthrough]];
387     case intrinsics::bindings::kHasSSE4_2:
388       expect_sse4_2 = true;
389       [[fallthrough]];
390     case intrinsics::bindings::kHasSSE4_1:
391       expect_sse4_1 = true;
392       [[fallthrough]];
393     case intrinsics::bindings::kHasSSSE3:
394       expect_ssse3 = true;
395       [[fallthrough]];
396     case intrinsics::bindings::kHasSSE3:
397       expect_sse3 = true;
398       [[fallthrough]];
399     case intrinsics::bindings::kIsAuthenticAMD:
400     case intrinsics::bindings::kNoCPUIDRestriction:;  // Do nothing - make compiler happy.
401   }
402   CHECK_EQ(expect_avx, as.need_avx);
403   CHECK_EQ(expect_bmi, as.need_bmi);
404   CHECK_EQ(expect_fma, as.need_fma);
405   CHECK_EQ(expect_fma4, as.need_fma4);
406   CHECK_EQ(expect_lzcnt, as.need_lzcnt);
407   CHECK_EQ(expect_popcnt, as.need_popcnt);
408   CHECK_EQ(expect_sse3, as.need_sse3);
409   CHECK_EQ(expect_ssse3, as.need_ssse3);
410   CHECK_EQ(expect_sse4_1, as.need_sse4_1);
411   CHECK_EQ(expect_sse4_2, as.need_sse4_2);
412   return std::tuple{as.need_gpr_macroassembler_scratch(), as.need_gpr_macroassembler_constants()};
413 }
414 
415 template <typename AsmCallInfo>
GenerateAssemblerOuts(FILE * out,int indent)416 void GenerateAssemblerOuts(FILE* out, int indent) {
417   std::vector<std::string> outs;
418   int tmp_id = 0;
419   AsmCallInfo::ProcessBindings([&outs, &tmp_id](auto arg) {
420     using RegisterClass = typename decltype(arg)::RegisterClass;
421     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS> &&
422                   !std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
423       std::string out = "\"=";
424       if constexpr (std::is_same_v<typename decltype(arg)::Usage,
425                                    intrinsics::bindings::DefEarlyClobber>) {
426         out += "&";
427       }
428       out += RegisterClass::kAsRegister;
429       if constexpr (HaveOutput(arg.arg_info)) {
430         bool need_shadow = NeedOutputShadow<AsmCallInfo>(arg);
431         out += "\"(out" + std::to_string(arg.arg_info.to) + (need_shadow ? "_shadow)" : ")");
432       } else if constexpr (HaveInput(arg.arg_info)) {
433         bool need_shadow = NeedInputShadow<AsmCallInfo>(arg);
434         out += "\"(in" + std::to_string(arg.arg_info.from) + (need_shadow ? "_shadow)" : ")");
435       } else {
436         out += "\"(tmp" + std::to_string(tmp_id++) + ")";
437       }
438       outs.push_back(out);
439     }
440   });
441   GenerateElementsList<AsmCallInfo>(out, indent, "  : ", "", outs);
442 }
443 
444 template <typename AsmCallInfo>
GenerateAssemblerIns(FILE * out,int indent,int * register_numbers,bool need_gpr_macroassembler_scratch,bool need_gpr_macroassembler_constants)445 void GenerateAssemblerIns(FILE* out,
446                           int indent,
447                           int* register_numbers,
448                           bool need_gpr_macroassembler_scratch,
449                           bool need_gpr_macroassembler_constants) {
450   std::vector<std::string> ins;
451   AsmCallInfo::ProcessBindings([&ins](auto arg) {
452     using RegisterClass = typename decltype(arg)::RegisterClass;
453     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS> &&
454                   std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
455       ins.push_back("\"" + std::string(1, RegisterClass::kAsRegister) + "\"(in" +
456                     std::to_string(arg.arg_info.from) +
457                     (NeedInputShadow<AsmCallInfo>(arg) ? "_shadow)" : ")"));
458     }
459   });
460   if (need_gpr_macroassembler_scratch) {
461     ins.push_back("\"m\"(scratch), \"m\"(scratch2)");
462   }
463   if (need_gpr_macroassembler_constants) {
464     ins.push_back(
465         "\"m\"(*reinterpret_cast<const char*>(&constants_pool::kBerberisMacroAssemblerConstants))");
466   }
467   int arg_counter = 0;
468   AsmCallInfo::ProcessBindings([&ins, &arg_counter, register_numbers](auto arg) {
469     using RegisterClass = typename decltype(arg)::RegisterClass;
470     if constexpr (!std::is_same_v<RegisterClass, intrinsics::bindings::FLAGS>) {
471       if constexpr (HaveInput(arg.arg_info) &&
472                     !std::is_same_v<typename decltype(arg)::Usage, intrinsics::bindings::Use>) {
473         ins.push_back("\"" + std::to_string(register_numbers[arg_counter]) + "\"(in" +
474                       std::to_string(arg.arg_info.from) +
475                       (NeedInputShadow<AsmCallInfo>(arg) ? "_shadow)" : ")"));
476       }
477       ++arg_counter;
478     }
479   });
480   GenerateElementsList<AsmCallInfo>(out, indent, "  : ", "", ins);
481 }
482 
483 template <typename AsmCallInfo>
GenerateOutShadows(FILE * out,int indent)484 void GenerateOutShadows(FILE* out, int indent) {
485   AsmCallInfo::ProcessBindings([out, indent](auto arg) {
486     using RegisterClass = typename decltype(arg)::RegisterClass;
487     if constexpr (RegisterClass::kAsRegister == 'r') {
488       // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
489       if constexpr (HaveOutput(arg.arg_info)) {
490         using Type = std::tuple_element_t<arg.arg_info.to, typename AsmCallInfo::OutputArguments>;
491         if constexpr (sizeof(Type) == sizeof(uint8_t)) {
492           fprintf(out, "%2$*1$sout%3$d = out%3$d_shadow;\n", indent, "", arg.arg_info.to);
493         }
494       }
495     } else if constexpr (RegisterClass::kAsRegister == 'x') {
496       if constexpr (HaveOutput(arg.arg_info)) {
497         using Type = std::tuple_element_t<arg.arg_info.to, typename AsmCallInfo::OutputArguments>;
498         const char* type_name = TypeTraits<Type>::kName;
499         const char* xmm_type_name;
500         // {,u}int32_t and {,u}int64_t have to be converted to float/double.
501         if constexpr (std::is_integral_v<Type>) {
502           xmm_type_name =
503               TypeTraits<typename TypeTraits<typename TypeTraits<Type>::Float>::Raw>::kName;
504         } else {
505           // Float32/Float64 can not be used, we need to use raw float/double.
506           xmm_type_name = TypeTraits<typename TypeTraits<Type>::Raw>::kName;
507         }
508         fprintf(out,
509                 "%*sstatic_assert(sizeof(%s) == sizeof(%s));\n",
510                 indent,
511                 "",
512                 type_name,
513                 xmm_type_name);
514         // Note: it's not safe to use bit_cast here till we have std::bit_cast from C++20.
515         // If optimizer wouldn't be enabled (e.g. if code is compiled with -O0) then bit_cast
516         // would use %st on 32-bit platform which destroys NaNs.
517         fprintf(out,
518                 "%2$*1$smemcpy(&out%3$d, &out%3$d_shadow, sizeof(%4$s));\n",
519                 indent,
520                 "",
521                 arg.arg_info.to,
522                 xmm_type_name);
523       }
524     }
525   });
526 }
527 
528 template <typename AsmCallInfo>
GenerateElementsList(FILE * out,int indent,const std::string & prefix,const std::string & suffix,const std::vector<std::string> & elements)529 void GenerateElementsList(FILE* out,
530                           int indent,
531                           const std::string& prefix,
532                           const std::string& suffix,
533                           const std::vector<std::string>& elements) {
534   std::size_t length = prefix.length() + suffix.length();
535   if (elements.size() == 0) {
536     fprintf(out, "%*s%s%s\n", indent, "", prefix.c_str(), suffix.c_str());
537     return;
538   }
539   for (const auto& element : elements) {
540     length += element.length() + 2;
541   }
542   for (const auto& element : elements) {
543     if (&element == &elements[0]) {
544       fprintf(out, "%*s%s%s", indent, "", prefix.c_str(), element.c_str());
545     } else {
546       if (length <= 102) {
547         fprintf(out, ", %s", element.c_str());
548       } else {
549         fprintf(out, ",\n%*s%s", static_cast<int>(prefix.length()) + indent, "", element.c_str());
550       }
551     }
552   }
553   fprintf(out, "%s\n", suffix.c_str());
554 }
555 
556 template <typename AsmCallInfo, typename Arg>
NeedInputShadow(Arg arg)557 constexpr bool NeedInputShadow(Arg arg) {
558   using RegisterClass = typename Arg::RegisterClass;
559   // Without shadow clang silently converts 'r' restriction into 'q' restriction which
560   // is wrong: if %ah or %bh is picked we would produce incorrect result here.
561   // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
562   if constexpr (RegisterClass::kAsRegister == 'r' && HaveInput(arg.arg_info)) {
563     // Only 8-bit registers are special because each 16-bit registers include two of them
564     // (%al/%ah, %cl/%ch, %dl/%dh, %bl/%bh).
565     // Mix of 16-bit and 64-bit registers doesn't trigger bug in Clang.
566     if constexpr (sizeof(std::tuple_element_t<arg.arg_info.from,
567                                               typename AsmCallInfo::InputArguments>) ==
568                   sizeof(uint8_t)) {
569       return true;
570     }
571   } else if constexpr (RegisterClass::kAsRegister == 'x') {
572     return true;
573   }
574   return false;
575 }
576 
577 template <typename AsmCallInfo, typename Arg>
NeedOutputShadow(Arg arg)578 constexpr bool NeedOutputShadow(Arg arg) {
579   using RegisterClass = typename Arg::RegisterClass;
580   // Without shadow clang silently converts 'r' restriction into 'q' restriction which
581   // is wrong: if %ah or %bh is picked we would produce incorrect result here.
582   // TODO(b/138439904): remove when clang handling of 'r' constraint would be fixed.
583   if constexpr (RegisterClass::kAsRegister == 'r' && HaveOutput(arg.arg_info)) {
584     // Only 8-bit registers are special because each some 16-bit registers include two of
585     // them (%al/%ah, %cl/%ch, %dl/%dh, %bl/%bh).
586     // Mix of 16-bit and 64-bit registers don't trigger bug in Clang.
587     if constexpr (sizeof(std::tuple_element_t<arg.arg_info.to,
588                                               typename AsmCallInfo::OutputArguments>) ==
589                   sizeof(uint8_t)) {
590       return true;
591     }
592   } else if constexpr (RegisterClass::kAsRegister == 'x') {
593     return true;
594   }
595   return false;
596 }
597 
598 #include "text_asm_intrinsics_process_bindings-inl.h"
599 
GenerateTextAsmIntrinsics(FILE * out)600 void GenerateTextAsmIntrinsics(FILE* out) {
601   intrinsics::bindings::CPUIDRestriction cpuid_restriction =
602       intrinsics::bindings::kNoCPUIDRestriction;
603   bool if_opened = false;
604   std::string running_name;
605   ProcessAllBindings<TextAssemblerX86<TextAssembler>,
606                      TextAssembler,
607                      MacroAssembler<TextAssembler>::MacroAssemblers>(
608       [&running_name, &if_opened, &cpuid_restriction, out](auto&& asm_call_generator) {
609         using AsmCallInfo = std::decay_t<decltype(asm_call_generator)>;
610         std::string full_name = std::string(asm_call_generator.kIntrinsic,
611                                             std::strlen(asm_call_generator.kIntrinsic) - 1) +
612                                 ", kUseCppImplementation>";
613         if (size_t arguments_count = std::tuple_size_v<typename AsmCallInfo::InputArguments>) {
614           full_name += "(in0";
615           for (size_t i = 1; i < arguments_count; ++i) {
616             full_name += ", in" + std::to_string(i);
617           }
618           full_name += ")";
619         } else {
620           full_name += "()";
621         }
622         if (full_name != running_name) {
623           if (if_opened) {
624             if (cpuid_restriction != intrinsics::bindings::kNoCPUIDRestriction) {
625               fprintf(out, "  } else {\n    return %s;\n", running_name.c_str());
626               cpuid_restriction = intrinsics::bindings::kNoCPUIDRestriction;
627             }
628             if_opened = false;
629             fprintf(out, "  }\n");
630           }
631           // Final line of function.
632           if (!running_name.empty()) {
633             fprintf(out, "};\n\n");
634           }
635           GenerateFunctionHeader<AsmCallInfo>(out, 0);
636           running_name = full_name;
637         }
638         if (asm_call_generator.kCPUIDRestriction != cpuid_restriction) {
639           if (asm_call_generator.kCPUIDRestriction == intrinsics::bindings::kNoCPUIDRestriction) {
640             fprintf(out, "  } else {\n");
641           } else {
642             if (if_opened) {
643               fprintf(out, "  } else if (");
644             } else {
645               fprintf(out, "  if (");
646               if_opened = true;
647             }
648             switch (asm_call_generator.kCPUIDRestriction) {
649               default:
650                 // Unsupported CPUID value.
651                 CHECK(false);
652               case intrinsics::bindings::kIsAuthenticAMD:
653                 fprintf(out, "host_platform::kIsAuthenticAMD");
654                 break;
655               case intrinsics::bindings::kHasAVX:
656                 fprintf(out, "host_platform::kHasAVX");
657                 break;
658               case intrinsics::bindings::kHasBMI:
659                 fprintf(out, "host_platform::kHasBMI");
660                 break;
661               case intrinsics::bindings::kHasFMA:
662                 fprintf(out, "host_platform::kHasFMA");
663                 break;
664               case intrinsics::bindings::kHasFMA4:
665                 fprintf(out, "host_platform::kHasFMA4");
666                 break;
667               case intrinsics::bindings::kHasLZCNT:
668                 fprintf(out, "host_platform::kHasLZCNT");
669                 break;
670               case intrinsics::bindings::kHasPOPCNT:
671                 fprintf(out, "host_platform::kHasPOPCNT");
672                 break;
673               case intrinsics::bindings::kHasSSE3:
674                 fprintf(out, "host_platform::kHasSSE3");
675                 break;
676               case intrinsics::bindings::kHasSSSE3:
677                 fprintf(out, "host_platform::kHasSSSE3");
678                 break;
679               case intrinsics::bindings::kHasSSE4_1:
680                 fprintf(out, "host_platform::kHasSSE4_1");
681                 break;
682               case intrinsics::bindings::kHasSSE4_2:
683                 fprintf(out, "host_platform::kHasSSE4_2");
684                 break;
685               case intrinsics::bindings::kNoCPUIDRestriction:;  // Do nothing - make compiler happy.
686             }
687             fprintf(out, ") {\n");
688           }
689           cpuid_restriction = asm_call_generator.kCPUIDRestriction;
690         }
691         GenerateFunctionBody<AsmCallInfo>(out, 2 + 2 * if_opened);
692       });
693   if (if_opened) {
694     fprintf(out, "  }\n");
695   }
696   // Final line of function.
697   fprintf(out, "};\n\n");
698 }
699 
700 }  // namespace berberis
701 
main(int argc,char * argv[])702 int main(int argc, char* argv[]) {
703   FILE* out = argc > 1 ? fopen(argv[1], "w") : stdout;
704   fprintf(out,
705           R"STRING(
706 // This file automatically generated by make_intrinsics.cc
707 // DO NOT EDIT!
708 
709 #ifndef %2$s_%3$s_INTRINSICS_INTRINSICS_H_
710 #define %2$s_%3$s_INTRINSICS_INTRINSICS_H_
711 
712 #include <xmmintrin.h>
713 
714 #include "berberis/base/config.h"
715 #include "berberis/runtime_primitives/platform.h"
716 #include "%3$s/intrinsics/%1$s/intrinsics.h"
717 #include "%3$s/intrinsics/vector_intrinsics.h"
718 
719 namespace berberis::constants_pool {
720 
721 struct MacroAssemblerConstants;
722 
723 extern const MacroAssemblerConstants kBerberisMacroAssemblerConstants
724     __attribute__((visibility("hidden")));
725 
726 }  // namespace berberis::constants_pool
727 
728 namespace %3$s {
729 
730 namespace constants_pool {
731 
732 %4$s
733 
734 }  // namespace constants_pool
735 
736 namespace intrinsics {
737 )STRING",
738           berberis::TextAssembler::kArchName,
739           berberis::TextAssembler::kArchGuard,
740           berberis::TextAssembler::kNamespaceName,
741           strcmp(berberis::TextAssembler::kNamespaceName, "berberis")
742               ? "using berberis::constants_pool::kBerberisMacroAssemblerConstants;"
743               : "");
744 
745   berberis::GenerateTextAsmIntrinsics(out);
746   berberis::MakeExtraGuestFunctions(out);
747 
748   fprintf(out,
749           R"STRING(
750 }  // namespace intrinsics
751 
752 }  // namespace %2$s
753 
754 #endif /* %1$s_%2$s_INTRINSICS_INTRINSICS_H_ */
755 )STRING",
756           berberis::TextAssembler::kArchGuard,
757           berberis::TextAssembler::kNamespaceName);
758 
759   fclose(out);
760   return 0;
761 }
762