1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
18 #define RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
19 
20 #include "berberis/base/bit_util.h"
21 #include "berberis/intrinsics/macro_assembler.h"
22 
23 namespace berberis {
24 
25 namespace {
26 
27 // Exceptions are at position 0 in both x87 status word and mxcsr.
28 // But rounding is in different positions for both.
29 constexpr int8_t kX87RmPosition = 10;
30 constexpr int8_t kMxcsrRmPosition = 13;
31 // Mask to clean exceptions and rm fields.
32 constexpr int8_t kX87MxcsrExceptionBits = 0b11'1101;  // No denormals: RISC-V doesn't have them.
33 constexpr int16_t kX87RoundingBits = 0b11 << kX87RmPosition;
34 constexpr int16_t kMxcsrRoundingBits = 0b11 << kMxcsrRmPosition;
35 // Because rouding mode is only two bits on x86 we can compress table which converts from
36 // RISC-V rounding mode to x87/SSE rounding mode into one integer.
37 // Each element of table is two bits here:
38 //   FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table.
39 // Note: we never convert from x86 rounding mode to RISC-V rounding mode because there are
40 // more roudning modes on RISC-V which means we have to keep these in the emulated CPU state.
41 constexpr int32_t kRiscVRoundingModes = 0b1110'0111'00;
42 
43 }  // namespace
44 
45 template <typename Assembler>
46 template <typename FloatType>
MacroCanonicalizeNan(XMMRegister result,XMMRegister src)47 void MacroAssembler<Assembler>::MacroCanonicalizeNan(XMMRegister result, XMMRegister src) {
48   Pmov(result, src);
49   Cmpords<FloatType>(result, src);
50   Pand(src, result);
51   Pandn(result, {.disp = constants_pool::kCanonicalNans<FloatType>});
52   Por(result, src);
53 }
54 
55 template <typename Assembler>
56 template <typename FloatType>
MacroCanonicalizeNanAVX(XMMRegister result,XMMRegister src)57 void MacroAssembler<Assembler>::MacroCanonicalizeNanAVX(XMMRegister result, XMMRegister src) {
58   Vcmpords<FloatType>(result, src, src);
59   Vpand(src, src, result);
60   Vpandn(result, result, {.disp = constants_pool::kCanonicalNans<FloatType>});
61   Vpor(result, result, src);
62 }
63 
64 template <typename Assembler>
65 template <typename FloatType>
MacroFeq(Register result,XMMRegister src1,XMMRegister src2)66 void MacroAssembler<Assembler>::MacroFeq(Register result, XMMRegister src1, XMMRegister src2) {
67   Cmpeqs<FloatType>(src1, src2);
68   Mov<FloatType>(result, src1);
69   And<int32_t>(result, 1);
70 }
71 
72 template <typename Assembler>
73 template <typename FloatType>
MacroFeqAVX(Register result,XMMRegister src1,XMMRegister src2,XMMRegister tmp)74 void MacroAssembler<Assembler>::MacroFeqAVX(Register result,
75                                             XMMRegister src1,
76                                             XMMRegister src2,
77                                             XMMRegister tmp) {
78   Vcmpeqs<FloatType>(tmp, src1, src2);
79   Vmov<FloatType>(result, tmp);
80   And<int32_t>(result, 1);
81 }
82 
83 // Note: result is returned in %rax which is implicit argument of that macro-instruction.
84 // Explicit argument is temporary needed to handle Stmxcsr instruction.
85 template <typename Assembler>
MacroFeGetExceptionsTranslate(const Operand & mxcsr_scratch)86 void MacroAssembler<Assembler>::MacroFeGetExceptionsTranslate(const Operand& mxcsr_scratch) {
87   // Store x87 status word in the AX.
88   Fnstsw();
89   // Store MXCSR in scratch slot.
90   Stmxcsr(mxcsr_scratch);
91   // Merge x87 status word and MXCSR.
92   Or<uint32_t>(gpr_a, mxcsr_scratch);
93   // Leave only exceptions.
94   And<uint32_t>(gpr_a, kX87MxcsrExceptionBits);
95   // Convert exception bits.
96   Expand<uint64_t, uint8_t>(gpr_a,
97                             {.index = gpr_a,
98                              .scale = Assembler::kTimesOne,
99                              .disp = constants_pool::kX87ToRiscVExceptions});
100 }
101 
102 template <typename Assembler>
MacroFeSetExceptionsAndRoundImmTranslate(const Operand & fenv_scratch,int8_t exceptions_and_rm)103 void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundImmTranslate(
104     const Operand& fenv_scratch,
105     int8_t exceptions_and_rm) {
106   int8_t exceptions = exceptions_and_rm & 0b1'1111;
107   int8_t rm = static_cast<uint8_t>(exceptions_and_rm) >> 5;
108   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
109   // Two bytes after control word are ignored.
110   Operand x87_status_word = {.base = fenv_scratch.base,
111                              .index = fenv_scratch.index,
112                              .scale = fenv_scratch.scale,
113                              .disp = fenv_scratch.disp + 4};
114   // Place mxcsr right after 28bytes-sized x87 environment.
115   Operand mxcsr = {.base = fenv_scratch.base,
116                    .index = fenv_scratch.index,
117                    .scale = fenv_scratch.scale,
118                    .disp = fenv_scratch.disp + 28};
119   // Convert RISC-V exceptions into x87 exceptions.
120   uint8_t x87_exceptions = bit_cast<unsigned char*>(
121       static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
122   // We have to store the whole floating point environment since it's not possible to just change
123   // status word without affecting other state.
124   Fnstenv(fenv_scratch);
125   // Store MXCSR in second scratch slot.
126   Stmxcsr(mxcsr);
127   // Clean exceptions in the x87 environment.
128   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
129   // Clean-out x87-RM field in x87 control word.
130   And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
131   // Clean-out MXCSR-RM field and exception bits in MXCSR.
132   And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
133   if (x87_exceptions) {
134     // If exceptions are not zero then put exceptions in the x87 environment.
135     Or<uint8_t>(x87_status_word, x87_exceptions);
136   }
137   if (rm) {
138     // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
139     Or<uint16_t>(fenv_scratch,
140                  (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
141   }
142   if (exceptions_and_rm) {
143     // If exceptions or roudning mode are not zero then then convert RISC-V rounding mode and store
144     // them it in MXCSR.
145     Or<uint32_t>(mxcsr,
146                  x87_exceptions | (((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) &
147                                    kMxcsrRoundingBits));
148   }
149   // Load x87 environment.
150   Fldenv(fenv_scratch);
151   // Load Mxcsr.
152   Ldmxcsr(mxcsr);
153 }
154 
155 template <typename Assembler>
MacroFeSetExceptionsAndRoundTranslate(Register exceptions,const Operand & fenv_scratch,Register scratch_register)156 void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundTranslate(Register exceptions,
157                                                                       const Operand& fenv_scratch,
158                                                                       Register scratch_register) {
159   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
160   // Two bytes after control word are ignored.
161   Operand x87_status_word = {.base = fenv_scratch.base,
162                              .index = fenv_scratch.index,
163                              .scale = fenv_scratch.scale,
164                              .disp = fenv_scratch.disp + 4};
165   // Place mxcsr right after 28bytes-sized x87 environment.
166   Operand mxcsr = {.base = fenv_scratch.base,
167                    .index = fenv_scratch.index,
168                    .scale = fenv_scratch.scale,
169                    .disp = fenv_scratch.disp + 28};
170   // We have to store the whole floating point environment since it's not possible to just change
171   // status word without affecting other state.
172   Fnstenv(fenv_scratch);
173   // Store MXCSR in second scratch slot.
174   Stmxcsr(mxcsr);
175   // Convert exceptions from RISC-V format to x87 format.
176   Mov<uint8_t>(scratch_register,
177                {.index = exceptions,
178                 .scale = Assembler::kTimesOne,
179                 .disp = constants_pool::kRiscVToX87Exceptions});
180   // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
181   // one may imagine. Two bytes after control word are ignored.
182   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
183   // Clean-out x87-RM field in x87 control word.
184   And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
185   // Clean-out MXCSR-RM field and exception bits in MXCSR.
186   And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
187   // Put exceptions in the x87 environment.
188   Or<uint8_t>(x87_status_word, scratch_register);
189   // Put exceptions in the MXCSR environment.
190   Or<uint8_t>(mxcsr, scratch_register);
191   // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
192   Mov<uint32_t>(scratch_register, kRiscVRoundingModes << kX87RmPosition);
193   // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
194   ShrByCl<uint32_t>(scratch_register);
195   // Each field is two bits so we need to shift by “rm” twice.
196   // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
197   ShrByCl<uint32_t>(scratch_register);
198   // Mask only x87-RM bits.
199   And<uint32_t>(scratch_register, kX87RoundingBits);
200   // Push x87-RM field into x87 control world.
201   Or<uint16_t>(fenv_scratch, scratch_register);
202   // Move x87-RM field into MSCXR-RM field.
203   Shl<uint32_t>(scratch_register, int8_t{3});
204   // Push MXCSR-RM field into MXCSR.
205   Or<uint32_t>(mxcsr, scratch_register);
206   // Load x87 environment.
207   Fldenv(fenv_scratch);
208   // Load Mxcsr.
209   Ldmxcsr(mxcsr);
210 }
211 
212 template <typename Assembler>
MacroFeSetExceptionsImmTranslate(const Operand & fenv_scratch,int8_t exceptions)213 void MacroAssembler<Assembler>::MacroFeSetExceptionsImmTranslate(const Operand& fenv_scratch,
214                                                                  int8_t exceptions) {
215   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
216   // Two bytes after control word are ignored.
217   Operand x87_status_word = {.base = fenv_scratch.base,
218                              .index = fenv_scratch.index,
219                              .scale = fenv_scratch.scale,
220                              .disp = fenv_scratch.disp + 4};
221   // Place mxcsr right after 28bytes-sized x87 environment.
222   Operand mxcsr = {.base = fenv_scratch.base,
223                    .index = fenv_scratch.index,
224                    .scale = fenv_scratch.scale,
225                    .disp = fenv_scratch.disp + 28};
226   // Convert RISC-V exceptions into x87 exceptions.
227   uint8_t x87_exceptions = bit_cast<unsigned char*>(
228       static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
229   // We have to store the whole floating point environment since it's not possible to just change
230   // status word without affecting other state.
231   Fnstenv(fenv_scratch);
232   // Store MXCSR in second scratch slot.
233   Stmxcsr(mxcsr);
234   // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
235   // one may imagine. Two bytes after control word are ignored.
236   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
237   // Clean exception bits
238   And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
239   if (x87_exceptions) {
240     // Put exceptions in the x87 environment.
241     Or<uint8_t>(x87_status_word, x87_exceptions);
242     // Put exceptions in the MXCSR environment.
243     Or<uint8_t>(mxcsr, x87_exceptions);
244   }
245   // Load x87 environment.
246   Fldenv(fenv_scratch);
247   // Load Mxcsr.
248   Ldmxcsr(mxcsr);
249 }
250 
251 template <typename Assembler>
MacroFeSetExceptionsTranslate(Register exceptions,const Operand & fenv_scratch,Register x87_exceptions)252 void MacroAssembler<Assembler>::MacroFeSetExceptionsTranslate(Register exceptions,
253                                                               const Operand& fenv_scratch,
254                                                               Register x87_exceptions) {
255   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
256   // Two bytes after control word are ignored.
257   Operand x87_status_word = {.base = fenv_scratch.base,
258                              .index = fenv_scratch.index,
259                              .scale = fenv_scratch.scale,
260                              .disp = fenv_scratch.disp + 4};
261   // Place mxcsr right after 28bytes-sized x87 environment.
262   Operand mxcsr = {.base = fenv_scratch.base,
263                    .index = fenv_scratch.index,
264                    .scale = fenv_scratch.scale,
265                    .disp = fenv_scratch.disp + 28};
266   // We have to store the whole floating point environment since it's not possible to just change
267   // status word without affecting other state.
268   Fnstenv(fenv_scratch);
269   // Store MXCSR in second scratch slot.
270   Stmxcsr(mxcsr);
271   // Convert exceptions from RISC-V format to x87 format.
272   Mov<uint8_t>(x87_exceptions,
273                {.index = exceptions,
274                 .scale = Assembler::kTimesOne,
275                 .disp = constants_pool::kRiscVToX87Exceptions});
276   // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
277   // one may imagine. Two bytes after control word are ignored.
278   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
279   // Clean exception bits
280   And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
281   // Put exceptions in the x87 environment.
282   Or<uint8_t>(x87_status_word, x87_exceptions);
283   // Put exceptions in the MXCSR environment.
284   Or<uint8_t>(mxcsr, x87_exceptions);
285   // Load x87 environment.
286   Fldenv(fenv_scratch);
287   // Load Mxcsr.
288   Ldmxcsr(mxcsr);
289 }
290 
291 // Note: actual rounding mode comes in %cl which is implicit argument of that macro-instruction.
292 // All explicit arguments are temporaries.
293 template <typename Assembler>
MacroFeSetRound(Register x87_sse_round,const Operand & cw_scratch,const Operand & mxcsr_scratch)294 void MacroAssembler<Assembler>::MacroFeSetRound(Register x87_sse_round,
295                                                 const Operand& cw_scratch,
296                                                 const Operand& mxcsr_scratch) {
297   // Store x87 control world in first scratch slot.
298   Fnstcw(cw_scratch);
299   // Store MXCSR in second scratch slot.
300   Stmxcsr(mxcsr_scratch);
301   // Clean-out x87-RM field in x87 control word.
302   And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
303   // Clean-out MXCSR-RM field in MXCSR.
304   And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
305   // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
306   Mov<uint32_t>(x87_sse_round, kRiscVRoundingModes << kX87RmPosition);
307   // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
308   ShrByCl<uint32_t>(x87_sse_round);
309   // Each field is two bits so we need to shift by “rm” twice.
310   // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
311   ShrByCl<uint32_t>(x87_sse_round);
312   // Mask only x87-RM bits.
313   And<uint32_t>(x87_sse_round, kX87RoundingBits);
314   // Push x87-RM field into x87 control world.
315   Or<uint16_t>(cw_scratch, x87_sse_round);
316   // Move x87-RM field into MSCXR-RM field.
317   Shl<uint32_t>(x87_sse_round, int8_t{3});
318   // Push MXCSR-RM field into MXCSR.
319   Or<uint32_t>(mxcsr_scratch, x87_sse_round);
320   // Load new control world into x87 FPU.
321   Fldcw(cw_scratch);
322   // Load Mxcsr.
323   Ldmxcsr(mxcsr_scratch);
324 }
325 
326 template <typename Assembler>
MacroFeSetRoundImmTranslate(const Operand & cw_scratch,const Operand & mxcsr_scratch,int8_t rm)327 void MacroAssembler<Assembler>::MacroFeSetRoundImmTranslate(const Operand& cw_scratch,
328                                                             const Operand& mxcsr_scratch,
329                                                             int8_t rm) {
330   // Store x87 control world in first scratch slot.
331   Fnstcw(cw_scratch);
332   // Store MXCSR in second scratch slot.
333   Stmxcsr(mxcsr_scratch);
334   // Clean-out x87-RM field in x87 control word.
335   And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
336   // Clean-out MXCSR-RM field in MXCSR.
337   And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
338   if (rm) {
339     // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
340     Or<uint16_t>(cw_scratch,
341                  (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
342     // If rounding mode is not zero then convert RISC-V rounding mode and store it in MXCSR.
343     Or<uint32_t>(mxcsr_scratch,
344                  ((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) & kMxcsrRoundingBits);
345   }
346   // Load new control world into x87 FPU.
347   Fldcw(cw_scratch);
348   // Load Mxcsr.
349   Ldmxcsr(mxcsr_scratch);
350 }
351 
352 template <typename Assembler>
353 template <typename FloatType>
MacroFle(Register result,XMMRegister src1,XMMRegister src2)354 void MacroAssembler<Assembler>::MacroFle(Register result, XMMRegister src1, XMMRegister src2) {
355   Cmples<FloatType>(src1, src2);
356   Mov<FloatType>(result, src1);
357   And<int32_t>(result, 1);
358 }
359 
360 template <typename Assembler>
361 template <typename FormatTo, typename FormatFrom>
MacroFCvtFloatToInteger(Register result,XMMRegister src)362 void MacroAssembler<Assembler>::MacroFCvtFloatToInteger(Register result, XMMRegister src) {
363   Cvt<FormatFrom, FormatTo>(result, src);
364 }
365 
366 template <typename Assembler>
367 template <typename FloatType>
MacroFleAVX(Register result,XMMRegister src1,XMMRegister src2,XMMRegister tmp)368 void MacroAssembler<Assembler>::MacroFleAVX(Register result,
369                                             XMMRegister src1,
370                                             XMMRegister src2,
371                                             XMMRegister tmp) {
372   Vcmples<FloatType>(tmp, src1, src2);
373   Vmov<FloatType>(result, tmp);
374   And<int32_t>(result, 1);
375 }
376 
377 template <typename Assembler>
378 template <typename FloatType>
MacroFlt(Register result,XMMRegister src1,XMMRegister src2)379 void MacroAssembler<Assembler>::MacroFlt(Register result, XMMRegister src1, XMMRegister src2) {
380   Cmplts<FloatType>(src1, src2);
381   Mov<FloatType>(result, src1);
382   And<int32_t>(result, 1);
383 }
384 
385 template <typename Assembler>
386 template <typename FloatType>
MacroFltAVX(Register result,XMMRegister src1,XMMRegister src2,XMMRegister tmp)387 void MacroAssembler<Assembler>::MacroFltAVX(Register result,
388                                             XMMRegister src1,
389                                             XMMRegister src2,
390                                             XMMRegister tmp) {
391   Vcmplts<FloatType>(tmp, src1, src2);
392   Vmov<FloatType>(result, tmp);
393   And<int32_t>(result, 1);
394 }
395 
396 template <typename Assembler>
397 template <typename FloatType>
MacroNanBox(XMMRegister arg)398 void MacroAssembler<Assembler>::MacroNanBox(XMMRegister arg) {
399   static_assert(std::is_same_v<FloatType, Float32>);
400 
401   Por(arg, {.disp = constants_pool::kNanBox<Float32>});
402 }
403 
404 template <typename Assembler>
405 template <typename FloatType>
MacroNanBoxAVX(XMMRegister result,XMMRegister src)406 void MacroAssembler<Assembler>::MacroNanBoxAVX(XMMRegister result, XMMRegister src) {
407   static_assert(std::is_same_v<FloatType, Float32>);
408 
409   Vpor(result, src, {.disp = constants_pool::kNanBox<Float32>});
410 }
411 
412 template <typename Assembler>
413 template <typename FloatType>
MacroUnboxNan(XMMRegister result,XMMRegister src)414 void MacroAssembler<Assembler>::MacroUnboxNan(XMMRegister result, XMMRegister src) {
415   static_assert(std::is_same_v<FloatType, Float32>);
416 
417   Pmov(result, src);
418   Pcmpeq<typename TypeTraits<FloatType>::Int>(result, {.disp = constants_pool::kNanBox<Float32>});
419   Pshufd(result, result, kShuffleDDBB);
420   Pand(src, result);
421   Pandn(result, {.disp = constants_pool::kNanBoxedNans<Float32>});
422   Por(result, src);
423 }
424 
425 template <typename Assembler>
426 template <typename FloatType>
MacroUnboxNanAVX(XMMRegister result,XMMRegister src)427 void MacroAssembler<Assembler>::MacroUnboxNanAVX(XMMRegister result, XMMRegister src) {
428   static_assert(std::is_same_v<FloatType, Float32>);
429 
430   Vpcmpeq<typename TypeTraits<FloatType>::Int>(
431       result, src, {.disp = constants_pool::kNanBox<Float32>});
432   Vpshufd(result, result, kShuffleDDBB);
433   Vpand(src, src, result);
434   Vpandn(result, result, {.disp = constants_pool::kNanBoxedNans<Float32>});
435   Vpor(result, result, src);
436 }
437 
438 }  // namespace berberis
439 
440 #endif  // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
441