1 /*
2 * Copyright (C) 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
18 #define RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
19
20 #include "berberis/base/bit_util.h"
21 #include "berberis/intrinsics/macro_assembler.h"
22
23 namespace berberis {
24
25 namespace {
26
27 // Exceptions are at position 0 in both x87 status word and mxcsr.
28 // But rounding is in different positions for both.
29 constexpr int8_t kX87RmPosition = 10;
30 constexpr int8_t kMxcsrRmPosition = 13;
31 // Mask to clean exceptions and rm fields.
32 constexpr int8_t kX87MxcsrExceptionBits = 0b11'1101; // No denormals: RISC-V doesn't have them.
33 constexpr int16_t kX87RoundingBits = 0b11 << kX87RmPosition;
34 constexpr int16_t kMxcsrRoundingBits = 0b11 << kMxcsrRmPosition;
35 // Because rouding mode is only two bits on x86 we can compress table which converts from
36 // RISC-V rounding mode to x87/SSE rounding mode into one integer.
37 // Each element of table is two bits here:
38 // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table.
39 // Note: we never convert from x86 rounding mode to RISC-V rounding mode because there are
40 // more roudning modes on RISC-V which means we have to keep these in the emulated CPU state.
41 constexpr int32_t kRiscVRoundingModes = 0b1110'0111'00;
42
43 } // namespace
44
45 template <typename Assembler>
46 template <typename FloatType>
MacroCanonicalizeNan(XMMRegister result,XMMRegister src)47 void MacroAssembler<Assembler>::MacroCanonicalizeNan(XMMRegister result, XMMRegister src) {
48 Pmov(result, src);
49 Cmpords<FloatType>(result, src);
50 Pand(src, result);
51 Pandn(result, {.disp = constants_pool::kCanonicalNans<FloatType>});
52 Por(result, src);
53 }
54
55 template <typename Assembler>
56 template <typename FloatType>
MacroCanonicalizeNanAVX(XMMRegister result,XMMRegister src)57 void MacroAssembler<Assembler>::MacroCanonicalizeNanAVX(XMMRegister result, XMMRegister src) {
58 Vcmpords<FloatType>(result, src, src);
59 Vpand(src, src, result);
60 Vpandn(result, result, {.disp = constants_pool::kCanonicalNans<FloatType>});
61 Vpor(result, result, src);
62 }
63
64 template <typename Assembler>
65 template <typename FloatType>
MacroFeq(Register result,XMMRegister src1,XMMRegister src2)66 void MacroAssembler<Assembler>::MacroFeq(Register result, XMMRegister src1, XMMRegister src2) {
67 Cmpeqs<FloatType>(src1, src2);
68 Mov<FloatType>(result, src1);
69 And<int32_t>(result, 1);
70 }
71
72 template <typename Assembler>
73 template <typename FloatType>
MacroFeqAVX(Register result,XMMRegister src1,XMMRegister src2,XMMRegister tmp)74 void MacroAssembler<Assembler>::MacroFeqAVX(Register result,
75 XMMRegister src1,
76 XMMRegister src2,
77 XMMRegister tmp) {
78 Vcmpeqs<FloatType>(tmp, src1, src2);
79 Vmov<FloatType>(result, tmp);
80 And<int32_t>(result, 1);
81 }
82
83 // Note: result is returned in %rax which is implicit argument of that macro-instruction.
84 // Explicit argument is temporary needed to handle Stmxcsr instruction.
85 template <typename Assembler>
MacroFeGetExceptionsTranslate(const Operand & mxcsr_scratch)86 void MacroAssembler<Assembler>::MacroFeGetExceptionsTranslate(const Operand& mxcsr_scratch) {
87 // Store x87 status word in the AX.
88 Fnstsw();
89 // Store MXCSR in scratch slot.
90 Stmxcsr(mxcsr_scratch);
91 // Merge x87 status word and MXCSR.
92 Or<uint32_t>(gpr_a, mxcsr_scratch);
93 // Leave only exceptions.
94 And<uint32_t>(gpr_a, kX87MxcsrExceptionBits);
95 // Convert exception bits.
96 Expand<uint64_t, uint8_t>(gpr_a,
97 {.index = gpr_a,
98 .scale = Assembler::kTimesOne,
99 .disp = constants_pool::kX87ToRiscVExceptions});
100 }
101
102 template <typename Assembler>
MacroFeSetExceptionsAndRoundImmTranslate(const Operand & fenv_scratch,int8_t exceptions_and_rm)103 void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundImmTranslate(
104 const Operand& fenv_scratch,
105 int8_t exceptions_and_rm) {
106 int8_t exceptions = exceptions_and_rm & 0b1'1111;
107 int8_t rm = static_cast<uint8_t>(exceptions_and_rm) >> 5;
108 // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
109 // Two bytes after control word are ignored.
110 Operand x87_status_word = {.base = fenv_scratch.base,
111 .index = fenv_scratch.index,
112 .scale = fenv_scratch.scale,
113 .disp = fenv_scratch.disp + 4};
114 // Place mxcsr right after 28bytes-sized x87 environment.
115 Operand mxcsr = {.base = fenv_scratch.base,
116 .index = fenv_scratch.index,
117 .scale = fenv_scratch.scale,
118 .disp = fenv_scratch.disp + 28};
119 // Convert RISC-V exceptions into x87 exceptions.
120 uint8_t x87_exceptions = bit_cast<unsigned char*>(
121 static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
122 // We have to store the whole floating point environment since it's not possible to just change
123 // status word without affecting other state.
124 Fnstenv(fenv_scratch);
125 // Store MXCSR in second scratch slot.
126 Stmxcsr(mxcsr);
127 // Clean exceptions in the x87 environment.
128 And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
129 // Clean-out x87-RM field in x87 control word.
130 And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
131 // Clean-out MXCSR-RM field and exception bits in MXCSR.
132 And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
133 if (x87_exceptions) {
134 // If exceptions are not zero then put exceptions in the x87 environment.
135 Or<uint8_t>(x87_status_word, x87_exceptions);
136 }
137 if (rm) {
138 // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
139 Or<uint16_t>(fenv_scratch,
140 (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
141 }
142 if (exceptions_and_rm) {
143 // If exceptions or roudning mode are not zero then then convert RISC-V rounding mode and store
144 // them it in MXCSR.
145 Or<uint32_t>(mxcsr,
146 x87_exceptions | (((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) &
147 kMxcsrRoundingBits));
148 }
149 // Load x87 environment.
150 Fldenv(fenv_scratch);
151 // Load Mxcsr.
152 Ldmxcsr(mxcsr);
153 }
154
155 template <typename Assembler>
MacroFeSetExceptionsAndRoundTranslate(Register exceptions,const Operand & fenv_scratch,Register scratch_register)156 void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundTranslate(Register exceptions,
157 const Operand& fenv_scratch,
158 Register scratch_register) {
159 // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
160 // Two bytes after control word are ignored.
161 Operand x87_status_word = {.base = fenv_scratch.base,
162 .index = fenv_scratch.index,
163 .scale = fenv_scratch.scale,
164 .disp = fenv_scratch.disp + 4};
165 // Place mxcsr right after 28bytes-sized x87 environment.
166 Operand mxcsr = {.base = fenv_scratch.base,
167 .index = fenv_scratch.index,
168 .scale = fenv_scratch.scale,
169 .disp = fenv_scratch.disp + 28};
170 // We have to store the whole floating point environment since it's not possible to just change
171 // status word without affecting other state.
172 Fnstenv(fenv_scratch);
173 // Store MXCSR in second scratch slot.
174 Stmxcsr(mxcsr);
175 // Convert exceptions from RISC-V format to x87 format.
176 Mov<uint8_t>(scratch_register,
177 {.index = exceptions,
178 .scale = Assembler::kTimesOne,
179 .disp = constants_pool::kRiscVToX87Exceptions});
180 // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
181 // one may imagine. Two bytes after control word are ignored.
182 And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
183 // Clean-out x87-RM field in x87 control word.
184 And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
185 // Clean-out MXCSR-RM field and exception bits in MXCSR.
186 And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
187 // Put exceptions in the x87 environment.
188 Or<uint8_t>(x87_status_word, scratch_register);
189 // Put exceptions in the MXCSR environment.
190 Or<uint8_t>(mxcsr, scratch_register);
191 // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
192 Mov<uint32_t>(scratch_register, kRiscVRoundingModes << kX87RmPosition);
193 // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
194 ShrByCl<uint32_t>(scratch_register);
195 // Each field is two bits so we need to shift by “rm” twice.
196 // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
197 ShrByCl<uint32_t>(scratch_register);
198 // Mask only x87-RM bits.
199 And<uint32_t>(scratch_register, kX87RoundingBits);
200 // Push x87-RM field into x87 control world.
201 Or<uint16_t>(fenv_scratch, scratch_register);
202 // Move x87-RM field into MSCXR-RM field.
203 Shl<uint32_t>(scratch_register, int8_t{3});
204 // Push MXCSR-RM field into MXCSR.
205 Or<uint32_t>(mxcsr, scratch_register);
206 // Load x87 environment.
207 Fldenv(fenv_scratch);
208 // Load Mxcsr.
209 Ldmxcsr(mxcsr);
210 }
211
212 template <typename Assembler>
MacroFeSetExceptionsImmTranslate(const Operand & fenv_scratch,int8_t exceptions)213 void MacroAssembler<Assembler>::MacroFeSetExceptionsImmTranslate(const Operand& fenv_scratch,
214 int8_t exceptions) {
215 // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
216 // Two bytes after control word are ignored.
217 Operand x87_status_word = {.base = fenv_scratch.base,
218 .index = fenv_scratch.index,
219 .scale = fenv_scratch.scale,
220 .disp = fenv_scratch.disp + 4};
221 // Place mxcsr right after 28bytes-sized x87 environment.
222 Operand mxcsr = {.base = fenv_scratch.base,
223 .index = fenv_scratch.index,
224 .scale = fenv_scratch.scale,
225 .disp = fenv_scratch.disp + 28};
226 // Convert RISC-V exceptions into x87 exceptions.
227 uint8_t x87_exceptions = bit_cast<unsigned char*>(
228 static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
229 // We have to store the whole floating point environment since it's not possible to just change
230 // status word without affecting other state.
231 Fnstenv(fenv_scratch);
232 // Store MXCSR in second scratch slot.
233 Stmxcsr(mxcsr);
234 // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
235 // one may imagine. Two bytes after control word are ignored.
236 And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
237 // Clean exception bits
238 And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
239 if (x87_exceptions) {
240 // Put exceptions in the x87 environment.
241 Or<uint8_t>(x87_status_word, x87_exceptions);
242 // Put exceptions in the MXCSR environment.
243 Or<uint8_t>(mxcsr, x87_exceptions);
244 }
245 // Load x87 environment.
246 Fldenv(fenv_scratch);
247 // Load Mxcsr.
248 Ldmxcsr(mxcsr);
249 }
250
251 template <typename Assembler>
MacroFeSetExceptionsTranslate(Register exceptions,const Operand & fenv_scratch,Register x87_exceptions)252 void MacroAssembler<Assembler>::MacroFeSetExceptionsTranslate(Register exceptions,
253 const Operand& fenv_scratch,
254 Register x87_exceptions) {
255 // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
256 // Two bytes after control word are ignored.
257 Operand x87_status_word = {.base = fenv_scratch.base,
258 .index = fenv_scratch.index,
259 .scale = fenv_scratch.scale,
260 .disp = fenv_scratch.disp + 4};
261 // Place mxcsr right after 28bytes-sized x87 environment.
262 Operand mxcsr = {.base = fenv_scratch.base,
263 .index = fenv_scratch.index,
264 .scale = fenv_scratch.scale,
265 .disp = fenv_scratch.disp + 28};
266 // We have to store the whole floating point environment since it's not possible to just change
267 // status word without affecting other state.
268 Fnstenv(fenv_scratch);
269 // Store MXCSR in second scratch slot.
270 Stmxcsr(mxcsr);
271 // Convert exceptions from RISC-V format to x87 format.
272 Mov<uint8_t>(x87_exceptions,
273 {.index = exceptions,
274 .scale = Assembler::kTimesOne,
275 .disp = constants_pool::kRiscVToX87Exceptions});
276 // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
277 // one may imagine. Two bytes after control word are ignored.
278 And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
279 // Clean exception bits
280 And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
281 // Put exceptions in the x87 environment.
282 Or<uint8_t>(x87_status_word, x87_exceptions);
283 // Put exceptions in the MXCSR environment.
284 Or<uint8_t>(mxcsr, x87_exceptions);
285 // Load x87 environment.
286 Fldenv(fenv_scratch);
287 // Load Mxcsr.
288 Ldmxcsr(mxcsr);
289 }
290
291 // Note: actual rounding mode comes in %cl which is implicit argument of that macro-instruction.
292 // All explicit arguments are temporaries.
293 template <typename Assembler>
MacroFeSetRound(Register x87_sse_round,const Operand & cw_scratch,const Operand & mxcsr_scratch)294 void MacroAssembler<Assembler>::MacroFeSetRound(Register x87_sse_round,
295 const Operand& cw_scratch,
296 const Operand& mxcsr_scratch) {
297 // Store x87 control world in first scratch slot.
298 Fnstcw(cw_scratch);
299 // Store MXCSR in second scratch slot.
300 Stmxcsr(mxcsr_scratch);
301 // Clean-out x87-RM field in x87 control word.
302 And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
303 // Clean-out MXCSR-RM field in MXCSR.
304 And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
305 // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
306 Mov<uint32_t>(x87_sse_round, kRiscVRoundingModes << kX87RmPosition);
307 // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
308 ShrByCl<uint32_t>(x87_sse_round);
309 // Each field is two bits so we need to shift by “rm” twice.
310 // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
311 ShrByCl<uint32_t>(x87_sse_round);
312 // Mask only x87-RM bits.
313 And<uint32_t>(x87_sse_round, kX87RoundingBits);
314 // Push x87-RM field into x87 control world.
315 Or<uint16_t>(cw_scratch, x87_sse_round);
316 // Move x87-RM field into MSCXR-RM field.
317 Shl<uint32_t>(x87_sse_round, int8_t{3});
318 // Push MXCSR-RM field into MXCSR.
319 Or<uint32_t>(mxcsr_scratch, x87_sse_round);
320 // Load new control world into x87 FPU.
321 Fldcw(cw_scratch);
322 // Load Mxcsr.
323 Ldmxcsr(mxcsr_scratch);
324 }
325
326 template <typename Assembler>
MacroFeSetRoundImmTranslate(const Operand & cw_scratch,const Operand & mxcsr_scratch,int8_t rm)327 void MacroAssembler<Assembler>::MacroFeSetRoundImmTranslate(const Operand& cw_scratch,
328 const Operand& mxcsr_scratch,
329 int8_t rm) {
330 // Store x87 control world in first scratch slot.
331 Fnstcw(cw_scratch);
332 // Store MXCSR in second scratch slot.
333 Stmxcsr(mxcsr_scratch);
334 // Clean-out x87-RM field in x87 control word.
335 And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
336 // Clean-out MXCSR-RM field in MXCSR.
337 And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
338 if (rm) {
339 // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
340 Or<uint16_t>(cw_scratch,
341 (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
342 // If rounding mode is not zero then convert RISC-V rounding mode and store it in MXCSR.
343 Or<uint32_t>(mxcsr_scratch,
344 ((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) & kMxcsrRoundingBits);
345 }
346 // Load new control world into x87 FPU.
347 Fldcw(cw_scratch);
348 // Load Mxcsr.
349 Ldmxcsr(mxcsr_scratch);
350 }
351
352 template <typename Assembler>
353 template <typename FloatType>
MacroFle(Register result,XMMRegister src1,XMMRegister src2)354 void MacroAssembler<Assembler>::MacroFle(Register result, XMMRegister src1, XMMRegister src2) {
355 Cmples<FloatType>(src1, src2);
356 Mov<FloatType>(result, src1);
357 And<int32_t>(result, 1);
358 }
359
360 template <typename Assembler>
361 template <typename FormatTo, typename FormatFrom>
MacroFCvtFloatToInteger(Register result,XMMRegister src)362 void MacroAssembler<Assembler>::MacroFCvtFloatToInteger(Register result, XMMRegister src) {
363 Cvt<FormatFrom, FormatTo>(result, src);
364 }
365
366 template <typename Assembler>
367 template <typename FloatType>
MacroFleAVX(Register result,XMMRegister src1,XMMRegister src2,XMMRegister tmp)368 void MacroAssembler<Assembler>::MacroFleAVX(Register result,
369 XMMRegister src1,
370 XMMRegister src2,
371 XMMRegister tmp) {
372 Vcmples<FloatType>(tmp, src1, src2);
373 Vmov<FloatType>(result, tmp);
374 And<int32_t>(result, 1);
375 }
376
377 template <typename Assembler>
378 template <typename FloatType>
MacroFlt(Register result,XMMRegister src1,XMMRegister src2)379 void MacroAssembler<Assembler>::MacroFlt(Register result, XMMRegister src1, XMMRegister src2) {
380 Cmplts<FloatType>(src1, src2);
381 Mov<FloatType>(result, src1);
382 And<int32_t>(result, 1);
383 }
384
385 template <typename Assembler>
386 template <typename FloatType>
MacroFltAVX(Register result,XMMRegister src1,XMMRegister src2,XMMRegister tmp)387 void MacroAssembler<Assembler>::MacroFltAVX(Register result,
388 XMMRegister src1,
389 XMMRegister src2,
390 XMMRegister tmp) {
391 Vcmplts<FloatType>(tmp, src1, src2);
392 Vmov<FloatType>(result, tmp);
393 And<int32_t>(result, 1);
394 }
395
396 template <typename Assembler>
397 template <typename FloatType>
MacroNanBox(XMMRegister arg)398 void MacroAssembler<Assembler>::MacroNanBox(XMMRegister arg) {
399 static_assert(std::is_same_v<FloatType, Float32>);
400
401 Por(arg, {.disp = constants_pool::kNanBox<Float32>});
402 }
403
404 template <typename Assembler>
405 template <typename FloatType>
MacroNanBoxAVX(XMMRegister result,XMMRegister src)406 void MacroAssembler<Assembler>::MacroNanBoxAVX(XMMRegister result, XMMRegister src) {
407 static_assert(std::is_same_v<FloatType, Float32>);
408
409 Vpor(result, src, {.disp = constants_pool::kNanBox<Float32>});
410 }
411
412 template <typename Assembler>
413 template <typename FloatType>
MacroUnboxNan(XMMRegister result,XMMRegister src)414 void MacroAssembler<Assembler>::MacroUnboxNan(XMMRegister result, XMMRegister src) {
415 static_assert(std::is_same_v<FloatType, Float32>);
416
417 Pmov(result, src);
418 Pcmpeq<typename TypeTraits<FloatType>::Int>(result, {.disp = constants_pool::kNanBox<Float32>});
419 Pshufd(result, result, kShuffleDDBB);
420 Pand(src, result);
421 Pandn(result, {.disp = constants_pool::kNanBoxedNans<Float32>});
422 Por(result, src);
423 }
424
425 template <typename Assembler>
426 template <typename FloatType>
MacroUnboxNanAVX(XMMRegister result,XMMRegister src)427 void MacroAssembler<Assembler>::MacroUnboxNanAVX(XMMRegister result, XMMRegister src) {
428 static_assert(std::is_same_v<FloatType, Float32>);
429
430 Vpcmpeq<typename TypeTraits<FloatType>::Int>(
431 result, src, {.disp = constants_pool::kNanBox<Float32>});
432 Vpshufd(result, result, kShuffleDDBB);
433 Vpand(src, src, result);
434 Vpandn(result, result, {.disp = constants_pool::kNanBoxedNans<Float32>});
435 Vpor(result, result, src);
436 }
437
438 } // namespace berberis
439
440 #endif // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
441