android-15.0.0_r3/s

/*
 * Copyright (C) 2020 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
#define RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_

#include "berberis/base/bit_util.h"
#include "berberis/intrinsics/macro_assembler.h"

namespace berberis {

namespace {

// Exceptions are at position 0 in both x87 status word and mxcsr.
// But rounding is in different positions for both.
constexpr int8_t kX87RmPosition = 10;
constexpr int8_t kMxcsrRmPosition = 13;
// Mask to clean exceptions and rm fields.
constexpr int8_t kX87MxcsrExceptionBits = 0b11'1101;  // No denormals: RISC-V doesn't have them.
constexpr int16_t kX87RoundingBits = 0b11 << kX87RmPosition;
constexpr int16_t kMxcsrRoundingBits = 0b11 << kMxcsrRmPosition;
// Because rouding mode is only two bits on x86 we can compress table which converts from
// RISC-V rounding mode to x87/SSE rounding mode into one integer.
// Each element of table is two bits here:
//   FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table.
// Note: we never convert from x86 rounding mode to RISC-V rounding mode because there are
// more roudning modes on RISC-V which means we have to keep these in the emulated CPU state.
constexpr int32_t kRiscVRoundingModes = 0b1110'0111'00;

}  // namespace

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroCanonicalizeNan(XMMRegister result, XMMRegister src) {
  Pmov(result, src);
  Cmpords<FloatType>(result, src);
  Pand(src, result);
  Pandn(result, {.disp = constants_pool::kCanonicalNans<FloatType>});
  Por(result, src);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroCanonicalizeNanAVX(XMMRegister result, XMMRegister src) {
  Vcmpords<FloatType>(result, src, src);
  Vpand(src, src, result);
  Vpandn(result, result, {.disp = constants_pool::kCanonicalNans<FloatType>});
  Vpor(result, result, src);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroFeq(Register result, XMMRegister src1, XMMRegister src2) {
  Cmpeqs<FloatType>(src1, src2);
  Mov<FloatType>(result, src1);
  And<int32_t>(result, 1);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroFeqAVX(Register result,
                                            XMMRegister src1,
                                            XMMRegister src2,
                                            XMMRegister tmp) {
  Vcmpeqs<FloatType>(tmp, src1, src2);
  Vmov<FloatType>(result, tmp);
  And<int32_t>(result, 1);
}

// Note: result is returned in %rax which is implicit argument of that macro-instruction.
// Explicit argument is temporary needed to handle Stmxcsr instruction.
template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeGetExceptionsTranslate(const Operand& mxcsr_scratch) {
  // Store x87 status word in the AX.
  Fnstsw();
  // Store MXCSR in scratch slot.
  Stmxcsr(mxcsr_scratch);
  // Merge x87 status word and MXCSR.
  Or<uint32_t>(gpr_a, mxcsr_scratch);
  // Leave only exceptions.
  And<uint32_t>(gpr_a, kX87MxcsrExceptionBits);
  // Convert exception bits.
  Expand<uint64_t, uint8_t>(gpr_a,
                            {.index = gpr_a,
                             .scale = Assembler::kTimesOne,
                             .disp = constants_pool::kX87ToRiscVExceptions});
}

template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundImmTranslate(
    const Operand& fenv_scratch,
    int8_t exceptions_and_rm) {
  int8_t exceptions = exceptions_and_rm & 0b1'1111;
  int8_t rm = static_cast<uint8_t>(exceptions_and_rm) >> 5;
  // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
  // Two bytes after control word are ignored.
  Operand x87_status_word = {.base = fenv_scratch.base,
                             .index = fenv_scratch.index,
                             .scale = fenv_scratch.scale,
                             .disp = fenv_scratch.disp + 4};
  // Place mxcsr right after 28bytes-sized x87 environment.
  Operand mxcsr = {.base = fenv_scratch.base,
                   .index = fenv_scratch.index,
                   .scale = fenv_scratch.scale,
                   .disp = fenv_scratch.disp + 28};
  // Convert RISC-V exceptions into x87 exceptions.
  uint8_t x87_exceptions = bit_cast<unsigned char*>(
      static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
  // We have to store the whole floating point environment since it's not possible to just change
  // status word without affecting other state.
  Fnstenv(fenv_scratch);
  // Store MXCSR in second scratch slot.
  Stmxcsr(mxcsr);
  // Clean exceptions in the x87 environment.
  And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
  // Clean-out x87-RM field in x87 control word.
  And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
  // Clean-out MXCSR-RM field and exception bits in MXCSR.
  And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
  if (x87_exceptions) {
    // If exceptions are not zero then put exceptions in the x87 environment.
    Or<uint8_t>(x87_status_word, x87_exceptions);
  }
  if (rm) {
    // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
    Or<uint16_t>(fenv_scratch,
                 (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
  }
  if (exceptions_and_rm) {
    // If exceptions or roudning mode are not zero then then convert RISC-V rounding mode and store
    // them it in MXCSR.
    Or<uint32_t>(mxcsr,
                 x87_exceptions | (((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) &
                                   kMxcsrRoundingBits));
  }
  // Load x87 environment.
  Fldenv(fenv_scratch);
  // Load Mxcsr.
  Ldmxcsr(mxcsr);
}

template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundTranslate(Register exceptions,
                                                                      const Operand& fenv_scratch,
                                                                      Register scratch_register) {
  // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
  // Two bytes after control word are ignored.
  Operand x87_status_word = {.base = fenv_scratch.base,
                             .index = fenv_scratch.index,
                             .scale = fenv_scratch.scale,
                             .disp = fenv_scratch.disp + 4};
  // Place mxcsr right after 28bytes-sized x87 environment.
  Operand mxcsr = {.base = fenv_scratch.base,
                   .index = fenv_scratch.index,
                   .scale = fenv_scratch.scale,
                   .disp = fenv_scratch.disp + 28};
  // We have to store the whole floating point environment since it's not possible to just change
  // status word without affecting other state.
  Fnstenv(fenv_scratch);
  // Store MXCSR in second scratch slot.
  Stmxcsr(mxcsr);
  // Convert exceptions from RISC-V format to x87 format.
  Mov<uint8_t>(scratch_register,
               {.index = exceptions,
                .scale = Assembler::kTimesOne,
                .disp = constants_pool::kRiscVToX87Exceptions});
  // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
  // one may imagine. Two bytes after control word are ignored.
  And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
  // Clean-out x87-RM field in x87 control word.
  And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
  // Clean-out MXCSR-RM field and exception bits in MXCSR.
  And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
  // Put exceptions in the x87 environment.
  Or<uint8_t>(x87_status_word, scratch_register);
  // Put exceptions in the MXCSR environment.
  Or<uint8_t>(mxcsr, scratch_register);
  // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
  Mov<uint32_t>(scratch_register, kRiscVRoundingModes << kX87RmPosition);
  // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
  ShrByCl<uint32_t>(scratch_register);
  // Each field is two bits so we need to shift by “rm” twice.
  // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
  ShrByCl<uint32_t>(scratch_register);
  // Mask only x87-RM bits.
  And<uint32_t>(scratch_register, kX87RoundingBits);
  // Push x87-RM field into x87 control world.
  Or<uint16_t>(fenv_scratch, scratch_register);
  // Move x87-RM field into MSCXR-RM field.
  Shl<uint32_t>(scratch_register, int8_t{3});
  // Push MXCSR-RM field into MXCSR.
  Or<uint32_t>(mxcsr, scratch_register);
  // Load x87 environment.
  Fldenv(fenv_scratch);
  // Load Mxcsr.
  Ldmxcsr(mxcsr);
}

template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeSetExceptionsImmTranslate(const Operand& fenv_scratch,
                                                                 int8_t exceptions) {
  // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
  // Two bytes after control word are ignored.
  Operand x87_status_word = {.base = fenv_scratch.base,
                             .index = fenv_scratch.index,
                             .scale = fenv_scratch.scale,
                             .disp = fenv_scratch.disp + 4};
  // Place mxcsr right after 28bytes-sized x87 environment.
  Operand mxcsr = {.base = fenv_scratch.base,
                   .index = fenv_scratch.index,
                   .scale = fenv_scratch.scale,
                   .disp = fenv_scratch.disp + 28};
  // Convert RISC-V exceptions into x87 exceptions.
  uint8_t x87_exceptions = bit_cast<unsigned char*>(
      static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
  // We have to store the whole floating point environment since it's not possible to just change
  // status word without affecting other state.
  Fnstenv(fenv_scratch);
  // Store MXCSR in second scratch slot.
  Stmxcsr(mxcsr);
  // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
  // one may imagine. Two bytes after control word are ignored.
  And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
  // Clean exception bits
  And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
  if (x87_exceptions) {
    // Put exceptions in the x87 environment.
    Or<uint8_t>(x87_status_word, x87_exceptions);
    // Put exceptions in the MXCSR environment.
    Or<uint8_t>(mxcsr, x87_exceptions);
  }
  // Load x87 environment.
  Fldenv(fenv_scratch);
  // Load Mxcsr.
  Ldmxcsr(mxcsr);
}

template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeSetExceptionsTranslate(Register exceptions,
                                                              const Operand& fenv_scratch,
                                                              Register x87_exceptions) {
  // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
  // Two bytes after control word are ignored.
  Operand x87_status_word = {.base = fenv_scratch.base,
                             .index = fenv_scratch.index,
                             .scale = fenv_scratch.scale,
                             .disp = fenv_scratch.disp + 4};
  // Place mxcsr right after 28bytes-sized x87 environment.
  Operand mxcsr = {.base = fenv_scratch.base,
                   .index = fenv_scratch.index,
                   .scale = fenv_scratch.scale,
                   .disp = fenv_scratch.disp + 28};
  // We have to store the whole floating point environment since it's not possible to just change
  // status word without affecting other state.
  Fnstenv(fenv_scratch);
  // Store MXCSR in second scratch slot.
  Stmxcsr(mxcsr);
  // Convert exceptions from RISC-V format to x87 format.
  Mov<uint8_t>(x87_exceptions,
               {.index = exceptions,
                .scale = Assembler::kTimesOne,
                .disp = constants_pool::kRiscVToX87Exceptions});
  // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
  // one may imagine. Two bytes after control word are ignored.
  And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
  // Clean exception bits
  And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
  // Put exceptions in the x87 environment.
  Or<uint8_t>(x87_status_word, x87_exceptions);
  // Put exceptions in the MXCSR environment.
  Or<uint8_t>(mxcsr, x87_exceptions);
  // Load x87 environment.
  Fldenv(fenv_scratch);
  // Load Mxcsr.
  Ldmxcsr(mxcsr);
}

// Note: actual rounding mode comes in %cl which is implicit argument of that macro-instruction.
// All explicit arguments are temporaries.
template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeSetRound(Register x87_sse_round,
                                                const Operand& cw_scratch,
                                                const Operand& mxcsr_scratch) {
  // Store x87 control world in first scratch slot.
  Fnstcw(cw_scratch);
  // Store MXCSR in second scratch slot.
  Stmxcsr(mxcsr_scratch);
  // Clean-out x87-RM field in x87 control word.
  And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
  // Clean-out MXCSR-RM field in MXCSR.
  And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
  // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
  Mov<uint32_t>(x87_sse_round, kRiscVRoundingModes << kX87RmPosition);
  // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
  ShrByCl<uint32_t>(x87_sse_round);
  // Each field is two bits so we need to shift by “rm” twice.
  // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
  ShrByCl<uint32_t>(x87_sse_round);
  // Mask only x87-RM bits.
  And<uint32_t>(x87_sse_round, kX87RoundingBits);
  // Push x87-RM field into x87 control world.
  Or<uint16_t>(cw_scratch, x87_sse_round);
  // Move x87-RM field into MSCXR-RM field.
  Shl<uint32_t>(x87_sse_round, int8_t{3});
  // Push MXCSR-RM field into MXCSR.
  Or<uint32_t>(mxcsr_scratch, x87_sse_round);
  // Load new control world into x87 FPU.
  Fldcw(cw_scratch);
  // Load Mxcsr.
  Ldmxcsr(mxcsr_scratch);
}

template <typename Assembler>
void MacroAssembler<Assembler>::MacroFeSetRoundImmTranslate(const Operand& cw_scratch,
                                                            const Operand& mxcsr_scratch,
                                                            int8_t rm) {
  // Store x87 control world in first scratch slot.
  Fnstcw(cw_scratch);
  // Store MXCSR in second scratch slot.
  Stmxcsr(mxcsr_scratch);
  // Clean-out x87-RM field in x87 control word.
  And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
  // Clean-out MXCSR-RM field in MXCSR.
  And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
  if (rm) {
    // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
    Or<uint16_t>(cw_scratch,
                 (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
    // If rounding mode is not zero then convert RISC-V rounding mode and store it in MXCSR.
    Or<uint32_t>(mxcsr_scratch,
                 ((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) & kMxcsrRoundingBits);
  }
  // Load new control world into x87 FPU.
  Fldcw(cw_scratch);
  // Load Mxcsr.
  Ldmxcsr(mxcsr_scratch);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroFle(Register result, XMMRegister src1, XMMRegister src2) {
  Cmples<FloatType>(src1, src2);
  Mov<FloatType>(result, src1);
  And<int32_t>(result, 1);
}

template <typename Assembler>
template <typename FormatTo, typename FormatFrom>
void MacroAssembler<Assembler>::MacroFCvtFloatToInteger(Register result, XMMRegister src) {
  Cvt<FormatFrom, FormatTo>(result, src);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroFleAVX(Register result,
                                            XMMRegister src1,
                                            XMMRegister src2,
                                            XMMRegister tmp) {
  Vcmples<FloatType>(tmp, src1, src2);
  Vmov<FloatType>(result, tmp);
  And<int32_t>(result, 1);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroFlt(Register result, XMMRegister src1, XMMRegister src2) {
  Cmplts<FloatType>(src1, src2);
  Mov<FloatType>(result, src1);
  And<int32_t>(result, 1);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroFltAVX(Register result,
                                            XMMRegister src1,
                                            XMMRegister src2,
                                            XMMRegister tmp) {
  Vcmplts<FloatType>(tmp, src1, src2);
  Vmov<FloatType>(result, tmp);
  And<int32_t>(result, 1);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroNanBox(XMMRegister arg) {
  static_assert(std::is_same_v<FloatType, Float32>);

  Por(arg, {.disp = constants_pool::kNanBox<Float32>});
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroNanBoxAVX(XMMRegister result, XMMRegister src) {
  static_assert(std::is_same_v<FloatType, Float32>);

  Vpor(result, src, {.disp = constants_pool::kNanBox<Float32>});
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroUnboxNan(XMMRegister result, XMMRegister src) {
  static_assert(std::is_same_v<FloatType, Float32>);

  Pmov(result, src);
  Pcmpeq<typename TypeTraits<FloatType>::Int>(result, {.disp = constants_pool::kNanBox<Float32>});
  Pshufd(result, result, kShuffleDDBB);
  Pand(src, result);
  Pandn(result, {.disp = constants_pool::kNanBoxedNans<Float32>});
  Por(result, src);
}

template <typename Assembler>
template <typename FloatType>
void MacroAssembler<Assembler>::MacroUnboxNanAVX(XMMRegister result, XMMRegister src) {
  static_assert(std::is_same_v<FloatType, Float32>);

  Vpcmpeq<typename TypeTraits<FloatType>::Int>(
      result, src, {.disp = constants_pool::kNanBox<Float32>});
  Vpshufd(result, result, kShuffleDDBB);
  Vpand(src, src, result);
  Vpandn(result, result, {.disp = constants_pool::kNanBoxedNans<Float32>});
  Vpor(result, result, src);
}

}  // namespace berberis

#endif  // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_