| //! Encoding tables for x86 ISAs. |
| |
| use super::registers::*; |
| use crate::bitset::BitSet; |
| use crate::cursor::{Cursor, FuncCursor}; |
| use crate::flowgraph::ControlFlowGraph; |
| use crate::ir::condcodes::{FloatCC, IntCC}; |
| use crate::ir::types::*; |
| use crate::ir::{self, Function, Inst, InstBuilder, MemFlags}; |
| use crate::isa::constraints::*; |
| use crate::isa::enc_tables::*; |
| use crate::isa::encoding::base_size; |
| use crate::isa::encoding::{Encoding, RecipeSizing}; |
| use crate::isa::RegUnit; |
| use crate::isa::{self, TargetIsa}; |
| use crate::legalizer::expand_as_libcall; |
| use crate::predicates; |
| use crate::regalloc::RegDiversions; |
| |
| include!(concat!(env!("OUT_DIR"), "/encoding-x86.rs")); |
| include!(concat!(env!("OUT_DIR"), "/legalize-x86.rs")); |
| |
| /// Whether the REX prefix is needed for encoding extended registers (via REX.RXB). |
| /// |
| /// Normal x86 instructions have only 3 bits for encoding a register. |
| /// The REX prefix adds REX.R, REX,X, and REX.B bits, interpreted as fourth bits. |
| pub fn is_extended_reg(reg: RegUnit) -> bool { |
| // Extended registers have the fourth bit set. |
| reg as u8 & 0b1000 != 0 |
| } |
| |
| pub fn needs_sib_byte(reg: RegUnit) -> bool { |
| reg == RU::r12 as RegUnit || reg == RU::rsp as RegUnit |
| } |
| pub fn needs_offset(reg: RegUnit) -> bool { |
| reg == RU::r13 as RegUnit || reg == RU::rbp as RegUnit |
| } |
| pub fn needs_sib_byte_or_offset(reg: RegUnit) -> bool { |
| needs_sib_byte(reg) || needs_offset(reg) |
| } |
| |
| fn test_input( |
| op_index: usize, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| condition_func: fn(RegUnit) -> bool, |
| ) -> bool { |
| let in_reg = divert.reg(func.dfg.inst_args(inst)[op_index], &func.locations); |
| condition_func(in_reg) |
| } |
| |
| fn test_result( |
| result_index: usize, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| condition_func: fn(RegUnit) -> bool, |
| ) -> bool { |
| let out_reg = divert.reg(func.dfg.inst_results(inst)[result_index], &func.locations); |
| condition_func(out_reg) |
| } |
| |
| fn size_plus_maybe_offset_for_inreg_0( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| let needs_offset = test_input(0, inst, divert, func, needs_offset); |
| sizing.base_size + if needs_offset { 1 } else { 0 } |
| } |
| fn size_plus_maybe_offset_for_inreg_1( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| let needs_offset = test_input(1, inst, divert, func, needs_offset); |
| sizing.base_size + if needs_offset { 1 } else { 0 } |
| } |
| fn size_plus_maybe_sib_for_inreg_0( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| let needs_sib = test_input(0, inst, divert, func, needs_sib_byte); |
| sizing.base_size + if needs_sib { 1 } else { 0 } |
| } |
| fn size_plus_maybe_sib_for_inreg_1( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| let needs_sib = test_input(1, inst, divert, func, needs_sib_byte); |
| sizing.base_size + if needs_sib { 1 } else { 0 } |
| } |
| fn size_plus_maybe_sib_or_offset_for_inreg_0( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| let needs_sib_or_offset = test_input(0, inst, divert, func, needs_sib_byte_or_offset); |
| sizing.base_size + if needs_sib_or_offset { 1 } else { 0 } |
| } |
| fn size_plus_maybe_sib_or_offset_for_inreg_1( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| let needs_sib_or_offset = test_input(1, inst, divert, func, needs_sib_byte_or_offset); |
| sizing.base_size + if needs_sib_or_offset { 1 } else { 0 } |
| } |
| |
| /// Calculates the size while inferring if the first and second input registers (inreg0, inreg1) |
| /// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB or offset. |
| fn size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1( |
| sizing: &RecipeSizing, |
| enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg) |
| || test_input(1, inst, divert, func, is_extended_reg); |
| size_plus_maybe_sib_or_offset_for_inreg_1(sizing, enc, inst, divert, func) |
| + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Calculates the size while inferring if the first and second input registers (inreg0, inreg1) |
| /// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB. |
| fn size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1( |
| sizing: &RecipeSizing, |
| enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg) |
| || test_input(1, inst, divert, func, is_extended_reg); |
| size_plus_maybe_sib_for_inreg_1(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Calculates the size while inferring if the first input register (inreg0) and first output |
| /// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a |
| /// SIB or offset. |
| fn size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0( |
| sizing: &RecipeSizing, |
| enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg) |
| || test_result(0, inst, divert, func, is_extended_reg); |
| size_plus_maybe_sib_or_offset_for_inreg_0(sizing, enc, inst, divert, func) |
| + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Calculates the size while inferring if the first input register (inreg0) and first output |
| /// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a |
| /// SIB. |
| fn size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0( |
| sizing: &RecipeSizing, |
| enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg) |
| || test_result(0, inst, divert, func, is_extended_reg); |
| size_plus_maybe_sib_for_inreg_0(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, for use with one input reg. |
| /// |
| /// A REX prefix is known to be emitted if either: |
| /// 1. The EncodingBits specify that REX.W is to be set. |
| /// 2. Registers are used that require REX.R or REX.B bits for encoding. |
| fn size_with_inferred_rex_for_inreg0( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, based on the second operand. |
| fn size_with_inferred_rex_for_inreg1( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(1, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, based on the third operand. |
| fn size_with_inferred_rex_for_inreg2( |
| sizing: &RecipeSizing, |
| _: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(2, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, for use with two input registers. |
| /// |
| /// A REX prefix is known to be emitted if either: |
| /// 1. The EncodingBits specify that REX.W is to be set. |
| /// 2. Registers are used that require REX.R or REX.B bits for encoding. |
| fn size_with_inferred_rex_for_inreg0_inreg1( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg) |
| || test_input(1, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, based on second and third operand. |
| fn size_with_inferred_rex_for_inreg1_inreg2( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(1, inst, divert, func, is_extended_reg) |
| || test_input(2, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, based on a single |
| /// input register and a single output register. |
| fn size_with_inferred_rex_for_inreg0_outreg0( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(0, inst, divert, func, is_extended_reg) |
| || test_result(0, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, based on a single output register. |
| fn size_with_inferred_rex_for_outreg0( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_result(0, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// Infers whether a dynamic REX prefix will be emitted, for use with CMOV. |
| /// |
| /// CMOV uses 3 inputs, with the REX is inferred from reg1 and reg2. |
| fn size_with_inferred_rex_for_cmov( |
| sizing: &RecipeSizing, |
| _enc: Encoding, |
| inst: Inst, |
| divert: &RegDiversions, |
| func: &Function, |
| ) -> u8 { |
| // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. |
| let needs_rex = test_input(1, inst, divert, func, is_extended_reg) |
| || test_input(2, inst, divert, func, is_extended_reg); |
| sizing.base_size + if needs_rex { 1 } else { 0 } |
| } |
| |
| /// If the value's definition is a constant immediate, returns its unpacked value, or None |
| /// otherwise. |
| fn maybe_iconst_imm(pos: &FuncCursor, value: ir::Value) -> Option<i64> { |
| if let ir::ValueDef::Result(inst, _) = &pos.func.dfg.value_def(value) { |
| if let ir::InstructionData::UnaryImm { |
| opcode: ir::Opcode::Iconst, |
| imm, |
| } = &pos.func.dfg[*inst] |
| { |
| let value: i64 = (*imm).into(); |
| Some(value) |
| } else { |
| None |
| } |
| } else { |
| None |
| } |
| } |
| |
| /// Expand the `sdiv` and `srem` instructions using `x86_sdivmodx`. |
| fn expand_sdivrem( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| let (x, y, is_srem) = match func.dfg[inst] { |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Sdiv, |
| args, |
| } => (args[0], args[1], false), |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Srem, |
| args, |
| } => (args[0], args[1], true), |
| _ => panic!("Need sdiv/srem: {}", func.dfg.display_inst(inst, None)), |
| }; |
| |
| let old_block = func.layout.pp_block(inst); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| pos.func.dfg.clear_results(inst); |
| |
| let avoid_div_traps = isa.flags().avoid_div_traps(); |
| |
| // If we can tolerate native division traps, sdiv doesn't need branching. |
| if !avoid_div_traps && !is_srem { |
| let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1); |
| pos.ins().with_result(result).x86_sdivmodx(x, xhi, y); |
| pos.remove_inst(); |
| return; |
| } |
| |
| // Try to remove checks if the input value is an immediate other than 0 or -1. For these two |
| // immediates, we'd ideally replace conditional traps by traps, but this requires more |
| // manipulation of the dfg/cfg, which is out of scope here. |
| let (could_be_zero, could_be_minus_one) = if let Some(imm) = maybe_iconst_imm(&pos, y) { |
| (imm == 0, imm == -1) |
| } else { |
| (true, true) |
| }; |
| |
| // Put in an explicit division-by-zero trap if the environment requires it. |
| if avoid_div_traps && could_be_zero { |
| pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero); |
| } |
| |
| if !could_be_minus_one { |
| let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1); |
| let reuse = if is_srem { |
| [None, Some(result)] |
| } else { |
| [Some(result), None] |
| }; |
| pos.ins().with_results(reuse).x86_sdivmodx(x, xhi, y); |
| pos.remove_inst(); |
| return; |
| } |
| |
| // block handling the nominal case. |
| let nominal = pos.func.dfg.make_block(); |
| |
| // block handling the -1 divisor case. |
| let minus_one = pos.func.dfg.make_block(); |
| |
| // Final block with one argument representing the final result value. |
| let done = pos.func.dfg.make_block(); |
| |
| // Move the `inst` result value onto the `done` block. |
| pos.func.dfg.attach_block_param(done, result); |
| |
| // Start by checking for a -1 divisor which needs to be handled specially. |
| let is_m1 = pos.ins().ifcmp_imm(y, -1); |
| pos.ins().brif(IntCC::Equal, is_m1, minus_one, &[]); |
| pos.ins().jump(nominal, &[]); |
| |
| // Now it is safe to execute the `x86_sdivmodx` instruction which will still trap on division |
| // by zero. |
| pos.insert_block(nominal); |
| let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1); |
| let (quot, rem) = pos.ins().x86_sdivmodx(x, xhi, y); |
| let divres = if is_srem { rem } else { quot }; |
| pos.ins().jump(done, &[divres]); |
| |
| // Now deal with the -1 divisor case. |
| pos.insert_block(minus_one); |
| let m1_result = if is_srem { |
| // x % -1 = 0. |
| pos.ins().iconst(ty, 0) |
| } else { |
| // Explicitly check for overflow: Trap when x == INT_MIN. |
| debug_assert!(avoid_div_traps, "Native trapping divide handled above"); |
| let f = pos.ins().ifcmp_imm(x, -1 << (ty.lane_bits() - 1)); |
| pos.ins() |
| .trapif(IntCC::Equal, f, ir::TrapCode::IntegerOverflow); |
| // x / -1 = -x. |
| pos.ins().irsub_imm(x, 0) |
| }; |
| |
| // Recycle the original instruction as a jump. |
| pos.func.dfg.replace(inst).jump(done, &[m1_result]); |
| |
| // Finally insert a label for the completion. |
| pos.next_inst(); |
| pos.insert_block(done); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, nominal); |
| cfg.recompute_block(pos.func, minus_one); |
| cfg.recompute_block(pos.func, done); |
| } |
| |
| /// Expand the `udiv` and `urem` instructions using `x86_udivmodx`. |
| fn expand_udivrem( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| let (x, y, is_urem) = match func.dfg[inst] { |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Udiv, |
| args, |
| } => (args[0], args[1], false), |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Urem, |
| args, |
| } => (args[0], args[1], true), |
| _ => panic!("Need udiv/urem: {}", func.dfg.display_inst(inst, None)), |
| }; |
| let avoid_div_traps = isa.flags().avoid_div_traps(); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| pos.func.dfg.clear_results(inst); |
| |
| // Put in an explicit division-by-zero trap if the environment requires it. |
| if avoid_div_traps { |
| let zero_check = if let Some(imm) = maybe_iconst_imm(&pos, y) { |
| // Ideally, we'd just replace the conditional trap with a trap when the immediate is |
| // zero, but this requires more manipulation of the dfg/cfg, which is out of scope |
| // here. |
| imm == 0 |
| } else { |
| true |
| }; |
| if zero_check { |
| pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero); |
| } |
| } |
| |
| // Now it is safe to execute the `x86_udivmodx` instruction. |
| let xhi = pos.ins().iconst(ty, 0); |
| let reuse = if is_urem { |
| [None, Some(result)] |
| } else { |
| [Some(result), None] |
| }; |
| pos.ins().with_results(reuse).x86_udivmodx(x, xhi, y); |
| pos.remove_inst(); |
| } |
| |
| /// Expand the `fmin` and `fmax` instructions using the x86 `x86_fmin` and `x86_fmax` |
| /// instructions. |
| fn expand_minmax( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] { |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Fmin, |
| args, |
| } => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor), |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Fmax, |
| args, |
| } => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band), |
| _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)), |
| }; |
| let old_block = func.layout.pp_block(inst); |
| |
| // We need to handle the following conditions, depending on how x and y compare: |
| // |
| // 1. LT or GT: The native `x86_opc` min/max instruction does what we need. |
| // 2. EQ: We need to use `bitwise_opc` to make sure that |
| // fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0. |
| // 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical. |
| |
| // block handling case 1) where operands are ordered but not equal. |
| let one_block = func.dfg.make_block(); |
| |
| // block handling case 3) where one operand is NaN. |
| let uno_block = func.dfg.make_block(); |
| |
| // block that handles the unordered or equal cases 2) and 3). |
| let ueq_block = func.dfg.make_block(); |
| |
| // block handling case 2) where operands are ordered and equal. |
| let eq_block = func.dfg.make_block(); |
| |
| // Final block with one argument representing the final result value. |
| let done = func.dfg.make_block(); |
| |
| // The basic blocks are laid out to minimize branching for the common cases: |
| // |
| // 1) One branch not taken, one jump. |
| // 2) One branch taken. |
| // 3) Two branches taken, one jump. |
| |
| // Move the `inst` result value onto the `done` block. |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| func.dfg.clear_results(inst); |
| func.dfg.attach_block_param(done, result); |
| |
| // Test for case 1) ordered and not equal. |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y); |
| pos.ins().brnz(cmp_ueq, ueq_block, &[]); |
| pos.ins().jump(one_block, &[]); |
| |
| // Handle the common ordered, not equal (LT|GT) case. |
| pos.insert_block(one_block); |
| let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0; |
| let one_result = pos.func.dfg.first_result(one_inst); |
| pos.ins().jump(done, &[one_result]); |
| |
| // Case 3) Unordered. |
| // We know that at least one operand is a NaN that needs to be propagated. We simply use an |
| // `fadd` instruction which has the same NaN propagation semantics. |
| pos.insert_block(uno_block); |
| let uno_result = pos.ins().fadd(x, y); |
| pos.ins().jump(done, &[uno_result]); |
| |
| // Case 2) or 3). |
| pos.insert_block(ueq_block); |
| // Test for case 3) (UN) one value is NaN. |
| // TODO: When we get support for flag values, we can reuse the above comparison. |
| let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y); |
| pos.ins().brnz(cmp_uno, uno_block, &[]); |
| pos.ins().jump(eq_block, &[]); |
| |
| // We are now in case 2) where x and y compare EQ. |
| // We need a bitwise operation to get the sign right. |
| pos.insert_block(eq_block); |
| let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0; |
| let bw_result = pos.func.dfg.first_result(bw_inst); |
| // This should become a fall-through for this second most common case. |
| // Recycle the original instruction as a jump. |
| pos.func.dfg.replace(inst).jump(done, &[bw_result]); |
| |
| // Finally insert a label for the completion. |
| pos.next_inst(); |
| pos.insert_block(done); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, one_block); |
| cfg.recompute_block(pos.func, uno_block); |
| cfg.recompute_block(pos.func, ueq_block); |
| cfg.recompute_block(pos.func, eq_block); |
| cfg.recompute_block(pos.func, done); |
| } |
| |
| /// This legalization converts a minimum/maximum operation into a sequence that matches the |
| /// non-x86-friendly WebAssembly semantics of NaN handling. This logic is kept separate from |
| /// [expand_minmax] above (the scalar version) for code clarity. |
| fn expand_minmax_vector( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let ty = func.dfg.ctrl_typevar(inst); |
| debug_assert!(ty.is_vector()); |
| let (x, y, x86_opcode, is_max) = match func.dfg[inst] { |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Fmin, |
| args, |
| } => (args[0], args[1], ir::Opcode::X86Fmin, false), |
| ir::InstructionData::Binary { |
| opcode: ir::Opcode::Fmax, |
| args, |
| } => (args[0], args[1], ir::Opcode::X86Fmax, true), |
| _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)), |
| }; |
| |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| // This sequence is complex due to how x86 handles NaNs and +0/-0. If x86 finds a NaN in |
| // either lane it returns the second operand; likewise, if both operands are in {+0.0, -0.0} |
| // it returns the second operand. To match the behavior of "return the minimum of the |
| // operands or a canonical NaN if either operand is NaN," we must compare in both |
| // directions. |
| let (forward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, x, y); |
| let forward = dfg.first_result(forward_inst); |
| let (backward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, y, x); |
| let backward = dfg.first_result(backward_inst); |
| |
| let (value, mask) = if is_max { |
| // For maximum: |
| // Find any differences between the forward and backward `max` operation. |
| let difference = pos.ins().bxor(forward, backward); |
| // Merge in the differences. |
| let propagate_nans_and_plus_zero = pos.ins().bor(backward, difference); |
| let value = pos.ins().fsub(propagate_nans_and_plus_zero, difference); |
| // Discover which lanes have NaNs in them. |
| let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, difference, value); |
| (value, find_nan_lanes_mask) |
| } else { |
| // For minimum: |
| // If either lane is a NaN, we want to use these bits, not the second operand bits. |
| let propagate_nans = pos.ins().bor(backward, forward); |
| // Find which lanes contain a NaN with an unordered comparison, filling the mask with |
| // 1s. |
| let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, forward, propagate_nans); |
| let bitcast_find_nan_lanes_mask = pos.ins().raw_bitcast(ty, find_nan_lanes_mask); |
| // Then flood the value lane with all 1s if that lane is a NaN. This causes all NaNs |
| // along this code path to be quieted and negative: after the upcoming shift and and_not, |
| // all upper bits (sign, exponent, and payload MSB) will be 1s. |
| let tmp = pos.ins().bor(propagate_nans, bitcast_find_nan_lanes_mask); |
| (tmp, bitcast_find_nan_lanes_mask) |
| }; |
| |
| // During this lowering we will need to know how many bits to shift by and what type to |
| // convert to when using an integer shift. Recall that an IEEE754 number looks like: |
| // `[sign bit] [exponent bits] [significand bits]` |
| // A quiet NaN has all exponent bits set to 1 and the most significant bit of the |
| // significand set to 1; a signaling NaN has the same exponent but the MSB of the |
| // significand is set to 0. The payload of the NaN is the remaining significand bits, and |
| // WebAssembly assumes a canonical NaN is quiet and has 0s in its payload. To compute this |
| // canonical NaN, we create a mask for the top 10 bits on F32X4 (1 sign + 8 exp. + 1 MSB |
| // sig.) and the top 13 bits on F64X2 (1 sign + 11 exp. + 1 MSB sig.). This means that all |
| // NaNs produced with the mask will be negative (`-NaN`) which is allowed by the sign |
| // non-determinism in the spec: https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0 |
| let (shift_by, ty_as_int) = match ty { |
| F32X4 => (10, I32X4), |
| F64X2 => (13, I64X2), |
| _ => unimplemented!("this legalization only understands 128-bit floating point types"), |
| }; |
| |
| // In order to clear the NaN payload for canonical NaNs, we shift right the NaN lanes (all |
| // 1s) leaving 0s in the top bits. Remember that non-NaN lanes are all 0s so this has |
| // little effect. |
| let mask_as_int = pos.ins().raw_bitcast(ty_as_int, mask); |
| let shift_mask = pos.ins().ushr_imm(mask_as_int, shift_by); |
| let shift_mask_as_float = pos.ins().raw_bitcast(ty, shift_mask); |
| |
| // Finally, we replace the value with `value & ~shift_mask`. For non-NaN lanes, this is |
| // equivalent to `... & 1111...` but for NaN lanes this will only have 1s in the top bits, |
| // clearing the payload. |
| pos.func |
| .dfg |
| .replace(inst) |
| .band_not(value, shift_mask_as_float); |
| } |
| |
| /// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to |
| /// i64 with a pattern, the rest needs more code. |
| /// |
| /// Note that this is the scalar implementation; for the vector implemenation see |
| /// [expand_fcvt_from_uint_vector]. |
| fn expand_fcvt_from_uint( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let x; |
| match func.dfg[inst] { |
| ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtFromUint, |
| arg, |
| } => x = arg, |
| _ => panic!("Need fcvt_from_uint: {}", func.dfg.display_inst(inst, None)), |
| } |
| let xty = func.dfg.value_type(x); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| // Conversion from an unsigned int smaller than 64bit is easy on x86-64. |
| match xty { |
| ir::types::I8 | ir::types::I16 | ir::types::I32 => { |
| // TODO: This should be guarded by an ISA check. |
| let wide = pos.ins().uextend(ir::types::I64, x); |
| pos.func.dfg.replace(inst).fcvt_from_sint(ty, wide); |
| return; |
| } |
| ir::types::I64 => {} |
| _ => unimplemented!(), |
| } |
| |
| let old_block = pos.func.layout.pp_block(inst); |
| |
| // block handling the case where x >= 0. |
| let poszero_block = pos.func.dfg.make_block(); |
| |
| // block handling the case where x < 0. |
| let neg_block = pos.func.dfg.make_block(); |
| |
| // Final block with one argument representing the final result value. |
| let done = pos.func.dfg.make_block(); |
| |
| // Move the `inst` result value onto the `done` block. |
| pos.func.dfg.clear_results(inst); |
| pos.func.dfg.attach_block_param(done, result); |
| |
| // If x as a signed int is not negative, we can use the existing `fcvt_from_sint` instruction. |
| let is_neg = pos.ins().icmp_imm(IntCC::SignedLessThan, x, 0); |
| pos.ins().brnz(is_neg, neg_block, &[]); |
| pos.ins().jump(poszero_block, &[]); |
| |
| // Easy case: just use a signed conversion. |
| pos.insert_block(poszero_block); |
| let posres = pos.ins().fcvt_from_sint(ty, x); |
| pos.ins().jump(done, &[posres]); |
| |
| // Now handle the negative case. |
| pos.insert_block(neg_block); |
| |
| // Divide x by two to get it in range for the signed conversion, keep the LSB, and scale it |
| // back up on the FP side. |
| let ihalf = pos.ins().ushr_imm(x, 1); |
| let lsb = pos.ins().band_imm(x, 1); |
| let ifinal = pos.ins().bor(ihalf, lsb); |
| let fhalf = pos.ins().fcvt_from_sint(ty, ifinal); |
| let negres = pos.ins().fadd(fhalf, fhalf); |
| |
| // Recycle the original instruction as a jump. |
| pos.func.dfg.replace(inst).jump(done, &[negres]); |
| |
| // Finally insert a label for the completion. |
| pos.next_inst(); |
| pos.insert_block(done); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, poszero_block); |
| cfg.recompute_block(pos.func, neg_block); |
| cfg.recompute_block(pos.func, done); |
| } |
| |
| /// To convert packed unsigned integers to their float equivalents, we must legalize to a special |
| /// AVX512 instruction (using MCSR rounding) or use a long sequence of instructions. This logic is |
| /// separate from [expand_fcvt_from_uint] above (the scalar version), only due to how the transform |
| /// groups are set up; TODO if we change the SIMD legalization groups, then this logic could be |
| /// merged into [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745). |
| fn expand_fcvt_from_uint_vector( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtFromUint, |
| arg, |
| } = pos.func.dfg[inst] |
| { |
| let controlling_type = pos.func.dfg.ctrl_typevar(inst); |
| if controlling_type == F32X4 { |
| debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4); |
| let x86_isa = isa |
| .as_any() |
| .downcast_ref::<isa::x86::Isa>() |
| .expect("the target ISA must be x86 at this point"); |
| if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() { |
| // If we have certain AVX512 features, we can lower this instruction simply. |
| pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg); |
| } else { |
| // Otherwise, we default to a very lengthy SSE4.1-compatible sequence: PXOR, |
| // PBLENDW, PSUB, CVTDQ2PS, PSRLD, CVTDQ2PS, ADDPS, ADDPS |
| let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg); |
| let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into()); |
| let zero = pos.ins().vconst(I16X8, zero_constant); |
| let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55); |
| let bitcast_low = pos.ins().raw_bitcast(I32X4, low); |
| let high = pos.ins().isub(arg, bitcast_low); |
| let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low); |
| let shift_high = pos.ins().ushr_imm(high, 1); |
| let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high); |
| let double_high = pos.ins().fadd(convert_high, convert_high); |
| pos.func.dfg.replace(inst).fadd(double_high, convert_low); |
| } |
| } else { |
| unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None)) |
| } |
| } |
| } |
| |
| fn expand_fcvt_to_sint( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| use crate::ir::immediates::{Ieee32, Ieee64}; |
| |
| let x = match func.dfg[inst] { |
| ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtToSint, |
| arg, |
| } => arg, |
| _ => panic!("Need fcvt_to_sint: {}", func.dfg.display_inst(inst, None)), |
| }; |
| let old_block = func.layout.pp_block(inst); |
| let xty = func.dfg.value_type(x); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| |
| // Final block after the bad value checks. |
| let done = func.dfg.make_block(); |
| |
| // block for checking failure cases. |
| let maybe_trap_block = func.dfg.make_block(); |
| |
| // The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or overflow. |
| // It produces an INT_MIN result instead. |
| func.dfg.replace(inst).x86_cvtt2si(ty, x); |
| |
| let mut pos = FuncCursor::new(func).after_inst(inst); |
| pos.use_srcloc(inst); |
| |
| let is_done = pos |
| .ins() |
| .icmp_imm(IntCC::NotEqual, result, 1 << (ty.lane_bits() - 1)); |
| pos.ins().brnz(is_done, done, &[]); |
| pos.ins().jump(maybe_trap_block, &[]); |
| |
| // We now have the following possibilities: |
| // |
| // 1. INT_MIN was actually the correct conversion result. |
| // 2. The input was NaN -> trap bad_toint |
| // 3. The input was out of range -> trap int_ovf |
| // |
| pos.insert_block(maybe_trap_block); |
| |
| // Check for NaN. |
| let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x); |
| pos.ins() |
| .trapnz(is_nan, ir::TrapCode::BadConversionToInteger); |
| |
| // Check for case 1: INT_MIN is the correct result. |
| // Determine the smallest floating point number that would convert to INT_MIN. |
| let mut overflow_cc = FloatCC::LessThan; |
| let output_bits = ty.lane_bits(); |
| let flimit = match xty { |
| ir::types::F32 => |
| // An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so |
| // there are values less than -2^(N-1) that convert correctly to INT_MIN. |
| { |
| pos.ins().f32const(if output_bits < 32 { |
| overflow_cc = FloatCC::LessThanOrEqual; |
| Ieee32::fcvt_to_sint_negative_overflow(output_bits) |
| } else { |
| Ieee32::pow2(output_bits - 1).neg() |
| }) |
| } |
| ir::types::F64 => |
| // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so |
| // there are values less than -2^(N-1) that convert correctly to INT_MIN. |
| { |
| pos.ins().f64const(if output_bits < 64 { |
| overflow_cc = FloatCC::LessThanOrEqual; |
| Ieee64::fcvt_to_sint_negative_overflow(output_bits) |
| } else { |
| Ieee64::pow2(output_bits - 1).neg() |
| }) |
| } |
| _ => panic!("Can't convert {}", xty), |
| }; |
| let overflow = pos.ins().fcmp(overflow_cc, x, flimit); |
| pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow); |
| |
| // Finally, we could have a positive value that is too large. |
| let fzero = match xty { |
| ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)), |
| ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)), |
| _ => panic!("Can't convert {}", xty), |
| }; |
| let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero); |
| pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow); |
| |
| pos.ins().jump(done, &[]); |
| pos.insert_block(done); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, maybe_trap_block); |
| cfg.recompute_block(pos.func, done); |
| } |
| |
| fn expand_fcvt_to_sint_sat( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| use crate::ir::immediates::{Ieee32, Ieee64}; |
| |
| let x = match func.dfg[inst] { |
| ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtToSintSat, |
| arg, |
| } => arg, |
| _ => panic!( |
| "Need fcvt_to_sint_sat: {}", |
| func.dfg.display_inst(inst, None) |
| ), |
| }; |
| |
| let old_block = func.layout.pp_block(inst); |
| let xty = func.dfg.value_type(x); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| |
| // Final block after the bad value checks. |
| let done_block = func.dfg.make_block(); |
| let intmin_block = func.dfg.make_block(); |
| let minsat_block = func.dfg.make_block(); |
| let maxsat_block = func.dfg.make_block(); |
| func.dfg.clear_results(inst); |
| func.dfg.attach_block_param(done_block, result); |
| |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| // The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or |
| // overflow. It produces an INT_MIN result instead. |
| let cvtt2si = pos.ins().x86_cvtt2si(ty, x); |
| |
| let is_done = pos |
| .ins() |
| .icmp_imm(IntCC::NotEqual, cvtt2si, 1 << (ty.lane_bits() - 1)); |
| pos.ins().brnz(is_done, done_block, &[cvtt2si]); |
| pos.ins().jump(intmin_block, &[]); |
| |
| // We now have the following possibilities: |
| // |
| // 1. INT_MIN was actually the correct conversion result. |
| // 2. The input was NaN -> replace the result value with 0. |
| // 3. The input was out of range -> saturate the result to the min/max value. |
| pos.insert_block(intmin_block); |
| |
| // Check for NaN, which is truncated to 0. |
| let zero = pos.ins().iconst(ty, 0); |
| let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x); |
| pos.ins().brnz(is_nan, done_block, &[zero]); |
| pos.ins().jump(minsat_block, &[]); |
| |
| // Check for case 1: INT_MIN is the correct result. |
| // Determine the smallest floating point number that would convert to INT_MIN. |
| pos.insert_block(minsat_block); |
| let mut overflow_cc = FloatCC::LessThan; |
| let output_bits = ty.lane_bits(); |
| let flimit = match xty { |
| ir::types::F32 => |
| // An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so |
| // there are values less than -2^(N-1) that convert correctly to INT_MIN. |
| { |
| pos.ins().f32const(if output_bits < 32 { |
| overflow_cc = FloatCC::LessThanOrEqual; |
| Ieee32::fcvt_to_sint_negative_overflow(output_bits) |
| } else { |
| Ieee32::pow2(output_bits - 1).neg() |
| }) |
| } |
| ir::types::F64 => |
| // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so |
| // there are values less than -2^(N-1) that convert correctly to INT_MIN. |
| { |
| pos.ins().f64const(if output_bits < 64 { |
| overflow_cc = FloatCC::LessThanOrEqual; |
| Ieee64::fcvt_to_sint_negative_overflow(output_bits) |
| } else { |
| Ieee64::pow2(output_bits - 1).neg() |
| }) |
| } |
| _ => panic!("Can't convert {}", xty), |
| }; |
| |
| let overflow = pos.ins().fcmp(overflow_cc, x, flimit); |
| let min_imm = match ty { |
| ir::types::I32 => i32::min_value() as i64, |
| ir::types::I64 => i64::min_value(), |
| _ => panic!("Don't know the min value for {}", ty), |
| }; |
| let min_value = pos.ins().iconst(ty, min_imm); |
| pos.ins().brnz(overflow, done_block, &[min_value]); |
| pos.ins().jump(maxsat_block, &[]); |
| |
| // Finally, we could have a positive value that is too large. |
| pos.insert_block(maxsat_block); |
| let fzero = match xty { |
| ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)), |
| ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)), |
| _ => panic!("Can't convert {}", xty), |
| }; |
| |
| let max_imm = match ty { |
| ir::types::I32 => i32::max_value() as i64, |
| ir::types::I64 => i64::max_value(), |
| _ => panic!("Don't know the max value for {}", ty), |
| }; |
| let max_value = pos.ins().iconst(ty, max_imm); |
| |
| let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero); |
| pos.ins().brnz(overflow, done_block, &[max_value]); |
| |
| // Recycle the original instruction. |
| pos.func.dfg.replace(inst).jump(done_block, &[cvtt2si]); |
| |
| // Finally insert a label for the completion. |
| pos.next_inst(); |
| pos.insert_block(done_block); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, intmin_block); |
| cfg.recompute_block(pos.func, minsat_block); |
| cfg.recompute_block(pos.func, maxsat_block); |
| cfg.recompute_block(pos.func, done_block); |
| } |
| |
| /// This legalization converts a vector of 32-bit floating point lanes to signed integer lanes |
| /// using CVTTPS2DQ (see encoding of `x86_cvtt2si`). This logic is separate from [expand_fcvt_to_sint_sat] |
| /// above (the scalar version), only due to how the transform groups are set up; TODO if we change |
| /// the SIMD legalization groups, then this logic could be merged into [expand_fcvt_to_sint_sat] |
| /// (see https://github.com/bytecodealliance/wasmtime/issues/1745). |
| fn expand_fcvt_to_sint_sat_vector( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtToSintSat, |
| arg, |
| } = pos.func.dfg[inst] |
| { |
| let controlling_type = pos.func.dfg.ctrl_typevar(inst); |
| if controlling_type == I32X4 { |
| debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4); |
| // We must both quiet any NaNs--setting that lane to 0--and saturate any |
| // lanes that might overflow during conversion to the highest/lowest signed integer |
| // allowed in that lane. |
| |
| // Saturate NaNs: `fcmp eq` will not match if a lane contains a NaN. We use ANDPS to |
| // avoid doing the comparison twice (we need the zeroed lanes to find differences). |
| let zeroed_nans = pos.ins().fcmp(FloatCC::Equal, arg, arg); |
| let zeroed_nans_bitcast = pos.ins().raw_bitcast(F32X4, zeroed_nans); |
| let zeroed_nans_copy = pos.ins().band(arg, zeroed_nans_bitcast); |
| |
| // Find differences with the zeroed lanes (we will only use the MSB: 1 if positive or |
| // NaN, 0 otherwise). |
| let differences = pos.ins().bxor(zeroed_nans_bitcast, arg); |
| let differences_bitcast = pos.ins().raw_bitcast(I32X4, differences); |
| |
| // Convert the numeric lanes. CVTTPS2DQ will mark overflows with 0x80000000 (MSB set). |
| let converted = pos.ins().x86_cvtt2si(I32X4, zeroed_nans_copy); |
| |
| // Create a mask of all 1s only on positive overflow, 0s otherwise. This uses the MSB |
| // of `differences` (1 when positive or NaN) and the MSB of `converted` (1 on positive |
| // overflow). |
| let tmp = pos.ins().band(differences_bitcast, converted); |
| let mask = pos.ins().sshr_imm(tmp, 31); |
| |
| // Apply the mask to create 0x7FFFFFFF for positive overflow. XOR of all 0s (all other |
| // cases) has no effect. |
| pos.func.dfg.replace(inst).bxor(converted, mask); |
| } else { |
| unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None)) |
| } |
| } |
| } |
| |
| fn expand_fcvt_to_uint( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| use crate::ir::immediates::{Ieee32, Ieee64}; |
| |
| let x = match func.dfg[inst] { |
| ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtToUint, |
| arg, |
| } => arg, |
| _ => panic!("Need fcvt_to_uint: {}", func.dfg.display_inst(inst, None)), |
| }; |
| |
| let old_block = func.layout.pp_block(inst); |
| let xty = func.dfg.value_type(x); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| |
| // block handle numbers < 2^(N-1). |
| let below_uint_max_block = func.dfg.make_block(); |
| |
| // block handle numbers < 0. |
| let below_zero_block = func.dfg.make_block(); |
| |
| // block handling numbers >= 2^(N-1). |
| let large = func.dfg.make_block(); |
| |
| // Final block after the bad value checks. |
| let done = func.dfg.make_block(); |
| |
| // Move the `inst` result value onto the `done` block. |
| func.dfg.clear_results(inst); |
| func.dfg.attach_block_param(done, result); |
| |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| // Start by materializing the floating point constant 2^(N-1) where N is the number of bits in |
| // the destination integer type. |
| let pow2nm1 = match xty { |
| ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)), |
| ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)), |
| _ => panic!("Can't convert {}", xty), |
| }; |
| let is_large = pos.ins().ffcmp(x, pow2nm1); |
| pos.ins() |
| .brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]); |
| pos.ins().jump(below_uint_max_block, &[]); |
| |
| // We need to generate a specific trap code when `x` is NaN, so reuse the flags from the |
| // previous comparison. |
| pos.insert_block(below_uint_max_block); |
| pos.ins().trapff( |
| FloatCC::Unordered, |
| is_large, |
| ir::TrapCode::BadConversionToInteger, |
| ); |
| |
| // Now we know that x < 2^(N-1) and not NaN. |
| let sres = pos.ins().x86_cvtt2si(ty, x); |
| let is_neg = pos.ins().ifcmp_imm(sres, 0); |
| pos.ins() |
| .brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]); |
| pos.ins().jump(below_zero_block, &[]); |
| |
| pos.insert_block(below_zero_block); |
| pos.ins().trap(ir::TrapCode::IntegerOverflow); |
| |
| // Handle the case where x >= 2^(N-1) and not NaN. |
| pos.insert_block(large); |
| let adjx = pos.ins().fsub(x, pow2nm1); |
| let lres = pos.ins().x86_cvtt2si(ty, adjx); |
| let is_neg = pos.ins().ifcmp_imm(lres, 0); |
| pos.ins() |
| .trapif(IntCC::SignedLessThan, is_neg, ir::TrapCode::IntegerOverflow); |
| let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1)); |
| |
| // Recycle the original instruction as a jump. |
| pos.func.dfg.replace(inst).jump(done, &[lfinal]); |
| |
| // Finally insert a label for the completion. |
| pos.next_inst(); |
| pos.insert_block(done); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, below_uint_max_block); |
| cfg.recompute_block(pos.func, below_zero_block); |
| cfg.recompute_block(pos.func, large); |
| cfg.recompute_block(pos.func, done); |
| } |
| |
| fn expand_fcvt_to_uint_sat( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| use crate::ir::immediates::{Ieee32, Ieee64}; |
| |
| let x = match func.dfg[inst] { |
| ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtToUintSat, |
| arg, |
| } => arg, |
| _ => panic!( |
| "Need fcvt_to_uint_sat: {}", |
| func.dfg.display_inst(inst, None) |
| ), |
| }; |
| |
| let old_block = func.layout.pp_block(inst); |
| let xty = func.dfg.value_type(x); |
| let result = func.dfg.first_result(inst); |
| let ty = func.dfg.value_type(result); |
| |
| // block handle numbers < 2^(N-1). |
| let below_pow2nm1_or_nan_block = func.dfg.make_block(); |
| let below_pow2nm1_block = func.dfg.make_block(); |
| |
| // block handling numbers >= 2^(N-1). |
| let large = func.dfg.make_block(); |
| |
| // block handling numbers < 2^N. |
| let uint_large_block = func.dfg.make_block(); |
| |
| // Final block after the bad value checks. |
| let done = func.dfg.make_block(); |
| |
| // Move the `inst` result value onto the `done` block. |
| func.dfg.clear_results(inst); |
| func.dfg.attach_block_param(done, result); |
| |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| // Start by materializing the floating point constant 2^(N-1) where N is the number of bits in |
| // the destination integer type. |
| let pow2nm1 = match xty { |
| ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)), |
| ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)), |
| _ => panic!("Can't convert {}", xty), |
| }; |
| let zero = pos.ins().iconst(ty, 0); |
| let is_large = pos.ins().ffcmp(x, pow2nm1); |
| pos.ins() |
| .brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]); |
| pos.ins().jump(below_pow2nm1_or_nan_block, &[]); |
| |
| // We need to generate zero when `x` is NaN, so reuse the flags from the previous comparison. |
| pos.insert_block(below_pow2nm1_or_nan_block); |
| pos.ins().brff(FloatCC::Unordered, is_large, done, &[zero]); |
| pos.ins().jump(below_pow2nm1_block, &[]); |
| |
| // Now we know that x < 2^(N-1) and not NaN. If the result of the cvtt2si is positive, we're |
| // done; otherwise saturate to the minimum unsigned value, that is 0. |
| pos.insert_block(below_pow2nm1_block); |
| let sres = pos.ins().x86_cvtt2si(ty, x); |
| let is_neg = pos.ins().ifcmp_imm(sres, 0); |
| pos.ins() |
| .brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]); |
| pos.ins().jump(done, &[zero]); |
| |
| // Handle the case where x >= 2^(N-1) and not NaN. |
| pos.insert_block(large); |
| let adjx = pos.ins().fsub(x, pow2nm1); |
| let lres = pos.ins().x86_cvtt2si(ty, adjx); |
| let max_value = pos.ins().iconst( |
| ty, |
| match ty { |
| ir::types::I32 => u32::max_value() as i64, |
| ir::types::I64 => u64::max_value() as i64, |
| _ => panic!("Can't convert {}", ty), |
| }, |
| ); |
| let is_neg = pos.ins().ifcmp_imm(lres, 0); |
| pos.ins() |
| .brif(IntCC::SignedLessThan, is_neg, done, &[max_value]); |
| pos.ins().jump(uint_large_block, &[]); |
| |
| pos.insert_block(uint_large_block); |
| let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1)); |
| |
| // Recycle the original instruction as a jump. |
| pos.func.dfg.replace(inst).jump(done, &[lfinal]); |
| |
| // Finally insert a label for the completion. |
| pos.next_inst(); |
| pos.insert_block(done); |
| |
| cfg.recompute_block(pos.func, old_block); |
| cfg.recompute_block(pos.func, below_pow2nm1_or_nan_block); |
| cfg.recompute_block(pos.func, below_pow2nm1_block); |
| cfg.recompute_block(pos.func, large); |
| cfg.recompute_block(pos.func, uint_large_block); |
| cfg.recompute_block(pos.func, done); |
| } |
| |
| // Lanes of an I32x4 filled with the max signed integer values converted to an F32x4. |
| static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [ |
| 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, |
| ]; |
| |
| /// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes |
| /// using a long sequence of NaN quieting and truncation. This logic is separate from |
| /// [expand_fcvt_to_uint_sat] above (the scalar version), only due to how the transform groups are |
| /// set up; TODO if we change the SIMD legalization groups, then this logic could be merged into |
| /// [expand_fcvt_to_uint_sat] (see https://github.com/bytecodealliance/wasmtime/issues/1745). |
| fn expand_fcvt_to_uint_sat_vector( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Unary { |
| opcode: ir::Opcode::FcvtToUintSat, |
| arg, |
| } = pos.func.dfg[inst] |
| { |
| let controlling_type = pos.func.dfg.ctrl_typevar(inst); |
| if controlling_type == I32X4 { |
| debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4); |
| // We must both quiet any NaNs--setting that lane to 0--and saturate any |
| // lanes that might overflow during conversion to the highest/lowest integer |
| // allowed in that lane. |
| let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into()); |
| let max_signed_constant = pos |
| .func |
| .dfg |
| .constants |
| .insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into()); |
| let zeroes = pos.ins().vconst(F32X4, zeroes_constant); |
| let max_signed = pos.ins().vconst(F32X4, max_signed_constant); |
| // Clamp the input to 0 for negative floating point numbers. TODO we need to |
| // convert NaNs to 0 but this doesn't do that? |
| let ge_zero = pos.ins().x86_fmax(arg, zeroes); |
| // Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert. |
| // For floating point numbers above this, CVTTPS2DQ returns the undefined value |
| // 0x80000000. |
| let minus_max_signed = pos.ins().fsub(ge_zero, max_signed); |
| let le_max_signed = |
| pos.ins() |
| .fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed); |
| // Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0. |
| // These lanes have the MSB set to 1 after the XOR. We are trying to calculate a |
| // valid, in-range addend. |
| let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed); |
| let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed); |
| let difference = pos |
| .ins() |
| .bxor(minus_max_signed_as_int, le_max_signed_as_int); |
| // Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified |
| // previously (MSB set to 1). |
| let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes); |
| let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int); |
| // Convert the original clamped number to an integer and add back in the addend |
| // (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these). |
| let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero); |
| pos.func.dfg.replace(inst).iadd(converted, addend); |
| } else { |
| unreachable!( |
| "{} should not be legalized in expand_fcvt_to_uint_sat_vector", |
| pos.func.dfg.display_inst(inst, None) |
| ) |
| } |
| } |
| } |
| |
| /// Convert shuffle instructions. |
| fn convert_shuffle( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] { |
| // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1 |
| // in the most significant position zeroes the lane. |
| let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; |
| |
| // We only have to worry about aliasing here because copies will be introduced later (in |
| // regalloc). |
| let a = pos.func.dfg.resolve_aliases(args[0]); |
| let b = pos.func.dfg.resolve_aliases(args[1]); |
| let mask = pos |
| .func |
| .dfg |
| .immediates |
| .get(mask) |
| .expect("The shuffle immediate should have been recorded before this point") |
| .clone(); |
| if a == b { |
| // PSHUFB the first argument (since it is the same as the second). |
| let constructed_mask = mask |
| .iter() |
| // If the mask is greater than 15 it still may be referring to a lane in b. |
| .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) |
| .map(zero_unknown_lane_index) |
| .collect(); |
| let handle = pos.func.dfg.constants.insert(constructed_mask); |
| // Move the built mask into another XMM register. |
| let a_type = pos.func.dfg.value_type(a); |
| let mask_value = pos.ins().vconst(a_type, handle); |
| // Shuffle the single incoming argument. |
| pos.func.dfg.replace(inst).x86_pshufb(a, mask_value); |
| } else { |
| // PSHUFB the first argument, placing zeroes for unused lanes. |
| let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); |
| let handle = pos.func.dfg.constants.insert(constructed_mask); |
| // Move the built mask into another XMM register. |
| let a_type = pos.func.dfg.value_type(a); |
| let mask_value = pos.ins().vconst(a_type, handle); |
| // Shuffle the first argument. |
| let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value); |
| |
| // PSHUFB the second argument, placing zeroes for unused lanes. |
| let constructed_mask = mask |
| .iter() |
| .map(|b| b.wrapping_sub(16)) |
| .map(zero_unknown_lane_index) |
| .collect(); |
| let handle = pos.func.dfg.constants.insert(constructed_mask); |
| // Move the built mask into another XMM register. |
| let b_type = pos.func.dfg.value_type(b); |
| let mask_value = pos.ins().vconst(b_type, handle); |
| // Shuffle the second argument. |
| let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value); |
| |
| // OR the vectors together to form the final shuffled value. |
| pos.func |
| .dfg |
| .replace(inst) |
| .bor(shuffled_first_arg, shuffled_second_arg); |
| |
| // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB |
| }; |
| } |
| } |
| |
| /// Because floats already exist in XMM registers, we can keep them there when executing a CLIF |
| /// extractlane instruction |
| fn convert_extractlane( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::BinaryImm8 { |
| opcode: ir::Opcode::Extractlane, |
| arg, |
| imm: lane, |
| } = pos.func.dfg[inst] |
| { |
| // NOTE: the following legalization assumes that the upper bits of the XMM register do |
| // not need to be zeroed during extractlane. |
| let value_type = pos.func.dfg.value_type(arg); |
| if value_type.lane_type().is_float() { |
| // Floats are already in XMM registers and can stay there. |
| let shuffled = if lane != 0 { |
| // Replace the extractlane with a PSHUFD to get the float in the right place. |
| match value_type { |
| F32X4 => { |
| // Move the selected lane to the 0 lane. |
| let shuffle_mask: u8 = 0b00_00_00_00 | lane; |
| pos.ins().x86_pshufd(arg, shuffle_mask) |
| } |
| F64X2 => { |
| assert_eq!(lane, 1); |
| // Because we know the lane == 1, we move the upper 64 bits to the lower |
| // 64 bits, leaving the top 64 bits as-is. |
| let shuffle_mask = 0b11_10_11_10; |
| let bitcast = pos.ins().raw_bitcast(F32X4, arg); |
| pos.ins().x86_pshufd(bitcast, shuffle_mask) |
| } |
| _ => unreachable!(), |
| } |
| } else { |
| // Remove the extractlane instruction, leaving the float where it is. |
| arg |
| }; |
| // Then we must bitcast to the right type. |
| pos.func |
| .dfg |
| .replace(inst) |
| .raw_bitcast(value_type.lane_type(), shuffled); |
| } else { |
| // For non-floats, lower with the usual PEXTR* instruction. |
| pos.func.dfg.replace(inst).x86_pextr(arg, lane); |
| } |
| } |
| } |
| |
| /// Because floats exist in XMM registers, we can keep them there when executing a CLIF |
| /// insertlane instruction |
| fn convert_insertlane( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::TernaryImm8 { |
| opcode: ir::Opcode::Insertlane, |
| args: [vector, replacement], |
| imm: lane, |
| } = pos.func.dfg[inst] |
| { |
| let value_type = pos.func.dfg.value_type(vector); |
| if value_type.lane_type().is_float() { |
| // Floats are already in XMM registers and can stay there. |
| match value_type { |
| F32X4 => { |
| assert!(lane <= 3); |
| let immediate = 0b00_00_00_00 | lane << 4; |
| // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane |
| // shifted into bits 5:6). |
| pos.func |
| .dfg |
| .replace(inst) |
| .x86_insertps(vector, replacement, immediate) |
| } |
| F64X2 => { |
| let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types |
| if lane == 0 { |
| // Move the lowest quadword in replacement to vector without changing |
| // the upper bits. |
| pos.func |
| .dfg |
| .replace(inst) |
| .x86_movsd(vector, replacement_as_vector) |
| } else { |
| assert_eq!(lane, 1); |
| // Move the low 64 bits of replacement vector to the high 64 bits of the |
| // vector. |
| pos.func |
| .dfg |
| .replace(inst) |
| .x86_movlhps(vector, replacement_as_vector) |
| } |
| } |
| _ => unreachable!(), |
| }; |
| } else { |
| // For non-floats, lower with the usual PINSR* instruction. |
| pos.func |
| .dfg |
| .replace(inst) |
| .x86_pinsr(vector, replacement, lane); |
| } |
| } |
| } |
| |
| /// For SIMD or scalar integer negation, convert `ineg` to `vconst + isub` or `iconst + isub`. |
| fn convert_ineg( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Unary { |
| opcode: ir::Opcode::Ineg, |
| arg, |
| } = pos.func.dfg[inst] |
| { |
| let value_type = pos.func.dfg.value_type(arg); |
| let zero_value = if value_type.is_vector() && value_type.lane_type().is_int() { |
| let zero_immediate = pos.func.dfg.constants.insert(vec![0; 16].into()); |
| pos.ins().vconst(value_type, zero_immediate) // this should be legalized to a PXOR |
| } else if value_type.is_int() { |
| pos.ins().iconst(value_type, 0) |
| } else { |
| panic!("Can't convert ineg of type {}", value_type) |
| }; |
| pos.func.dfg.replace(inst).isub(zero_value, arg); |
| } else { |
| unreachable!() |
| } |
| } |
| |
| fn expand_dword_to_xmm<'f>( |
| pos: &mut FuncCursor<'_>, |
| arg: ir::Value, |
| arg_type: ir::Type, |
| ) -> ir::Value { |
| if arg_type == I64 { |
| let (arg_lo, arg_hi) = pos.ins().isplit(arg); |
| let arg = pos.ins().scalar_to_vector(I32X4, arg_lo); |
| let arg = pos.ins().insertlane(arg, arg_hi, 1); |
| let arg = pos.ins().raw_bitcast(I64X2, arg); |
| arg |
| } else { |
| pos.ins().bitcast(I64X2, arg) |
| } |
| } |
| |
| fn contract_dword_from_xmm<'f>( |
| pos: &mut FuncCursor<'f>, |
| inst: ir::Inst, |
| ret: ir::Value, |
| ret_type: ir::Type, |
| ) { |
| if ret_type == I64 { |
| let ret = pos.ins().raw_bitcast(I32X4, ret); |
| let ret_lo = pos.ins().extractlane(ret, 0); |
| let ret_hi = pos.ins().extractlane(ret, 1); |
| pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi); |
| } else { |
| let ret = pos.ins().extractlane(ret, 0); |
| pos.func.dfg.replace(inst).ireduce(ret_type, ret); |
| } |
| } |
| |
| // Masks for i8x16 unsigned right shift. |
| static USHR_MASKS: [u8; 128] = [ |
| 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, |
| 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, |
| 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, |
| 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, |
| 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, |
| 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, |
| 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, |
| ]; |
| |
| // Convert a vector unsigned right shift. x86 has implementations for i16x8 and up (see `x86_pslr`), |
| // but for i8x16 we translate the shift to a i16x8 shift and mask off the upper bits. This same |
| // conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex` |
| // has a varargs field that we can't modify with the CDSL in legalize.rs. |
| fn convert_ushr( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Binary { |
| opcode: ir::Opcode::Ushr, |
| args: [arg0, arg1], |
| } = pos.func.dfg[inst] |
| { |
| // Note that for Wasm, the bounding of the shift index has happened during translation |
| let arg0_type = pos.func.dfg.value_type(arg0); |
| let arg1_type = pos.func.dfg.value_type(arg1); |
| assert!(!arg1_type.is_vector() && arg1_type.is_int()); |
| |
| // TODO it may be more clear to use scalar_to_vector here; the current issue is that |
| // scalar_to_vector has the restriction that the vector produced has a matching lane size |
| // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 -> |
| // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift |
| // index type--this could be relaxed since it is not really meaningful. |
| let shift_index = pos.ins().bitcast(I64X2, arg1); |
| |
| if arg0_type == I8X16 { |
| // First, shift the vector using an I16X8 shift. |
| let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); |
| let shifted = pos.ins().x86_psrl(bitcasted, shift_index); |
| let shifted = pos.ins().raw_bitcast(I8X16, shifted); |
| |
| // Then, fixup the even lanes that have incorrect upper bits. This uses the 128 mask |
| // bytes as a table that we index into. It is a substantial code-size increase but |
| // reduces the instruction count slightly. |
| let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into()); |
| let mask_address = pos.ins().const_addr(isa.pointer_type(), masks); |
| let mask_offset = pos.ins().ishl_imm(arg1, 4); |
| let mask = |
| pos.ins() |
| .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0); |
| pos.func.dfg.replace(inst).band(shifted, mask); |
| } else if arg0_type.is_vector() { |
| // x86 has encodings for these shifts. |
| pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index); |
| } else if arg0_type == I64 { |
| // 64 bit shifts need to be legalized on x86_32. |
| let x86_isa = isa |
| .as_any() |
| .downcast_ref::<isa::x86::Isa>() |
| .expect("the target ISA must be x86 at this point"); |
| if x86_isa.isa_flags.has_sse41() { |
| // if we have pinstrq/pextrq (SSE 4.1), legalize to that |
| let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type); |
| let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type); |
| let shifted = pos.ins().x86_psrl(value, amount); |
| contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type); |
| } else { |
| // otherwise legalize to libcall |
| expand_as_libcall(inst, func, isa); |
| } |
| } else { |
| // Everything else should be already legal. |
| unreachable!() |
| } |
| } |
| } |
| |
| // Masks for i8x16 left shift. |
| static SHL_MASKS: [u8; 128] = [ |
| 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, |
| 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, |
| 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, |
| 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, |
| 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, |
| 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, |
| 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| ]; |
| |
| // Convert a vector left shift. x86 has implementations for i16x8 and up (see `x86_psll`), |
| // but for i8x16 we translate the shift to a i16x8 shift and mask off the lower bits. This same |
| // conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex` |
| // has a varargs field that we can't modify with the CDSL in legalize.rs. |
| fn convert_ishl( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Binary { |
| opcode: ir::Opcode::Ishl, |
| args: [arg0, arg1], |
| } = pos.func.dfg[inst] |
| { |
| // Note that for Wasm, the bounding of the shift index has happened during translation |
| let arg0_type = pos.func.dfg.value_type(arg0); |
| let arg1_type = pos.func.dfg.value_type(arg1); |
| assert!(!arg1_type.is_vector() && arg1_type.is_int()); |
| |
| // TODO it may be more clear to use scalar_to_vector here; the current issue is that |
| // scalar_to_vector has the restriction that the vector produced has a matching lane size |
| // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 -> |
| // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift |
| // index type--this could be relaxed since it is not really meaningful. |
| let shift_index = pos.ins().bitcast(I64X2, arg1); |
| |
| if arg0_type == I8X16 { |
| // First, shift the vector using an I16X8 shift. |
| let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); |
| let shifted = pos.ins().x86_psll(bitcasted, shift_index); |
| let shifted = pos.ins().raw_bitcast(I8X16, shifted); |
| |
| // Then, fixup the even lanes that have incorrect lower bits. This uses the 128 mask |
| // bytes as a table that we index into. It is a substantial code-size increase but |
| // reduces the instruction count slightly. |
| let masks = pos.func.dfg.constants.insert(SHL_MASKS.as_ref().into()); |
| let mask_address = pos.ins().const_addr(isa.pointer_type(), masks); |
| let mask_offset = pos.ins().ishl_imm(arg1, 4); |
| let mask = |
| pos.ins() |
| .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0); |
| pos.func.dfg.replace(inst).band(shifted, mask); |
| } else if arg0_type.is_vector() { |
| // x86 has encodings for these shifts. |
| pos.func.dfg.replace(inst).x86_psll(arg0, shift_index); |
| } else if arg0_type == I64 { |
| // 64 bit shifts need to be legalized on x86_32. |
| let x86_isa = isa |
| .as_any() |
| .downcast_ref::<isa::x86::Isa>() |
| .expect("the target ISA must be x86 at this point"); |
| if x86_isa.isa_flags.has_sse41() { |
| // if we have pinstrq/pextrq (SSE 4.1), legalize to that |
| let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type); |
| let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type); |
| let shifted = pos.ins().x86_psll(value, amount); |
| contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type); |
| } else { |
| // otherwise legalize to libcall |
| expand_as_libcall(inst, func, isa); |
| } |
| } else { |
| // Everything else should be already legal. |
| unreachable!() |
| } |
| } |
| } |
| |
| /// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2. |
| fn convert_i64x2_imul( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| pos.use_srcloc(inst); |
| |
| if let ir::InstructionData::Binary { |
| opcode: ir::Opcode::Imul, |
| args: [arg0, arg1], |
| } = pos.func.dfg[inst] |
| { |
| let ty = pos.func.dfg.ctrl_typevar(inst); |
| if ty == I64X2 { |
| let x86_isa = isa |
| .as_any() |
| .downcast_ref::<isa::x86::Isa>() |
| .expect("the target ISA must be x86 at this point"); |
| if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() { |
| // If we have certain AVX512 features, we can lower this instruction simply. |
| pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1); |
| } else { |
| // Otherwise, we default to a very lengthy SSE2-compatible sequence. It splits each |
| // 64-bit lane into 32-bit high and low sections using shifting and then performs |
| // the following arithmetic per lane: with arg0 = concat(high0, low0) and arg1 = |
| // concat(high1, low1), calculate (high0 * low1) + (high1 * low0) + (low0 * low1). |
| let high0 = pos.ins().ushr_imm(arg0, 32); |
| let mul0 = pos.ins().x86_pmuludq(high0, arg1); |
| let high1 = pos.ins().ushr_imm(arg1, 32); |
| let mul1 = pos.ins().x86_pmuludq(high1, arg0); |
| let addhigh = pos.ins().iadd(mul0, mul1); |
| let high = pos.ins().ishl_imm(addhigh, 32); |
| let low = pos.ins().x86_pmuludq(arg0, arg1); |
| pos.func.dfg.replace(inst).iadd(low, high); |
| } |
| } else { |
| unreachable!( |
| "{} should be encodable; it cannot be legalized by convert_i64x2_imul", |
| pos.func.dfg.display_inst(inst, None) |
| ); |
| } |
| } |
| } |
| |
| fn expand_tls_value( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| isa: &dyn TargetIsa, |
| ) { |
| use crate::settings::TlsModel; |
| |
| assert!( |
| isa.triple().architecture == target_lexicon::Architecture::X86_64, |
| "Not yet implemented for {:?}", |
| isa.triple(), |
| ); |
| |
| if let ir::InstructionData::UnaryGlobalValue { |
| opcode: ir::Opcode::TlsValue, |
| global_value, |
| } = func.dfg[inst] |
| { |
| let ctrl_typevar = func.dfg.ctrl_typevar(inst); |
| assert_eq!(ctrl_typevar, ir::types::I64); |
| |
| match isa.flags().tls_model() { |
| TlsModel::None => panic!("tls_model flag is not set."), |
| TlsModel::ElfGd => { |
| func.dfg.replace(inst).x86_elf_tls_get_addr(global_value); |
| } |
| TlsModel::Macho => { |
| func.dfg.replace(inst).x86_macho_tls_get_addr(global_value); |
| } |
| model => unimplemented!("tls_value for tls model {:?}", model), |
| } |
| } else { |
| unreachable!(); |
| } |
| } |
| |
| fn expand_load_splat( |
| inst: ir::Inst, |
| func: &mut ir::Function, |
| _cfg: &mut ControlFlowGraph, |
| _isa: &dyn TargetIsa, |
| ) { |
| let mut pos = FuncCursor::new(func).at_inst(inst); |
| |
| pos.use_srcloc(inst); |
| |
| let (ptr, offset, flags) = match pos.func.dfg[inst] { |
| ir::InstructionData::Load { |
| opcode: ir::Opcode::LoadSplat, |
| arg, |
| offset, |
| flags, |
| } => (arg, offset, flags), |
| _ => panic!( |
| "Expected load_splat: {}", |
| pos.func.dfg.display_inst(inst, None) |
| ), |
| }; |
| let ty = pos.func.dfg.ctrl_typevar(inst); |
| let load = pos.ins().load(ty.lane_type(), flags, ptr, offset); |
| |
| pos.func.dfg.replace(inst).splat(ty, load); |
| } |