vendor/cranelift-codegen/src/isa/x64/lower.rs - toolchain/rustc - Git at Google

 //! Lowering rules for X64.

 use crate::data_value::DataValue;
 use crate::ir::{
     condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
     Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
 };
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::result::CodegenResult;
 use crate::settings::{Flags, TlsModel};
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use cranelift_codegen_shared::condcodes::CondCode;
 use log::trace;
 use regalloc::{Reg, RegClass, Writable};
 use smallvec::{smallvec, SmallVec};
 use std::convert::TryFrom;
 use target_lexicon::Triple;

 //=============================================================================
 // Helpers for instruction lowering.

 fn is_int_or_ref_ty(ty: Type) -> bool {
     match ty {
         types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
         types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
         _ => false,
     }
 }

 fn is_bool_ty(ty: Type) -> bool {
     match ty {
         types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
         _ => false,
     }
 }

 /// This is target-word-size dependent.  And it excludes booleans and reftypes.
 fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
     match ty {
         types::I8 | types::I16 | types::I32 | types::I64 => true,
         _ => false,
     }
 }

 /// Returns whether the given specified `input` is a result produced by an instruction with Opcode
 /// `op`.
 // TODO investigate failures with checking against the result index.
 fn matches_input<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     op: Opcode,
 ) -> Option<IRInst> {
     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
     inputs.inst.and_then(|(src_inst, _)| {
         let data = ctx.data(src_inst);
         if data.opcode() == op {
             return Some(src_inst);
         }
         None
     })
 }

 /// Returns whether the given specified `input` is a result produced by an instruction with any of
 /// the opcodes specified in `ops`.
 fn matches_input_any<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     ops: &[Opcode],
 ) -> Option<IRInst> {
     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
     inputs.inst.and_then(|(src_inst, _)| {
         let data = ctx.data(src_inst);
         for &op in ops {
             if data.opcode() == op {
                 return Some(src_inst);
             }
         }
         None
     })
 }

 /// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
 /// temporary register, returning that register.
 fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> {
     let from_bits = ty_bits(ty);
     let masked = if from_bits < 64 {
         c & ((1u64 << from_bits) - 1)
     } else {
         c
     };

     let cst_copy = ctx.alloc_tmp(ty);
     for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
         ctx.alloc_tmp(ty).only_reg().unwrap()
     })
     .into_iter()
     {
         ctx.emit(inst);
     }
     non_writable_value_regs(cst_copy)
 }

 /// Put the given input into possibly multiple registers, and mark it as used (side-effect).
 fn put_input_in_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> ValueRegs<Reg> {
     let ty = ctx.input_ty(spec.insn, spec.input);
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);

     if let Some(c) = input.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         generate_constant(ctx, ty, c)
     } else {
         ctx.put_input_in_regs(spec.insn, spec.input)
     }
 }

 /// Put the given input into a register, and mark it as used (side-effect).
 fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
     put_input_in_regs(ctx, spec)
         .only_reg()
         .expect("Multi-register value not expected")
 }

 /// Determines whether a load operation (indicated by `src_insn`) can be merged
 /// into the current lowering point. If so, returns the address-base source (as
 /// an `InsnInput`) and an offset from that address from which to perform the
 /// load.
 fn is_mergeable_load<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     src_insn: IRInst,
 ) -> Option<(InsnInput, i32)> {
     let insn_data = ctx.data(src_insn);
     let inputs = ctx.num_inputs(src_insn);
     if inputs != 1 {
         return None;
     }

     let load_ty = ctx.output_ty(src_insn, 0);
     if ty_bits(load_ty) < 32 {
         // Narrower values are handled by ALU insts that are at least 32 bits
         // wide, which is normally OK as we ignore upper buts; but, if we
         // generate, e.g., a direct-from-memory 32-bit add for a byte value and
         // the byte is the last byte in a page, the extra data that we load is
         // incorrectly accessed. So we only allow loads to merge for
         // 32-bit-and-above widths.
         return None;
     }

     // SIMD instructions can only be load-coalesced when the loaded value comes
     // from an aligned address.
     if load_ty.is_vector() && !insn_data.memflags().map_or(false, |f| f.aligned()) {
         return None;
     }

     // Just testing the opcode is enough, because the width will always match if
     // the type does (and the type should match if the CLIF is properly
     // constructed).
     if insn_data.opcode() == Opcode::Load {
         let offset = insn_data
             .load_store_offset()
             .expect("load should have offset");
         Some((
             InsnInput {
                 insn: src_insn,
                 input: 0,
             },
             offset,
         ))
     } else {
         None
     }
 }

 /// Put the given input into a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem {
     let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);

     if let Some(c) = inputs.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         let ty = ctx.input_ty(spec.insn, spec.input);
         return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
     }

     if let Some((src_insn, 0)) = inputs.inst {
         if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
             ctx.sink_inst(src_insn);
             let amode = lower_to_amode(ctx, addr_input, offset);
             return RegMem::mem(amode);
         }
     }

     RegMem::reg(
         ctx.put_input_in_regs(spec.insn, spec.input)
             .only_reg()
             .unwrap(),
     )
 }

 /// An extension specification for `extend_input_to_reg`.
 #[derive(Clone, Copy)]
 enum ExtSpec {
     ZeroExtendTo32,
     ZeroExtendTo64,
     SignExtendTo32,
     #[allow(dead_code)] // not used just yet but may be used in the future!
     SignExtendTo64,
 }

 /// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
 /// required. (This obviously causes side-effects.)
 fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     spec: InsnInput,
     ext_spec: ExtSpec,
 ) -> Reg {
     let requested_size = match ext_spec {
         ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
         ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
     };
     let input_size = ctx.input_ty(spec.insn, spec.input).bits();

     let requested_ty = if requested_size == 32 {
         types::I32
     } else {
         types::I64
     };

     let ext_mode = match (input_size, requested_size) {
         (a, b) if a == b => return put_input_in_reg(ctx, spec),
         (1, 8) => return put_input_in_reg(ctx, spec),
         (a, b) => ExtMode::new(a, b).expect(&format!("invalid extension: {} -> {}", a, b)),
     };

     let src = input_to_reg_mem(ctx, spec);
     let dst = ctx.alloc_tmp(requested_ty).only_reg().unwrap();
     match ext_spec {
         ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
             ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst))
         }
         ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
             ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst))
         }
     }
     dst.to_reg()
 }

 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
 /// possible side-effect.
 fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
     input.constant.and_then(|x| {
         // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
         // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
         // constant.
         if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
             Some(x as u32)
         } else {
             None
         }
     })
 }

 fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
     ctx.get_input_as_source_or_const(spec.insn, spec.input)
         .constant
 }

 /// Put the given input into an immediate, a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm {
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
     let input_ty = ctx.input_ty(spec.insn, spec.input);
     match non_reg_input_to_sext_imm(input, input_ty) {
         Some(x) => RegMemImm::imm(x),
         None => match input_to_reg_mem(ctx, spec) {
             RegMem::Reg { reg } => RegMemImm::reg(reg),
             RegMem::Mem { addr } => RegMemImm::mem(addr),
         },
     }
 }

 /// Emit an instruction to insert a value `src` into a lane of `dst`.
 fn emit_insert_lane<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     src: RegMem,
     dst: Writable<Reg>,
     lane: u8,
     ty: Type,
 ) {
     if !ty.is_float() {
         let (sse_op, size) = match ty.lane_bits() {
             8 => (SseOpcode::Pinsrb, OperandSize::Size32),
             16 => (SseOpcode::Pinsrw, OperandSize::Size32),
             32 => (SseOpcode::Pinsrd, OperandSize::Size32),
             64 => (SseOpcode::Pinsrd, OperandSize::Size64),
             _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
         };
         ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
     } else if ty == types::F32 {
         let sse_op = SseOpcode::Insertps;
         // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
         // shifted into bits 5:6).
         let lane = 0b00_00_00_00 | lane << 4;
         ctx.emit(Inst::xmm_rm_r_imm(
             sse_op,
             src,
             dst,
             lane,
             OperandSize::Size32,
         ));
     } else if ty == types::F64 {
         let sse_op = match lane {
             // Move the lowest quadword in replacement to vector without changing
             // the upper bits.
             0 => SseOpcode::Movsd,
             // Move the low 64 bits of replacement vector to the high 64 bits of the
             // vector.
             1 => SseOpcode::Movlhps,
             _ => unreachable!(),
         };
         // Here we use the `xmm_rm_r` encoding because it correctly tells the register
         // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
         // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
         ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
     } else {
         panic!("unable to emit insertlane for type: {}", ty)
     }
 }

 /// Emit an instruction to extract a lane of `src` into `dst`.
 fn emit_extract_lane<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     src: Reg,
     dst: Writable<Reg>,
     lane: u8,
     ty: Type,
 ) {
     if !ty.is_float() {
         let (sse_op, size) = match ty.lane_bits() {
             8 => (SseOpcode::Pextrb, OperandSize::Size32),
             16 => (SseOpcode::Pextrw, OperandSize::Size32),
             32 => (SseOpcode::Pextrd, OperandSize::Size32),
             64 => (SseOpcode::Pextrd, OperandSize::Size64),
             _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
         };
         let src = RegMem::reg(src);
         ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
     } else if ty == types::F32 || ty == types::F64 {
         if lane == 0 {
             // Remove the extractlane instruction, leaving the float where it is. The upper
             // bits will remain unchanged; for correctness, this relies on Cranelift type
             // checking to avoid using those bits.
             ctx.emit(Inst::gen_move(dst, src, ty));
         } else {
             // Otherwise, shuffle the bits in `lane` to the lowest lane.
             let sse_op = SseOpcode::Pshufd;
             let mask = match ty {
                 // Move the value at `lane` to lane 0, copying existing value at lane 0 to
                 // other lanes. Again, this relies on Cranelift type checking to avoid
                 // using those bits.
                 types::F32 => {
                     assert!(lane > 0 && lane < 4);
                     0b00_00_00_00 | lane
                 }
                 // Move the value at `lane` 1 (we know it must be 1 because of the `if`
                 // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
                 // checking assumption also applies here.
                 types::F64 => {
                     assert!(lane == 1);
                     0b11_10_11_10
                 }
                 _ => unreachable!(),
             };
             let src = RegMem::reg(src);
             ctx.emit(Inst::xmm_rm_r_imm(
                 sse_op,
                 src,
                 dst,
                 mask,
                 OperandSize::Size32,
             ));
         }
     } else {
         panic!("unable to emit extractlane for type: {}", ty)
     }
 }

 /// Emits an int comparison instruction.
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
 ///
 /// Takes the condition code that will be tested, and returns
 /// the condition code that should be used. This allows us to
 /// synthesize comparisons out of multiple instructions for
 /// special cases (e.g., 128-bit integers).
 fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC {
     let ty = ctx.input_ty(insn, 0);

     let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];

     if ty == types::I128 {
         // We need to compare both halves and combine the results appropriately.
         let cmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
         let cmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
         let lhs = put_input_in_regs(ctx, inputs[0]);
         let lhs_lo = lhs.regs()[0];
         let lhs_hi = lhs.regs()[1];
         let rhs = put_input_in_regs(ctx, inputs[1]);
         let rhs_lo = RegMemImm::reg(rhs.regs()[0]);
         let rhs_hi = RegMemImm::reg(rhs.regs()[1]);
         match cc {
             IntCC::Equal => {
                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
                 ctx.emit(Inst::setcc(CC::Z, cmp1));
                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
                 ctx.emit(Inst::setcc(CC::Z, cmp2));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::And,
                     RegMemImm::reg(cmp1.to_reg()),
                     cmp2,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::And,
                     RegMemImm::imm(1),
                     cmp2,
                 ));
                 IntCC::NotEqual
             }
             IntCC::NotEqual => {
                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
                 ctx.emit(Inst::setcc(CC::NZ, cmp1));
                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
                 ctx.emit(Inst::setcc(CC::NZ, cmp2));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Or,
                     RegMemImm::reg(cmp1.to_reg()),
                     cmp2,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::And,
                     RegMemImm::imm(1),
                     cmp2,
                 ));
                 IntCC::NotEqual
             }
             IntCC::SignedLessThan
             | IntCC::SignedLessThanOrEqual
             | IntCC::SignedGreaterThan
             | IntCC::SignedGreaterThanOrEqual
             | IntCC::UnsignedLessThan
             | IntCC::UnsignedLessThanOrEqual
             | IntCC::UnsignedGreaterThan
             | IntCC::UnsignedGreaterThanOrEqual => {
                 // Result = (lhs_hi <> rhs_hi) ||
                 //          (lhs_hi == rhs_hi && lhs_lo <> rhs_lo)
                 let cmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_hi, lhs_hi));
                 ctx.emit(Inst::setcc(CC::from_intcc(cc.without_equal()), cmp1));
                 ctx.emit(Inst::setcc(CC::Z, cmp2));
                 ctx.emit(Inst::cmp_rmi_r(OperandSize::Size64, rhs_lo, lhs_lo));
                 ctx.emit(Inst::setcc(CC::from_intcc(cc.unsigned()), cmp3));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::And,
                     RegMemImm::reg(cmp2.to_reg()),
                     cmp3,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Or,
                     RegMemImm::reg(cmp1.to_reg()),
                     cmp3,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::And,
                     RegMemImm::imm(1),
                     cmp3,
                 ));
                 IntCC::NotEqual
             }
             _ => panic!("Unhandled IntCC in I128 comparison: {:?}", cc),
         }
     } else {
         // TODO Try to commute the operands (and invert the condition) if one is an immediate.
         let lhs = put_input_in_reg(ctx, inputs[0]);
         // We force the RHS into a register, and disallow load-op fusion, because we
         // do not have a transitive guarantee that this cmp-site will be the sole
         // user of the value. Consider: the icmp might be the only user of a load,
         // but there may be multiple users of the icmp (e.g.  select or bint
         // instructions) that each invoke `emit_cmp()`. If we were to allow a load
         // to sink to the *latest* one, but other sites did not permit sinking, then
         // we would be missing the load for other cmp-sites.
         let rhs = put_input_in_reg(ctx, inputs[1]);

         // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
         // us dst - src at the machine instruction level, so invert operands.
         ctx.emit(Inst::cmp_rmi_r(
             OperandSize::from_ty(ty),
             RegMemImm::reg(rhs),
             lhs,
         ));
         cc
     }
 }

 /// A specification for a fcmp emission.
 enum FcmpSpec {
     /// Normal flow.
     Normal,

     /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
     /// happens with `InvertedEqualOrConditions`.
     ///
     /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
     /// sequence of instructions) that check for an "AND" combination of condition codes; see for
     /// instance lowering of Select.
     InvertEqual,
 }

 /// This explains how to interpret the results of an fcmp instruction.
 enum FcmpCondResult {
     /// The given condition code must be set.
     Condition(CC),

     /// Both condition codes must be set.
     AndConditions(CC, CC),

     /// Either of the conditions codes must be set.
     OrConditions(CC, CC),

     /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
     /// of the condition codes must be set, and the user must invert meaning of analyzing the
     /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
     /// reached.
     InvertedEqualOrConditions(CC, CC),
 }

 /// Emits a float comparison instruction.
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
 fn emit_fcmp<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
     mut cond_code: FloatCC,
     spec: FcmpSpec,
 ) -> FcmpCondResult {
     let (flip_operands, inverted_equal) = match cond_code {
         FloatCC::LessThan
         | FloatCC::LessThanOrEqual
         | FloatCC::UnorderedOrGreaterThan
         | FloatCC::UnorderedOrGreaterThanOrEqual => {
             cond_code = cond_code.reverse();
             (true, false)
         }
         FloatCC::Equal => {
             let inverted_equal = match spec {
                 FcmpSpec::Normal => false,
                 FcmpSpec::InvertEqual => {
                     cond_code = FloatCC::NotEqual; // same as .inverse()
                     true
                 }
             };
             (false, inverted_equal)
         }
         _ => (false, false),
     };

     // The only valid CC constructed with `from_floatcc` can be put in the flag
     // register with a direct float comparison; do this here.
     let op = match ctx.input_ty(insn, 0) {
         types::F32 => SseOpcode::Ucomiss,
         types::F64 => SseOpcode::Ucomisd,
         _ => panic!("Bad input type to Fcmp"),
     };

     let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
     let (lhs_input, rhs_input) = if flip_operands {
         (inputs[1], inputs[0])
     } else {
         (inputs[0], inputs[1])
     };
     let lhs = put_input_in_reg(ctx, lhs_input);
     // See above in `emit_cmp()`. We must only use the reg/reg form of the
     // comparison in order to avoid issues with merged loads.
     let rhs = put_input_in_reg(ctx, rhs_input);
     ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs));

     let cond_result = match cond_code {
         FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
         FloatCC::NotEqual if inverted_equal => {
             FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
         }
         FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
         _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
     };

     cond_result
 }

 fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>, ty: Type) {
     let bits = ty.bits();
     let const_mask = if bits == 64 {
         0xffff_ffff_ffff_ffff
     } else {
         (1u64 << bits) - 1
     };
     let tmp0 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();

     ctx.emit(Inst::gen_move(tmp0, src, types::I64));

     // Swap 1-bit units.
     // tmp1 = src
     ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
     // tmp2 = 0b0101..
     ctx.emit(Inst::imm(
         OperandSize::Size64,
         0x5555_5555_5555_5555 & const_mask,
         tmp2,
     ));
     // tmp1 = src >> 1
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftRightLogical,
         Some(1),
         tmp1,
     ));
     // tmp1 = (src >> 1) & 0b0101..
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::reg(tmp2.to_reg()),
         tmp1,
     ));
     // tmp2 = src & 0b0101..
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::reg(tmp0.to_reg()),
         tmp2,
     ));
     // tmp2 = (src & 0b0101..) << 1
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftLeft,
         Some(1),
         tmp2,
     ));
     // tmp0 = (src >> 1) & 0b0101.. | (src & 0b0101..) << 1
     ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Or,
         RegMemImm::reg(tmp1.to_reg()),
         tmp0,
     ));

     // Swap 2-bit units.
     ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
     ctx.emit(Inst::imm(
         OperandSize::Size64,
         0x3333_3333_3333_3333 & const_mask,
         tmp2,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftRightLogical,
         Some(2),
         tmp1,
     ));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::reg(tmp2.to_reg()),
         tmp1,
     ));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::reg(tmp0.to_reg()),
         tmp2,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftLeft,
         Some(2),
         tmp2,
     ));
     ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Or,
         RegMemImm::reg(tmp1.to_reg()),
         tmp0,
     ));

     // Swap 4-bit units.
     ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
     ctx.emit(Inst::imm(
         OperandSize::Size64,
         0x0f0f_0f0f_0f0f_0f0f & const_mask,
         tmp2,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftRightLogical,
         Some(4),
         tmp1,
     ));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::reg(tmp2.to_reg()),
         tmp1,
     ));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::reg(tmp0.to_reg()),
         tmp2,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftLeft,
         Some(4),
         tmp2,
     ));
     ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Or,
         RegMemImm::reg(tmp1.to_reg()),
         tmp0,
     ));

     if bits > 8 {
         // Swap 8-bit units.
         ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
         ctx.emit(Inst::imm(
             OperandSize::Size64,
             0x00ff_00ff_00ff_00ff & const_mask,
             tmp2,
         ));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightLogical,
             Some(8),
             tmp1,
         ));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::And,
             RegMemImm::reg(tmp2.to_reg()),
             tmp1,
         ));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::And,
             RegMemImm::reg(tmp0.to_reg()),
             tmp2,
         ));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftLeft,
             Some(8),
             tmp2,
         ));
         ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::Or,
             RegMemImm::reg(tmp1.to_reg()),
             tmp0,
         ));
     }

     if bits > 16 {
         // Swap 16-bit units.
         ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
         ctx.emit(Inst::imm(
             OperandSize::Size64,
             0x0000_ffff_0000_ffff & const_mask,
             tmp2,
         ));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightLogical,
             Some(16),
             tmp1,
         ));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::And,
             RegMemImm::reg(tmp2.to_reg()),
             tmp1,
         ));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::And,
             RegMemImm::reg(tmp0.to_reg()),
             tmp2,
         ));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftLeft,
             Some(16),
             tmp2,
         ));
         ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::Or,
             RegMemImm::reg(tmp1.to_reg()),
             tmp0,
         ));
     }

     if bits > 32 {
         // Swap 32-bit units.
         ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64));
         ctx.emit(Inst::imm(
             OperandSize::Size64,
             0x0000_0000_ffff_ffff & const_mask,
             tmp2,
         ));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightLogical,
             Some(32),
             tmp1,
         ));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::And,
             RegMemImm::reg(tmp2.to_reg()),
             tmp1,
         ));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::And,
             RegMemImm::reg(tmp0.to_reg()),
             tmp2,
         ));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftLeft,
             Some(32),
             tmp2,
         ));
         ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64));
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::Or,
             RegMemImm::reg(tmp1.to_reg()),
             tmp0,
         ));
     }

     ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
 }

 fn emit_shl_i128<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     src: ValueRegs<Reg>,
     dst: ValueRegs<Writable<Reg>>,
     amt_src: Reg,
 ) {
     let src_lo = src.regs()[0];
     let src_hi = src.regs()[1];
     let dst_lo = dst.regs()[0];
     let dst_hi = dst.regs()[1];

     // mov tmp1, src_lo
     // shl tmp1, amt_src
     // mov tmp2, src_hi
     // shl tmp2, amt_src
     // mov amt, 64
     // sub amt, amt_src
     // mov tmp3, src_lo
     // shr tmp3, amt
     // xor dst_lo, dst_lo
     // test amt_src, 127
     // cmovz tmp3, dst_lo
     // or tmp3, tmp2
     // mov amt, amt_src
     // and amt, 64
     // cmovz dst_hi, tmp3
     // cmovz dst_lo, tmp1
     // cmovnz dst_hi, tmp1

     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();

     ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
     ctx.emit(Inst::gen_move(
         Writable::from_reg(regs::rcx()),
         amt_src,
         types::I64,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftLeft,
         None,
         tmp1,
     ));

     ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
     ctx.emit(Inst::gen_move(
         Writable::from_reg(regs::rcx()),
         amt_src,
         types::I64,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftLeft,
         None,
         tmp2,
     ));

     ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Sub,
         RegMemImm::reg(amt_src),
         amt,
     ));

     ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
     ctx.emit(Inst::gen_move(
         Writable::from_reg(regs::rcx()),
         amt.to_reg(),
         types::I64,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftRightLogical,
         None,
         tmp3,
     ));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Xor,
         RegMemImm::reg(dst_lo.to_reg()),
         dst_lo,
     ));

     ctx.emit(Inst::test_rmi_r(
         OperandSize::Size64,
         RegMemImm::imm(127),
         amt_src,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::Z,
         RegMem::reg(dst_lo.to_reg()),
         tmp3,
     ));

     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Or,
         RegMemImm::reg(tmp2.to_reg()),
         tmp3,
     ));

     // This isn't semantically necessary, but it keeps the
     // register allocator happy, because it cannot otherwise
     // infer that cmovz + cmovnz always defines dst_hi.
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Xor,
         RegMemImm::reg(dst_hi.to_reg()),
         dst_hi,
     ));

     ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::imm(64),
         amt,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::Z,
         RegMem::reg(tmp3.to_reg()),
         dst_hi,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::Z,
         RegMem::reg(tmp1.to_reg()),
         dst_lo,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::NZ,
         RegMem::reg(tmp1.to_reg()),
         dst_hi,
     ));
 }

 fn emit_shr_i128<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     src: ValueRegs<Reg>,
     dst: ValueRegs<Writable<Reg>>,
     amt_src: Reg,
     is_signed: bool,
 ) {
     let src_lo = src.regs()[0];
     let src_hi = src.regs()[1];
     let dst_lo = dst.regs()[0];
     let dst_hi = dst.regs()[1];

     // mov tmp1, src_hi
     // {u,s}shr tmp1, amt_src
     // mov tmp2, src_lo
     // ushr tmp2, amt_src
     // mov amt, 64
     // sub amt, amt_src
     // mov tmp3, src_hi
     // shl tmp3, amt
     // xor dst_lo, dst_lo
     // test amt_src, 127
     // cmovz tmp3, dst_lo
     // or tmp3, tmp2
     // if is_signed:
     //   mov dst_hi, src_hi
     //   sshr dst_hi, 63  // get the sign bit
     // else:
     //   xor dst_hi, dst_hi
     // mov amt, amt_src
     // and amt, 64
     // cmovz dst_hi, tmp1
     // cmovz dst_lo, tmp3
     // cmovnz dst_lo, tmp1

     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
     let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();

     let shift_kind = if is_signed {
         ShiftKind::ShiftRightArithmetic
     } else {
         ShiftKind::ShiftRightLogical
     };

     ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
     ctx.emit(Inst::gen_move(
         Writable::from_reg(regs::rcx()),
         amt_src,
         types::I64,
     ));
     ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp1));

     ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
     ctx.emit(Inst::gen_move(
         Writable::from_reg(regs::rcx()),
         amt_src,
         types::I64,
     ));
     // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftRightLogical,
         None,
         tmp2,
     ));

     ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Sub,
         RegMemImm::reg(amt_src),
         amt,
     ));

     ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
     ctx.emit(Inst::gen_move(
         Writable::from_reg(regs::rcx()),
         amt.to_reg(),
         types::I64,
     ));
     ctx.emit(Inst::shift_r(
         OperandSize::Size64,
         ShiftKind::ShiftLeft,
         None,
         tmp3,
     ));

     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Xor,
         RegMemImm::reg(dst_lo.to_reg()),
         dst_lo,
     ));
     ctx.emit(Inst::test_rmi_r(
         OperandSize::Size64,
         RegMemImm::imm(127),
         amt_src,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::Z,
         RegMem::reg(dst_lo.to_reg()),
         tmp3,
     ));

     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Or,
         RegMemImm::reg(tmp2.to_reg()),
         tmp3,
     ));

     if is_signed {
         ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
         ctx.emit(Inst::shift_r(
             OperandSize::Size64,
             ShiftKind::ShiftRightArithmetic,
             Some(63),
             dst_hi,
         ));
     } else {
         ctx.emit(Inst::alu_rmi_r(
             OperandSize::Size64,
             AluRmiROpcode::Xor,
             RegMemImm::reg(dst_hi.to_reg()),
             dst_hi,
         ));
     }
     // This isn't semantically necessary, but it keeps the
     // register allocator happy, because it cannot otherwise
     // infer that cmovz + cmovnz always defines dst_lo.
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Xor,
         RegMemImm::reg(dst_lo.to_reg()),
         dst_lo,
     ));

     ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::And,
         RegMemImm::imm(64),
         amt,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::Z,
         RegMem::reg(tmp1.to_reg()),
         dst_hi,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::Z,
         RegMem::reg(tmp3.to_reg()),
         dst_lo,
     ));
     ctx.emit(Inst::cmove(
         OperandSize::Size64,
         CC::NZ,
         RegMem::reg(tmp1.to_reg()),
         dst_lo,
     ));
 }

 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
     call_conv: CallConv,
     ptr_ty: Type,
 ) -> Signature {
     let mut sig = Signature::new(call_conv);
     for i in 0..ctx.num_inputs(insn) {
         sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
     }
     for i in 0..ctx.num_outputs(insn) {
         sig.returns.push(AbiParam::new(ctx.output_ty(insn, i)));
     }
     if call_conv.extends_baldrdash() {
         // Adds the special VMContext parameter to the signature.
         sig.params
             .push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext));
     }
     sig
 }

 fn emit_vm_call<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     flags: &Flags,
     triple: &Triple,
     libcall: LibCall,
     insn: IRInst,
     inputs: SmallVec<[InsnInput; 4]>,
     outputs: SmallVec<[InsnOutput; 2]>,
 ) -> CodegenResult<()> {
     let extname = ExternalName::LibCall(libcall);

     let dist = if flags.use_colocated_libcalls() {
         RelocDistance::Near
     } else {
         RelocDistance::Far
     };

     // TODO avoid recreating signatures for every single Libcall function.
     let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
     let sig = make_libcall_sig(ctx, insn, call_conv, types::I64);
     let caller_conv = ctx.abi().call_conv();

     let mut abi = X64ABICaller::from_func(&sig, &extname, dist, caller_conv, flags)?;

     abi.emit_stack_pre_adjust(ctx);

     let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 };
     assert_eq!(inputs.len() + vm_context, abi.num_args());

     for (i, input) in inputs.iter().enumerate() {
         let arg_reg = put_input_in_reg(ctx, *input);
         abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg));
     }
     if call_conv.extends_baldrdash() {
         let vm_context_vreg = ctx
             .get_vm_context()
             .expect("should have a VMContext to pass to libcall funcs");
         abi.emit_copy_regs_to_arg(ctx, inputs.len(), ValueRegs::one(vm_context_vreg));
     }

     abi.emit_call(ctx);
     for (i, output) in outputs.iter().enumerate() {
         let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap();
         abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg));
     }
     abi.emit_stack_post_adjust(ctx);

     Ok(())
 }

 /// Returns whether the given input is a shift by a constant value less or equal than 3.
 /// The goal is to embed it within an address mode.
 fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     spec: InsnInput,
 ) -> Option<(InsnInput, u8)> {
     matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
         match input_to_imm(
             ctx,
             InsnInput {
                 insn: shift,
                 input: 1,
             },
         ) {
             Some(shift_amt) if shift_amt <= 3 => Some((
                 InsnInput {
                     insn: shift,
                     input: 0,
                 },
                 shift_amt as u8,
             )),
             _ => None,
         }
     })
 }

 /// Lowers an instruction to one of the x86 addressing modes.
 ///
 /// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
 fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: i32) -> Amode {
     let flags = ctx
         .memflags(spec.insn)
         .expect("Instruction with amode should have memflags");

     // We now either have an add that we must materialize, or some other input; as well as the
     // final offset.
     if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
         debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
         let add_inputs = &[
             InsnInput {
                 insn: add,
                 input: 0,
             },
             InsnInput {
                 insn: add,
                 input: 1,
             },
         ];

         // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
         // aren't happening in the wasm case. We could do better, given some range analysis.
         let (base, index, shift) = if let Some((shift_input, shift_amt)) =
             matches_small_constant_shift(ctx, add_inputs[0])
         {
             (
                 put_input_in_reg(ctx, add_inputs[1]),
                 put_input_in_reg(ctx, shift_input),
                 shift_amt,
             )
         } else if let Some((shift_input, shift_amt)) =
             matches_small_constant_shift(ctx, add_inputs[1])
         {
             (
                 put_input_in_reg(ctx, add_inputs[0]),
                 put_input_in_reg(ctx, shift_input),
                 shift_amt,
             )
         } else {
             for i in 0..=1 {
                 // Try to pierce through uextend.
                 if let Some(uextend) = matches_input(
                     ctx,
                     InsnInput {
                         insn: add,
                         input: i,
                     },
                     Opcode::Uextend,
                 ) {
                     if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
                         // Zero the upper bits.
                         let input_size = ctx.input_ty(uextend, 0).bits() as u64;
                         let shift: u64 = 64 - input_size;
                         let uext_cst: u64 = (cst << shift) >> shift;

                         let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
                         if low32_will_sign_extend_to_64(final_offset as u64) {
                             let base = put_input_in_reg(ctx, add_inputs[1 - i]);
                             return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
                         }
                     }
                 }

                 // If it's a constant, add it directly!
                 if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
                     let final_offset = (offset as i64).wrapping_add(cst as i64);
                     if low32_will_sign_extend_to_64(final_offset as u64) {
                         let base = put_input_in_reg(ctx, add_inputs[1 - i]);
                         return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
                     }
                 }
             }

             (
                 put_input_in_reg(ctx, add_inputs[0]),
                 put_input_in_reg(ctx, add_inputs[1]),
                 0,
             )
         };

         return Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags);
     }

     let input = put_input_in_reg(ctx, spec);
     Amode::imm_reg(offset as u32, input).with_flags(flags)
 }

 fn emit_moves<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     dst: ValueRegs<Writable<Reg>>,
     src: ValueRegs<Reg>,
     ty: Type,
 ) {
     let (_, tys) = Inst::rc_for_type(ty).unwrap();
     for ((dst, src), ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) {
         ctx.emit(Inst::gen_move(*dst, *src, *ty));
     }
 }

 fn emit_cmoves<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     size: u8,
     cc: CC,
     src: ValueRegs<Reg>,
     dst: ValueRegs<Writable<Reg>>,
 ) {
     let size = size / src.len() as u8;
     let size = u8::max(size, 4); // at least 32 bits
     for (dst, src) in dst.regs().iter().zip(src.regs().iter()) {
         ctx.emit(Inst::cmove(
             OperandSize::from_bytes(size.into()),
             cc,
             RegMem::reg(*src),
             *dst,
         ));
     }
 }

 fn emit_clz<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     orig_ty: Type,
     ty: Type,
     src: Reg,
     dst: Writable<Reg>,
 ) {
     let src = RegMem::reg(src);
     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
     ctx.emit(Inst::imm(OperandSize::from_ty(ty), u64::max_value(), dst));

     ctx.emit(Inst::unary_rm_r(
         OperandSize::from_ty(ty),
         UnaryRmROpcode::Bsr,
         src,
         tmp,
     ));

     ctx.emit(Inst::cmove(
         OperandSize::from_ty(ty),
         CC::Z,
         RegMem::reg(dst.to_reg()),
         tmp,
     ));

     ctx.emit(Inst::imm(
         OperandSize::from_ty(ty),
         orig_ty.bits() as u64 - 1,
         dst,
     ));

     ctx.emit(Inst::alu_rmi_r(
         if ty == types::I64 {
             OperandSize::Size64
         } else {
             OperandSize::Size32
         },
         AluRmiROpcode::Sub,
         RegMemImm::reg(tmp.to_reg()),
         dst,
     ));
 }

 fn emit_ctz<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     orig_ty: Type,
     ty: Type,
     src: Reg,
     dst: Writable<Reg>,
 ) {
     let src = RegMem::reg(src);
     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
     ctx.emit(Inst::imm(OperandSize::Size32, orig_ty.bits() as u64, tmp));

     ctx.emit(Inst::unary_rm_r(
         OperandSize::from_ty(ty),
         UnaryRmROpcode::Bsf,
         src,
         dst,
     ));

     ctx.emit(Inst::cmove(
         OperandSize::from_ty(ty),
         CC::Z,
         RegMem::reg(tmp.to_reg()),
         dst,
     ));
 }

 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.

 /// Actually codegen an instruction's results into registers.
 fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
     flags: &Flags,
     isa_flags: &x64_settings::Flags,
     triple: &Triple,
 ) -> CodegenResult<()> {
     let op = ctx.data(insn).opcode();

     let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
         .map(|i| InsnInput { insn, input: i })
         .collect();
     let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
         .map(|i| InsnOutput { insn, output: i })
         .collect();

     let ty = if outputs.len() > 0 {
         Some(ctx.output_ty(insn, 0))
     } else {
         None
     };

     match op {
         Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
             let value = ctx
                 .get_constant(insn)
                 .expect("constant value for iconst et al");
             let dst = get_output_reg(ctx, outputs[0]);
             for inst in Inst::gen_constant(dst, value as u128, ty.unwrap(), |ty| {
                 ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
         }

         Opcode::Iadd
         | Opcode::IaddIfcout
         | Opcode::SaddSat
         | Opcode::UaddSat
         | Opcode::Isub
         | Opcode::SsubSat
         | Opcode::UsubSat
         | Opcode::AvgRound
         | Opcode::Band
         | Opcode::Bor
         | Opcode::Bxor => {
             let ty = ty.unwrap();
             if ty.lane_count() > 1 {
                 let sse_op = match op {
                     Opcode::Iadd => match ty {
                         types::I8X16 => SseOpcode::Paddb,
                         types::I16X8 => SseOpcode::Paddw,
                         types::I32X4 => SseOpcode::Paddd,
                         types::I64X2 => SseOpcode::Paddq,
                         _ => panic!("Unsupported type for packed iadd instruction: {}", ty),
                     },
                     Opcode::SaddSat => match ty {
                         types::I8X16 => SseOpcode::Paddsb,
                         types::I16X8 => SseOpcode::Paddsw,
                         _ => panic!("Unsupported type for packed sadd_sat instruction: {}", ty),
                     },
                     Opcode::UaddSat => match ty {
                         types::I8X16 => SseOpcode::Paddusb,
                         types::I16X8 => SseOpcode::Paddusw,
                         _ => panic!("Unsupported type for packed uadd_sat instruction: {}", ty),
                     },
                     Opcode::Isub => match ty {
                         types::I8X16 => SseOpcode::Psubb,
                         types::I16X8 => SseOpcode::Psubw,
                         types::I32X4 => SseOpcode::Psubd,
                         types::I64X2 => SseOpcode::Psubq,
                         _ => panic!("Unsupported type for packed isub instruction: {}", ty),
                     },
                     Opcode::SsubSat => match ty {
                         types::I8X16 => SseOpcode::Psubsb,
                         types::I16X8 => SseOpcode::Psubsw,
                         _ => panic!("Unsupported type for packed ssub_sat instruction: {}", ty),
                     },
                     Opcode::UsubSat => match ty {
                         types::I8X16 => SseOpcode::Psubusb,
                         types::I16X8 => SseOpcode::Psubusw,
                         _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
                     },
                     Opcode::AvgRound => match ty {
                         types::I8X16 => SseOpcode::Pavgb,
                         types::I16X8 => SseOpcode::Pavgw,
                         _ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
                     },
                     Opcode::Band => match ty {
                         types::F32X4 => SseOpcode::Andps,
                         types::F64X2 => SseOpcode::Andpd,
                         _ => SseOpcode::Pand,
                     },
                     Opcode::Bor => match ty {
                         types::F32X4 => SseOpcode::Orps,
                         types::F64X2 => SseOpcode::Orpd,
                         _ => SseOpcode::Por,
                     },
                     Opcode::Bxor => match ty {
                         types::F32X4 => SseOpcode::Xorps,
                         types::F64X2 => SseOpcode::Xorpd,
                         _ => SseOpcode::Pxor,
                     },
                     _ => panic!("Unsupported packed instruction: {}", op),
                 };
                 let lhs = put_input_in_reg(ctx, inputs[0]);
                 let rhs = input_to_reg_mem(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
             } else if ty == types::I128 || ty == types::B128 {
                 let alu_ops = match op {
                     Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
                     Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
                     Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
                     Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
                     Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
                     _ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
                 };
                 let lhs = put_input_in_regs(ctx, inputs[0]);
                 let rhs = put_input_in_regs(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]);
                 assert_eq!(lhs.len(), 2);
                 assert_eq!(rhs.len(), 2);
                 assert_eq!(dst.len(), 2);

                 // For add, sub, and, or, xor: just do ops on lower then upper
                 // half. Carry-flag propagation is implicit (add/adc, sub/sbb).
                 ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
                 ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     alu_ops.0,
                     RegMemImm::reg(rhs.regs()[0]),
                     dst.regs()[0],
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     alu_ops.1,
                     RegMemImm::reg(rhs.regs()[1]),
                     dst.regs()[1],
                 ));
             } else {
                 let size = if ty == types::I64 {
                     OperandSize::Size64
                 } else {
                     OperandSize::Size32
                 };
                 let alu_op = match op {
                     Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
                     Opcode::Isub => AluRmiROpcode::Sub,
                     Opcode::Band => AluRmiROpcode::And,
                     Opcode::Bor => AluRmiROpcode::Or,
                     Opcode::Bxor => AluRmiROpcode::Xor,
                     _ => unreachable!(),
                 };

                 let (lhs, rhs) = match op {
                     Opcode::Iadd
                     | Opcode::IaddIfcout
                     | Opcode::Band
                     | Opcode::Bor
                     | Opcode::Bxor => {
                         // For commutative operations, try to commute operands if one is an
                         // immediate or direct memory reference. Do so by converting LHS to RMI; if
                         // reg, then always convert RHS to RMI; else, use LHS as RMI and convert
                         // RHS to reg.
                         let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
                         if let RegMemImm::Reg { reg: lhs_reg } = lhs {
                             let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
                             (lhs_reg, rhs)
                         } else {
                             let rhs_reg = put_input_in_reg(ctx, inputs[1]);
                             (rhs_reg, lhs)
                         }
                     }
                     Opcode::Isub => (
                         put_input_in_reg(ctx, inputs[0]),
                         input_to_reg_mem_imm(ctx, inputs[1]),
                     ),
                     _ => unreachable!(),
                 };

                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
                 ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
             }
         }

         Opcode::Imul => {
             let ty = ty.unwrap();

             // Check for ext_mul_* instructions which are being shared here under imul. We must
             // check first for operands that are opcodes since checking for types is not enough.
             if let Some(_) = matches_input_any(
                 ctx,
                 inputs[0],
                 &[
                     Opcode::SwidenHigh,
                     Opcode::SwidenLow,
                     Opcode::UwidenHigh,
                     Opcode::UwidenLow,
                 ],
             ) {
                 // Optimized ext_mul_* lowerings are based on optimized lowerings
                 // here: https://github.com/WebAssembly/simd/pull/376
                 if let Some(swiden0_high) = matches_input(ctx, inputs[0], Opcode::SwidenHigh) {
                     if let Some(swiden1_high) = matches_input(ctx, inputs[1], Opcode::SwidenHigh) {
                         let swiden_input = &[
                             InsnInput {
                                 insn: swiden0_high,
                                 input: 0,
                             },
                             InsnInput {
                                 insn: swiden1_high,
                                 input: 0,
                             },
                         ];
                         let input0_ty = ctx.input_ty(swiden0_high, 0);
                         let input1_ty = ctx.input_ty(swiden1_high, 0);
                         let output_ty = ctx.output_ty(insn, 0);
                         let lhs = put_input_in_reg(ctx, swiden_input[0]);
                         let rhs = put_input_in_reg(ctx, swiden_input[1]);
                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                         match (input0_ty, input1_ty, output_ty) {
                             (types::I8X16, types::I8X16, types::I16X8) => {
                                 // i16x8.extmul_high_i8x16_s
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Palignr,
                                     RegMem::reg(lhs),
                                     Writable::from_reg(lhs),
                                     8,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_mov(
                                     SseOpcode::Pmovsxbw,
                                     RegMem::reg(lhs),
                                     Writable::from_reg(lhs),
                                 ));

                                 ctx.emit(Inst::gen_move(dst, rhs, output_ty));
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Palignr,
                                     RegMem::reg(rhs),
                                     dst,
                                     8,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_mov(
                                     SseOpcode::Pmovsxbw,
                                     RegMem::reg(dst.to_reg()),
                                     dst,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(lhs), dst));
                             }
                             (types::I16X8, types::I16X8, types::I32X4) => {
                                 // i32x4.extmul_high_i16x8_s
                                 ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
                                 let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
                                 ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
                                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmulhw,
                                     RegMem::reg(rhs),
                                     tmp_reg,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Punpckhwd,
                                     RegMem::from(tmp_reg),
                                     dst,
                                 ));
                             }
                             (types::I32X4, types::I32X4, types::I64X2) => {
                                 // i64x2.extmul_high_i32x4_s
                                 let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(lhs),
                                     tmp_reg,
                                     0xFA,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(rhs),
                                     dst,
                                     0xFA,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmuldq,
                                     RegMem::reg(tmp_reg.to_reg()),
                                     dst,
                                 ));
                             }
                             // Note swiden_high only allows types: I8X16, I16X8, and I32X4
                             _ => panic!("Unsupported extmul_low_signed type"),
                         }
                     }
                 } else if let Some(swiden0_low) = matches_input(ctx, inputs[0], Opcode::SwidenLow) {
                     if let Some(swiden1_low) = matches_input(ctx, inputs[1], Opcode::SwidenLow) {
                         let swiden_input = &[
                             InsnInput {
                                 insn: swiden0_low,
                                 input: 0,
                             },
                             InsnInput {
                                 insn: swiden1_low,
                                 input: 0,
                             },
                         ];
                         let input0_ty = ctx.input_ty(swiden0_low, 0);
                         let input1_ty = ctx.input_ty(swiden1_low, 0);
                         let output_ty = ctx.output_ty(insn, 0);
                         let lhs = put_input_in_reg(ctx, swiden_input[0]);
                         let rhs = put_input_in_reg(ctx, swiden_input[1]);
                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                         match (input0_ty, input1_ty, output_ty) {
                             (types::I8X16, types::I8X16, types::I16X8) => {
                                 // i32x4.extmul_low_i8x16_s
                                 let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
                                 ctx.emit(Inst::xmm_mov(
                                     SseOpcode::Pmovsxbw,
                                     RegMem::reg(lhs),
                                     tmp_reg,
                                 ));
                                 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(rhs), dst));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmullw,
                                     RegMem::reg(tmp_reg.to_reg()),
                                     dst,
                                 ));
                             }
                             (types::I16X8, types::I16X8, types::I32X4) => {
                                 // i32x4.extmul_low_i16x8_s
                                 ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
                                 let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
                                 ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
                                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmulhw,
                                     RegMem::reg(rhs),
                                     tmp_reg,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Punpcklwd,
                                     RegMem::from(tmp_reg),
                                     dst,
                                 ));
                             }
                             (types::I32X4, types::I32X4, types::I64X2) => {
                                 // i64x2.extmul_low_i32x4_s
                                 let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(lhs),
                                     tmp_reg,
                                     0x50,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(rhs),
                                     dst,
                                     0x50,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmuldq,
                                     RegMem::reg(tmp_reg.to_reg()),
                                     dst,
                                 ));
                             }
                             // Note swiden_low only allows types: I8X16, I16X8, and I32X4
                             _ => panic!("Unsupported extmul_low_signed type"),
                         }
                     }
                 } else if let Some(uwiden0_high) = matches_input(ctx, inputs[0], Opcode::UwidenHigh)
                 {
                     if let Some(uwiden1_high) = matches_input(ctx, inputs[1], Opcode::UwidenHigh) {
                         let uwiden_input = &[
                             InsnInput {
                                 insn: uwiden0_high,
                                 input: 0,
                             },
                             InsnInput {
                                 insn: uwiden1_high,
                                 input: 0,
                             },
                         ];
                         let input0_ty = ctx.input_ty(uwiden0_high, 0);
                         let input1_ty = ctx.input_ty(uwiden1_high, 0);
                         let output_ty = ctx.output_ty(insn, 0);
                         let lhs = put_input_in_reg(ctx, uwiden_input[0]);
                         let rhs = put_input_in_reg(ctx, uwiden_input[1]);
                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                         match (input0_ty, input1_ty, output_ty) {
                             (types::I8X16, types::I8X16, types::I16X8) => {
                                 // i16x8.extmul_high_i8x16_u
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Palignr,
                                     RegMem::reg(lhs),
                                     Writable::from_reg(lhs),
                                     8,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_mov(
                                     SseOpcode::Pmovzxbw,
                                     RegMem::reg(lhs),
                                     Writable::from_reg(lhs),
                                 ));
                                 ctx.emit(Inst::gen_move(dst, rhs, output_ty));
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Palignr,
                                     RegMem::reg(rhs),
                                     dst,
                                     8,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_mov(
                                     SseOpcode::Pmovzxbw,
                                     RegMem::reg(dst.to_reg()),
                                     dst,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(lhs), dst));
                             }
                             (types::I16X8, types::I16X8, types::I32X4) => {
                                 // i32x4.extmul_high_i16x8_u
                                 ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
                                 let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
                                 ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
                                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmulhuw,
                                     RegMem::reg(rhs),
                                     tmp_reg,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Punpckhwd,
                                     RegMem::from(tmp_reg),
                                     dst,
                                 ));
                             }
                             (types::I32X4, types::I32X4, types::I64X2) => {
                                 // i64x2.extmul_high_i32x4_u
                                 let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(lhs),
                                     tmp_reg,
                                     0xFA,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(rhs),
                                     dst,
                                     0xFA,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmuludq,
                                     RegMem::reg(tmp_reg.to_reg()),
                                     dst,
                                 ));
                             }
                             // Note uwiden_high only allows types: I8X16, I16X8, and I32X4
                             _ => panic!("Unsupported extmul_high_unsigned type"),
                         }
                     }
                 } else if let Some(uwiden0_low) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
                     if let Some(uwiden1_low) = matches_input(ctx, inputs[1], Opcode::UwidenLow) {
                         let uwiden_input = &[
                             InsnInput {
                                 insn: uwiden0_low,
                                 input: 0,
                             },
                             InsnInput {
                                 insn: uwiden1_low,
                                 input: 0,
                             },
                         ];

                         let input0_ty = ctx.input_ty(uwiden0_low, 0);
                         let input1_ty = ctx.input_ty(uwiden1_low, 0);
                         let output_ty = ctx.output_ty(insn, 0);
                         let lhs = put_input_in_reg(ctx, uwiden_input[0]);
                         let rhs = put_input_in_reg(ctx, uwiden_input[1]);
                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                         match (input0_ty, input1_ty, output_ty) {
                             (types::I8X16, types::I8X16, types::I16X8) => {
                                 // i16x8.extmul_low_i8x16_u
                                 let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
                                 ctx.emit(Inst::xmm_mov(
                                     SseOpcode::Pmovzxbw,
                                     RegMem::reg(lhs),
                                     tmp_reg,
                                 ));
                                 ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(rhs), dst));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmullw,
                                     RegMem::reg(tmp_reg.to_reg()),
                                     dst,
                                 ));
                             }
                             (types::I16X8, types::I16X8, types::I32X4) => {
                                 // i32x4.extmul_low_i16x8_u
                                 ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
                                 let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
                                 ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
                                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmulhuw,
                                     RegMem::reg(rhs),
                                     tmp_reg,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Punpcklwd,
                                     RegMem::from(tmp_reg),
                                     dst,
                                 ));
                             }
                             (types::I32X4, types::I32X4, types::I64X2) => {
                                 // i64x2.extmul_low_i32x4_u
                                 let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(lhs),
                                     tmp_reg,
                                     0x50,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r_imm(
                                     SseOpcode::Pshufd,
                                     RegMem::reg(rhs),
                                     dst,
                                     0x50,
                                     OperandSize::Size32,
                                 ));
                                 ctx.emit(Inst::xmm_rm_r(
                                     SseOpcode::Pmuludq,
                                     RegMem::reg(tmp_reg.to_reg()),
                                     dst,
                                 ));
                             }
                             // Note uwiden_low only allows types: I8X16, I16X8, and I32X4
                             _ => panic!("Unsupported extmul_low_unsigned type"),
                         }
                     }
                 } else {
                     panic!("Unsupported imul operation for type: {}", ty);
                 }
             } else if ty == types::I64X2 {
                 // Eventually one of these should be `input_to_reg_mem` (TODO).
                 let lhs = put_input_in_reg(ctx, inputs[0]);
                 let rhs = put_input_in_reg(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512dq_simd() {
                     // With the right AVX512 features (VL + DQ) this operation
                     // can lower to a single operation.
                     ctx.emit(Inst::xmm_rm_r_evex(
                         Avx512Opcode::Vpmullq,
                         RegMem::reg(rhs),
                         lhs,
                         dst,
                     ));
                 } else {
                     // Otherwise, for I64X2 multiplication we describe a lane A as being
                     // composed of a 32-bit upper half "Ah" and a 32-bit lower half
                     // "Al". The 32-bit long hand multiplication can then be written
                     // as:
                     //    Ah Al
                     // *  Bh Bl
                     //    -----
                     //    Al * Bl
                     // + (Ah * Bl) << 32
                     // + (Al * Bh) << 32
                     //
                     // So for each lane we will compute:
                     //   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
                     //
                     // Note, the algorithm will use pmuldq which operates directly
                     // on the lower 32-bit (Al or Bl) of a lane and writes the
                     // result to the full 64-bits of the lane of the destination.
                     // For this reason we don't need shifts to isolate the lower
                     // 32-bits, however, we will need to use shifts to isolate the
                     // high 32-bits when doing calculations, i.e., Ah == A >> 32.
                     //
                     // The full sequence then is as follows:
                     //   A' = A
                     //   A' = A' >> 32
                     //   A' = Ah' * Bl
                     //   B' = B
                     //   B' = B' >> 32
                     //   B' = Bh' * Al
                     //   B' = B' + A'
                     //   B' = B' << 32
                     //   A' = A
                     //   A' = Al' * Bl
                     //   A' = A' + B'
                     //   dst = A'

                     // A' = A
                     let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(rhs_1, rhs, ty));

                     // A' = A' >> 32
                     // A' = Ah' * Bl
                     ctx.emit(Inst::xmm_rmi_reg(
                         SseOpcode::Psrlq,
                         RegMemImm::imm(32),
                         rhs_1,
                     ));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pmuludq,
                         RegMem::reg(lhs.clone()),
                         rhs_1,
                     ));

                     // B' = B
                     let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(lhs_1, lhs, ty));

                     // B' = B' >> 32
                     // B' = Bh' * Al
                     ctx.emit(Inst::xmm_rmi_reg(
                         SseOpcode::Psrlq,
                         RegMemImm::imm(32),
                         lhs_1,
                     ));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));

                     // B' = B' + A'
                     // B' = B' << 32
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Paddq,
                         RegMem::reg(rhs_1.to_reg()),
                         lhs_1,
                     ));
                     ctx.emit(Inst::xmm_rmi_reg(
                         SseOpcode::Psllq,
                         RegMemImm::imm(32),
                         lhs_1,
                     ));

                     // A' = A
                     // A' = Al' * Bl
                     // A' = A' + B'
                     // dst = A'
                     ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pmuludq,
                         RegMem::reg(lhs.clone()),
                         rhs_1,
                     ));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Paddq,
                         RegMem::reg(lhs_1.to_reg()),
                         rhs_1,
                     ));
                     ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
                 }
             } else if ty.lane_count() > 1 {
                 // Emit single instruction lowerings for the remaining vector
                 // multiplications.
                 let sse_op = match ty {
                     types::I16X8 => SseOpcode::Pmullw,
                     types::I32X4 => SseOpcode::Pmulld,
                     _ => panic!("Unsupported type for packed imul instruction: {}", ty),
                 };
                 let lhs = put_input_in_reg(ctx, inputs[0]);
                 let rhs = input_to_reg_mem(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
             } else if ty == types::I128 || ty == types::B128 {
                 // Handle 128-bit multiplications.
                 let lhs = put_input_in_regs(ctx, inputs[0]);
                 let rhs = put_input_in_regs(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]);
                 assert_eq!(lhs.len(), 2);
                 assert_eq!(rhs.len(), 2);
                 assert_eq!(dst.len(), 2);

                 // mul:
                 //   dst_lo = lhs_lo * rhs_lo
                 //   dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
                 //
                 // so we emit:
                 //   mov dst_lo, lhs_lo
                 //   mul dst_lo, rhs_lo
                 //   mov dst_hi, lhs_lo
                 //   mul dst_hi, rhs_hi
                 //   mov tmp, lhs_hi
                 //   mul tmp, rhs_lo
                 //   add dst_hi, tmp
                 //   mov rax, lhs_lo
                 //   umulhi rhs_lo  // implicit rax arg/dst
                 //   add dst_hi, rax
                 let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Mul,
                     RegMemImm::reg(rhs.regs()[0]),
                     dst.regs()[0],
                 ));
                 ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Mul,
                     RegMemImm::reg(rhs.regs()[1]),
                     dst.regs()[1],
                 ));
                 ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Mul,
                     RegMemImm::reg(rhs.regs()[0]),
                     tmp,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Add,
                     RegMemImm::reg(tmp.to_reg()),
                     dst.regs()[1],
                 ));
                 ctx.emit(Inst::gen_move(
                     Writable::from_reg(regs::rax()),
                     lhs.regs()[0],
                     types::I64,
                 ));
                 ctx.emit(Inst::mul_hi(
                     OperandSize::Size64,
                     /* signed = */ false,
                     RegMem::reg(rhs.regs()[0]),
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Add,
                     RegMemImm::reg(regs::rdx()),
                     dst.regs()[1],
                 ));
             } else {
                 let size = if ty == types::I64 {
                     OperandSize::Size64
                 } else {
                     OperandSize::Size32
                 };
                 let alu_op = AluRmiROpcode::Mul;

                 // For commutative operations, try to commute operands if one is
                 // an immediate or direct memory reference. Do so by converting
                 // LHS to RMI; if reg, then always convert RHS to RMI; else, use
                 // LHS as RMI and convert RHS to reg.
                 let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
                 let (lhs, rhs) = if let RegMemImm::Reg { reg: lhs_reg } = lhs {
                     let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
                     (lhs_reg, rhs)
                 } else {
                     let rhs_reg = put_input_in_reg(ctx, inputs[1]);
                     (rhs_reg, lhs)
                 };

                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
                 ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
             }
         }

         Opcode::BandNot => {
             let ty = ty.unwrap();
             debug_assert!(ty.is_vector() && ty.bytes() == 16);
             let lhs = input_to_reg_mem(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let sse_op = match ty {
                 types::F32X4 => SseOpcode::Andnps,
                 types::F64X2 => SseOpcode::Andnpd,
                 _ => SseOpcode::Pandn,
             };
             // Note the flipping of operands: the `rhs` operand is used as the destination instead
             // of the `lhs` as in the other bit operations above (e.g. `band`).
             ctx.emit(Inst::gen_move(dst, rhs, ty));
             ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
         }

         Opcode::Iabs => {
             let src = input_to_reg_mem(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if ty == types::I64X2 {
                 if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512f_simd() {
                     ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst));
                 } else {
                     // If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to
                     // contain the results of `0 - src` and then blend in those results with
                     // `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or,
                     // conversely, if `src` was originally positive).

                     // Emit all 0s into the `tmp` register.
                     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
                     // Subtract the lanes from 0 and set up `dst`.
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp));
                     ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty));
                     // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics
                     // require the "choice" mask to be in XMM0.
                     ctx.emit(Inst::gen_move(
                         Writable::from_reg(regs::xmm0()),
                         tmp.to_reg(),
                         ty,
                     ));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
                 }
             } else if ty.is_vector() {
                 let opcode = match ty {
                     types::I8X16 => SseOpcode::Pabsb,
                     types::I16X8 => SseOpcode::Pabsw,
                     types::I32X4 => SseOpcode::Pabsd,
                     _ => panic!("Unsupported type for packed iabs instruction: {}", ty),
                 };
                 ctx.emit(Inst::xmm_unary_rm_r(opcode, src, dst));
             } else {
                 unimplemented!("iabs is unimplemented for non-vector type: {}", ty);
             }
         }

         Opcode::Imax | Opcode::Umax | Opcode::Imin | Opcode::Umin => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if ty.is_vector() {
                 let sse_op = match op {
                     Opcode::Imax => match ty {
                         types::I8X16 => SseOpcode::Pmaxsb,
                         types::I16X8 => SseOpcode::Pmaxsw,
                         types::I32X4 => SseOpcode::Pmaxsd,
                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
                     },
                     Opcode::Umax => match ty {
                         types::I8X16 => SseOpcode::Pmaxub,
                         types::I16X8 => SseOpcode::Pmaxuw,
                         types::I32X4 => SseOpcode::Pmaxud,
                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
                     },
                     Opcode::Imin => match ty {
                         types::I8X16 => SseOpcode::Pminsb,
                         types::I16X8 => SseOpcode::Pminsw,
                         types::I32X4 => SseOpcode::Pminsd,
                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
                     },
                     Opcode::Umin => match ty {
                         types::I8X16 => SseOpcode::Pminub,
                         types::I16X8 => SseOpcode::Pminuw,
                         types::I32X4 => SseOpcode::Pminud,
                         _ => panic!("Unsupported type for packed {} instruction: {}", op, ty),
                     },
                     _ => unreachable!("This is a bug: the external and internal `match op` should be over the same opcodes."),
                 };

                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
                 ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
             } else {
                 panic!("Unsupported type for {} instruction: {}", op, ty);
             }
         }

         Opcode::Bnot => {
             let ty = ty.unwrap();

             if ty.is_vector() {
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src, ty));
                 let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp));
                 ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
             } else if ty == types::I128 || ty == types::B128 {
                 let src = put_input_in_regs(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]);
                 ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], types::I64));
                 ctx.emit(Inst::not(OperandSize::Size64, dst.regs()[0]));
                 ctx.emit(Inst::gen_move(dst.regs()[1], src.regs()[1], types::I64));
                 ctx.emit(Inst::not(OperandSize::Size64, dst.regs()[1]));
             } else if ty.is_bool() {
                 unimplemented!("bool bnot")
             } else {
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src, ty));
                 ctx.emit(Inst::not(OperandSize::from_ty(ty), dst));
             }
         }

         Opcode::Bitselect => {
             let ty = ty.unwrap();
             let condition = put_input_in_reg(ctx, inputs[0]);
             let if_true = put_input_in_reg(ctx, inputs[1]);
             let if_false = input_to_reg_mem(ctx, inputs[2]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             if ty.is_vector() {
                 let tmp1 = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(tmp1, if_true, ty));
                 ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));

                 let tmp2 = ctx.alloc_tmp(ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(tmp2, condition, ty));
                 ctx.emit(Inst::and_not(ty, if_false, tmp2));

                 ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
                 ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
             } else {
                 unimplemented!("no lowering for scalar bitselect instruction")
             }
         }

         Opcode::Vselect => {
             let ty = ty.unwrap();
             let condition = put_input_in_reg(ctx, inputs[0]);
             let condition_ty = ctx.input_ty(insn, 0);
             let if_true = input_to_reg_mem(ctx, inputs[1]);
             let if_false = put_input_in_reg(ctx, inputs[2]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             if ty.is_vector() {
                 // `vselect` relies on the bit representation of the condition:
                 // vector boolean types are defined in Cranelift to be all 1s or
                 // all 0s. This lowering relies on that fact to use x86's
                 // variable blend instructions, which look at the _high_bit_ of
                 // the condition mask. All the bits of vector booleans will
                 // match (all 1s or all 0s), so we can just use the high bit.
                 assert!(condition_ty.lane_type().is_bool());

                 // Variable blend instructions expect the condition mask to be
                 // in XMM0.
                 let xmm0 = Writable::from_reg(regs::xmm0());
                 ctx.emit(Inst::gen_move(xmm0, condition, ty));

                 // Match up the source and destination registers for regalloc.
                 ctx.emit(Inst::gen_move(dst, if_false, ty));

                 // Technically PBLENDVB would work in all cases (since the bytes
                 // inside the mask will be all 1s or 0s we can blend
                 // byte-by-byte instead of word-by-word, e.g.) but
                 // type-specialized versions are included here for clarity when
                 // troubleshooting and due to slight improvements in
                 // latency/throughput on certain processor families.
                 let opcode = match condition_ty {
                     types::B64X2 => SseOpcode::Blendvpd,
                     types::B32X4 => SseOpcode::Blendvps,
                     types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
                     _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
                 };
                 ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
             } else {
                 unimplemented!("no lowering for scalar vselect instruction")
             }
         }

         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
             let dst_ty = ctx.output_ty(insn, 0);
             debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);

             if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
                 // Scalar shifts on x86 have various encodings:
                 // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
                 // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
                 // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
                 // This implementation uses the last two encoding methods.
                 let (size, lhs) = match dst_ty {
                     types::I8 | types::I16 => match op {
                         Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
                         Opcode::Ushr => (
                             OperandSize::Size32,
                             extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
                         ),
                         Opcode::Sshr => (
                             OperandSize::Size32,
                             extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
                         ),
                         Opcode::Rotl | Opcode::Rotr => (
                             OperandSize::from_ty(dst_ty),
                             put_input_in_reg(ctx, inputs[0]),
                         ),
                         _ => unreachable!(),
                     },
                     types::I32 | types::I64 => (
                         OperandSize::from_ty(dst_ty),
                         put_input_in_reg(ctx, inputs[0]),
                     ),
                     _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
                 };

                 let (count, rhs) =
                     if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
                         // Mask count, according to Cranelift's semantics.
                         let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
                         (Some(cst), None)
                     } else {
                         // We can ignore upper registers if shift amount is multi-reg, because we
                         // are taking the shift amount mod 2^(lhs_width) anyway.
                         (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
                     };

                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 let shift_kind = match op {
                     Opcode::Ishl => ShiftKind::ShiftLeft,
                     Opcode::Ushr => ShiftKind::ShiftRightLogical,
                     Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
                     Opcode::Rotl => ShiftKind::RotateLeft,
                     Opcode::Rotr => ShiftKind::RotateRight,
                     _ => unreachable!(),
                 };

                 let w_rcx = Writable::from_reg(regs::rcx());
                 ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
                 if count.is_none() {
                     ctx.emit(Inst::mov_r_r(OperandSize::Size64, rhs.unwrap(), w_rcx));
                 }
                 ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
             } else if dst_ty == types::I128 {
                 let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
                 let src = put_input_in_regs(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]);

                 match op {
                     Opcode::Ishl => {
                         emit_shl_i128(ctx, src, dst, amt_src);
                     }
                     Opcode::Ushr => {
                         emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
                     }
                     Opcode::Sshr => {
                         emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
                     }
                     Opcode::Rotl => {
                         // (mov tmp, src)
                         // (shl.i128 tmp, amt)
                         // (mov dst, src)
                         // (ushr.i128 dst, 128-amt)
                         // (or dst, tmp)
                         let tmp = ctx.alloc_tmp(types::I128);
                         emit_shl_i128(ctx, src, tmp, amt_src);
                         let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(amt_src),
                             inv_amt,
                         ));
                         emit_shr_i128(
                             ctx,
                             src,
                             dst,
                             inv_amt.to_reg(),
                             /* is_signed = */ false,
                         );
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Or,
                             RegMemImm::reg(tmp.regs()[0].to_reg()),
                             dst.regs()[0],
                         ));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Or,
                             RegMemImm::reg(tmp.regs()[1].to_reg()),
                             dst.regs()[1],
                         ));
                     }
                     Opcode::Rotr => {
                         // (mov tmp, src)
                         // (ushr.i128 tmp, amt)
                         // (mov dst, src)
                         // (shl.i128 dst, 128-amt)
                         // (or dst, tmp)
                         let tmp = ctx.alloc_tmp(types::I128);
                         emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
                         let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(amt_src),
                             inv_amt,
                         ));
                         emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Or,
                             RegMemImm::reg(tmp.regs()[0].to_reg()),
                             dst.regs()[0],
                         ));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Or,
                             RegMemImm::reg(tmp.regs()[1].to_reg()),
                             dst.regs()[1],
                         ));
                     }
                     _ => unreachable!(),
                 }
             } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
                 // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
                 // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
                 // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
                 // incorrect bits to 0s (see below for handling signs in `sshr.i8x16`).
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 // If necessary, move the shift index into the lowest bits of a vector register.
                 let shift_by_moved = match &shift_by {
                     RegMemImm::Imm { .. } => shift_by.clone(),
                     RegMemImm::Reg { reg } => {
                         let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                         ctx.emit(Inst::gpr_to_xmm(
                             SseOpcode::Movd,
                             RegMem::reg(*reg),
                             OperandSize::Size32,
                             tmp_shift_by,
                         ));
                         RegMemImm::reg(tmp_shift_by.to_reg())
                     }
                     RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
                 };

                 // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
                 // the others must be fixed up with the mask below.
                 let shift_opcode = match op {
                     Opcode::Ishl => SseOpcode::Psllw,
                     Opcode::Ushr => SseOpcode::Psrlw,
                     _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                 };
                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
                 ctx.emit(Inst::xmm_rmi_reg(shift_opcode, shift_by_moved, dst));

                 // Choose which mask to use to fixup the shifted lanes. Since we must use a 16x8 shift, we need to fix
                 // up the bits that migrate from one half of the lane to the other. Each 16-byte mask (which rustfmt
                 // forces to multiple lines) is indexed by the shift amount: e.g. if we shift right by 0 (no movement),
                 // we want to retain all the bits so we mask with `0xff`; if we shift right by 1, we want to retain all
                 // bits except the MSB so we mask with `0x7f`; etc.
                 const USHR_MASKS: [u8; 128] = [
                     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
                     0xff, 0xff, 0xff, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
                     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
                     0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, 0x1f, 0x1f, 0x1f,
                     0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f,
                     0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
                     0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
                     0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
                     0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
                     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                 ];
                 const SHL_MASKS: [u8; 128] = [
                     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
                     0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
                     0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
                     0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
                     0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
                     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
                     0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
                     0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
                     0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
                     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
                 ];
                 let mask = match op {
                     Opcode::Ishl => &SHL_MASKS,
                     Opcode::Ushr => &USHR_MASKS,
                     _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                 };

                 // Figure out the address of the shift mask.
                 let mask_address = match shift_by {
                     RegMemImm::Imm { simm32 } => {
                         // When the shift amount is known, we can statically (i.e. at compile time) determine the mask to
                         // use and only emit that.
                         debug_assert!(simm32 < 8);
                         let mask_offset = simm32 as usize * 16;
                         let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(
                             &mask[mask_offset..mask_offset + 16],
                         ));
                         SyntheticAmode::ConstantOffset(mask_constant)
                     }
                     RegMemImm::Reg { reg } => {
                         // Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct
                         // mask offset in the table. We do this use LEA to find the base address of the mask table and then
                         // complex addressing to offset to the right mask: `base_address + shift_by * 4`
                         let base_mask_address = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let mask_offset = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask));
                         ctx.emit(Inst::lea(
                             SyntheticAmode::ConstantOffset(mask_constant),
                             base_mask_address,
                         ));
                         ctx.emit(Inst::gen_move(mask_offset, reg, types::I64));
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             ShiftKind::ShiftLeft,
                             Some(4),
                             mask_offset,
                         ));
                         Amode::imm_reg_reg_shift(
                             0,
                             base_mask_address.to_reg(),
                             mask_offset.to_reg(),
                             0,
                         )
                         .into()
                     }
                     RegMemImm::Mem { addr: _ } => unimplemented!("load mask address"),
                 };

                 // Load the mask into a temporary register, `mask_value`.
                 let mask_value = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                 ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None));

                 // Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future,
                 // this AND instruction could be coalesced with the load above.
                 let sse_op = match dst_ty {
                     types::F32X4 => SseOpcode::Andps,
                     types::F64X2 => SseOpcode::Andpd,
                     _ => SseOpcode::Pand,
                 };
                 ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
             } else if dst_ty == types::I8X16 && op == Opcode::Sshr {
                 // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
                 // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
                 // approach here: separate the low and high lanes, shift them separately, and merge them into the final
                 // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
                 //   low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
                 //   shifted_low.i16x8 = shift each lane of `low`
                 //   high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
                 //   shifted_high.i16x8 = shift each lane of `high`
                 //   dst.i8x16 = [s0'', s1'', ..., s15'']
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
                 let shift_by_ty = ctx.input_ty(insn, 1);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
                 // bits, relying on PSRAW to fill in the upper bits appropriately.
                 let bigger_shift_by = match shift_by {
                     // When we know the shift amount at compile time, we add the extra shift amount statically.
                     RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
                     // Otherwise we add instructions to add the extra shift amount and move the value into an XMM
                     // register.
                     RegMemImm::Reg { reg } => {
                         let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
                         ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));

                         let size = if shift_by_ty == types::I64 {
                             OperandSize::Size64
                         } else {
                             OperandSize::Size32
                         };
                         let imm = RegMemImm::imm(8);
                         ctx.emit(Inst::alu_rmi_r(
                             size,
                             AluRmiROpcode::Add,
                             imm,
                             bigger_shift_by_gpr,
                         ));

                         let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                         ctx.emit(Inst::gpr_to_xmm(
                             SseOpcode::Movd,
                             RegMem::from(bigger_shift_by_gpr),
                             OperandSize::Size32,
                             bigger_shift_by_xmm,
                         ));
                         RegMemImm::reg(bigger_shift_by_xmm.to_reg())
                     }
                     RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
                 };

                 // Unpack and shift the lower lanes of `src` into the `dst` register.
                 ctx.emit(Inst::gen_move(dst, src, dst_ty));
                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
                 ctx.emit(Inst::xmm_rmi_reg(
                     SseOpcode::Psraw,
                     bigger_shift_by.clone(),
                     dst,
                 ));

                 // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
                 let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
                 ctx.emit(Inst::xmm_rm_r(
                     SseOpcode::Punpckhbw,
                     RegMem::from(upper_lanes),
                     upper_lanes,
                 ));
                 ctx.emit(Inst::xmm_rmi_reg(
                     SseOpcode::Psraw,
                     bigger_shift_by,
                     upper_lanes,
                 ));

                 // Merge the upper and lower shifted lanes into `dst`.
                 ctx.emit(Inst::xmm_rm_r(
                     SseOpcode::Packsswb,
                     RegMem::from(upper_lanes),
                     dst,
                 ));
             } else if dst_ty == types::I64X2 && op == Opcode::Sshr {
                 // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
                 // like AVX512VL + AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
                 // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
                 // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
                 // scalar instruction, and insert the shifted values back in the `dst` XMM register.
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src, dst_ty));

                 // Extract the upper and lower lanes into temporary GPRs.
                 let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
                 let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 emit_extract_lane(ctx, src, upper_lane, 1, types::I64);

                 // Shift each value.
                 let mut shift = |reg: Writable<Reg>| {
                     let kind = ShiftKind::ShiftRightArithmetic;
                     if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
                         // Mask the shift amount according to Cranelift's semantics.
                         let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             kind,
                             Some(shift_by),
                             reg,
                         ));
                     } else {
                         let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
                         let w_rcx = Writable::from_reg(regs::rcx());
                         ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
                         ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
                     };
                 };
                 shift(lower_lane);
                 shift(upper_lane);

                 // Insert the scalar values back into the `dst` vector.
                 emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
                 emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
             } else {
                 // For the remaining packed shifts not covered above, x86 has implementations that can either:
                 // - shift using an immediate
                 // - shift using a dynamic value given in the lower bits of another XMM register.
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let sse_op = match dst_ty {
                     types::I16X8 => match op {
                         Opcode::Ishl => SseOpcode::Psllw,
                         Opcode::Ushr => SseOpcode::Psrlw,
                         Opcode::Sshr => SseOpcode::Psraw,
                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                     },
                     types::I32X4 => match op {
                         Opcode::Ishl => SseOpcode::Pslld,
                         Opcode::Ushr => SseOpcode::Psrld,
                         Opcode::Sshr => SseOpcode::Psrad,
                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                     },
                     types::I64X2 => match op {
                         Opcode::Ishl => SseOpcode::Psllq,
                         Opcode::Ushr => SseOpcode::Psrlq,
                         _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
                     },
                     _ => unreachable!(),
                 };

                 // If necessary, move the shift index into the lowest bits of a vector register.
                 let shift_by = match shift_by {
                     RegMemImm::Imm { .. } => shift_by,
                     RegMemImm::Reg { reg } => {
                         let tmp_shift_by = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
                         ctx.emit(Inst::gpr_to_xmm(
                             SseOpcode::Movd,
                             RegMem::reg(reg),
                             OperandSize::Size32,
                             tmp_shift_by,
                         ));
                         RegMemImm::reg(tmp_shift_by.to_reg())
                     }
                     RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
                 };

                 // Move the `src` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, src, dst_ty));

                 ctx.emit(Inst::xmm_rmi_reg(sse_op, shift_by, dst));
             }
         }

         Opcode::Ineg => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();

             if ty.is_vector() {
                 // Zero's out a register and then does a packed subtraction
                 // of the input from the register.

                 let src = input_to_reg_mem(ctx, inputs[0]);
                 let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();

                 let subtract_opcode = match ty {
                     types::I8X16 => SseOpcode::Psubb,
                     types::I16X8 => SseOpcode::Psubw,
                     types::I32X4 => SseOpcode::Psubd,
                     types::I64X2 => SseOpcode::Psubq,
                     _ => panic!("Unsupported type for Ineg instruction, found {}", ty),
                 };

                 // Note we must zero out a tmp instead of using the destination register since
                 // the desitnation could be an alias for the source input register
                 ctx.emit(Inst::xmm_rm_r(
                     SseOpcode::Pxor,
                     RegMem::reg(tmp.to_reg()),
                     tmp,
                 ));
                 ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
                 ctx.emit(Inst::xmm_unary_rm_r(
                     SseOpcode::Movapd,
                     RegMem::reg(tmp.to_reg()),
                     dst,
                 ));
             } else {
                 let src = put_input_in_reg(ctx, inputs[0]);
                 ctx.emit(Inst::gen_move(dst, src, ty));
                 ctx.emit(Inst::neg(OperandSize::from_ty(ty), dst));
             }
         }

         Opcode::Clz => {
             let orig_ty = ty.unwrap();

             if isa_flags.use_lzcnt() && (orig_ty == types::I32 || orig_ty == types::I64) {
                 // We can use a plain lzcnt instruction here. Note no special handling is required
                 // for zero inputs, because the machine instruction does what the CLIF expects for
                 // zero, i.e. it returns zero.
                 let src = input_to_reg_mem(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::unary_rm_r(
                     OperandSize::from_ty(orig_ty),
                     UnaryRmROpcode::Lzcnt,
                     src,
                     dst,
                 ));
                 return Ok(());
             }

             // General formula using bit-scan reverse (BSR):
             // mov -1, %dst
             // bsr %src, %tmp
             // cmovz %dst, %tmp
             // mov $(size_bits - 1), %dst
             // sub %tmp, %dst

             if orig_ty == types::I128 {
                 // clz upper, tmp1
                 // clz lower, dst
                 // add dst, 64
                 // cmp tmp1, 64
                 // cmovnz tmp1, dst
                 let dsts = get_output_reg(ctx, outputs[0]);
                 let dst = dsts.regs()[0];
                 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 let srcs = put_input_in_regs(ctx, inputs[0]);
                 let src_lo = srcs.regs()[0];
                 let src_hi = srcs.regs()[1];
                 emit_clz(ctx, types::I64, types::I64, src_hi, tmp1);
                 emit_clz(ctx, types::I64, types::I64, src_lo, dst);
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Add,
                     RegMemImm::imm(64),
                     dst,
                 ));
                 ctx.emit(Inst::cmp_rmi_r(
                     OperandSize::Size64,
                     RegMemImm::imm(64),
                     tmp1.to_reg(),
                 ));
                 ctx.emit(Inst::cmove(
                     OperandSize::Size64,
                     CC::NZ,
                     RegMem::reg(tmp1.to_reg()),
                     dst,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Xor,
                     RegMemImm::reg(dsts.regs()[1].to_reg()),
                     dsts.regs()[1],
                 ));
             } else {
                 let (ext_spec, ty) = match orig_ty {
                     types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
                     a if a == types::I32 || a == types::I64 => (None, a),
                     _ => unreachable!(),
                 };
                 let src = if let Some(ext_spec) = ext_spec {
                     extend_input_to_reg(ctx, inputs[0], ext_spec)
                 } else {
                     put_input_in_reg(ctx, inputs[0])
                 };

                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 emit_clz(ctx, orig_ty, ty, src, dst);
             }
         }

         Opcode::Ctz => {
             let orig_ty = ctx.input_ty(insn, 0);

             if isa_flags.use_bmi1() && (orig_ty == types::I32 || orig_ty == types::I64) {
                 // We can use a plain tzcnt instruction here. Note no special handling is required
                 // for zero inputs, because the machine instruction does what the CLIF expects for
                 // zero, i.e. it returns zero.
                 let src = input_to_reg_mem(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::unary_rm_r(
                     OperandSize::from_ty(orig_ty),
                     UnaryRmROpcode::Tzcnt,
                     src,
                     dst,
                 ));
                 return Ok(());
             }

             // General formula using bit-scan forward (BSF):
             // bsf %src, %dst
             // mov $(size_bits), %tmp
             // cmovz %tmp, %dst
             if orig_ty == types::I128 {
                 // ctz src_lo, dst
                 // ctz src_hi, tmp1
                 // add tmp1, 64
                 // cmp dst, 64
                 // cmovz tmp1, dst
                 let dsts = get_output_reg(ctx, outputs[0]);
                 let dst = dsts.regs()[0];
                 let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 let srcs = put_input_in_regs(ctx, inputs[0]);
                 let src_lo = srcs.regs()[0];
                 let src_hi = srcs.regs()[1];
                 emit_ctz(ctx, types::I64, types::I64, src_lo, dst);
                 emit_ctz(ctx, types::I64, types::I64, src_hi, tmp1);
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Add,
                     RegMemImm::imm(64),
                     tmp1,
                 ));
                 ctx.emit(Inst::cmp_rmi_r(
                     OperandSize::Size64,
                     RegMemImm::imm(64),
                     dst.to_reg(),
                 ));
                 ctx.emit(Inst::cmove(
                     OperandSize::Size64,
                     CC::Z,
                     RegMem::reg(tmp1.to_reg()),
                     dst,
                 ));
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Xor,
                     RegMemImm::reg(dsts.regs()[1].to_reg()),
                     dsts.regs()[1],
                 ));
             } else {
                 let ty = if orig_ty.bits() < 32 {
                     types::I32
                 } else {
                     orig_ty
                 };
                 debug_assert!(ty == types::I32 || ty == types::I64);

                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 emit_ctz(ctx, orig_ty, ty, src, dst);
             }
         }

         Opcode::Popcnt => {
             let ty_tmp = ty.unwrap();
             if !ty_tmp.is_vector() {
                 let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
                     types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
                     a if a == types::I32 || a == types::I64 || a == types::I128 => (None, a),
                     _ => unreachable!(),
                 };

                 if isa_flags.use_popcnt() {
                     match ty {
                         types::I32 | types::I64 => {
                             let src = input_to_reg_mem(ctx, inputs[0]);
                             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                             ctx.emit(Inst::unary_rm_r(
                                 OperandSize::from_ty(ty),
                                 UnaryRmROpcode::Popcnt,
                                 src,
                                 dst,
                             ));
                             return Ok(());
                         }

                         types::I128 => {
                             // The number of ones in a 128-bits value is the plain sum of the number of
                             // ones in its low and high parts. No risk of overflow here.
                             let dsts = get_output_reg(ctx, outputs[0]);
                             let dst = dsts.regs()[0];
                             let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                             let srcs = put_input_in_regs(ctx, inputs[0]);
                             let src_lo = srcs.regs()[0];
                             let src_hi = srcs.regs()[1];

                             ctx.emit(Inst::unary_rm_r(
                                 OperandSize::Size64,
                                 UnaryRmROpcode::Popcnt,
                                 RegMem::reg(src_lo),
                                 dst,
                             ));
                             ctx.emit(Inst::unary_rm_r(
                                 OperandSize::Size64,
                                 UnaryRmROpcode::Popcnt,
                                 RegMem::reg(src_hi),
                                 tmp,
                             ));
                             ctx.emit(Inst::alu_rmi_r(
                                 OperandSize::Size64,
                                 AluRmiROpcode::Add,
                                 RegMemImm::reg(tmp.to_reg()),
                                 dst,
                             ));

                             // Zero the result's high component.
                             ctx.emit(Inst::alu_rmi_r(
                                 OperandSize::Size64,
                                 AluRmiROpcode::Xor,
                                 RegMemImm::reg(dsts.regs()[1].to_reg()),
                                 dsts.regs()[1],
                             ));

                             return Ok(());
                         }
                         _ => {}
                     }
                 }

                 let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec {
                     (
                         smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))],
                         ty,
                     )
                 } else if ty == types::I128 {
                     let regs = put_input_in_regs(ctx, inputs[0]);
                     (
                         smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])],
                         types::I64,
                     )
                 } else {
                     // N.B.: explicitly put input in a reg here because the width of the instruction
                     // into which this RM op goes may not match the width of the input type (in fact,
                     // it won't for i32.popcnt), and we don't want a larger than necessary load.
                     (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty)
                 };

                 let mut dsts: SmallVec<[Reg; 2]> = smallvec![];
                 for src in srcs {
                     let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                     dsts.push(dst.to_reg());
                     if ty == types::I64 {
                         let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap();

                         // mov src, tmp1
                         ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));

                         // shr $1, tmp1
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             ShiftKind::ShiftRightLogical,
                             Some(1),
                             tmp1,
                         ));

                         // mov 0x7777_7777_7777_7777, cst
                         ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));

                         // andq cst, tmp1
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::And,
                             RegMemImm::reg(cst.to_reg()),
                             tmp1,
                         ));

                         // mov src, tmp2
                         ctx.emit(Inst::mov64_rm_r(src, tmp2));

                         // sub tmp1, tmp2
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));

                         // shr $1, tmp1
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             ShiftKind::ShiftRightLogical,
                             Some(1),
                             tmp1,
                         ));

                         // and cst, tmp1
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::And,
                             RegMemImm::reg(cst.to_reg()),
                             tmp1,
                         ));

                         // sub tmp1, tmp2
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));

                         // shr $1, tmp1
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             ShiftKind::ShiftRightLogical,
                             Some(1),
                             tmp1,
                         ));

                         // and cst, tmp1
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::And,
                             RegMemImm::reg(cst.to_reg()),
                             tmp1,
                         ));

                         // sub tmp1, tmp2
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));

                         // mov tmp2, dst
                         ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));

                         // shr $4, dst
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             ShiftKind::ShiftRightLogical,
                             Some(4),
                             dst,
                         ));

                         // add tmp2, dst
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Add,
                             RegMemImm::reg(tmp2.to_reg()),
                             dst,
                         ));

                         // mov $0x0F0F_0F0F_0F0F_0F0F, cst
                         ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));

                         // and cst, dst
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::And,
                             RegMemImm::reg(cst.to_reg()),
                             dst,
                         ));

                         // mov $0x0101_0101_0101_0101, cst
                         ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));

                         // mul cst, dst
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size64,
                             AluRmiROpcode::Mul,
                             RegMemImm::reg(cst.to_reg()),
                             dst,
                         ));

                         // shr $56, dst
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size64,
                             ShiftKind::ShiftRightLogical,
                             Some(56),
                             dst,
                         ));
                     } else {
                         assert_eq!(ty, types::I32);

                         let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();

                         // mov src, tmp1
                         ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1));

                         // shr $1, tmp1
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size32,
                             ShiftKind::ShiftRightLogical,
                             Some(1),
                             tmp1,
                         ));

                         // andq $0x7777_7777, tmp1
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::And,
                             RegMemImm::imm(0x77777777),
                             tmp1,
                         ));

                         // mov src, tmp2
                         ctx.emit(Inst::mov64_rm_r(src, tmp2));

                         // sub tmp1, tmp2
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));

                         // shr $1, tmp1
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size32,
                             ShiftKind::ShiftRightLogical,
                             Some(1),
                             tmp1,
                         ));

                         // and 0x7777_7777, tmp1
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::And,
                             RegMemImm::imm(0x77777777),
                             tmp1,
                         ));

                         // sub tmp1, tmp2
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));

                         // shr $1, tmp1
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size32,
                             ShiftKind::ShiftRightLogical,
                             Some(1),
                             tmp1,
                         ));

                         // and $0x7777_7777, tmp1
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::And,
                             RegMemImm::imm(0x77777777),
                             tmp1,
                         ));

                         // sub tmp1, tmp2
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::Sub,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));

                         // mov tmp2, dst
                         ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst));

                         // shr $4, dst
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size32,
                             ShiftKind::ShiftRightLogical,
                             Some(4),
                             dst,
                         ));

                         // add tmp2, dst
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::Add,
                             RegMemImm::reg(tmp2.to_reg()),
                             dst,
                         ));

                         // and $0x0F0F_0F0F, dst
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::And,
                             RegMemImm::imm(0x0F0F0F0F),
                             dst,
                         ));

                         // mul $0x0101_0101, dst
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::Mul,
                             RegMemImm::imm(0x01010101),
                             dst,
                         ));

                         // shr $24, dst
                         ctx.emit(Inst::shift_r(
                             OperandSize::Size32,
                             ShiftKind::ShiftRightLogical,
                             Some(24),
                             dst,
                         ));
                     }
                 }

                 if dsts.len() == 1 {
                     let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64));
                 } else {
                     assert!(dsts.len() == 2);
                     let final_dst = get_output_reg(ctx, outputs[0]);
                     ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64));
                     ctx.emit(Inst::alu_rmi_r(
                         OperandSize::Size64,
                         AluRmiROpcode::Add,
                         RegMemImm::reg(dsts[1]),
                         final_dst.regs()[0],
                     ));
                     ctx.emit(Inst::alu_rmi_r(
                         OperandSize::Size64,
                         AluRmiROpcode::Xor,
                         RegMemImm::reg(final_dst.regs()[1].to_reg()),
                         final_dst.regs()[1],
                     ));
                 }
             } else {
                 // Lower `popcount` for vectors.
                 let ty = ty.unwrap();
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512bitalg_simd() {
                     // When AVX512VL and AVX512BITALG are available,
                     // `popcnt.i8x16` can be lowered to a single instruction.
                     assert_eq!(ty, types::I8X16);
                     ctx.emit(Inst::xmm_unary_rm_r_evex(
                         Avx512Opcode::Vpopcntb,
                         RegMem::reg(src),
                         dst,
                     ));
                 } else {
                     // For SIMD 4.4 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf)
                     //
                     //__m128i count_bytes ( __m128i v) {
                     //    __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
                     //    __m128i low_mask = _mm_set1_epi8 (0x0f);
                     //    __m128i lo = _mm_and_si128 (v, low_mask);
                     //    __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
                     //    __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
                     //    __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi);
                     //    return _mm_add_epi8 (cnt1, cnt2);
                     //}
                     //
                     // Details of the above algorithm can be found in the reference noted above, but the basics
                     // are to create a lookup table that pre populates the popcnt values for each number [0,15].
                     // The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
                     // lookup process, and adds together the results.

                     // __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
                     static POPCOUNT_4BIT: [u8; 16] = [
                         0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03,
                         0x02, 0x03, 0x03, 0x04,
                     ];
                     let lookup = ctx.use_constant(VCodeConstantData::WellKnown(&POPCOUNT_4BIT));

                     // Create a mask for lower 4bits of each subword.
                     static LOW_MASK: [u8; 16] = [0x0F; 16];
                     let low_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&LOW_MASK));
                     let low_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                     ctx.emit(Inst::xmm_load_const(low_mask_const, low_mask, ty));

                     // __m128i lo = _mm_and_si128 (v, low_mask);
                     let lo = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(lo, low_mask.to_reg(), types::I8X16));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pand, RegMem::reg(src), lo));

                     // __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
                     ctx.emit(Inst::gen_move(dst, src, ty));
                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrlw, RegMemImm::imm(4), dst));
                     let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(tmp, low_mask.to_reg(), types::I8X16));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pand,
                         RegMem::reg(dst.to_reg()),
                         tmp,
                     ));

                     // __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
                     let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                     ctx.emit(Inst::xmm_load_const(lookup, tmp2, ty));
                     ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), types::I8X16));

                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pshufb,
                         RegMem::reg(lo.to_reg()),
                         dst,
                     ));

                     // __m128i cnt2 = _mm_shuffle_epi8 (lookup , hi) ;
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pshufb,
                         RegMem::reg(tmp.to_reg()),
                         tmp2,
                     ));

                     // return _mm_add_epi8 (cnt1 , cnt2 ) ;
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Paddb,
                         RegMem::reg(tmp2.to_reg()),
                         dst,
                     ));
                 }
             }
         }

         Opcode::Bitrev => {
             let ty = ctx.input_ty(insn, 0);
             assert!(
                 ty == types::I8
                     || ty == types::I16
                     || ty == types::I32
                     || ty == types::I64
                     || ty == types::I128
             );

             if ty == types::I128 {
                 let src = put_input_in_regs(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]);
                 emit_bitrev(ctx, src.regs()[0], dst.regs()[1], types::I64);
                 emit_bitrev(ctx, src.regs()[1], dst.regs()[0], types::I64);
             } else {
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 emit_bitrev(ctx, src, dst, ty);
             }
         }

         Opcode::IsNull | Opcode::IsInvalid => {
             // Null references are represented by the constant value 0; invalid references are
             // represented by the constant value -1. See `define_reftypes()` in
             // `meta/src/isa/x86/encodings.rs` to confirm.
             let src = put_input_in_reg(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             let imm = match op {
                 Opcode::IsNull => {
                     // TODO could use tst src, src for IsNull
                     0
                 }
                 Opcode::IsInvalid => {
                     // We can do a 32-bit comparison even in 64-bits mode, as the constant is then
                     // sign-extended.
                     0xffffffff
                 }
                 _ => unreachable!(),
             };
             ctx.emit(Inst::cmp_rmi_r(
                 OperandSize::from_ty(ty),
                 RegMemImm::imm(imm),
                 src,
             ));
             ctx.emit(Inst::setcc(CC::Z, dst));
         }

         Opcode::Uextend | Opcode::Sextend | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => {
             let src_ty = ctx.input_ty(insn, 0);
             let dst_ty = ctx.output_ty(insn, 0);

             if src_ty == types::I128 {
                 assert!(dst_ty.bits() <= 64);
                 assert!(op == Opcode::Ireduce);
                 let src = put_input_in_regs(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src.regs()[0], types::I64));
             } else if dst_ty == types::I128 {
                 assert!(src_ty.bits() <= 64);
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]);
                 assert!(op == Opcode::Uextend || op == Opcode::Sextend);
                 // Extend to 64 bits first.

                 let ext_mode = ExtMode::new(src_ty.bits(), /* dst bits = */ 64);
                 if let Some(ext_mode) = ext_mode {
                     if op == Opcode::Sextend {
                         ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
                     } else {
                         ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0]));
                     }
                 } else {
                     ctx.emit(Inst::mov64_rm_r(RegMem::reg(src), dst.regs()[0]));
                 }

                 // Now generate the top 64 bits.
                 if op == Opcode::Sextend {
                     // Sign-extend: move dst[0] into dst[1] and arithmetic-shift right by 63 bits
                     // to spread the sign bit across all bits.
                     ctx.emit(Inst::gen_move(
                         dst.regs()[1],
                         dst.regs()[0].to_reg(),
                         types::I64,
                     ));
                     ctx.emit(Inst::shift_r(
                         OperandSize::Size64,
                         ShiftKind::ShiftRightArithmetic,
                         Some(63),
                         dst.regs()[1],
                     ));
                 } else {
                     // Zero-extend: just zero the top word.
                     ctx.emit(Inst::alu_rmi_r(
                         OperandSize::Size64,
                         AluRmiROpcode::Xor,
                         RegMemImm::reg(dst.regs()[1].to_reg()),
                         dst.regs()[1],
                     ));
                 }
             } else {
                 // Sextend requires a sign-extended move, but all the other opcodes are simply a move
                 // from a zero-extended source. Here is why this works, in each case:
                 //
                 // - Breduce, Bextend: changing width of a boolean. We
                 //   represent a bool as a 0 or -1, so Breduce can mask, while
                 //   Bextend must sign-extend.
                 //
                 // - Ireduce: changing width of an integer. Smaller ints are stored with undefined
                 // high-order bits, so we can simply do a copy.
                 let is_sextend = match op {
                     Opcode::Sextend | Opcode::Bextend => true,
                     _ => false,
                 };
                 if src_ty == types::I32 && dst_ty == types::I64 && !is_sextend {
                     // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on
                     // 32-bits will zero-extend the upper 32-bits, so we can even not generate a
                     // zero-extended move in this case.
                     // TODO add loads and shifts here.
                     if let Some(_) = matches_input_any(
                         ctx,
                         inputs[0],
                         &[
                             Opcode::Iadd,
                             Opcode::IaddIfcout,
                             Opcode::Isub,
                             Opcode::Imul,
                             Opcode::Band,
                             Opcode::Bor,
                             Opcode::Bxor,
                         ],
                     ) {
                         let src = put_input_in_reg(ctx, inputs[0]);
                         let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                         ctx.emit(Inst::gen_move(dst, src, types::I64));
                         return Ok(());
                     }
                 }

                 let src = input_to_reg_mem(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits());
                 assert_eq!(
                     src_ty.bits() < dst_ty.bits(),
                     ext_mode.is_some(),
                     "unexpected extension: {} -> {}",
                     src_ty,
                     dst_ty
                 );

                 if let Some(ext_mode) = ext_mode {
                     if is_sextend {
                         ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst));
                     } else {
                         ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst));
                     }
                 } else {
                     ctx.emit(Inst::mov64_rm_r(src, dst));
                 }
             }
         }

         Opcode::Bint => {
             // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
             // out the LSB to give a 0 / 1-valued integer result.
             let rn = put_input_in_reg(ctx, inputs[0]);
             let rd = get_output_reg(ctx, outputs[0]);
             let ty = ctx.output_ty(insn, 0);

             ctx.emit(Inst::gen_move(rd.regs()[0], rn, types::I64));
             ctx.emit(Inst::alu_rmi_r(
                 OperandSize::Size64,
                 AluRmiROpcode::And,
                 RegMemImm::imm(1),
                 rd.regs()[0],
             ));

             if ty == types::I128 {
                 let upper = rd.regs()[1];
                 ctx.emit(Inst::alu_rmi_r(
                     OperandSize::Size64,
                     AluRmiROpcode::Xor,
                     RegMemImm::reg(upper.to_reg()),
                     upper,
                 ));
             }
         }

         Opcode::Icmp => {
             let condcode = ctx.data(insn).cond_code().unwrap();
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             if !ty.is_vector() {
                 let condcode = emit_cmp(ctx, insn, condcode);
                 let cc = CC::from_intcc(condcode);
                 ctx.emit(Inst::setcc(cc, dst));
             } else {
                 assert_eq!(ty.bits(), 128);
                 let eq = |ty| match ty {
                     types::I8X16 => SseOpcode::Pcmpeqb,
                     types::I16X8 => SseOpcode::Pcmpeqw,
                     types::I32X4 => SseOpcode::Pcmpeqd,
                     types::I64X2 => SseOpcode::Pcmpeqq,
                     _ => panic!(
                         "Unable to find an instruction for {} for type: {}",
                         condcode, ty
                     ),
                 };
                 let gt = |ty| match ty {
                     types::I8X16 => SseOpcode::Pcmpgtb,
                     types::I16X8 => SseOpcode::Pcmpgtw,
                     types::I32X4 => SseOpcode::Pcmpgtd,
                     types::I64X2 => SseOpcode::Pcmpgtq,
                     _ => panic!(
                         "Unable to find an instruction for {} for type: {}",
                         condcode, ty
                     ),
                 };
                 let maxu = |ty| match ty {
                     types::I8X16 => SseOpcode::Pmaxub,
                     types::I16X8 => SseOpcode::Pmaxuw,
                     types::I32X4 => SseOpcode::Pmaxud,
                     _ => panic!(
                         "Unable to find an instruction for {} for type: {}",
                         condcode, ty
                     ),
                 };
                 let mins = |ty| match ty {
                     types::I8X16 => SseOpcode::Pminsb,
                     types::I16X8 => SseOpcode::Pminsw,
                     types::I32X4 => SseOpcode::Pminsd,
                     _ => panic!(
                         "Unable to find an instruction for {} for type: {}",
                         condcode, ty
                     ),
                 };
                 let minu = |ty| match ty {
                     types::I8X16 => SseOpcode::Pminub,
                     types::I16X8 => SseOpcode::Pminuw,
                     types::I32X4 => SseOpcode::Pminud,
                     _ => panic!(
                         "Unable to find an instruction for {} for type: {}",
                         condcode, ty
                     ),
                 };

                 // Here we decide which operand to use as the read/write `dst` (ModRM reg field) and
                 // which to use as the read `input` (ModRM r/m field). In the normal case we use
                 // Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for the
                 // less-than cases so that we can reuse the greater-than implementation.
                 //
                 // In a surprising twist, the operands for i64x2 `gte`/`sle` must also be flipped
                 // from the normal order because of the special-case lowering for these instructions
                 // (i.e. we use PCMPGTQ with flipped operands and negate the result).
                 let input = match condcode {
                     IntCC::SignedLessThanOrEqual if ty == types::I64X2 => {
                         let lhs = put_input_in_reg(ctx, inputs[0]);
                         let rhs = input_to_reg_mem(ctx, inputs[1]);
                         ctx.emit(Inst::gen_move(dst, lhs, ty));
                         rhs
                     }
                     IntCC::SignedGreaterThanOrEqual if ty == types::I64X2 => {
                         let lhs = input_to_reg_mem(ctx, inputs[0]);
                         let rhs = put_input_in_reg(ctx, inputs[1]);
                         ctx.emit(Inst::gen_move(dst, rhs, ty));
                         lhs
                     }
                     IntCC::SignedLessThan
                     | IntCC::SignedLessThanOrEqual
                     | IntCC::UnsignedLessThan
                     | IntCC::UnsignedLessThanOrEqual => {
                         let lhs = input_to_reg_mem(ctx, inputs[0]);
                         let rhs = put_input_in_reg(ctx, inputs[1]);
                         ctx.emit(Inst::gen_move(dst, rhs, ty));
                         lhs
                     }
                     _ => {
                         let lhs = put_input_in_reg(ctx, inputs[0]);
                         let rhs = input_to_reg_mem(ctx, inputs[1]);
                         ctx.emit(Inst::gen_move(dst, lhs, ty));
                         rhs
                     }
                 };

                 match condcode {
                     IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
                     IntCC::NotEqual => {
                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
                         // Invert the result of the `PCMPEQ*`.
                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
                     }
                     IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
                         ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
                     }
                     IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
                         if ty != types::I64X2 =>
                     {
                         ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
                     }
                     IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
                         if ty == types::I64X2 =>
                     {
                         // The PMINS* instruction is only available in AVX512VL/F so we must instead
                         // compare with flipped operands and negate the result (emitting one more
                         // instruction).
                         ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
                         // Invert the result of the `PCMPGT*`.
                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
                     }
                     IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
                         ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                         ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
                         // Invert the result of the `PCMPEQ*`.
                         ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
                     }
                     IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
                         ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
                         ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
                     }
                     _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
                 }
             }
         }

         Opcode::Fcmp => {
             let cond_code = ctx.data(insn).fp_cond_code().unwrap();
             let input_ty = ctx.input_ty(insn, 0);
             if !input_ty.is_vector() {
                 // Unordered is returned by setting ZF, PF, CF <- 111
                 // Greater than by ZF, PF, CF <- 000
                 // Less than by ZF, PF, CF <- 001
                 // Equal by ZF, PF, CF <- 100
                 //
                 // Checking the result of comiss is somewhat annoying because you don't have setcc
                 // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
                 // gt, etc) *and* orderedness.
                 //
                 // So that might mean we need more than one setcc check and then a logical "and" or
                 // "or" to determine both, in some cases.  However knowing that if the parity bit is
                 // set, then the result was considered unordered and knowing that if the parity bit is
                 // set, then both the ZF and CF flag bits must also be set we can get away with using
                 // one setcc for most condition codes.

                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
                     FcmpCondResult::Condition(cc) => {
                         ctx.emit(Inst::setcc(cc, dst));
                     }
                     FcmpCondResult::AndConditions(cc1, cc2) => {
                         let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         ctx.emit(Inst::setcc(cc1, tmp));
                         ctx.emit(Inst::setcc(cc2, dst));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::And,
                             RegMemImm::reg(tmp.to_reg()),
                             dst,
                         ));
                     }
                     FcmpCondResult::OrConditions(cc1, cc2) => {
                         let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         ctx.emit(Inst::setcc(cc1, tmp));
                         ctx.emit(Inst::setcc(cc2, dst));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::Or,
                             RegMemImm::reg(tmp.to_reg()),
                             dst,
                         ));
                     }
                     FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                 }
             } else {
                 let op = match input_ty {
                     types::F32X4 => SseOpcode::Cmpps,
                     types::F64X2 => SseOpcode::Cmppd,
                     _ => panic!("Bad input type to fcmp: {}", input_ty),
                 };

                 // Since some packed comparisons are not available, some of the condition codes
                 // must be inverted, with a corresponding `flip` of the operands.
                 let (imm, flip) = match cond_code {
                     FloatCC::GreaterThan => (FcmpImm::LessThan, true),
                     FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
                     FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
                     FloatCC::UnorderedOrLessThanOrEqual => {
                         (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
                     }
                     FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
                         panic!("unsupported float condition code: {}", cond_code)
                     }
                     _ => (FcmpImm::from(cond_code), false),
                 };

                 // Determine the operands of the comparison, possibly by flipping them.
                 let (lhs, rhs) = if flip {
                     (
                         put_input_in_reg(ctx, inputs[1]),
                         input_to_reg_mem(ctx, inputs[0]),
                     )
                 } else {
                     (
                         put_input_in_reg(ctx, inputs[0]),
                         input_to_reg_mem(ctx, inputs[1]),
                     )
                 };

                 // Move the `lhs` to the same register as `dst`; this may not emit an actual move
                 // but ensures that the registers are the same to match x86's read-write operand
                 // encoding.
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, lhs, input_ty));

                 // Emit the comparison.
                 ctx.emit(Inst::xmm_rm_r_imm(
                     op,
                     rhs,
                     dst,
                     imm.encode(),
                     OperandSize::Size32,
                 ));
             }
         }

         Opcode::FallthroughReturn | Opcode::Return => {
             for i in 0..ctx.num_inputs(insn) {
                 let src_reg = put_input_in_regs(ctx, inputs[i]);
                 let retval_reg = ctx.retval(i);
                 let ty = ctx.input_ty(insn, i);
                 assert!(src_reg.len() == retval_reg.len());
                 let (_, tys) = Inst::rc_for_type(ty)?;
                 for ((&src, &dst), &ty) in src_reg
                     .regs()
                     .iter()
                     .zip(retval_reg.regs().iter())
                     .zip(tys.iter())
                 {
                     ctx.emit(Inst::gen_move(dst, src, ty));
                 }
             }
             // N.B.: the Ret itself is generated by the ABI.
         }

         Opcode::Call | Opcode::CallIndirect => {
             let caller_conv = ctx.abi().call_conv();
             let (mut abi, inputs) = match op {
                 Opcode::Call => {
                     let (extname, dist) = ctx.call_target(insn).unwrap();
                     let sig = ctx.call_sig(insn).unwrap();
                     assert_eq!(inputs.len(), sig.params.len());
                     assert_eq!(outputs.len(), sig.returns.len());
                     (
                         X64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
                         &inputs[..],
                     )
                 }

                 Opcode::CallIndirect => {
                     let ptr = put_input_in_reg(ctx, inputs[0]);
                     let sig = ctx.call_sig(insn).unwrap();
                     assert_eq!(inputs.len() - 1, sig.params.len());
                     assert_eq!(outputs.len(), sig.returns.len());
                     (
                         X64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
                         &inputs[1..],
                     )
                 }

                 _ => unreachable!(),
             };

             abi.emit_stack_pre_adjust(ctx);
             assert_eq!(inputs.len(), abi.num_args());
             for i in abi.get_copy_to_arg_order() {
                 let input = inputs[i];
                 let arg_regs = put_input_in_regs(ctx, input);
                 abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
                 let retval_regs = get_output_reg(ctx, *output);
                 abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }

         Opcode::Debugtrap => {
             ctx.emit(Inst::Hlt);
         }

         Opcode::Trap | Opcode::ResumableTrap => {
             let trap_code = ctx.data(insn).trap_code().unwrap();
             ctx.emit_safepoint(Inst::Ud2 { trap_code });
         }

         Opcode::Trapif | Opcode::Trapff => {
             let trap_code = ctx.data(insn).trap_code().unwrap();

             if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
                 let cond_code = ctx.data(insn).cond_code().unwrap();
                 // The flags must not have been clobbered by any other instruction between the
                 // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
                 // simply use the flags here.
                 let cc = CC::from_intcc(cond_code);

                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
             } else if op == Opcode::Trapif {
                 let cond_code = ctx.data(insn).cond_code().unwrap();

                 // Verification ensures that the input is always a single-def ifcmp.
                 let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
                 let cond_code = emit_cmp(ctx, ifcmp, cond_code);
                 let cc = CC::from_intcc(cond_code);

                 ctx.emit_safepoint(Inst::TrapIf { trap_code, cc });
             } else {
                 let cond_code = ctx.data(insn).fp_cond_code().unwrap();

                 // Verification ensures that the input is always a single-def ffcmp.
                 let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();

                 match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
                     FcmpCondResult::Condition(cc) => {
                         ctx.emit_safepoint(Inst::TrapIf { trap_code, cc })
                     }
                     FcmpCondResult::AndConditions(cc1, cc2) => {
                         // A bit unfortunate, but materialize the flags in their own register, and
                         // check against this.
                         let tmp = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         let tmp2 = ctx.alloc_tmp(types::I32).only_reg().unwrap();
                         ctx.emit(Inst::setcc(cc1, tmp));
                         ctx.emit(Inst::setcc(cc2, tmp2));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             AluRmiROpcode::And,
                             RegMemImm::reg(tmp.to_reg()),
                             tmp2,
                         ));
                         ctx.emit_safepoint(Inst::TrapIf {
                             trap_code,
                             cc: CC::NZ,
                         });
                     }
                     FcmpCondResult::OrConditions(cc1, cc2) => {
                         ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc1 });
                         ctx.emit_safepoint(Inst::TrapIf { trap_code, cc: cc2 });
                     }
                     FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                 };
             };
         }

         Opcode::F64const => {
             // TODO use cmpeqpd for all 1s.
             let value = ctx.get_constant(insn).unwrap();
             let dst = get_output_reg(ctx, outputs[0]);
             for inst in Inst::gen_constant(dst, value as u128, types::F64, |ty| {
                 ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
         }

         Opcode::F32const => {
             // TODO use cmpeqps for all 1s.
             let value = ctx.get_constant(insn).unwrap();
             let dst = get_output_reg(ctx, outputs[0]);
             for inst in Inst::gen_constant(dst, value as u128, types::F32, |ty| {
                 ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
         }

         Opcode::WideningPairwiseDotProductS => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();

             ctx.emit(Inst::gen_move(dst, lhs, ty));

             if ty == types::I32X4 {
                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
             } else {
                 panic!(
                     "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
                     ty
                 );
             }
         }

         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
             // must avoid merging a load here.
             let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();

             // Move the `lhs` to the same register as `dst`; this may not emit an actual move
             // but ensures that the registers are the same to match x86's read-write operand
             // encoding.
             ctx.emit(Inst::gen_move(dst, lhs, ty));

             // Note: min and max can't be handled here, because of the way Cranelift defines them:
             // if any operand is a NaN, they must return the NaN operand, while the x86 machine
             // instruction will return the second operand if either operand is a NaN.
             let sse_op = match ty {
                 types::F32 => match op {
                     Opcode::Fadd => SseOpcode::Addss,
                     Opcode::Fsub => SseOpcode::Subss,
                     Opcode::Fmul => SseOpcode::Mulss,
                     Opcode::Fdiv => SseOpcode::Divss,
                     _ => unreachable!(),
                 },
                 types::F64 => match op {
                     Opcode::Fadd => SseOpcode::Addsd,
                     Opcode::Fsub => SseOpcode::Subsd,
                     Opcode::Fmul => SseOpcode::Mulsd,
                     Opcode::Fdiv => SseOpcode::Divsd,
                     _ => unreachable!(),
                 },
                 types::F32X4 => match op {
                     Opcode::Fadd => SseOpcode::Addps,
                     Opcode::Fsub => SseOpcode::Subps,
                     Opcode::Fmul => SseOpcode::Mulps,
                     Opcode::Fdiv => SseOpcode::Divps,
                     _ => unreachable!(),
                 },
                 types::F64X2 => match op {
                     Opcode::Fadd => SseOpcode::Addpd,
                     Opcode::Fsub => SseOpcode::Subpd,
                     Opcode::Fmul => SseOpcode::Mulpd,
                     Opcode::Fdiv => SseOpcode::Divpd,
                     _ => unreachable!(),
                 },
                 _ => panic!(
                     "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
                     ty
                 ),
             };
             ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
         }

         Opcode::Fmin | Opcode::Fmax => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let is_min = op == Opcode::Fmin;
             let output_ty = ty.unwrap();
             ctx.emit(Inst::gen_move(dst, rhs, output_ty));
             if !output_ty.is_vector() {
                 let op_size = match output_ty {
                     types::F32 => OperandSize::Size32,
                     types::F64 => OperandSize::Size64,
                     _ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
                 };
                 ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
             } else {
                 // X64's implementation of floating point min and floating point max does not
                 // propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
                 // scalar approach we use jumps to handle cases where NaN and +0 propagation is
                 // not consistent with what is needed. However for packed floating point min and
                 // floating point max we implement a different approach to avoid the sequence
                 // of jumps that would be required on a per lane basis. Because we do not need to
                 // lower labels and jumps but do need ctx for creating temporaries we implement
                 // the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
                 // The outline of approach is as follows:
                 //
                 // First we preform the Min/Max in both directions. This is because in the
                 // case of an operand's lane containing a NaN or in the case of the lanes of the
                 // two operands containing 0 but with mismatched signs, x64 will return the second
                 // operand regardless of its contents. So in order to make sure we capture NaNs and
                 // normalize NaNs and 0 values we capture the operation in both directions and merge the
                 // results. Then we normalize the results through operations that create a mask for the
                 // lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
                 // 0s.
                 //
                 // The following sequence is generated for min:
                 //
                 // movap{s,d} %lhs, %tmp
                 // minp{s,d} %dst, %tmp
                 // minp,{s,d} %lhs, %dst
                 // orp{s,d} %dst, %tmp
                 // cmpp{s,d} %tmp, %dst, $3
                 // orps{s,d} %dst, %tmp
                 // psrl{s,d} {$10, $13}, %dst
                 // andnp{s,d} %tmp, %dst
                 //
                 // and for max the sequence is:
                 //
                 // movap{s,d} %lhs, %tmp
                 // minp{s,d} %dst, %tmp
                 // minp,{s,d} %lhs, %dst
                 // xorp{s,d} %tmp, %dst
                 // orp{s,d} %dst, %tmp
                 // subp{s,d} %dst, %tmp
                 // cmpp{s,d} %tmp, %dst, $3
                 // psrl{s,d} {$10, $13}, %dst
                 // andnp{s,d} %tmp, %dst

                 if is_min {
                     let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
                         match output_ty {
                             types::F32X4 => (
                                 SseOpcode::Movaps,
                                 SseOpcode::Minps,
                                 SseOpcode::Orps,
                                 SseOpcode::Cmpps,
                                 SseOpcode::Psrld,
                                 10,
                                 SseOpcode::Andnps,
                             ),
                             types::F64X2 => (
                                 SseOpcode::Movapd,
                                 SseOpcode::Minpd,
                                 SseOpcode::Orpd,
                                 SseOpcode::Cmppd,
                                 SseOpcode::Psrlq,
                                 13,
                                 SseOpcode::Andnpd,
                             ),
                             _ => unimplemented!("unsupported op type {:?}", output_ty),
                         };

                     // Copy lhs into tmp
                     let tmp_xmm1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));

                     // Perform min in reverse direction
                     ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));

                     // Perform min in original direction
                     ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));

                     // X64 handles propagation of -0's and Nans differently between left and right
                     // operands. After doing the min in both directions, this OR will
                     // guarrentee capture of -0's and Nan in our tmp register
                     ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));

                     // Compare unordered to create mask for lanes containing NaNs and then use
                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
                     // TODO: Would a check for NaN and then a jump be better here in the
                     // common case than continuing on to normalize NaNs that might not exist?
                     let cond = FcmpImm::from(FloatCC::Unordered);
                     ctx.emit(Inst::xmm_rm_r_imm(
                         cmp_op,
                         RegMem::reg(tmp_xmm1.to_reg()),
                         dst,
                         cond.encode(),
                         OperandSize::Size32,
                     ));
                     ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));

                     // The dst register holds a mask for lanes containing NaNs.
                     // We take that mask and shift in preparation for creating a different mask
                     // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
                     // number of least signficant bits. We shift right each lane by 10 bits
                     // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
                     // 11 exp. + 1 MSB sig.) for F64X2.
                     ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));

                     // Finally we do a nand with the tmp register to produce the final results
                     // in the dst.
                     ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
                 } else {
                     let (
                         mov_op,
                         max_op,
                         xor_op,
                         or_op,
                         sub_op,
                         cmp_op,
                         shift_op,
                         shift_by,
                         andn_op,
                     ) = match output_ty {
                         types::F32X4 => (
                             SseOpcode::Movaps,
                             SseOpcode::Maxps,
                             SseOpcode::Xorps,
                             SseOpcode::Orps,
                             SseOpcode::Subps,
                             SseOpcode::Cmpps,
                             SseOpcode::Psrld,
                             10,
                             SseOpcode::Andnps,
                         ),
                         types::F64X2 => (
                             SseOpcode::Movapd,
                             SseOpcode::Maxpd,
                             SseOpcode::Xorpd,
                             SseOpcode::Orpd,
                             SseOpcode::Subpd,
                             SseOpcode::Cmppd,
                             SseOpcode::Psrlq,
                             13,
                             SseOpcode::Andnpd,
                         ),
                         _ => unimplemented!("unsupported op type {:?}", output_ty),
                     };

                     // Copy lhs into tmp.
                     let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));

                     // Perform max in reverse direction.
                     ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));

                     // Perform max in original direction.
                     ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));

                     // Get the difference between the two results and store in tmp.
                     // Max uses a different approach than min to account for potential
                     // discrepancies with plus/minus 0.
                     ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));

                     // X64 handles propagation of -0's and Nans differently between left and right
                     // operands. After doing the max in both directions, this OR will
                     // guarentee capture of 0's and Nan in our tmp register.
                     ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));

                     // Capture NaNs and sign discrepancies.
                     ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));

                     // Compare unordered to create mask for lanes containing NaNs and then use
                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
                     let cond = FcmpImm::from(FloatCC::Unordered);
                     ctx.emit(Inst::xmm_rm_r_imm(
                         cmp_op,
                         RegMem::reg(tmp_xmm1.to_reg()),
                         dst,
                         cond.encode(),
                         OperandSize::Size32,
                     ));

                     // The dst register holds a mask for lanes containing NaNs.
                     // We take that mask and shift in preparation for creating a different mask
                     // to normalize NaNs (create a quite NaN) by zeroing out the appropriate
                     // number of least signficant bits. We shift right each lane by 10 bits
                     // (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
                     // 11 exp. + 1 MSB sig.) for F64X2.
                     ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));

                     // Finally we do a nand with the tmp register to produce the final results
                     // in the dst.
                     ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
                 }
             }
         }

         Opcode::FminPseudo | Opcode::FmaxPseudo => {
             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
             // must avoid merging a load here.
             let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let rhs = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             ctx.emit(Inst::gen_move(dst, rhs, ty));
             let sse_opcode = match (ty, op) {
                 (types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
                 (types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
                 (types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
                 (types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
                 _ => unimplemented!("unsupported type {} for {}", ty, op),
             };
             ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
         }

         Opcode::Sqrt => {
             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
             // must avoid merging a load here.
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();

             let sse_op = match ty {
                 types::F32 => SseOpcode::Sqrtss,
                 types::F64 => SseOpcode::Sqrtsd,
                 types::F32X4 => SseOpcode::Sqrtps,
                 types::F64X2 => SseOpcode::Sqrtpd,
                 _ => panic!(
                     "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
                     ty
                 ),
             };

             ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
         }

         Opcode::Fpromote => {
             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
             // must avoid merging a load here.
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
         }

         Opcode::FvpromoteLow => {
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(
                 SseOpcode::Cvtps2pd,
                 RegMem::from(src),
                 dst,
             ));
         }

         Opcode::Fdemote => {
             // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
             // must avoid merging a load here.
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
         }

         Opcode::Fvdemote => {
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(
                 SseOpcode::Cvtpd2ps,
                 RegMem::from(src),
                 dst,
             ));
         }

         Opcode::FcvtFromSint => {
             let output_ty = ty.unwrap();
             if !output_ty.is_vector() {
                 let (ext_spec, src_size) = match ctx.input_ty(insn, 0) {
                     types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32),
                     types::I32 => (None, OperandSize::Size32),
                     types::I64 => (None, OperandSize::Size64),
                     _ => unreachable!(),
                 };

                 let src = match ext_spec {
                     Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
                     None => RegMem::reg(put_input_in_reg(ctx, inputs[0])),
                 };

                 let opcode = if output_ty == types::F32 {
                     SseOpcode::Cvtsi2ss
                 } else {
                     assert_eq!(output_ty, types::F64);
                     SseOpcode::Cvtsi2sd
                 };
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
             } else {
                 let ty = ty.unwrap();
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let opcode = match ctx.input_ty(insn, 0) {
                     types::I32X4 => SseOpcode::Cvtdq2ps,
                     _ => {
                         unimplemented!("unable to use type {} for op {}", ctx.input_ty(insn, 0), op)
                     }
                 };
                 ctx.emit(Inst::gen_move(dst, src, ty));
                 ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst));
             }
         }
         Opcode::FcvtLowFromSint => {
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::xmm_unary_rm_r(
                 SseOpcode::Cvtdq2pd,
                 RegMem::from(src),
                 dst,
             ));
         }
         Opcode::FcvtFromUint => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();

             let input_ty = ctx.input_ty(insn, 0);
             if !ty.is_vector() {
                 match input_ty {
                     types::I8 | types::I16 | types::I32 => {
                         // Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
                         // do a signed conversion (which won't overflow).
                         let opcode = if ty == types::F32 {
                             SseOpcode::Cvtsi2ss
                         } else {
                             assert_eq!(ty, types::F64);
                             SseOpcode::Cvtsi2sd
                         };

                         let src = RegMem::reg(extend_input_to_reg(
                             ctx,
                             inputs[0],
                             ExtSpec::ZeroExtendTo64,
                         ));
                         ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
                     }

                     types::I64 => {
                         let src = put_input_in_reg(ctx, inputs[0]);

                         let src_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::gen_move(src_copy, src, types::I64));

                         let tmp_gpr1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let tmp_gpr2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::cvt_u64_to_float_seq(
                             if ty == types::F64 {
                                 OperandSize::Size64
                             } else {
                                 OperandSize::Size32
                             },
                             src_copy,
                             tmp_gpr1,
                             tmp_gpr2,
                             dst,
                         ));
                     }
                     _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
                 };
             } else if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
                 let uwiden_input = InsnInput {
                     insn: uwiden,
                     input: 0,
                 };
                 let src = put_input_in_reg(ctx, uwiden_input);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let input_ty = ctx.input_ty(uwiden, 0);
                 let output_ty = ctx.output_ty(insn, 0);

                 // Matches_input further obfuscates which Wasm instruction this is ultimately
                 // lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U.
                 debug_assert!(input_ty == types::I32X4 || output_ty == types::F64X2);

                 // Algorithm uses unpcklps to help create a float that is equivalent
                 // 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
                 // every value of the mantissa represents a corresponding uint32 number.
                 // When we subtract 0x1.0p52 we are left with double(src).
                 let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(dst, src, types::I32X4));

                 static UINT_MASK: [u8; 16] = [
                     0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
                     0x00, 0x00, 0x00,
                 ];

                 let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));

                 ctx.emit(Inst::xmm_load_const(
                     uint_mask_const,
                     uint_mask,
                     types::I32X4,
                 ));

                 // Creates 0x1.0p52 + double(src)
                 ctx.emit(Inst::xmm_rm_r(
                     SseOpcode::Unpcklps,
                     RegMem::from(uint_mask),
                     dst,
                 ));

                 static UINT_MASK_HIGH: [u8; 16] = [
                     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
                     0x00, 0x30, 0x43,
                 ];

                 let uint_mask_high_const =
                     ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
                 let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                 ctx.emit(Inst::xmm_load_const(
                     uint_mask_high_const,
                     uint_mask_high,
                     types::I32X4,
                 ));

                 // 0x1.0p52 + double(src) - 0x1.0p52
                 ctx.emit(Inst::xmm_rm_r(
                     SseOpcode::Subpd,
                     RegMem::from(uint_mask_high),
                     dst,
                 ));
             } else {
                 assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
                 let src = put_input_in_reg(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512f_simd() {
                     // When AVX512VL and AVX512F are available,
                     // `fcvt_from_uint` can be lowered to a single instruction.
                     ctx.emit(Inst::xmm_unary_rm_r_evex(
                         Avx512Opcode::Vcvtudq2ps,
                         RegMem::reg(src),
                         dst,
                     ));
                 } else {
                     // Converting packed unsigned integers to packed floats
                     // requires a few steps. There is no single instruction
                     // lowering for converting unsigned floats but there is for
                     // converting packed signed integers to float (cvtdq2ps). In
                     // the steps below we isolate the upper half (16 bits) and
                     // lower half (16 bits) of each lane and then we convert
                     // each half separately using cvtdq2ps meant for signed
                     // integers. In order for this to work for the upper half
                     // bits we must shift right by 1 (divide by 2) these bits in
                     // order to ensure the most significant bit is 0 not signed,
                     // and then after the conversion we double the value.
                     // Finally we add the converted values where addition will
                     // correctly round.
                     //
                     // Sequence:
                     // -> A = 0xffffffff
                     // -> Ah = 0xffff0000
                     // -> Al = 0x0000ffff
                     // -> Convert(Al) // Convert int to float
                     // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
                     // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
                     // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
                     // -> dst = Ah + Al // Add the two floats together

                     // Create a temporary register
                     let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                     ctx.emit(Inst::xmm_unary_rm_r(
                         SseOpcode::Movapd,
                         RegMem::reg(src),
                         tmp,
                     ));
                     ctx.emit(Inst::gen_move(dst, src, ty));

                     // Get the low 16 bits
                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp));
                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp));

                     // Get the high 16 bits
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst));

                     // Convert the low 16 bits
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp));

                     // Shift the high bits by 1, convert, and double to get the correct value.
                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Addps,
                         RegMem::reg(dst.to_reg()),
                         dst,
                     ));

                     // Add together the two converted values.
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Addps,
                         RegMem::reg(tmp.to_reg()),
                         dst,
                     ));
                 }
             }
         }

         Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
             let src = put_input_in_reg(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             let input_ty = ctx.input_ty(insn, 0);
             if !input_ty.is_vector() {
                 let src_size = if input_ty == types::F32 {
                     OperandSize::Size32
                 } else {
                     assert_eq!(input_ty, types::F64);
                     OperandSize::Size64
                 };

                 let output_ty = ty.unwrap();
                 let dst_size = if output_ty == types::I32 {
                     OperandSize::Size32
                 } else {
                     assert_eq!(output_ty, types::I64);
                     OperandSize::Size64
                 };

                 let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
                 let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;

                 let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(src_copy, src, input_ty));

                 let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap();
                 let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap();

                 if to_signed {
                     ctx.emit(Inst::cvt_float_to_sint_seq(
                         src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
                     ));
                 } else {
                     ctx.emit(Inst::cvt_float_to_uint_seq(
                         src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm,
                     ));
                 }
             } else {
                 if op == Opcode::FcvtToSintSat {
                     // Sets destination to zero if float is NaN
                     assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
                     let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                     ctx.emit(Inst::xmm_unary_rm_r(
                         SseOpcode::Movapd,
                         RegMem::reg(src),
                         tmp,
                     ));
                     ctx.emit(Inst::gen_move(dst, src, input_ty));
                     let cond = FcmpImm::from(FloatCC::Equal);
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Cmpps,
                         RegMem::reg(tmp.to_reg()),
                         tmp,
                         cond.encode(),
                         OperandSize::Size32,
                     ));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Andps,
                         RegMem::reg(tmp.to_reg()),
                         dst,
                     ));

                     // Sets top bit of tmp if float is positive
                     // Setting up to set top bit on negative float values
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pxor,
                         RegMem::reg(dst.to_reg()),
                         tmp,
                     ));

                     // Convert the packed float to packed doubleword.
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Cvttps2dq,
                         RegMem::reg(dst.to_reg()),
                         dst,
                     ));

                     // Set top bit only if < 0
                     // Saturate lane with sign (top) bit.
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pand,
                         RegMem::reg(dst.to_reg()),
                         tmp,
                     ));
                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp));

                     // On overflow 0x80000000 is returned to a lane.
                     // Below sets positive overflow lanes to 0x7FFFFFFF
                     // Keeps negative overflow lanes as is.
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Pxor,
                         RegMem::reg(tmp.to_reg()),
                         dst,
                     ));
                 } else if op == Opcode::FcvtToUintSat {
                     // The algorithm for converting floats to unsigned ints is a little tricky. The
                     // complication arises because we are converting from a signed 64-bit int with a positive
                     // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
                     // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
                     // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
                     // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
                     // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
                     // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
                     // precisely INT_MAX values we can correctly account for and convert every value in this range
                     // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
                     // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
                     // After the conversion we add INT_MAX+1 back to this converted value, noting again that
                     // values we are trying to account for were already set to INT_MAX+1 during the original conversion.
                     // We simply have to create a mask and make sure we are adding together only the lanes that need
                     // to be accounted for. Digesting it all the steps then are:
                     //
                     // Step 1 - Account for NaN and negative floats by setting these src values to zero.
                     // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
                     //          reasons described above.
                     // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
                     // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
                     //          values that were originally in the range (0..INT_MAX). This will come in handy during
                     //          step 7 when we zero negative lanes.
                     // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
                     //          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
                     // Step 6 - Convert the second set of values (tmp1)
                     // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
                     //          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
                     //          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
                     // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
                     //          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
                     //          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
                     //          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
                     //
                     //
                     // The table below illustrates the result after each step where it matters for the converted set.
                     // Note the original value range (original src set) is the final dst in Step 8:
                     //
                     // Original src set:
                     // | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
                     // |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
                     //
                     // Copied src set (tmp1):
                     // |    Step 2    |                  Step 4                  |
                     // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
                     //
                     // |                       Step 6                        |                 Step 7                 |
                     // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |

                     // Create temporaries
                     assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
                     let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
                     let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();

                     // Converting to unsigned int so if float src is negative or NaN
                     // will first set to zero.
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
                     ctx.emit(Inst::gen_move(dst, src, input_ty));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));

                     // Set tmp2 to INT_MAX+1. It is important to note here that after it looks
                     // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
                     // single precision IEEE-754 floats can only accurately represent contingous
                     // integers up to 2^23 and outside of this range it rounds to the closest
                     // integer that it can represent. In the case of INT_MAX, this value gets
                     // represented as 0x4f000000 which is the integer value (INT_MAX+1).

                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
                     ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Cvtdq2ps,
                         RegMem::from(tmp2),
                         tmp2,
                     ));

                     // Make a copy of these lanes and then do the first conversion.
                     // Overflow lanes greater than the maximum allowed signed value will
                     // set to 0x80000000. Negative and NaN lanes will be 0x0
                     ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::from(dst), dst));

                     // Set lanes to src - max_signed_int
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));

                     // Create mask for all positive lanes to saturate (i.e. greater than
                     // or equal to the maxmimum allowable unsigned int).
                     let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Cmpps,
                         RegMem::from(tmp1),
                         tmp2,
                         cond.encode(),
                         OperandSize::Size32,
                     ));

                     // Convert those set of lanes that have the max_signed_int factored out.
                     ctx.emit(Inst::xmm_rm_r(
                         SseOpcode::Cvttps2dq,
                         RegMem::from(tmp1),
                         tmp1,
                     ));

                     // Prepare converted lanes by zeroing negative lanes and prepping lanes
                     // that have positive overflow (based on the mask) by setting these lanes
                     // to 0x7FFFFFFF
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));

                     // Add this second set of converted lanes to the original to properly handle
                     // values greater than max signed int.
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
                 } else {
                     // Since this branch is also guarded by a check for vector types
                     // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
                     // due to vector varients not existing. The first two branches will
                     // cover all reachable cases.
                     unreachable!();
                 }
             }
         }
         Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
             let input_ty = ctx.input_ty(insn, 0);
             let output_ty = ctx.output_ty(insn, 0);
             let src = put_input_in_reg(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if output_ty.is_vector() {
                 match op {
                     Opcode::SwidenLow => match (input_ty, output_ty) {
                         (types::I8X16, types::I16X8) => {
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(src), dst));
                         }
                         (types::I16X8, types::I32X4) => {
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::reg(src), dst));
                         }
                         _ => unreachable!(),
                     },
                     Opcode::SwidenHigh => match (input_ty, output_ty) {
                         (types::I8X16, types::I16X8) => {
                             ctx.emit(Inst::gen_move(dst, src, output_ty));
                             ctx.emit(Inst::xmm_rm_r_imm(
                                 SseOpcode::Palignr,
                                 RegMem::reg(src),
                                 dst,
                                 8,
                                 OperandSize::Size32,
                             ));
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
                         }
                         (types::I16X8, types::I32X4) => {
                             ctx.emit(Inst::gen_move(dst, src, output_ty));
                             ctx.emit(Inst::xmm_rm_r_imm(
                                 SseOpcode::Palignr,
                                 RegMem::reg(src),
                                 dst,
                                 8,
                                 OperandSize::Size32,
                             ));
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
                         }
                         _ => unreachable!(),
                     },
                     Opcode::UwidenLow => match (input_ty, output_ty) {
                         (types::I8X16, types::I16X8) => {
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(src), dst));
                         }
                         (types::I16X8, types::I32X4) => {
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
                         }
                         _ => unreachable!(
                             "In UwidenLow: input_ty {:?}, output_ty {:?}",
                             input_ty, output_ty
                         ),
                     },
                     Opcode::UwidenHigh => match (input_ty, output_ty) {
                         (types::I8X16, types::I16X8) => {
                             ctx.emit(Inst::gen_move(dst, src, output_ty));
                             ctx.emit(Inst::xmm_rm_r_imm(
                                 SseOpcode::Palignr,
                                 RegMem::reg(src),
                                 dst,
                                 8,
                                 OperandSize::Size32,
                             ));
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
                         }
                         (types::I16X8, types::I32X4) => {
                             ctx.emit(Inst::gen_move(dst, src, output_ty));
                             ctx.emit(Inst::xmm_rm_r_imm(
                                 SseOpcode::Palignr,
                                 RegMem::reg(src),
                                 dst,
                                 8,
                                 OperandSize::Size32,
                             ));
                             ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
                         }
                         _ => unreachable!(),
                     },
                     _ => unreachable!(),
                 }
             } else {
                 panic!("Unsupported non-vector type for widen instruction {:?}", ty);
             }
         }
         Opcode::Snarrow | Opcode::Unarrow => {
             let input_ty = ctx.input_ty(insn, 0);
             let output_ty = ctx.output_ty(insn, 0);
             let src1 = put_input_in_reg(ctx, inputs[0]);
             let src2 = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if output_ty.is_vector() {
                 match op {
                     Opcode::Snarrow => match (input_ty, output_ty) {
                         (types::I16X8, types::I8X16) => {
                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst));
                         }
                         (types::I32X4, types::I16X8) => {
                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst));
                         }
                         _ => unreachable!(),
                     },
                     Opcode::Unarrow => match (input_ty, output_ty) {
                         (types::I16X8, types::I8X16) => {
                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst));
                         }
                         (types::I32X4, types::I16X8) => {
                             ctx.emit(Inst::gen_move(dst, src1, input_ty));
                             ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst));
                         }
                         _ => unreachable!(),
                     },
                     _ => unreachable!(),
                 }
             } else {
                 panic!("Unsupported non-vector type for widen instruction {:?}", ty);
             }
         }
         Opcode::Bitcast => {
             let input_ty = ctx.input_ty(insn, 0);
             let output_ty = ctx.output_ty(insn, 0);
             match (input_ty, output_ty) {
                 (types::F32, types::I32) => {
                     let src = put_input_in_reg(ctx, inputs[0]);
                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::xmm_to_gpr(
                         SseOpcode::Movd,
                         src,
                         dst,
                         OperandSize::Size32,
                     ));
                 }
                 (types::I32, types::F32) => {
                     let src = input_to_reg_mem(ctx, inputs[0]);
                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::gpr_to_xmm(
                         SseOpcode::Movd,
                         src,
                         OperandSize::Size32,
                         dst,
                     ));
                 }
                 (types::F64, types::I64) => {
                     let src = put_input_in_reg(ctx, inputs[0]);
                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::xmm_to_gpr(
                         SseOpcode::Movq,
                         src,
                         dst,
                         OperandSize::Size64,
                     ));
                 }
                 (types::I64, types::F64) => {
                     let src = input_to_reg_mem(ctx, inputs[0]);
                     let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                     ctx.emit(Inst::gpr_to_xmm(
                         SseOpcode::Movq,
                         src,
                         OperandSize::Size64,
                         dst,
                     ));
                 }
                 _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
             }
         }

         Opcode::Fabs | Opcode::Fneg => {
             let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             // In both cases, generate a constant and apply a single binary instruction:
             // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
             // src with it.
             // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
             // src with it.
             let output_ty = ty.unwrap();
             if !output_ty.is_vector() {
                 let (val, opcode): (u64, _) = match output_ty {
                     types::F32 => match op {
                         Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
                         Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
                         _ => unreachable!(),
                     },
                     types::F64 => match op {
                         Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
                         Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
                         _ => unreachable!(),
                     },
                     _ => panic!("unexpected type {:?} for Fabs", output_ty),
                 };

                 for inst in Inst::gen_constant(ValueRegs::one(dst), val as u128, output_ty, |ty| {
                     ctx.alloc_tmp(ty).only_reg().unwrap()
                 }) {
                     ctx.emit(inst);
                 }

                 ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
             } else {
                 // Eventually vector constants should be available in `gen_constant` and this block
                 // can be merged with the one above (TODO).
                 if output_ty.bits() == 128 {
                     // Move the `lhs` to the same register as `dst`; this may not emit an actual move
                     // but ensures that the registers are the same to match x86's read-write operand
                     // encoding.
                     let src = put_input_in_reg(ctx, inputs[0]);
                     ctx.emit(Inst::gen_move(dst, src, output_ty));

                     // Generate an all 1s constant in an XMM register. This uses CMPPS but could
                     // have used CMPPD with the same effect. Note, we zero the temp we allocate
                     // because if not, there is a chance that the register we use could be initialized
                     // with NaN .. in which case the CMPPS would fail since NaN != NaN.
                     let tmp = ctx.alloc_tmp(output_ty).only_reg().unwrap();
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Xorps, RegMem::from(tmp), tmp));
                     let cond = FcmpImm::from(FloatCC::Equal);
                     let cmpps = Inst::xmm_rm_r_imm(
                         SseOpcode::Cmpps,
                         RegMem::reg(tmp.to_reg()),
                         tmp,
                         cond.encode(),
                         OperandSize::Size32,
                     );
                     ctx.emit(cmpps);

                     // Shift the all 1s constant to generate the mask.
                     let lane_bits = output_ty.lane_bits();
                     let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
                         (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
                         (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
                         (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
                         (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
                         _ => unreachable!(
                             "unexpected opcode and lane size: {:?}, {} bits",
                             op, lane_bits
                         ),
                     };
                     let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
                     ctx.emit(shift);

                     // Apply shifted mask (XOR or AND).
                     let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
                     ctx.emit(mask);
                 } else {
                     panic!("unexpected type {:?} for Fabs", output_ty);
                 }
             }
         }

         Opcode::Fcopysign => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);

             let ty = ty.unwrap();

             // We're going to generate the following sequence:
             //
             // movabs     $INT_MIN, tmp_gpr1
             // mov{d,q}   tmp_gpr1, tmp_xmm1
             // movap{s,d} tmp_xmm1, dst
             // andnp{s,d} src_1, dst
             // movap{s,d} src_2, tmp_xmm2
             // andp{s,d}  tmp_xmm1, tmp_xmm2
             // orp{s,d}   tmp_xmm2, dst

             let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
             let tmp_xmm2 = ctx.alloc_tmp(types::F32).only_reg().unwrap();

             let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
                 types::F32 => (
                     0x8000_0000,
                     SseOpcode::Movaps,
                     SseOpcode::Andnps,
                     SseOpcode::Andps,
                     SseOpcode::Orps,
                 ),
                 types::F64 => (
                     0x8000_0000_0000_0000,
                     SseOpcode::Movapd,
                     SseOpcode::Andnpd,
                     SseOpcode::Andpd,
                     SseOpcode::Orpd,
                 ),
                 _ => {
                     panic!("unexpected type {:?} for copysign", ty);
                 }
             };

             for inst in Inst::gen_constant(ValueRegs::one(tmp_xmm1), sign_bit_cst, ty, |ty| {
                 ctx.alloc_tmp(ty).only_reg().unwrap()
             }) {
                 ctx.emit(inst);
             }
             ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
             ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
             ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2));
             ctx.emit(Inst::xmm_rm_r(
                 and_op,
                 RegMem::reg(tmp_xmm1.to_reg()),
                 tmp_xmm2,
             ));
             ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
         }

         Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
             let ty = ty.unwrap();
             if isa_flags.use_sse41() {
                 let mode = match op {
                     Opcode::Ceil => RoundImm::RoundUp,
                     Opcode::Floor => RoundImm::RoundDown,
                     Opcode::Nearest => RoundImm::RoundNearest,
                     Opcode::Trunc => RoundImm::RoundZero,
                     _ => panic!("unexpected opcode {:?} in Ceil/Floor/Nearest/Trunc", op),
                 };
                 let op = match ty {
                     types::F32 => SseOpcode::Roundss,
                     types::F64 => SseOpcode::Roundsd,
                     types::F32X4 => SseOpcode::Roundps,
                     types::F64X2 => SseOpcode::Roundpd,
                     _ => panic!("unexpected type {:?} in Ceil/Floor/Nearest/Trunc", ty),
                 };
                 let src = input_to_reg_mem(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::xmm_rm_r_imm(
                     op,
                     src,
                     dst,
                     mode.encode(),
                     OperandSize::Size32,
                 ));
             } else {
                 // Lower to VM calls when there's no access to SSE4.1.
                 // Note, for vector types on platforms that don't support sse41
                 // the execution will panic here.
                 let libcall = match (op, ty) {
                     (Opcode::Ceil, types::F32) => LibCall::CeilF32,
                     (Opcode::Ceil, types::F64) => LibCall::CeilF64,
                     (Opcode::Floor, types::F32) => LibCall::FloorF32,
                     (Opcode::Floor, types::F64) => LibCall::FloorF64,
                     (Opcode::Nearest, types::F32) => LibCall::NearestF32,
                     (Opcode::Nearest, types::F64) => LibCall::NearestF64,
                     (Opcode::Trunc, types::F32) => LibCall::TruncF32,
                     (Opcode::Trunc, types::F64) => LibCall::TruncF64,
                     _ => panic!(
                         "unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc",
                         ty, op
                     ),
                 };
                 emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
             }
         }

         Opcode::Load
         | Opcode::Uload8
         | Opcode::Sload8
         | Opcode::Uload16
         | Opcode::Sload16
         | Opcode::Uload32
         | Opcode::Sload32
         | Opcode::LoadComplex
         | Opcode::Uload8Complex
         | Opcode::Sload8Complex
         | Opcode::Uload16Complex
         | Opcode::Sload16Complex
         | Opcode::Uload32Complex
         | Opcode::Sload32Complex
         | Opcode::Sload8x8
         | Opcode::Uload8x8
         | Opcode::Sload16x4
         | Opcode::Uload16x4
         | Opcode::Sload32x2
         | Opcode::Uload32x2 => {
             let offset = ctx.data(insn).load_store_offset().unwrap();

             let elem_ty = match op {
                 Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
                     types::I8
                 }
                 Opcode::Sload16
                 | Opcode::Uload16
                 | Opcode::Sload16Complex
                 | Opcode::Uload16Complex => types::I16,
                 Opcode::Sload32
                 | Opcode::Uload32
                 | Opcode::Sload32Complex
                 | Opcode::Uload32Complex => types::I32,
                 Opcode::Sload8x8
                 | Opcode::Uload8x8
                 | Opcode::Sload8x8Complex
                 | Opcode::Uload8x8Complex => types::I8X8,
                 Opcode::Sload16x4
                 | Opcode::Uload16x4
                 | Opcode::Sload16x4Complex
                 | Opcode::Uload16x4Complex => types::I16X4,
                 Opcode::Sload32x2
                 | Opcode::Uload32x2
                 | Opcode::Sload32x2Complex
                 | Opcode::Uload32x2Complex => types::I32X2,
                 Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
                 _ => unimplemented!(),
             };

             let ext_mode = ExtMode::new(elem_ty.bits(), 64);

             let sign_extend = match op {
                 Opcode::Sload8
                 | Opcode::Sload8Complex
                 | Opcode::Sload16
                 | Opcode::Sload16Complex
                 | Opcode::Sload32
                 | Opcode::Sload32Complex
                 | Opcode::Sload8x8
                 | Opcode::Sload8x8Complex
                 | Opcode::Sload16x4
                 | Opcode::Sload16x4Complex
                 | Opcode::Sload32x2
                 | Opcode::Sload32x2Complex => true,
                 _ => false,
             };

             let amode = match op {
                 Opcode::Load
                 | Opcode::Uload8
                 | Opcode::Sload8
                 | Opcode::Uload16
                 | Opcode::Sload16
                 | Opcode::Uload32
                 | Opcode::Sload32
                 | Opcode::Sload8x8
                 | Opcode::Uload8x8
                 | Opcode::Sload16x4
                 | Opcode::Uload16x4
                 | Opcode::Sload32x2
                 | Opcode::Uload32x2 => {
                     assert_eq!(inputs.len(), 1, "only one input for load operands");
                     lower_to_amode(ctx, inputs[0], offset)
                 }

                 Opcode::LoadComplex
                 | Opcode::Uload8Complex
                 | Opcode::Sload8Complex
                 | Opcode::Uload16Complex
                 | Opcode::Sload16Complex
                 | Opcode::Uload32Complex
                 | Opcode::Sload32Complex
                 | Opcode::Sload8x8Complex
                 | Opcode::Uload8x8Complex
                 | Opcode::Sload16x4Complex
                 | Opcode::Uload16x4Complex
                 | Opcode::Sload32x2Complex
                 | Opcode::Uload32x2Complex => {
                     assert_eq!(
                         inputs.len(),
                         2,
                         "can't handle more than two inputs in complex load"
                     );
                     let base = put_input_in_reg(ctx, inputs[0]);
                     let index = put_input_in_reg(ctx, inputs[1]);
                     let shift = 0;
                     let flags = ctx.memflags(insn).expect("load should have memflags");
                     Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
                 }
                 _ => unreachable!(),
             };

             if elem_ty == types::I128 {
                 let dsts = get_output_reg(ctx, outputs[0]);
                 ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0]));
                 ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1]));
             } else {
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
                 match (sign_extend, is_xmm) {
                     (true, false) => {
                         // The load is sign-extended only when the output size is lower than 64 bits,
                         // so ext-mode is defined in this case.
                         ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
                     }
                     (false, false) => {
                         if elem_ty.bytes() == 8 {
                             // Use a plain load.
                             ctx.emit(Inst::mov64_m_r(amode, dst))
                         } else {
                             // Use a zero-extended load.
                             ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
                         }
                     }
                     (_, true) => {
                         ctx.emit(match elem_ty {
                             types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
                             types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
                             types::I8X8 => {
                                 if sign_extend == true {
                                     Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
                                 } else {
                                     Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
                                 }
                             }
                             types::I16X4 => {
                                 if sign_extend == true {
                                     Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
                                 } else {
                                     Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
                                 }
                             }
                             types::I32X2 => {
                                 if sign_extend == true {
                                     Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
                                 } else {
                                     Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
                                 }
                             }
                             _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
                                 Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
                             }
                             // TODO Specialize for different types: MOVUPD, MOVDQU
                             _ => unreachable!(
                                 "unexpected type for load: {:?} - {:?}",
                                 elem_ty,
                                 elem_ty.bits()
                             ),
                         });
                     }
                 }
             }
         }

         Opcode::Store
         | Opcode::Istore8
         | Opcode::Istore16
         | Opcode::Istore32
         | Opcode::StoreComplex
         | Opcode::Istore8Complex
         | Opcode::Istore16Complex
         | Opcode::Istore32Complex => {
             let offset = ctx.data(insn).load_store_offset().unwrap();

             let elem_ty = match op {
                 Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
                 Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
                 Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
                 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
                 _ => unreachable!(),
             };

             let addr = match op {
                 Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
                     assert_eq!(inputs.len(), 2, "only one input for store memory operands");
                     lower_to_amode(ctx, inputs[1], offset)
                 }

                 Opcode::StoreComplex
                 | Opcode::Istore8Complex
                 | Opcode::Istore16Complex
                 | Opcode::Istore32Complex => {
                     assert_eq!(
                         inputs.len(),
                         3,
                         "can't handle more than two inputs in complex store"
                     );
                     let base = put_input_in_reg(ctx, inputs[1]);
                     let index = put_input_in_reg(ctx, inputs[2]);
                     let shift = 0;
                     let flags = ctx.memflags(insn).expect("store should have memflags");
                     Amode::imm_reg_reg_shift(offset as u32, base, index, shift).with_flags(flags)
                 }

                 _ => unreachable!(),
             };

             if elem_ty == types::I128 {
                 let srcs = put_input_in_regs(ctx, inputs[0]);
                 ctx.emit(Inst::store(types::I64, srcs.regs()[0], addr.clone()));
                 ctx.emit(Inst::store(types::I64, srcs.regs()[1], addr.offset(8)));
             } else {
                 let src = put_input_in_reg(ctx, inputs[0]);
                 ctx.emit(Inst::store(elem_ty, src, addr));
             }
         }

         Opcode::AtomicRmw => {
             // This is a simple, general-case atomic update, based on a loop involving
             // `cmpxchg`.  Note that we could do much better than this in the case where the old
             // value at the location (that is to say, the SSA `Value` computed by this CLIF
             // instruction) is not required.  In that case, we could instead implement this
             // using a single `lock`-prefixed x64 read-modify-write instruction.  Also, even in
             // the case where the old value is required, for the `add` and `sub` cases, we can
             // use the single instruction `lock xadd`.  However, those improvements have been
             // left for another day.
             // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut addr = put_input_in_reg(ctx, inputs[0]);
             let mut arg2 = put_input_in_reg(ctx, inputs[1]);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));

             // Make sure that both args are in virtual regs, since in effect we have to do a
             // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
             // guaranteed safe if either is in a real reg.
             addr = ctx.ensure_in_vreg(addr, types::I64);
             arg2 = ctx.ensure_in_vreg(arg2, types::I64);

             // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
             // operates at whatever width is specified by `ty`, so there's no need to
             // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::r9()),
                 addr,
                 types::I64,
             ));
             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::r10()),
                 arg2,
                 types::I64,
             ));

             // Now the AtomicRmwSeq (pseudo-) instruction itself
             let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
             ctx.emit(Inst::AtomicRmwSeq { ty: ty_access, op });

             // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
             ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
         }

         Opcode::AtomicCas => {
             // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
             // `AtomicRmw`, there's no need to zero-extend narrow values here.
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let addr = lower_to_amode(ctx, inputs[0], 0);
             let expected = put_input_in_reg(ctx, inputs[1]);
             let replacement = put_input_in_reg(ctx, inputs[2]);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));

             // Move the expected value into %rax.  Because there's only one fixed register on
             // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
             // `AtomicRmw` case.
             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::rax()),
                 expected,
                 types::I64,
             ));
             ctx.emit(Inst::LockCmpxchg {
                 ty: ty_access,
                 src: replacement,
                 dst: addr.into(),
             });
             // And finally, copy the old value at the location to its destination reg.
             ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
         }

         Opcode::AtomicLoad => {
             // This is a normal load.  The x86-TSO memory model provides sufficient sequencing
             // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
             // need for any fence instructions.
             let data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let addr = lower_to_amode(ctx, inputs[0], 0);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));

             let rm = RegMem::mem(addr);
             if ty_access == types::I64 {
                 ctx.emit(Inst::mov64_rm_r(rm, data));
             } else {
                 let ext_mode = ExtMode::new(ty_access.bits(), 64).expect(&format!(
                     "invalid extension during AtomicLoad: {} -> {}",
                     ty_access.bits(),
                     64
                 ));
                 ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data));
             }
         }

         Opcode::AtomicStore => {
             // This is a normal store, followed by an `mfence` instruction.
             let data = put_input_in_reg(ctx, inputs[0]);
             let addr = lower_to_amode(ctx, inputs[1], 0);
             let ty_access = ctx.input_ty(insn, 0);
             assert!(is_valid_atomic_transaction_ty(ty_access));

             ctx.emit(Inst::store(ty_access, data, addr));
             ctx.emit(Inst::Fence {
                 kind: FenceKind::MFence,
             });
         }

         Opcode::Fence => {
             ctx.emit(Inst::Fence {
                 kind: FenceKind::MFence,
             });
         }

         Opcode::FuncAddr => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _) = ctx.call_target(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
                 dst,
                 name: Box::new(extname),
                 offset: 0,
             });
         }

         Opcode::SymbolValue => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
                 dst,
                 name: Box::new(extname),
                 offset,
             });
         }

         Opcode::StackAddr => {
             let (stack_slot, offset) = match *ctx.data(insn) {
                 InstructionData::StackLoad {
                     opcode: Opcode::StackAddr,
                     stack_slot,
                     offset,
                 } => (stack_slot, offset),
                 _ => unreachable!(),
             };
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let offset: i32 = offset.into();
             let inst = ctx
                 .abi()
                 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
             ctx.emit(inst);
         }

         Opcode::Select => {
             let flag_input = inputs[0];
             if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
                 let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();

                 // For equal, we flip the operands, because we can't test a conjunction of
                 // CPU flags with a single cmove; see InvertedEqualOrConditions doc comment.
                 let (lhs_input, rhs_input) = match cond_code {
                     FloatCC::Equal => (inputs[2], inputs[1]),
                     _ => (inputs[1], inputs[2]),
                 };

                 let ty = ctx.output_ty(insn, 0);
                 let rhs = put_input_in_regs(ctx, rhs_input);
                 let dst = get_output_reg(ctx, outputs[0]);
                 let lhs = put_input_in_regs(ctx, lhs_input);

                 // We request inversion of Equal to NotEqual here: taking LHS if equal would mean
                 // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
                 // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
                 // select operation, and invert the equal to a not-equal here.
                 let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);

                 if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results {
                     // Keep this sync'd with the lowering of the select inputs above.
                     assert_eq!(cond_code, FloatCC::Equal);
                 }

                 emit_moves(ctx, dst, rhs, ty);

                 let operand_size = if ty == types::F64 {
                     OperandSize::Size64
                 } else {
                     OperandSize::Size32
                 };
                 match fcmp_results {
                     FcmpCondResult::Condition(cc) => {
                         if is_int_or_ref_ty(ty) || ty == types::I128 || ty == types::B128 {
                             let size = ty.bytes() as u8;
                             emit_cmoves(ctx, size, cc, lhs, dst);
                         } else {
                             ctx.emit(Inst::xmm_cmove(
                                 operand_size,
                                 cc,
                                 RegMem::reg(lhs.only_reg().unwrap()),
                                 dst.only_reg().unwrap(),
                             ));
                         }
                     }
                     FcmpCondResult::AndConditions(_, _) => {
                         unreachable!(
                             "can't AND with select; see above comment about inverting equal"
                         );
                     }
                     FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
                     | FcmpCondResult::OrConditions(cc1, cc2) => {
                         if is_int_or_ref_ty(ty) || ty == types::I128 {
                             let size = ty.bytes() as u8;
                             emit_cmoves(ctx, size, cc1, lhs.clone(), dst);
                             emit_cmoves(ctx, size, cc2, lhs, dst);
                         } else {
                             ctx.emit(Inst::xmm_cmove(
                                 operand_size,
                                 cc1,
                                 RegMem::reg(lhs.only_reg().unwrap()),
                                 dst.only_reg().unwrap(),
                             ));
                             ctx.emit(Inst::xmm_cmove(
                                 operand_size,
                                 cc2,
                                 RegMem::reg(lhs.only_reg().unwrap()),
                                 dst.only_reg().unwrap(),
                             ));
                         }
                     }
                 }
             } else {
                 let ty = ty.unwrap();

                 let size = ty.bytes() as u8;
                 let lhs = put_input_in_regs(ctx, inputs[1]);
                 let rhs = put_input_in_regs(ctx, inputs[2]);
                 let dst = get_output_reg(ctx, outputs[0]);

                 let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
                     let cond_code = ctx.data(icmp).cond_code().unwrap();
                     let cond_code = emit_cmp(ctx, icmp, cond_code);
                     CC::from_intcc(cond_code)
                 } else {
                     let sel_ty = ctx.input_ty(insn, 0);
                     let size = OperandSize::from_ty(ctx.input_ty(insn, 0));
                     let test = put_input_in_reg(ctx, flag_input);
                     let test_input = if sel_ty == types::B1 {
                         // The input is a boolean value; test the LSB for nonzero with:
                         //     test reg, 1
                         RegMemImm::imm(1)
                     } else {
                         // The input is an integer; test the whole value for
                         // nonzero with:
                         //     test reg, reg
                         //
                         // (It doesn't make sense to have a boolean wider than
                         // one bit here -- which bit would cause us to select an
                         // input?)
                         assert!(!is_bool_ty(sel_ty));
                         RegMemImm::reg(test)
                     };
                     ctx.emit(Inst::test_rmi_r(size, test_input, test));
                     CC::NZ
                 };

                 // This doesn't affect the flags.
                 emit_moves(ctx, dst, rhs, ty);

                 if is_int_or_ref_ty(ty) || ty == types::I128 {
                     emit_cmoves(ctx, size, cc, lhs, dst);
                 } else {
                     debug_assert!(ty == types::F32 || ty == types::F64);
                     ctx.emit(Inst::xmm_cmove(
                         if ty == types::F64 {
                             OperandSize::Size64
                         } else {
                             OperandSize::Size32
                         },
                         cc,
                         RegMem::reg(lhs.only_reg().unwrap()),
                         dst.only_reg().unwrap(),
                     ));
                 }
             }
         }

         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
             let lhs = put_input_in_regs(ctx, inputs[1]);
             let rhs = put_input_in_regs(ctx, inputs[2]);
             let dst = get_output_reg(ctx, outputs[0]);
             let ty = ctx.output_ty(insn, 0);

             // Verification ensures that the input is always a single-def ifcmp.
             let cmp_insn = ctx
                 .get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
                 .inst
                 .unwrap()
                 .0;
             debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
             let cond_code = ctx.data(insn).cond_code().unwrap();
             let cond_code = emit_cmp(ctx, cmp_insn, cond_code);

             let cc = CC::from_intcc(cond_code);

             if is_int_or_ref_ty(ty) || ty == types::I128 {
                 let size = ty.bytes() as u8;
                 emit_moves(ctx, dst, rhs, ty);
                 emit_cmoves(ctx, size, cc, lhs, dst);
             } else {
                 debug_assert!(ty == types::F32 || ty == types::F64);
                 emit_moves(ctx, dst, rhs, ty);
                 ctx.emit(Inst::xmm_cmove(
                     if ty == types::F64 {
                         OperandSize::Size64
                     } else {
                         OperandSize::Size32
                     },
                     cc,
                     RegMem::reg(lhs.only_reg().unwrap()),
                     dst.only_reg().unwrap(),
                 ));
             }
         }

         Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
             let kind = match op {
                 Opcode::Udiv => DivOrRemKind::UnsignedDiv,
                 Opcode::Sdiv => DivOrRemKind::SignedDiv,
                 Opcode::Urem => DivOrRemKind::UnsignedRem,
                 Opcode::Srem => DivOrRemKind::SignedRem,
                 _ => unreachable!(),
             };
             let is_div = kind.is_div();

             let input_ty = ctx.input_ty(insn, 0);
             let size = OperandSize::from_ty(input_ty);

             let dividend = put_input_in_reg(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::rax()),
                 dividend,
                 input_ty,
             ));

             // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
             if flags.avoid_div_traps() || op == Opcode::Srem {
                 // A vcode meta-instruction is used to lower the inline checks, since they embed
                 // pc-relative offsets that must not change, thus requiring regalloc to not
                 // interfere by introducing spills and reloads.
                 //
                 // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
                 // regalloc is aware of the coalescing opportunity between rax/rdx and the
                 // destination register.
                 let divisor = put_input_in_reg(ctx, inputs[1]);

                 let divisor_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));

                 let tmp = if op == Opcode::Sdiv && size == OperandSize::Size64 {
                     Some(ctx.alloc_tmp(types::I64).only_reg().unwrap())
                 } else {
                     None
                 };
                 // TODO use xor
                 ctx.emit(Inst::imm(
                     OperandSize::Size32,
                     0,
                     Writable::from_reg(regs::rdx()),
                 ));
                 ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
             } else {
                 // We don't want more than one trap record for a single instruction,
                 // so let's not allow the "mem" case (load-op merging) here; force
                 // divisor into a register instead.
                 let divisor = RegMem::reg(put_input_in_reg(ctx, inputs[1]));

                 // Fill in the high parts:
                 if kind.is_signed() {
                     // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
                     // signed opcodes.
                     ctx.emit(Inst::sign_extend_data(size));
                 } else if input_ty == types::I8 {
                     ctx.emit(Inst::movzx_rm_r(
                         ExtMode::BL,
                         RegMem::reg(regs::rax()),
                         Writable::from_reg(regs::rax()),
                     ));
                 } else {
                     // zero for unsigned opcodes.
                     ctx.emit(Inst::imm(
                         OperandSize::Size64,
                         0,
                         Writable::from_reg(regs::rdx()),
                     ));
                 }

                 // Emit the actual idiv.
                 ctx.emit(Inst::div(size, kind.is_signed(), divisor));
             }

             // Move the result back into the destination reg.
             if is_div {
                 // The quotient is in rax.
                 ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
             } else {
                 if size == OperandSize::Size8 {
                     // The remainder is in AH. Right-shift by 8 bits then move from rax.
                     ctx.emit(Inst::shift_r(
                         OperandSize::Size64,
                         ShiftKind::ShiftRightLogical,
                         Some(8),
                         Writable::from_reg(regs::rax()),
                     ));
                     ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
                 } else {
                     // The remainder is in rdx.
                     ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
                 }
             }
         }

         Opcode::Umulhi | Opcode::Smulhi => {
             let input_ty = ctx.input_ty(insn, 0);

             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             // Move lhs in %rax.
             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::rax()),
                 lhs,
                 input_ty,
             ));

             // Emit the actual mul or imul.
             let signed = op == Opcode::Smulhi;
             ctx.emit(Inst::mul_hi(OperandSize::from_ty(input_ty), signed, rhs));

             // Read the result from the high part (stored in %rdx).
             ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
         }

         Opcode::GetPinnedReg => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
         }

         Opcode::SetPinnedReg => {
             let src = put_input_in_reg(ctx, inputs[0]);
             ctx.emit(Inst::gen_move(
                 Writable::from_reg(regs::pinned_reg()),
                 src,
                 types::I64,
             ));
         }

         Opcode::Vconst => {
             let used_constant = if let &InstructionData::UnaryConst {
                 constant_handle, ..
             } = ctx.data(insn)
             {
                 ctx.use_constant(VCodeConstantData::Pool(
                     constant_handle,
                     ctx.get_constant_data(constant_handle).clone(),
                 ))
             } else {
                 unreachable!("vconst should always have unary_const format")
             };
             // TODO use Inst::gen_constant() instead.
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             ctx.emit(Inst::xmm_load_const(used_constant, dst, ty));
         }

         Opcode::RawBitcast => {
             // A raw_bitcast is just a mechanism for correcting the type of V128 values (see
             // https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR
             // instruction should emit no machine code but a move is necessary to give the register
             // allocator a definition for the output virtual register.
             let src = put_input_in_reg(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             ctx.emit(Inst::gen_move(dst, src, ty));
         }

         Opcode::Shuffle => {
             let ty = ty.unwrap();
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let lhs_ty = ctx.input_ty(insn, 0);
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = put_input_in_reg(ctx, inputs[1]);
             let mask = match ctx.get_immediate(insn) {
                 Some(DataValue::V128(bytes)) => bytes.to_vec(),
                 _ => unreachable!("shuffle should always have a 16-byte immediate"),
             };

             // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
             // 1 in the most significant position zeroes the lane.
             let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };

             ctx.emit(Inst::gen_move(dst, rhs, ty));
             if rhs == lhs {
                 // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
                 // register. We statically build `constructed_mask` to zero out any unknown lane
                 // indices (may not be completely necessary: verification could fail incorrect mask
                 // values) and fix the indexes to all point to the `dst` vector.
                 let constructed_mask = mask
                     .iter()
                     // If the mask is greater than 15 it still may be referring to a lane in b.
                     .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
                     .map(zero_unknown_lane_index)
                     .collect();
                 let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
                 let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                 ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
                 // After loading the constructed mask in a temporary register, we use this to
                 // shuffle the `dst` register (remember that, in this case, it is the same as
                 // `src` so we disregard this register).
                 ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
             } else {
                 if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
                     assert!(
                         mask.iter().all(|b| *b < 32),
                         "shuffle mask values must be between 0 and 31"
                     );

                     // Load the mask into the destination register.
                     let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
                     ctx.emit(Inst::xmm_load_const(constant, dst, ty));

                     // VPERMI2B has the exact semantics of Wasm's shuffle:
                     // permute the bytes in `src1` and `src2` using byte indexes
                     // in `dst` and store the byte results in `dst`.
                     ctx.emit(Inst::xmm_rm_r_evex(
                         Avx512Opcode::Vpermi2b,
                         RegMem::reg(rhs),
                         lhs,
                         dst,
                     ));
                 } else {
                     // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
                     // them together. This is necessary due to PSHUFB semantics. As in the case above,
                     // we build the `constructed_mask` for each case statically.

                     // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
                     let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
                     let constructed_mask =
                         mask.iter().cloned().map(zero_unknown_lane_index).collect();
                     let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
                     let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                     ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));

                     // PSHUFB the second argument, placing zeroes for unused lanes.
                     let constructed_mask = mask
                         .iter()
                         .map(|b| b.wrapping_sub(16))
                         .map(zero_unknown_lane_index)
                         .collect();
                     let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
                     let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
                     ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));

                     // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
                     // is not important).
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
                 }
             }
         }

         Opcode::Swizzle => {
             // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
             // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
             // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
             // semantics match the Wasm SIMD semantics for this instruction.
             // The instruction format maps to variables like: %dst = swizzle %src, %mask
             let ty = ty.unwrap();
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src = put_input_in_reg(ctx, inputs[0]);
             let swizzle_mask = put_input_in_reg(ctx, inputs[1]);

             // Inform the register allocator that `src` and `dst` should be in the same register.
             ctx.emit(Inst::gen_move(dst, src, ty));

             // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
             let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
             static ZERO_MASK_VALUE: [u8; 16] = [
                 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
                 0x70, 0x70,
             ];
             let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
             ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));

             // Use the `zero_mask` on a writable `swizzle_mask`.
             let swizzle_mask = Writable::from_reg(swizzle_mask);
             ctx.emit(Inst::xmm_rm_r(
                 SseOpcode::Paddusb,
                 RegMem::from(zero_mask),
                 swizzle_mask,
             ));

             // Shuffle `dst` using the fixed-up `swizzle_mask`.
             ctx.emit(Inst::xmm_rm_r(
                 SseOpcode::Pshufb,
                 RegMem::from(swizzle_mask),
                 dst,
             ));
         }

         Opcode::Insertlane => {
             // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane
             let ty = ty.unwrap();
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let in_vec = put_input_in_reg(ctx, inputs[0]);
             let src_ty = ctx.input_ty(insn, 1);
             debug_assert!(!src_ty.is_vector());
             let src = input_to_reg_mem(ctx, inputs[1]);
             let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
                 *imm
             } else {
                 unreachable!();
             };
             debug_assert!(lane < ty.lane_count() as u8);

             ctx.emit(Inst::gen_move(dst, in_vec, ty));
             emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
         }

         Opcode::Extractlane => {
             // The instruction format maps to variables like: %dst = extractlane %src, %lane
             let ty = ty.unwrap();
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_ty = ctx.input_ty(insn, 0);
             assert_eq!(src_ty.bits(), 128);
             let src = put_input_in_reg(ctx, inputs[0]);
             let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
                 *imm
             } else {
                 unreachable!();
             };
             debug_assert!(lane < src_ty.lane_count() as u8);

             emit_extract_lane(ctx, src, dst, lane, ty);
         }

         Opcode::ScalarToVector => {
             // When moving a scalar value to a vector register, we must be handle several
             // situations:
             //  1. a scalar float is already in an XMM register, so we simply move it
             //  2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an
             //     XMM register and zeroes the upper bits
             //  3. a scalar (float or otherwise) that has previously been loaded from memory (e.g.
             //     the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single
             //     MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the
             //     unused load.
             let src = input_to_reg_mem(ctx, inputs[0]);
             let src_ty = ctx.input_ty(insn, 0);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let dst_ty = ty.unwrap();
             assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128);
             match src {
                 RegMem::Reg { reg } => {
                     if src_ty.is_float() {
                         // Case 1: when moving a scalar float, we simply move from one XMM register
                         // to another, expecting the register allocator to elide this. Here we
                         // assume that the upper bits of a scalar float have not been munged with
                         // (the same assumption the old backend makes).
                         ctx.emit(Inst::gen_move(dst, reg, dst_ty));
                     } else {
                         // Case 2: when moving a scalar value of any other type, use MOVD to zero
                         // the upper lanes.
                         let src_size = match src_ty.bits() {
                             32 => OperandSize::Size32,
                             64 => OperandSize::Size64,
                             _ => unimplemented!("invalid source size for type: {}", src_ty),
                         };
                         ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst));
                     }
                 }
                 RegMem::Mem { .. } => {
                     // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
                     // MOVSS/MOVSD instruction.
                     let opcode = match src_ty.bits() {
                         32 => SseOpcode::Movss,
                         64 => SseOpcode::Movsd,
                         _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty),
                     };
                     ctx.emit(Inst::xmm_mov(opcode, src, dst));
                 }
             }
         }

         Opcode::Splat => {
             let ty = ty.unwrap();
             assert_eq!(ty.bits(), 128);
             let src_ty = ctx.input_ty(insn, 0);
             assert!(src_ty.bits() < 128);

             let src = input_to_reg_mem(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             // We know that splat will overwrite all of the lanes of `dst` but it takes several
             // instructions to do so. Because of the multiple instructions, there is no good way to
             // declare `dst` a `def` except with the following pseudo-instruction.
             ctx.emit(Inst::xmm_uninit_value(dst));

             // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
             // and VPBROADCAST*.
             match ty.lane_bits() {
                 8 => {
                     emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
                     // Initialize a register with all 0s.
                     let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
                     // Shuffle the lowest byte lane to all other lanes.
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
                 }
                 16 => {
                     emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
                     emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
                     // Shuffle the lowest two lanes to all other lanes.
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Pshufd,
                         RegMem::from(dst),
                         dst,
                         0,
                         OperandSize::Size32,
                     ))
                 }
                 32 => {
                     emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
                     // Shuffle the lowest lane to all other lanes.
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Pshufd,
                         RegMem::from(dst),
                         dst,
                         0,
                         OperandSize::Size32,
                     ))
                 }
                 64 => {
                     emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
                     emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
                 }
                 _ => panic!("Invalid type to splat: {}", ty),
             }
         }

         Opcode::VanyTrue => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_ty = ctx.input_ty(insn, 0);
             assert_eq!(src_ty.bits(), 128);
             let src = put_input_in_reg(ctx, inputs[0]);
             // Set the ZF if the result is all zeroes.
             ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
             // If the ZF is not set, place a 1 in `dst`.
             ctx.emit(Inst::setcc(CC::NZ, dst));
         }

         Opcode::VallTrue => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_ty = ctx.input_ty(insn, 0);
             assert_eq!(src_ty.bits(), 128);
             let src = input_to_reg_mem(ctx, inputs[0]);

             let eq = |ty: Type| match ty.lane_bits() {
                 8 => SseOpcode::Pcmpeqb,
                 16 => SseOpcode::Pcmpeqw,
                 32 => SseOpcode::Pcmpeqd,
                 64 => SseOpcode::Pcmpeqq,
                 _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
             };

             // Initialize a register with all 0s.
             let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
             ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
             // Compare to see what lanes are filled with all 1s.
             ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
             // Set the ZF if the result is all zeroes.
             ctx.emit(Inst::xmm_cmp_rm_r(
                 SseOpcode::Ptest,
                 RegMem::from(tmp),
                 tmp.to_reg(),
             ));
             // If the ZF is set, place a 1 in `dst`.
             ctx.emit(Inst::setcc(CC::Z, dst));
         }

         Opcode::VhighBits => {
             let src = put_input_in_reg(ctx, inputs[0]);
             let src_ty = ctx.input_ty(insn, 0);
             debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             debug_assert!(dst.to_reg().get_class() == RegClass::I64);

             // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
             // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
             // the instruction can access additional registers when used with a REX.R prefix. The
             // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
             // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
             // unnecessary (`OperandSize` is used for setting/clearing REX.W).
             let size = OperandSize::Size32;

             match src_ty {
                 types::I8X16 | types::B8X16 => {
                     ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
                 }
                 types::I32X4 | types::B32X4 | types::F32X4 => {
                     ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
                 }
                 types::I64X2 | types::B64X2 | types::F64X2 => {
                     ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
                 }
                 types::I16X8 | types::B16X8 => {
                     // There is no x86 instruction for extracting the high bit of 16-bit lanes so
                     // here we:
                     // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
                     //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
                     // - use PMOVMSKB to gather the high bits; now we have duplicates, though
                     // - shift away the bottom 8 high bits to remove the duplicates.
                     let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
                     ctx.emit(Inst::gen_move(tmp, src, src_ty));
                     ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
                     ctx.emit(Inst::xmm_to_gpr(
                         SseOpcode::Pmovmskb,
                         tmp.to_reg(),
                         dst,
                         size,
                     ));
                     ctx.emit(Inst::shift_r(
                         OperandSize::Size64,
                         ShiftKind::ShiftRightLogical,
                         Some(8),
                         dst,
                     ));
                 }
                 _ => unimplemented!("unknown input type {} for {}", src_ty, op),
             }
         }

         Opcode::Iconcat => {
             let ty = ctx.output_ty(insn, 0);
             assert_eq!(
                 ty,
                 types::I128,
                 "Iconcat not expected to be used for non-128-bit type"
             );
             assert_eq!(ctx.input_ty(insn, 0), types::I64);
             assert_eq!(ctx.input_ty(insn, 1), types::I64);
             let lo = put_input_in_reg(ctx, inputs[0]);
             let hi = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]);
             ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
             ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
         }

         Opcode::Isplit => {
             let ty = ctx.input_ty(insn, 0);
             assert_eq!(
                 ty,
                 types::I128,
                 "Iconcat not expected to be used for non-128-bit type"
             );
             assert_eq!(ctx.output_ty(insn, 0), types::I64);
             assert_eq!(ctx.output_ty(insn, 1), types::I64);
             let src = put_input_in_regs(ctx, inputs[0]);
             let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
             ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
         }

         Opcode::TlsValue => match flags.tls_model() {
             TlsModel::ElfGd => {
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let (name, _, _) = ctx.symbol_value(insn).unwrap();
                 let symbol = name.clone();
                 ctx.emit(Inst::ElfTlsGetAddr { symbol });
                 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
             }
             TlsModel::Macho => {
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let (name, _, _) = ctx.symbol_value(insn).unwrap();
                 let symbol = name.clone();
                 ctx.emit(Inst::MachOTlsGetAddr { symbol });
                 ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
             }
             _ => {
                 todo!(
                     "Unimplemented TLS model in x64 backend: {:?}",
                     flags.tls_model()
                 );
             }
         },

         Opcode::SqmulRoundSat => {
             // Lane-wise saturating rounding multiplication in Q15 format
             // Optimal lowering taken from instruction proposal https://github.com/WebAssembly/simd/pull/365
             // y = i16x8.q15mulr_sat_s(a, b) is lowered to:
             //MOVDQA xmm_y, xmm_a
             //MOVDQA xmm_tmp, wasm_i16x8_splat(0x8000)
             //PMULHRSW xmm_y, xmm_b
             //PCMPEQW xmm_tmp, xmm_y
             //PXOR xmm_y, xmm_tmp
             let input_ty = ctx.input_ty(insn, 0);
             let src1 = put_input_in_reg(ctx, inputs[0]);
             let src2 = put_input_in_reg(ctx, inputs[1]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             ctx.emit(Inst::gen_move(dst, src1, input_ty));
             static SAT_MASK: [u8; 16] = [
                 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
                 0x00, 0x80,
             ];
             let mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&SAT_MASK));
             let mask = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
             ctx.emit(Inst::xmm_load_const(mask_const, mask, types::I16X8));

             ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmulhrsw, RegMem::reg(src2), dst));
             ctx.emit(Inst::xmm_rm_r(
                 SseOpcode::Pcmpeqw,
                 RegMem::reg(dst.to_reg()),
                 mask,
             ));
             ctx.emit(Inst::xmm_rm_r(
                 SseOpcode::Pxor,
                 RegMem::reg(mask.to_reg()),
                 dst,
             ));
         }

         // Unimplemented opcodes below. These are not currently used by Wasm
         // lowering or other known embeddings, but should be either supported or
         // removed eventually.
         Opcode::Uload8x8Complex
         | Opcode::Sload8x8Complex
         | Opcode::Uload16x4Complex
         | Opcode::Sload16x4Complex
         | Opcode::Uload32x2Complex
         | Opcode::Sload32x2Complex => {
             unimplemented!("Vector load {:?} not implemented", op);
         }

         Opcode::Cls => unimplemented!("Cls not supported"),

         Opcode::Fma => unimplemented!("Fma not supported"),

         Opcode::BorNot | Opcode::BxorNot => {
             unimplemented!("or-not / xor-not opcodes not implemented");
         }

         Opcode::Bmask => unimplemented!("Bmask not implemented"),

         Opcode::Trueif | Opcode::Trueff => unimplemented!("trueif / trueff not implemented"),

         Opcode::ConstAddr => unimplemented!("ConstAddr not implemented"),

         Opcode::Vsplit | Opcode::Vconcat => {
             unimplemented!("Vector split/concat ops not implemented.");
         }

         Opcode::Uunarrow => {
             unimplemented!("unimplemented lowering for opcode {:?}", op);
         }

         // Opcodes that should be removed by legalization. These should
         // eventually be removed if/when we replace in-situ legalization with
         // something better.
         Opcode::Ifcmp | Opcode::Ffcmp => {
             panic!("Should never reach ifcmp/ffcmp as isel root!");
         }

         Opcode::IaddImm
         | Opcode::ImulImm
         | Opcode::UdivImm
         | Opcode::SdivImm
         | Opcode::UremImm
         | Opcode::SremImm
         | Opcode::IrsubImm
         | Opcode::IaddCin
         | Opcode::IaddIfcin
         | Opcode::IaddCout
         | Opcode::IaddCarry
         | Opcode::IaddIfcarry
         | Opcode::IsubBin
         | Opcode::IsubIfbin
         | Opcode::IsubBout
         | Opcode::IsubIfbout
         | Opcode::IsubBorrow
         | Opcode::IsubIfborrow
         | Opcode::BandImm
         | Opcode::BorImm
         | Opcode::BxorImm
         | Opcode::RotlImm
         | Opcode::RotrImm
         | Opcode::IshlImm
         | Opcode::UshrImm
         | Opcode::SshrImm
         | Opcode::IcmpImm
         | Opcode::IfcmpImm => {
             panic!("ALU+imm and ALU+carry ops should not appear here!");
         }

         Opcode::StackLoad | Opcode::StackStore => {
             panic!("Direct stack memory access not supported; should have been legalized");
         }

         Opcode::GlobalValue => {
             panic!("global_value should have been removed by legalization!");
         }

         Opcode::HeapAddr => {
             panic!("heap_addr should have been removed by legalization!");
         }

         Opcode::TableAddr => {
             panic!("table_addr should have been removed by legalization!");
         }

         Opcode::Safepoint => {
             panic!("safepoint instructions not used by new backend's safepoints!");
         }

         Opcode::Spill
         | Opcode::Fill
         | Opcode::FillNop
         | Opcode::Regmove
         | Opcode::CopySpecial
         | Opcode::CopyToSsa
         | Opcode::CopyNop
         | Opcode::AdjustSpDown
         | Opcode::AdjustSpUpImm
         | Opcode::AdjustSpDownImm
         | Opcode::IfcmpSp
         | Opcode::Regspill
         | Opcode::Regfill
         | Opcode::Copy
         | Opcode::DummySargT => {
             panic!("Unused opcode should not be encountered.");
         }

         Opcode::JumpTableEntry | Opcode::JumpTableBase => {
             panic!("Should not appear: we handle BrTable directly");
         }

         Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
             panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
         }

         Opcode::Jump
         | Opcode::Fallthrough
         | Opcode::Brz
         | Opcode::Brnz
         | Opcode::BrIcmp
         | Opcode::Brif
         | Opcode::Brff
         | Opcode::IndirectJumpTableBr
         | Opcode::BrTable => {
             panic!("Branch opcode reached non-branch lowering logic!");
         }

         Opcode::X86Udivmodx
         | Opcode::X86Sdivmodx
         | Opcode::X86Umulx
         | Opcode::X86Smulx
         | Opcode::X86Cvtt2si
         | Opcode::X86Fmin
         | Opcode::X86Fmax
         | Opcode::X86Push
         | Opcode::X86Pop
         | Opcode::X86Bsr
         | Opcode::X86Bsf
         | Opcode::X86Pblendw
         | Opcode::X86Pshufd
         | Opcode::X86Pshufb
         | Opcode::X86Pextr
         | Opcode::X86Pinsr
         | Opcode::X86Insertps
         | Opcode::X86Movsd
         | Opcode::X86Movlhps
         | Opcode::X86Palignr
         | Opcode::X86Psll
         | Opcode::X86Psrl
         | Opcode::X86Psra
         | Opcode::X86Ptest
         | Opcode::X86Pmaxs
         | Opcode::X86Pmaxu
         | Opcode::X86Pmins
         | Opcode::X86Pminu
         | Opcode::X86Pmullq
         | Opcode::X86Pmuludq
         | Opcode::X86Punpckh
         | Opcode::X86Punpckl
         | Opcode::X86Vcvtudq2ps
         | Opcode::X86ElfTlsGetAddr
         | Opcode::X86MachoTlsGetAddr => {
             panic!("x86-specific opcode in supposedly arch-neutral IR!");
         }

         Opcode::Nop => {
             // Nothing.
         }
     }

     Ok(())
 }

 //=============================================================================
 // Lowering-backend trait implementation.

 impl LowerBackend for X64Backend {
     type MInst = Inst;

     fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
         lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple)
     }

     fn lower_branch_group<C: LowerCtx<I = Inst>>(
         &self,
         ctx: &mut C,
         branches: &[IRInst],
         targets: &[MachLabel],
     ) -> CodegenResult<()> {
         // A block should end with at most two branches. The first may be a
         // conditional branch; a conditional branch can be followed only by an
         // unconditional branch or fallthrough. Otherwise, if only one branch,
         // it may be an unconditional branch, a fallthrough, a return, or a
         // trap. These conditions are verified by `is_ebb_basic()` during the
         // verifier pass.
         assert!(branches.len() <= 2);

         if branches.len() == 2 {
             // Must be a conditional branch followed by an unconditional branch.
             let op0 = ctx.data(branches[0]).opcode();
             let op1 = ctx.data(branches[1]).opcode();

             trace!(
                 "lowering two-branch group: opcodes are {:?} and {:?}",
                 op0,
                 op1
             );
             assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);

             let taken = targets[0];
             // not_taken target is the target of the second branch, even if it is a Fallthrough
             // instruction: because we reorder blocks while we lower, the fallthrough in the new
             // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
             // explicitly-provided target.
             let not_taken = targets[1];

             match op0 {
                 Opcode::Brz | Opcode::Brnz => {
                     let flag_input = InsnInput {
                         insn: branches[0],
                         input: 0,
                     };

                     let src_ty = ctx.input_ty(branches[0], 0);

                     if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
                         let cond_code = ctx.data(icmp).cond_code().unwrap();
                         let cond_code = emit_cmp(ctx, icmp, cond_code);

                         let cond_code = if op0 == Opcode::Brz {
                             cond_code.inverse()
                         } else {
                             cond_code
                         };

                         let cc = CC::from_intcc(cond_code);
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
                         let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
                         let cond_code = if op0 == Opcode::Brz {
                             cond_code.inverse()
                         } else {
                             cond_code
                         };
                         match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
                             FcmpCondResult::Condition(cc) => {
                                 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                             }
                             FcmpCondResult::AndConditions(cc1, cc2) => {
                                 ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
                                 ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
                             }
                             FcmpCondResult::OrConditions(cc1, cc2) => {
                                 ctx.emit(Inst::jmp_if(cc1, taken));
                                 ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
                             }
                             FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                         }
                     } else if src_ty == types::I128 {
                         let src = put_input_in_regs(
                             ctx,
                             InsnInput {
                                 insn: branches[0],
                                 input: 0,
                             },
                         );
                         let (half_cc, comb_op) = match op0 {
                             Opcode::Brz => (CC::Z, AluRmiROpcode::And8),
                             Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8),
                             _ => unreachable!(),
                         };
                         let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                         ctx.emit(Inst::cmp_rmi_r(
                             OperandSize::Size64,
                             RegMemImm::imm(0),
                             src.regs()[0],
                         ));
                         ctx.emit(Inst::setcc(half_cc, tmp1));
                         ctx.emit(Inst::cmp_rmi_r(
                             OperandSize::Size64,
                             RegMemImm::imm(0),
                             src.regs()[1],
                         ));
                         ctx.emit(Inst::setcc(half_cc, tmp2));
                         ctx.emit(Inst::alu_rmi_r(
                             OperandSize::Size32,
                             comb_op,
                             RegMemImm::reg(tmp1.to_reg()),
                             tmp2,
                         ));
                         ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
                     } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
                         let src = put_input_in_reg(
                             ctx,
                             InsnInput {
                                 insn: branches[0],
                                 input: 0,
                             },
                         );
                         let cc = match op0 {
                             Opcode::Brz => CC::Z,
                             Opcode::Brnz => CC::NZ,
                             _ => unreachable!(),
                         };
                         // See case for `Opcode::Select` above re: testing the
                         // boolean input.
                         let test_input = if src_ty == types::B1 {
                             // test src, 1
                             RegMemImm::imm(1)
                         } else {
                             assert!(!is_bool_ty(src_ty));
                             // test src, src
                             RegMemImm::reg(src)
                         };

                         ctx.emit(Inst::test_rmi_r(
                             OperandSize::from_ty(src_ty),
                             test_input,
                             src,
                         ));
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else {
                         unimplemented!("brz/brnz with non-int type {:?}", src_ty);
                     }
                 }

                 Opcode::BrIcmp => {
                     let src_ty = ctx.input_ty(branches[0], 0);
                     if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
                         let lhs = put_input_in_reg(
                             ctx,
                             InsnInput {
                                 insn: branches[0],
                                 input: 0,
                             },
                         );
                         let rhs = input_to_reg_mem_imm(
                             ctx,
                             InsnInput {
                                 insn: branches[0],
                                 input: 1,
                             },
                         );
                         let cc = CC::from_intcc(ctx.data(branches[0]).cond_code().unwrap());
                         // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
                         // us dst - src at the machine instruction level, so invert operands.
                         ctx.emit(Inst::cmp_rmi_r(OperandSize::from_ty(src_ty), rhs, lhs));
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else {
                         unimplemented!("bricmp with non-int type {:?}", src_ty);
                     }
                 }

                 Opcode::Brif => {
                     let flag_input = InsnInput {
                         insn: branches[0],
                         input: 0,
                     };

                     if let Some(ifcmp) = matches_input(ctx, flag_input, Opcode::Ifcmp) {
                         let cond_code = ctx.data(branches[0]).cond_code().unwrap();
                         let cond_code = emit_cmp(ctx, ifcmp, cond_code);
                         let cc = CC::from_intcc(cond_code);
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else if let Some(ifcmp_sp) = matches_input(ctx, flag_input, Opcode::IfcmpSp) {
                         let operand = put_input_in_reg(
                             ctx,
                             InsnInput {
                                 insn: ifcmp_sp,
                                 input: 0,
                             },
                         );
                         let ty = ctx.input_ty(ifcmp_sp, 0);
                         ctx.emit(Inst::cmp_rmi_r(
                             OperandSize::from_ty(ty),
                             RegMemImm::reg(regs::rsp()),
                             operand,
                         ));
                         let cond_code = ctx.data(branches[0]).cond_code().unwrap();
                         let cc = CC::from_intcc(cond_code);
                         ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                     } else {
                         // Should be disallowed by flags checks in verifier.
                         unimplemented!("Brif with non-ifcmp input");
                     }
                 }
                 Opcode::Brff => {
                     let flag_input = InsnInput {
                         insn: branches[0],
                         input: 0,
                     };

                     if let Some(ffcmp) = matches_input(ctx, flag_input, Opcode::Ffcmp) {
                         let cond_code = ctx.data(branches[0]).fp_cond_code().unwrap();
                         match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
                             FcmpCondResult::Condition(cc) => {
                                 ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                             }
                             FcmpCondResult::AndConditions(cc1, cc2) => {
                                 ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
                                 ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
                             }
                             FcmpCondResult::OrConditions(cc1, cc2) => {
                                 ctx.emit(Inst::jmp_if(cc1, taken));
                                 ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
                             }
                             FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                         }
                     } else {
                         // Should be disallowed by flags checks in verifier.
                         unimplemented!("Brff with input not from ffcmp");
                     }
                 }

                 _ => panic!("unexpected branch opcode: {:?}", op0),
             }
         } else {
             assert_eq!(branches.len(), 1);

             // Must be an unconditional branch or trap.
             let op = ctx.data(branches[0]).opcode();
             match op {
                 Opcode::Jump | Opcode::Fallthrough => {
                     ctx.emit(Inst::jmp_known(targets[0]));
                 }

                 Opcode::BrTable => {
                     let jt_size = targets.len() - 1;
                     assert!(jt_size <= u32::max_value() as usize);
                     let jt_size = jt_size as u32;

                     let idx = extend_input_to_reg(
                         ctx,
                         InsnInput {
                             insn: branches[0],
                             input: 0,
                         },
                         ExtSpec::ZeroExtendTo32,
                     );

                     // Bounds-check (compute flags from idx - jt_size) and branch to default.
                     ctx.emit(Inst::cmp_rmi_r(
                         OperandSize::Size32,
                         RegMemImm::imm(jt_size),
                         idx,
                     ));

                     // Emit the compound instruction that does:
                     //
                     // lea $jt, %rA
                     // movsbl [%rA, %rIndex, 2], %rB
                     // add %rB, %rA
                     // j *%rA
                     // [jt entries]
                     //
                     // This must be *one* instruction in the vcode because we cannot allow regalloc
                     // to insert any spills/fills in the middle of the sequence; otherwise, the
                     // lea PC-rel offset to the jumptable would be incorrect.  (The alternative
                     // is to introduce a relocation pass for inlined jumptables, which is much
                     // worse.)

                     // This temporary is used as a signed integer of 64-bits (to hold addresses).
                     let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                     // This temporary is used as a signed integer of 32-bits (for the wasm-table
                     // index) and then 64-bits (address addend). The small lie about the I64 type
                     // is benign, since the temporary is dead after this instruction (and its
                     // Cranelift type is thus unused).
                     let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();

                     let targets_for_term: Vec<MachLabel> = targets.to_vec();
                     let default_target = targets[0];

                     let jt_targets: Vec<MachLabel> = targets.iter().skip(1).cloned().collect();

                     ctx.emit(Inst::JmpTableSeq {
                         idx,
                         tmp1,
                         tmp2,
                         default_target,
                         targets: jt_targets,
                         targets_for_term,
                     });
                 }

                 _ => panic!("Unknown branch type {:?}", op),
             }
         }

         Ok(())
     }

     fn maybe_pinned_reg(&self) -> Option<Reg> {
         Some(regs::pinned_reg())
     }
 }