vendor/cranelift-codegen/src/isa/x64/lower.rs - toolchain/rustc - Git at Google

 //! Lowering rules for X64.

 // ISLE integration glue.
 pub(super) mod isle;

 use crate::ir::{types, ExternalName, Inst as IRInst, LibCall, Opcode, Type};
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::result::CodegenResult;
 use crate::settings::Flags;
 use smallvec::SmallVec;
 use target_lexicon::Triple;

 //=============================================================================
 // Helpers for instruction lowering.

 fn is_int_or_ref_ty(ty: Type) -> bool {
     match ty {
         types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
         types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
         types::R32 => panic!("shouldn't have 32-bits refs on x64"),
         _ => false,
     }
 }

 /// Returns whether the given specified `input` is a result produced by an instruction with Opcode
 /// `op`.
 // TODO investigate failures with checking against the result index.
 fn matches_input(ctx: &mut Lower<Inst>, input: InsnInput, op: Opcode) -> Option<IRInst> {
     let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
     inputs.inst.as_inst().and_then(|(src_inst, _)| {
         let data = ctx.data(src_inst);
         if data.opcode() == op {
             return Some(src_inst);
         }
         None
     })
 }

 /// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
 /// temporary register, returning that register.
 fn generate_constant(ctx: &mut Lower<Inst>, ty: Type, c: u64) -> ValueRegs<Reg> {
     let from_bits = ty_bits(ty);
     let masked = if from_bits < 64 {
         c & ((1u64 << from_bits) - 1)
     } else {
         c
     };

     let cst_copy = ctx.alloc_tmp(ty);
     for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| {
         ctx.alloc_tmp(ty).only_reg().unwrap()
     })
     .into_iter()
     {
         ctx.emit(inst);
     }
     non_writable_value_regs(cst_copy)
 }

 /// Put the given input into possibly multiple registers, and mark it as used (side-effect).
 fn put_input_in_regs(ctx: &mut Lower<Inst>, spec: InsnInput) -> ValueRegs<Reg> {
     let ty = ctx.input_ty(spec.insn, spec.input);
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);

     if let Some(c) = input.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         generate_constant(ctx, ty, c)
     } else {
         ctx.put_input_in_regs(spec.insn, spec.input)
     }
 }

 /// Put the given input into a register, and mark it as used (side-effect).
 fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg {
     put_input_in_regs(ctx, spec)
         .only_reg()
         .expect("Multi-register value not expected")
 }

 /// Determines whether a load operation (indicated by `src_insn`) can be merged
 /// into the current lowering point. If so, returns the address-base source (as
 /// an `InsnInput`) and an offset from that address from which to perform the
 /// load.
 fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInput, i32)> {
     let insn_data = ctx.data(src_insn);
     let inputs = ctx.num_inputs(src_insn);
     if inputs != 1 {
         return None;
     }

     let load_ty = ctx.output_ty(src_insn, 0);
     if ty_bits(load_ty) < 32 {
         // Narrower values are handled by ALU insts that are at least 32 bits
         // wide, which is normally OK as we ignore upper buts; but, if we
         // generate, e.g., a direct-from-memory 32-bit add for a byte value and
         // the byte is the last byte in a page, the extra data that we load is
         // incorrectly accessed. So we only allow loads to merge for
         // 32-bit-and-above widths.
         return None;
     }

     // SIMD instructions can only be load-coalesced when the loaded value comes
     // from an aligned address.
     if load_ty.is_vector() && !insn_data.memflags().map_or(false, |f| f.aligned()) {
         return None;
     }

     // Just testing the opcode is enough, because the width will always match if
     // the type does (and the type should match if the CLIF is properly
     // constructed).
     if insn_data.opcode() == Opcode::Load {
         let offset = insn_data
             .load_store_offset()
             .expect("load should have offset");
         Some((
             InsnInput {
                 insn: src_insn,
                 input: 0,
             },
             offset,
         ))
     } else {
         None
     }
 }

 fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
     ctx.get_input_as_source_or_const(spec.insn, spec.input)
         .constant
 }

 fn emit_vm_call(
     ctx: &mut Lower<Inst>,
     flags: &Flags,
     triple: &Triple,
     libcall: LibCall,
     inputs: &[Reg],
     outputs: &[Writable<Reg>],
 ) -> CodegenResult<()> {
     let extname = ExternalName::LibCall(libcall);

     let dist = if flags.use_colocated_libcalls() {
         RelocDistance::Near
     } else {
         RelocDistance::Far
     };

     // TODO avoid recreating signatures for every single Libcall function.
     let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
     let sig = libcall.signature(call_conv);
     let caller_conv = ctx.abi().call_conv(ctx.sigs());

     if !ctx.sigs().have_abi_sig_for_signature(&sig) {
         ctx.sigs_mut()
             .make_abi_sig_from_ir_signature::<X64ABIMachineSpec>(sig.clone(), flags)?;
     }

     let mut abi =
         X64Caller::from_libcall(ctx.sigs(), &sig, &extname, dist, caller_conv, flags.clone())?;

     abi.emit_stack_pre_adjust(ctx);

     assert_eq!(inputs.len(), abi.num_args(ctx.sigs()));

     for (i, input) in inputs.iter().enumerate() {
         for inst in abi.gen_copy_regs_to_arg(ctx, i, ValueRegs::one(*input)) {
             ctx.emit(inst);
         }
     }

     abi.emit_call(ctx);
     for (i, output) in outputs.iter().enumerate() {
         for inst in abi.gen_copy_retval_to_regs(ctx, i, ValueRegs::one(*output)) {
             ctx.emit(inst);
         }
     }
     abi.emit_stack_post_adjust(ctx);

     Ok(())
 }

 /// Returns whether the given input is a shift by a constant value less or equal than 3.
 /// The goal is to embed it within an address mode.
 fn matches_small_constant_shift(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<(InsnInput, u8)> {
     matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
         match input_to_imm(
             ctx,
             InsnInput {
                 insn: shift,
                 input: 1,
             },
         ) {
             Some(shift_amt) if shift_amt <= 3 => Some((
                 InsnInput {
                     insn: shift,
                     input: 0,
                 },
                 shift_amt as u8,
             )),
             _ => None,
         }
     })
 }

 /// Lowers an instruction to one of the x86 addressing modes.
 ///
 /// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
 fn lower_to_amode(ctx: &mut Lower<Inst>, spec: InsnInput, offset: i32) -> Amode {
     let flags = ctx
         .memflags(spec.insn)
         .expect("Instruction with amode should have memflags");

     // We now either have an add that we must materialize, or some other input; as well as the
     // final offset.
     if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
         debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
         let add_inputs = &[
             InsnInput {
                 insn: add,
                 input: 0,
             },
             InsnInput {
                 insn: add,
                 input: 1,
             },
         ];

         // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
         // aren't happening in the wasm case. We could do better, given some range analysis.
         let (base, index, shift) = if let Some((shift_input, shift_amt)) =
             matches_small_constant_shift(ctx, add_inputs[0])
         {
             (
                 put_input_in_reg(ctx, add_inputs[1]),
                 put_input_in_reg(ctx, shift_input),
                 shift_amt,
             )
         } else if let Some((shift_input, shift_amt)) =
             matches_small_constant_shift(ctx, add_inputs[1])
         {
             (
                 put_input_in_reg(ctx, add_inputs[0]),
                 put_input_in_reg(ctx, shift_input),
                 shift_amt,
             )
         } else {
             for i in 0..=1 {
                 // Try to pierce through uextend.
                 if let Some(uextend) = matches_input(
                     ctx,
                     InsnInput {
                         insn: add,
                         input: i,
                     },
                     Opcode::Uextend,
                 ) {
                     if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
                         // Zero the upper bits.
                         let input_size = ctx.input_ty(uextend, 0).bits() as u64;
                         let shift: u64 = 64 - input_size;
                         let uext_cst: u64 = (cst << shift) >> shift;

                         let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
                         if low32_will_sign_extend_to_64(final_offset as u64) {
                             let base = put_input_in_reg(ctx, add_inputs[1 - i]);
                             return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
                         }
                     }
                 }

                 // If it's a constant, add it directly!
                 if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
                     let final_offset = (offset as i64).wrapping_add(cst as i64);
                     if low32_will_sign_extend_to_64(final_offset as u64) {
                         let base = put_input_in_reg(ctx, add_inputs[1 - i]);
                         return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
                     }
                 }
             }

             (
                 put_input_in_reg(ctx, add_inputs[0]),
                 put_input_in_reg(ctx, add_inputs[1]),
                 0,
             )
         };

         return Amode::imm_reg_reg_shift(
             offset as u32,
             Gpr::new(base).unwrap(),
             Gpr::new(index).unwrap(),
             shift,
         )
         .with_flags(flags);
     }

     let input = put_input_in_reg(ctx, spec);
     Amode::imm_reg(offset as u32, input).with_flags(flags)
 }

 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.

 /// Actually codegen an instruction's results into registers.
 fn lower_insn_to_regs(
     ctx: &mut Lower<Inst>,
     insn: IRInst,
     flags: &Flags,
     isa_flags: &x64_settings::Flags,
     triple: &Triple,
 ) -> CodegenResult<()> {
     let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
         .map(|i| InsnOutput { insn, output: i })
         .collect();

     if let Ok(()) = isle::lower(ctx, triple, flags, isa_flags, &outputs, insn) {
         return Ok(());
     }

     let op = ctx.data(insn).opcode();
     match op {
         Opcode::Iconst
         | Opcode::Bconst
         | Opcode::F32const
         | Opcode::F64const
         | Opcode::Null
         | Opcode::Iadd
         | Opcode::IaddIfcout
         | Opcode::SaddSat
         | Opcode::UaddSat
         | Opcode::Isub
         | Opcode::SsubSat
         | Opcode::UsubSat
         | Opcode::AvgRound
         | Opcode::Band
         | Opcode::Bor
         | Opcode::Bxor
         | Opcode::Imul
         | Opcode::BandNot
         | Opcode::Iabs
         | Opcode::Imax
         | Opcode::Umax
         | Opcode::Imin
         | Opcode::Umin
         | Opcode::Bnot
         | Opcode::Bitselect
         | Opcode::Vselect
         | Opcode::Ushr
         | Opcode::Sshr
         | Opcode::Ishl
         | Opcode::Rotl
         | Opcode::Rotr
         | Opcode::Ineg
         | Opcode::Trap
         | Opcode::ResumableTrap
         | Opcode::Clz
         | Opcode::Ctz
         | Opcode::Popcnt
         | Opcode::Bitrev
         | Opcode::IsNull
         | Opcode::IsInvalid
         | Opcode::Uextend
         | Opcode::Sextend
         | Opcode::Breduce
         | Opcode::Bextend
         | Opcode::Ireduce
         | Opcode::Bint
         | Opcode::Debugtrap
         | Opcode::WideningPairwiseDotProductS
         | Opcode::Fadd
         | Opcode::Fsub
         | Opcode::Fmul
         | Opcode::Fdiv
         | Opcode::Fmin
         | Opcode::Fmax
         | Opcode::FminPseudo
         | Opcode::FmaxPseudo
         | Opcode::Sqrt
         | Opcode::Fpromote
         | Opcode::FvpromoteLow
         | Opcode::Fdemote
         | Opcode::Fvdemote
         | Opcode::Fma
         | Opcode::Icmp
         | Opcode::Fcmp
         | Opcode::Load
         | Opcode::Uload8
         | Opcode::Sload8
         | Opcode::Uload16
         | Opcode::Sload16
         | Opcode::Uload32
         | Opcode::Sload32
         | Opcode::Sload8x8
         | Opcode::Uload8x8
         | Opcode::Sload16x4
         | Opcode::Uload16x4
         | Opcode::Sload32x2
         | Opcode::Uload32x2
         | Opcode::Store
         | Opcode::Istore8
         | Opcode::Istore16
         | Opcode::Istore32
         | Opcode::AtomicRmw
         | Opcode::AtomicCas
         | Opcode::AtomicLoad
         | Opcode::AtomicStore
         | Opcode::Fence
         | Opcode::FuncAddr
         | Opcode::SymbolValue
         | Opcode::Return
         | Opcode::Call
         | Opcode::CallIndirect
         | Opcode::Trapif
         | Opcode::Trapff
         | Opcode::GetFramePointer
         | Opcode::GetStackPointer
         | Opcode::GetReturnAddress
         | Opcode::Select
         | Opcode::Selectif
         | Opcode::SelectifSpectreGuard
         | Opcode::FcvtFromSint
         | Opcode::FcvtLowFromSint
         | Opcode::FcvtFromUint
         | Opcode::FcvtToUint
         | Opcode::FcvtToSint
         | Opcode::FcvtToUintSat
         | Opcode::FcvtToSintSat
         | Opcode::IaddPairwise
         | Opcode::UwidenHigh
         | Opcode::UwidenLow
         | Opcode::SwidenHigh
         | Opcode::SwidenLow
         | Opcode::Snarrow
         | Opcode::Unarrow
         | Opcode::Bitcast
         | Opcode::Fabs
         | Opcode::Fneg
         | Opcode::Fcopysign
         | Opcode::Ceil
         | Opcode::Floor
         | Opcode::Nearest
         | Opcode::Trunc
         | Opcode::StackAddr
         | Opcode::Udiv
         | Opcode::Urem
         | Opcode::Sdiv
         | Opcode::Srem
         | Opcode::Umulhi
         | Opcode::Smulhi
         | Opcode::GetPinnedReg
         | Opcode::SetPinnedReg
         | Opcode::Vconst
         | Opcode::RawBitcast
         | Opcode::Insertlane
         | Opcode::Shuffle
         | Opcode::Swizzle
         | Opcode::Extractlane
         | Opcode::ScalarToVector
         | Opcode::Splat
         | Opcode::VanyTrue
         | Opcode::VallTrue
         | Opcode::VhighBits
         | Opcode::Iconcat
         | Opcode::Isplit
         | Opcode::TlsValue
         | Opcode::SqmulRoundSat
         | Opcode::Uunarrow
         | Opcode::Nop => {
             let ty = if outputs.len() > 0 {
                 Some(ctx.output_ty(insn, 0))
             } else {
                 None
             };

             unreachable!(
                 "implemented in ISLE: inst = `{}`, type = `{:?}`",
                 ctx.dfg().display_inst(insn),
                 ty
             )
         }

         Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

         // Unimplemented opcodes below. These are not currently used by Wasm
         // lowering or other known embeddings, but should be either supported or
         // removed eventually
         Opcode::ExtractVector => {
             unimplemented!("ExtractVector not supported");
         }

         Opcode::Cls => unimplemented!("Cls not supported"),

         Opcode::BorNot | Opcode::BxorNot => {
             unimplemented!("or-not / xor-not opcodes not implemented");
         }

         Opcode::Bmask => unimplemented!("Bmask not implemented"),

         Opcode::Trueif | Opcode::Trueff => unimplemented!("trueif / trueff not implemented"),

         Opcode::Vsplit | Opcode::Vconcat => {
             unimplemented!("Vector split/concat ops not implemented.");
         }

         // Opcodes that should be removed by legalization. These should
         // eventually be removed if/when we replace in-situ legalization with
         // something better.
         Opcode::Ifcmp | Opcode::Ffcmp => {
             panic!("Should never reach ifcmp/ffcmp as isel root!");
         }

         Opcode::IaddImm
         | Opcode::ImulImm
         | Opcode::UdivImm
         | Opcode::SdivImm
         | Opcode::UremImm
         | Opcode::SremImm
         | Opcode::IrsubImm
         | Opcode::IaddCin
         | Opcode::IaddIfcin
         | Opcode::IaddCout
         | Opcode::IaddCarry
         | Opcode::IaddIfcarry
         | Opcode::IsubBin
         | Opcode::IsubIfbin
         | Opcode::IsubBout
         | Opcode::IsubIfbout
         | Opcode::IsubBorrow
         | Opcode::IsubIfborrow
         | Opcode::BandImm
         | Opcode::BorImm
         | Opcode::BxorImm
         | Opcode::RotlImm
         | Opcode::RotrImm
         | Opcode::IshlImm
         | Opcode::UshrImm
         | Opcode::SshrImm
         | Opcode::IcmpImm
         | Opcode::IfcmpImm => {
             panic!("ALU+imm and ALU+carry ops should not appear here!");
         }

         Opcode::StackLoad
         | Opcode::StackStore
         | Opcode::DynamicStackStore
         | Opcode::DynamicStackLoad => {
             panic!("Direct stack memory access not supported; should have been legalized");
         }

         Opcode::GlobalValue => {
             panic!("global_value should have been removed by legalization!");
         }

         Opcode::HeapAddr => {
             panic!("heap_addr should have been removed by legalization!");
         }

         Opcode::TableAddr => {
             panic!("table_addr should have been removed by legalization!");
         }

         Opcode::Copy => {
             panic!("Unused opcode should not be encountered.");
         }

         Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
             panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
         }

         Opcode::Jump
         | Opcode::Brz
         | Opcode::Brnz
         | Opcode::BrIcmp
         | Opcode::Brif
         | Opcode::Brff
         | Opcode::BrTable => {
             panic!("Branch opcode reached non-branch lowering logic!");
         }
     }
 }

 //=============================================================================
 // Lowering-backend trait implementation.

 impl LowerBackend for X64Backend {
     type MInst = Inst;

     fn lower(&self, ctx: &mut Lower<Inst>, ir_inst: IRInst) -> CodegenResult<()> {
         lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple)
     }

     fn lower_branch_group(
         &self,
         ctx: &mut Lower<Inst>,
         branches: &[IRInst],
         targets: &[MachLabel],
     ) -> CodegenResult<()> {
         // A block should end with at most two branches. The first may be a
         // conditional branch; a conditional branch can be followed only by an
         // unconditional branch or fallthrough. Otherwise, if only one branch,
         // it may be an unconditional branch, a fallthrough, a return, or a
         // trap. These conditions are verified by `is_ebb_basic()` during the
         // verifier pass.
         assert!(branches.len() <= 2);
         if branches.len() == 2 {
             let op1 = ctx.data(branches[1]).opcode();
             assert!(op1 == Opcode::Jump);
         }

         if let Ok(()) = isle::lower_branch(
             ctx,
             &self.triple,
             &self.flags,
             &self.x64_flags,
             branches[0],
             targets,
         ) {
             return Ok(());
         }

         unreachable!(
             "implemented in ISLE: branch = `{}`",
             ctx.dfg().display_inst(branches[0]),
         );
     }

     fn maybe_pinned_reg(&self) -> Option<Reg> {
         Some(regs::pinned_reg())
     }
 }
	//! Lowering rules for X64.

	// ISLE integration glue.
	pub(super) mod isle;

	use crate::ir::{types, ExternalName, Inst as IRInst, LibCall, Opcode, Type};
	use crate::isa::x64::abi::*;
	use crate::isa::x64::inst::args::*;
	use crate::isa::x64::inst::*;
	use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
	use crate::machinst::lower::*;
	use crate::machinst::*;
	use crate::result::CodegenResult;
	use crate::settings::Flags;
	use smallvec::SmallVec;
	use target_lexicon::Triple;

	//=============================================================================
	// Helpers for instruction lowering.

	fn is_int_or_ref_ty(ty: Type) -> bool {
	match ty {
	types::I8 \| types::I16 \| types::I32 \| types::I64 \| types::R64 => true,
	types::B1 \| types::B8 \| types::B16 \| types::B32 \| types::B64 => true,
	types::R32 => panic!("shouldn't have 32-bits refs on x64"),
	_ => false,
	}
	}

	/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
	/// `op`.
	// TODO investigate failures with checking against the result index.
	fn matches_input(ctx: &mut Lower<Inst>, input: InsnInput, op: Opcode) -> Option<IRInst> {
	let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
	inputs.inst.as_inst().and_then(\|(src_inst, _)\| {
	let data = ctx.data(src_inst);
	if data.opcode() == op {
	return Some(src_inst);
	}
	None
	})
	}

	/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
	/// temporary register, returning that register.
	fn generate_constant(ctx: &mut Lower<Inst>, ty: Type, c: u64) -> ValueRegs<Reg> {
	let from_bits = ty_bits(ty);
	let masked = if from_bits < 64 {
	c & ((1u64 << from_bits) - 1)
	} else {
	c
	};

	let cst_copy = ctx.alloc_tmp(ty);
	for inst in Inst::gen_constant(cst_copy, masked as u128, ty, \|ty\| {
	ctx.alloc_tmp(ty).only_reg().unwrap()
	})
	.into_iter()
	{
	ctx.emit(inst);
	}
	non_writable_value_regs(cst_copy)
	}

	/// Put the given input into possibly multiple registers, and mark it as used (side-effect).
	fn put_input_in_regs(ctx: &mut Lower<Inst>, spec: InsnInput) -> ValueRegs<Reg> {
	let ty = ctx.input_ty(spec.insn, spec.input);
	let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);

	if let Some(c) = input.constant {
	// Generate constants fresh at each use to minimize long-range register pressure.
	generate_constant(ctx, ty, c)
	} else {
	ctx.put_input_in_regs(spec.insn, spec.input)
	}
	}

	/// Put the given input into a register, and mark it as used (side-effect).
	fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg {
	put_input_in_regs(ctx, spec)
	.only_reg()
	.expect("Multi-register value not expected")
	}

	/// Determines whether a load operation (indicated by `src_insn`) can be merged
	/// into the current lowering point. If so, returns the address-base source (as
	/// an `InsnInput`) and an offset from that address from which to perform the
	/// load.
	fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInput, i32)> {
	let insn_data = ctx.data(src_insn);
	let inputs = ctx.num_inputs(src_insn);
	if inputs != 1 {
	return None;
	}

	let load_ty = ctx.output_ty(src_insn, 0);
	if ty_bits(load_ty) < 32 {
	// Narrower values are handled by ALU insts that are at least 32 bits
	// wide, which is normally OK as we ignore upper buts; but, if we
	// generate, e.g., a direct-from-memory 32-bit add for a byte value and
	// the byte is the last byte in a page, the extra data that we load is
	// incorrectly accessed. So we only allow loads to merge for
	// 32-bit-and-above widths.
	return None;
	}

	// SIMD instructions can only be load-coalesced when the loaded value comes
	// from an aligned address.
	if load_ty.is_vector() && !insn_data.memflags().map_or(false, \|f\| f.aligned()) {
	return None;
	}

	// Just testing the opcode is enough, because the width will always match if
	// the type does (and the type should match if the CLIF is properly
	// constructed).
	if insn_data.opcode() == Opcode::Load {
	let offset = insn_data
	.load_store_offset()
	.expect("load should have offset");
	Some((
	InsnInput {
	insn: src_insn,
	input: 0,
	},
	offset,
	))
	} else {
	None
	}
	}

	fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
	ctx.get_input_as_source_or_const(spec.insn, spec.input)
	.constant
	}

	fn emit_vm_call(
	ctx: &mut Lower<Inst>,
	flags: &Flags,
	triple: &Triple,
	libcall: LibCall,
	inputs: &[Reg],
	outputs: &[Writable<Reg>],
	) -> CodegenResult<()> {
	let extname = ExternalName::LibCall(libcall);

	let dist = if flags.use_colocated_libcalls() {
	RelocDistance::Near
	} else {
	RelocDistance::Far
	};

	// TODO avoid recreating signatures for every single Libcall function.
	let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
	let sig = libcall.signature(call_conv);
	let caller_conv = ctx.abi().call_conv(ctx.sigs());

	if !ctx.sigs().have_abi_sig_for_signature(&sig) {
	ctx.sigs_mut()
	.make_abi_sig_from_ir_signature::<X64ABIMachineSpec>(sig.clone(), flags)?;
	}

	let mut abi =
	X64Caller::from_libcall(ctx.sigs(), &sig, &extname, dist, caller_conv, flags.clone())?;

	abi.emit_stack_pre_adjust(ctx);

	assert_eq!(inputs.len(), abi.num_args(ctx.sigs()));

	for (i, input) in inputs.iter().enumerate() {
	for inst in abi.gen_copy_regs_to_arg(ctx, i, ValueRegs::one(*input)) {
	ctx.emit(inst);
	}
	}

	abi.emit_call(ctx);
	for (i, output) in outputs.iter().enumerate() {
	for inst in abi.gen_copy_retval_to_regs(ctx, i, ValueRegs::one(*output)) {
	ctx.emit(inst);
	}
	}
	abi.emit_stack_post_adjust(ctx);

	Ok(())
	}

	/// Returns whether the given input is a shift by a constant value less or equal than 3.
	/// The goal is to embed it within an address mode.
	fn matches_small_constant_shift(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<(InsnInput, u8)> {
	matches_input(ctx, spec, Opcode::Ishl).and_then(\|shift\| {
	match input_to_imm(
	ctx,
	InsnInput {
	insn: shift,
	input: 1,
	},
	) {
	Some(shift_amt) if shift_amt <= 3 => Some((
	InsnInput {
	insn: shift,
	input: 0,
	},
	shift_amt as u8,
	)),
	_ => None,
	}
	})
	}

	/// Lowers an instruction to one of the x86 addressing modes.
	///
	/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
	fn lower_to_amode(ctx: &mut Lower<Inst>, spec: InsnInput, offset: i32) -> Amode {
	let flags = ctx
	.memflags(spec.insn)
	.expect("Instruction with amode should have memflags");

	// We now either have an add that we must materialize, or some other input; as well as the
	// final offset.
	if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
	debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
	let add_inputs = &[
	InsnInput {
	insn: add,
	input: 0,
	},
	InsnInput {
	insn: add,
	input: 1,
	},
	];

	// TODO heap_addr legalization generates a uext64 after the shift, so these optimizations
	// aren't happening in the wasm case. We could do better, given some range analysis.
	let (base, index, shift) = if let Some((shift_input, shift_amt)) =
	matches_small_constant_shift(ctx, add_inputs[0])
	{
	(
	put_input_in_reg(ctx, add_inputs[1]),
	put_input_in_reg(ctx, shift_input),
	shift_amt,
	)
	} else if let Some((shift_input, shift_amt)) =
	matches_small_constant_shift(ctx, add_inputs[1])
	{
	(
	put_input_in_reg(ctx, add_inputs[0]),
	put_input_in_reg(ctx, shift_input),
	shift_amt,
	)
	} else {
	for i in 0..=1 {
	// Try to pierce through uextend.
	if let Some(uextend) = matches_input(
	ctx,
	InsnInput {
	insn: add,
	input: i,
	},
	Opcode::Uextend,
	) {
	if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
	// Zero the upper bits.
	let input_size = ctx.input_ty(uextend, 0).bits() as u64;
	let shift: u64 = 64 - input_size;
	let uext_cst: u64 = (cst << shift) >> shift;

	let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
	if low32_will_sign_extend_to_64(final_offset as u64) {
	let base = put_input_in_reg(ctx, add_inputs[1 - i]);
	return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
	}
	}
	}

	// If it's a constant, add it directly!
	if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
	let final_offset = (offset as i64).wrapping_add(cst as i64);
	if low32_will_sign_extend_to_64(final_offset as u64) {
	let base = put_input_in_reg(ctx, add_inputs[1 - i]);
	return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
	}
	}
	}

	(
	put_input_in_reg(ctx, add_inputs[0]),
	put_input_in_reg(ctx, add_inputs[1]),
	0,
	)
	};

	return Amode::imm_reg_reg_shift(
	offset as u32,
	Gpr::new(base).unwrap(),
	Gpr::new(index).unwrap(),
	shift,
	)
	.with_flags(flags);
	}

	let input = put_input_in_reg(ctx, spec);
	Amode::imm_reg(offset as u32, input).with_flags(flags)
	}

	//=============================================================================
	// Top-level instruction lowering entry point, for one instruction.

	/// Actually codegen an instruction's results into registers.
	fn lower_insn_to_regs(
	ctx: &mut Lower<Inst>,
	insn: IRInst,
	flags: &Flags,
	isa_flags: &x64_settings::Flags,
	triple: &Triple,
	) -> CodegenResult<()> {
	let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
	.map(\|i\| InsnOutput { insn, output: i })
	.collect();

	if let Ok(()) = isle::lower(ctx, triple, flags, isa_flags, &outputs, insn) {
	return Ok(());
	}

	let op = ctx.data(insn).opcode();
	match op {
	Opcode::Iconst
	\| Opcode::Bconst
	\| Opcode::F32const
	\| Opcode::F64const
	\| Opcode::Null
	\| Opcode::Iadd
	\| Opcode::IaddIfcout
	\| Opcode::SaddSat
	\| Opcode::UaddSat
	\| Opcode::Isub
	\| Opcode::SsubSat
	\| Opcode::UsubSat
	\| Opcode::AvgRound
	\| Opcode::Band
	\| Opcode::Bor
	\| Opcode::Bxor
	\| Opcode::Imul
	\| Opcode::BandNot
	\| Opcode::Iabs
	\| Opcode::Imax
	\| Opcode::Umax
	\| Opcode::Imin
	\| Opcode::Umin
	\| Opcode::Bnot
	\| Opcode::Bitselect
	\| Opcode::Vselect
	\| Opcode::Ushr
	\| Opcode::Sshr
	\| Opcode::Ishl
	\| Opcode::Rotl
	\| Opcode::Rotr
	\| Opcode::Ineg
	\| Opcode::Trap
	\| Opcode::ResumableTrap
	\| Opcode::Clz
	\| Opcode::Ctz
	\| Opcode::Popcnt
	\| Opcode::Bitrev
	\| Opcode::IsNull
	\| Opcode::IsInvalid
	\| Opcode::Uextend
	\| Opcode::Sextend
	\| Opcode::Breduce
	\| Opcode::Bextend
	\| Opcode::Ireduce
	\| Opcode::Bint
	\| Opcode::Debugtrap
	\| Opcode::WideningPairwiseDotProductS
	\| Opcode::Fadd
	\| Opcode::Fsub
	\| Opcode::Fmul
	\| Opcode::Fdiv
	\| Opcode::Fmin
	\| Opcode::Fmax
	\| Opcode::FminPseudo
	\| Opcode::FmaxPseudo
	\| Opcode::Sqrt
	\| Opcode::Fpromote
	\| Opcode::FvpromoteLow
	\| Opcode::Fdemote
	\| Opcode::Fvdemote
	\| Opcode::Fma
	\| Opcode::Icmp
	\| Opcode::Fcmp
	\| Opcode::Load
	\| Opcode::Uload8
	\| Opcode::Sload8
	\| Opcode::Uload16
	\| Opcode::Sload16
	\| Opcode::Uload32
	\| Opcode::Sload32
	\| Opcode::Sload8x8
	\| Opcode::Uload8x8
	\| Opcode::Sload16x4
	\| Opcode::Uload16x4
	\| Opcode::Sload32x2
	\| Opcode::Uload32x2
	\| Opcode::Store
	\| Opcode::Istore8
	\| Opcode::Istore16
	\| Opcode::Istore32
	\| Opcode::AtomicRmw
	\| Opcode::AtomicCas
	\| Opcode::AtomicLoad
	\| Opcode::AtomicStore
	\| Opcode::Fence
	\| Opcode::FuncAddr
	\| Opcode::SymbolValue
	\| Opcode::Return
	\| Opcode::Call
	\| Opcode::CallIndirect
	\| Opcode::Trapif
	\| Opcode::Trapff
	\| Opcode::GetFramePointer
	\| Opcode::GetStackPointer
	\| Opcode::GetReturnAddress
	\| Opcode::Select
	\| Opcode::Selectif
	\| Opcode::SelectifSpectreGuard
	\| Opcode::FcvtFromSint
	\| Opcode::FcvtLowFromSint
	\| Opcode::FcvtFromUint
	\| Opcode::FcvtToUint
	\| Opcode::FcvtToSint
	\| Opcode::FcvtToUintSat
	\| Opcode::FcvtToSintSat
	\| Opcode::IaddPairwise
	\| Opcode::UwidenHigh
	\| Opcode::UwidenLow
	\| Opcode::SwidenHigh
	\| Opcode::SwidenLow
	\| Opcode::Snarrow
	\| Opcode::Unarrow
	\| Opcode::Bitcast
	\| Opcode::Fabs
	\| Opcode::Fneg
	\| Opcode::Fcopysign
	\| Opcode::Ceil
	\| Opcode::Floor
	\| Opcode::Nearest
	\| Opcode::Trunc
	\| Opcode::StackAddr
	\| Opcode::Udiv
	\| Opcode::Urem
	\| Opcode::Sdiv
	\| Opcode::Srem
	\| Opcode::Umulhi
	\| Opcode::Smulhi
	\| Opcode::GetPinnedReg
	\| Opcode::SetPinnedReg
	\| Opcode::Vconst
	\| Opcode::RawBitcast
	\| Opcode::Insertlane
	\| Opcode::Shuffle
	\| Opcode::Swizzle
	\| Opcode::Extractlane
	\| Opcode::ScalarToVector
	\| Opcode::Splat
	\| Opcode::VanyTrue
	\| Opcode::VallTrue
	\| Opcode::VhighBits
	\| Opcode::Iconcat
	\| Opcode::Isplit
	\| Opcode::TlsValue
	\| Opcode::SqmulRoundSat
	\| Opcode::Uunarrow
	\| Opcode::Nop => {
	let ty = if outputs.len() > 0 {
	Some(ctx.output_ty(insn, 0))
	} else {
	None
	};

	unreachable!(
	"implemented in ISLE: inst = `{}`, type = `{:?}`",
	ctx.dfg().display_inst(insn),
	ty
	)
	}

	Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

	// Unimplemented opcodes below. These are not currently used by Wasm
	// lowering or other known embeddings, but should be either supported or
	// removed eventually
	Opcode::ExtractVector => {
	unimplemented!("ExtractVector not supported");
	}

	Opcode::Cls => unimplemented!("Cls not supported"),

	Opcode::BorNot \| Opcode::BxorNot => {
	unimplemented!("or-not / xor-not opcodes not implemented");
	}

	Opcode::Bmask => unimplemented!("Bmask not implemented"),

	Opcode::Trueif \| Opcode::Trueff => unimplemented!("trueif / trueff not implemented"),

	Opcode::Vsplit \| Opcode::Vconcat => {
	unimplemented!("Vector split/concat ops not implemented.");
	}

	// Opcodes that should be removed by legalization. These should
	// eventually be removed if/when we replace in-situ legalization with
	// something better.
	Opcode::Ifcmp \| Opcode::Ffcmp => {
	panic!("Should never reach ifcmp/ffcmp as isel root!");
	}

	Opcode::IaddImm
	\| Opcode::ImulImm
	\| Opcode::UdivImm
	\| Opcode::SdivImm
	\| Opcode::UremImm
	\| Opcode::SremImm
	\| Opcode::IrsubImm
	\| Opcode::IaddCin
	\| Opcode::IaddIfcin
	\| Opcode::IaddCout
	\| Opcode::IaddCarry
	\| Opcode::IaddIfcarry
	\| Opcode::IsubBin
	\| Opcode::IsubIfbin
	\| Opcode::IsubBout
	\| Opcode::IsubIfbout
	\| Opcode::IsubBorrow
	\| Opcode::IsubIfborrow
	\| Opcode::BandImm
	\| Opcode::BorImm
	\| Opcode::BxorImm
	\| Opcode::RotlImm
	\| Opcode::RotrImm
	\| Opcode::IshlImm
	\| Opcode::UshrImm
	\| Opcode::SshrImm
	\| Opcode::IcmpImm
	\| Opcode::IfcmpImm => {
	panic!("ALU+imm and ALU+carry ops should not appear here!");
	}

	Opcode::StackLoad
	\| Opcode::StackStore
	\| Opcode::DynamicStackStore
	\| Opcode::DynamicStackLoad => {
	panic!("Direct stack memory access not supported; should have been legalized");
	}

	Opcode::GlobalValue => {
	panic!("global_value should have been removed by legalization!");
	}

	Opcode::HeapAddr => {
	panic!("heap_addr should have been removed by legalization!");
	}

	Opcode::TableAddr => {
	panic!("table_addr should have been removed by legalization!");
	}

	Opcode::Copy => {
	panic!("Unused opcode should not be encountered.");
	}

	Opcode::Trapz \| Opcode::Trapnz \| Opcode::ResumableTrapnz => {
	panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
	}

	Opcode::Jump
	\| Opcode::Brz
	\| Opcode::Brnz
	\| Opcode::BrIcmp
	\| Opcode::Brif
	\| Opcode::Brff
	\| Opcode::BrTable => {
	panic!("Branch opcode reached non-branch lowering logic!");
	}
	}
	}

	//=============================================================================
	// Lowering-backend trait implementation.

	impl LowerBackend for X64Backend {
	type MInst = Inst;

	fn lower(&self, ctx: &mut Lower<Inst>, ir_inst: IRInst) -> CodegenResult<()> {
	lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple)
	}

	fn lower_branch_group(
	&self,
	ctx: &mut Lower<Inst>,
	branches: &[IRInst],
	targets: &[MachLabel],
	) -> CodegenResult<()> {
	// A block should end with at most two branches. The first may be a
	// conditional branch; a conditional branch can be followed only by an
	// unconditional branch or fallthrough. Otherwise, if only one branch,
	// it may be an unconditional branch, a fallthrough, a return, or a
	// trap. These conditions are verified by `is_ebb_basic()` during the
	// verifier pass.
	assert!(branches.len() <= 2);
	if branches.len() == 2 {
	let op1 = ctx.data(branches[1]).opcode();
	assert!(op1 == Opcode::Jump);
	}

	if let Ok(()) = isle::lower_branch(
	ctx,
	&self.triple,
	&self.flags,
	&self.x64_flags,
	branches[0],
	targets,
	) {
	return Ok(());
	}

	unreachable!(
	"implemented in ISLE: branch = `{}`",
	ctx.dfg().display_inst(branches[0]),
	);
	}

	fn maybe_pinned_reg(&self) -> Option<Reg> {
	Some(regs::pinned_reg())
	}
	}