vendor/cranelift-codegen/src/isa/aarch64/lower_inst.rs - toolchain/rustc - Git at Google

 //! Lower a single Cranelift instruction into vcode.

 use crate::binemit::CodeOffset;
 use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::isa::aarch64::settings as aarch64_settings;
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::settings::Flags;
 use crate::{CodegenError, CodegenResult};

 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;

 use regalloc::Writable;

 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::convert::TryFrom;

 use super::lower::*;

 /// Actually codegen an instruction's results into registers.
 pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
     flags: &Flags,
     isa_flags: &aarch64_settings::Flags,
 ) -> CodegenResult<()> {
     let op = ctx.data(insn).opcode();
     let inputs = insn_inputs(ctx, insn);
     let outputs = insn_outputs(ctx, insn);
     let ty = if outputs.len() > 0 {
         Some(ctx.output_ty(insn, 0))
     } else {
         None
     };

     match op {
         Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
             let value = ctx.get_constant(insn).unwrap();
             // Sign extend constant if necessary
             let value = match ty.unwrap() {
                 I8 => (((value as i64) << 56) >> 56) as u64,
                 I16 => (((value as i64) << 48) >> 48) as u64,
                 I32 => (((value as i64) << 32) >> 32) as u64,
                 I64 | R64 => value,
                 ty if ty.is_bool() => value,
                 ty => unreachable!("Unknown type for const: {}", ty),
             };
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_u64(ctx, rd, value);
         }
         Opcode::F32const => {
             let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_f32(ctx, rd, value);
         }
         Opcode::F64const => {
             let value = f64::from_bits(ctx.get_constant(insn).unwrap());
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_f64(ctx, rd, value);
         }
         Opcode::Iadd => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let mul_insn =
                     if let Some(mul_insn) = maybe_input_insn(ctx, inputs[1], Opcode::Imul) {
                         Some((mul_insn, 0))
                     } else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
                         Some((mul_insn, 1))
                     } else {
                         None
                     };
                 // If possible combine mul + add into madd.
                 if let Some((insn, addend_idx)) = mul_insn {
                     let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
                     let rn_input = InsnInput { insn, input: 0 };
                     let rm_input = InsnInput { insn, input: 1 };

                     let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
                     let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
                     let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);

                     ctx.emit(Inst::AluRRRR {
                         alu_op,
                         rd,
                         rn,
                         rm,
                         ra,
                     });
                 } else {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                     let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                         ctx,
                         inputs[1],
                         ty_bits(ty),
                         NarrowValueMode::None,
                     );
                     let alu_op = if !negated {
                         choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                     } else {
                         choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                     };
                     ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
                 }
             } else {
                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 ctx.emit(Inst::VecRRR {
                     rd,
                     rn,
                     rm,
                     alu_op: VecALUOp::Add,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }
         Opcode::Isub => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
                     ctx,
                     inputs[1],
                     ty_bits(ty),
                     NarrowValueMode::None,
                 );
                 let alu_op = if !negated {
                     choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
                 } else {
                     choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
                 };
                 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
             } else {
                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 ctx.emit(Inst::VecRRR {
                     rd,
                     rn,
                     rm,
                     alu_op: VecALUOp::Sub,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }
         Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
             let ty = ty.unwrap();
             assert!(ty.is_vector());
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);

             let alu_op = match op {
                 Opcode::UaddSat => VecALUOp::Uqadd,
                 Opcode::SaddSat => VecALUOp::Sqadd,
                 Opcode::UsubSat => VecALUOp::Uqsub,
                 Opcode::SsubSat => VecALUOp::Sqsub,
                 _ => unreachable!(),
             };

             ctx.emit(Inst::VecRRR {
                 rd,
                 rn,
                 rm,
                 alu_op,
                 size: VectorSize::from_ty(ty),
             });
         }

         Opcode::Ineg => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rn = zero_reg();
                 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
                 ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 ctx.emit(Inst::VecMisc {
                     op: VecMisc2::Neg,
                     rd,
                     rn,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }

         Opcode::Imul => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
                 ctx.emit(Inst::AluRRRR {
                     alu_op,
                     rd,
                     rn,
                     rm,
                     ra: zero_reg(),
                 });
             } else {
                 if ty == I64X2 {
                     let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
                     let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();

                     // This I64X2 multiplication is performed with several 32-bit
                     // operations.

                     // 64-bit numbers x and y, can be represented as:
                     //   x = a + 2^32(b)
                     //   y = c + 2^32(d)

                     // A 64-bit multiplication is:
                     //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
                     // note: `2^64(bd)` can be ignored, the value is too large to fit in
                     // 64 bits.

                     // This sequence implements a I64X2 multiply, where the registers
                     // `rn` and `rm` are split up into 32-bit components:
                     //   rn = |d|c|b|a|
                     //   rm = |h|g|f|e|
                     //
                     //   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
                     //
                     //  The sequence is:
                     //  rev64 rd.4s, rm.4s
                     //  mul rd.4s, rd.4s, rn.4s
                     //  xtn tmp1.2s, rn.2d
                     //  addp rd.4s, rd.4s, rd.4s
                     //  xtn tmp2.2s, rm.2d
                     //  shll rd.2d, rd.2s, #32
                     //  umlal rd.2d, tmp2.2s, tmp1.2s

                     // Reverse the 32-bit elements in the 64-bit words.
                     //   rd = |g|h|e|f|
                     ctx.emit(Inst::VecMisc {
                         op: VecMisc2::Rev64,
                         rd,
                         rn: rm,
                         size: VectorSize::Size32x4,
                     });

                     // Calculate the high half components.
                     //   rd = |dg|ch|be|af|
                     //
                     // Note that this 32-bit multiply of the high half
                     // discards the bits that would overflow, same as
                     // if 64-bit operations were used. Also the Shll
                     // below would shift out the overflow bits anyway.
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Mul,
                         rd,
                         rn: rd.to_reg(),
                         rm: rn,
                         size: VectorSize::Size32x4,
                     });

                     // Extract the low half components of rn.
                     //   tmp1 = |c|a|
                     ctx.emit(Inst::VecMiscNarrow {
                         op: VecMiscNarrowOp::Xtn,
                         rd: tmp1,
                         rn,
                         size: VectorSize::Size32x2,
                         high_half: false,
                     });

                     // Sum the respective high half components.
                     //   rd = |dg+ch|be+af||dg+ch|be+af|
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Addp,
                         rd: rd,
                         rn: rd.to_reg(),
                         rm: rd.to_reg(),
                         size: VectorSize::Size32x4,
                     });

                     // Extract the low half components of rm.
                     //   tmp2 = |g|e|
                     ctx.emit(Inst::VecMiscNarrow {
                         op: VecMiscNarrowOp::Xtn,
                         rd: tmp2,
                         rn: rm,
                         size: VectorSize::Size32x2,
                         high_half: false,
                     });

                     // Shift the high half components, into the high half.
                     //   rd = |dg+ch << 32|be+af << 32|
                     ctx.emit(Inst::VecMisc {
                         op: VecMisc2::Shll,
                         rd,
                         rn: rd.to_reg(),
                         size: VectorSize::Size32x2,
                     });

                     // Multiply the low components together, and accumulate with the high
                     // half.
                     //   rd = |rd[1] + cg|rd[0] + ae|
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Umlal,
                         rd,
                         rn: tmp2.to_reg(),
                         rm: tmp1.to_reg(),
                         size: VectorSize::Size32x2,
                     });
                 } else {
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Mul,
                         rd,
                         rn,
                         rm,
                         size: VectorSize::from_ty(ty),
                     });
                 }
             }
         }

         Opcode::Umulhi | Opcode::Smulhi => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let is_signed = op == Opcode::Smulhi;
             let input_ty = ctx.input_ty(insn, 0);
             assert!(ctx.input_ty(insn, 1) == input_ty);
             assert!(ctx.output_ty(insn, 0) == input_ty);

             match input_ty {
                 I64 => {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                     let alu_op = if is_signed {
                         ALUOp::SMulH
                     } else {
                         ALUOp::UMulH
                     };
                     ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
                 }
                 I32 | I16 | I8 => {
                     let narrow_mode = if is_signed {
                         NarrowValueMode::SignExtend64
                     } else {
                         NarrowValueMode::ZeroExtend64
                     };
                     let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
                     let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
                     let ra = zero_reg();
                     ctx.emit(Inst::AluRRRR {
                         alu_op: ALUOp3::MAdd64,
                         rd,
                         rn,
                         rm,
                         ra,
                     });
                     let shift_op = if is_signed {
                         ALUOp::Asr64
                     } else {
                         ALUOp::Lsr64
                     };
                     let shift_amt = match input_ty {
                         I32 => 32,
                         I16 => 16,
                         I8 => 8,
                         _ => unreachable!(),
                     };
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: shift_op,
                         rd,
                         rn: rd.to_reg(),
                         immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
                     });
                 }
                 _ => {
                     panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
                 }
             }
         }

         Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
             let is_signed = match op {
                 Opcode::Udiv | Opcode::Urem => false,
                 Opcode::Sdiv | Opcode::Srem => true,
                 _ => unreachable!(),
             };
             let is_rem = match op {
                 Opcode::Udiv | Opcode::Sdiv => false,
                 Opcode::Urem | Opcode::Srem => true,
                 _ => unreachable!(),
             };
             let narrow_mode = if is_signed {
                 NarrowValueMode::SignExtend64
             } else {
                 NarrowValueMode::ZeroExtend64
             };
             // TODO: Add SDiv32 to implement 32-bit directly, rather
             // than extending the input.
             let div_op = if is_signed {
                 ALUOp::SDiv64
             } else {
                 ALUOp::UDiv64
             };

             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
             let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
             // The div instruction does not trap on divide by zero or signed overflow
             // so checks are inserted below.
             //
             //   div rd, rn, rm
             ctx.emit(Inst::AluRRR {
                 alu_op: div_op,
                 rd,
                 rn,
                 rm,
             });

             if is_rem {
                 // Remainder (rn % rm) is implemented as:
                 //
                 //   tmp = rn / rm
                 //   rd = rn - (tmp*rm)
                 //
                 // use 'rd' for tmp and you have:
                 //
                 //   div rd, rn, rm       ; rd = rn / rm
                 //   cbnz rm, #8          ; branch over trap
                 //   udf                  ; divide by zero
                 //   msub rd, rd, rm, rn  ; rd = rn - rd * rm

                 // Check for divide by 0.
                 let trap_code = TrapCode::IntegerDivisionByZero;
                 ctx.emit(Inst::TrapIf {
                     trap_code,
                     kind: CondBrKind::Zero(rm),
                 });

                 ctx.emit(Inst::AluRRRR {
                     alu_op: ALUOp3::MSub64,
                     rd: rd,
                     rn: rd.to_reg(),
                     rm: rm,
                     ra: rn,
                 });
             } else {
                 if div_op == ALUOp::SDiv64 {
                     //   cbnz rm, #8
                     //   udf ; divide by zero
                     //   cmn rm, 1
                     //   ccmp rn, 1, #nzcv, eq
                     //   b.vc #8
                     //   udf ; signed overflow

                     // Check for divide by 0.
                     let trap_code = TrapCode::IntegerDivisionByZero;
                     ctx.emit(Inst::TrapIf {
                         trap_code,
                         kind: CondBrKind::Zero(rm),
                     });

                     // Check for signed overflow. The only case is min_value / -1.
                     let ty = ty.unwrap();
                     // The following checks must be done in 32-bit or 64-bit, depending
                     // on the input type. Even though the initial div instruction is
                     // always done in 64-bit currently.
                     let size = OperandSize::from_ty(ty);
                     // Check RHS is -1.
                     ctx.emit(Inst::AluRRImm12 {
                         alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
                         rd: writable_zero_reg(),
                         rn: rm,
                         imm12: Imm12::maybe_from_u64(1).unwrap(),
                     });
                     // Check LHS is min_value, by subtracting 1 and branching if
                     // there is overflow.
                     ctx.emit(Inst::CCmpImm {
                         size,
                         rn,
                         imm: UImm5::maybe_from_u8(1).unwrap(),
                         nzcv: NZCV::new(false, false, false, false),
                         cond: Cond::Eq,
                     });
                     let trap_code = TrapCode::IntegerOverflow;
                     ctx.emit(Inst::TrapIf {
                         trap_code,
                         kind: CondBrKind::Cond(Cond::Vs),
                     });
                 } else {
                     //   cbnz rm, #8
                     //   udf ; divide by zero

                     // Check for divide by 0.
                     let trap_code = TrapCode::IntegerDivisionByZero;
                     ctx.emit(Inst::TrapIf {
                         trap_code,
                         kind: CondBrKind::Zero(rm),
                     });
                 }
             }
         }

         Opcode::Uextend | Opcode::Sextend => {
             let output_ty = ty.unwrap();
             let input_ty = ctx.input_ty(insn, 0);
             let from_bits = ty_bits(input_ty) as u8;
             let to_bits = ty_bits(output_ty) as u8;
             let to_bits = std::cmp::max(32, to_bits);
             assert!(from_bits <= to_bits);
             if from_bits < to_bits {
                 let signed = op == Opcode::Sextend;
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
                     let idx =
                         if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
                             *imm
                         } else {
                             unreachable!();
                         };
                     let input = InsnInput {
                         insn: extract_insn,
                         input: 0,
                     };
                     let rn = put_input_in_reg(ctx, input, NarrowValueMode::None);
                     let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));

                     if signed {
                         let scalar_size = OperandSize::from_ty(output_ty);

                         ctx.emit(Inst::MovFromVecSigned {
                             rd,
                             rn,
                             idx,
                             size,
                             scalar_size,
                         });
                     } else {
                         ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
                     }
                 } else {
                     // If we reach this point, we weren't able to incorporate the extend as
                     // a register-mode on another instruction, so we have a 'None'
                     // narrow-value/extend mode here, and we emit the explicit instruction.
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                     ctx.emit(Inst::Extend {
                         rd,
                         rn,
                         signed,
                         from_bits,
                         to_bits,
                     });
                 }
             }
         }

         Opcode::Bnot => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
                 let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
                 // NOT rd, rm ==> ORR_NOT rd, zero, rm
                 ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
             } else {
                 let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 ctx.emit(Inst::VecMisc {
                     op: VecMisc2::Not,
                     rd,
                     rn: rm,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }

         Opcode::Band
         | Opcode::Bor
         | Opcode::Bxor
         | Opcode::BandNot
         | Opcode::BorNot
         | Opcode::BxorNot => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
                 let alu_op = match op {
                     Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
                     Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
                     Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
                     Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
                     Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
                     Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
                     _ => unreachable!(),
                 };
                 ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
             } else {
                 let alu_op = match op {
                     Opcode::Band => VecALUOp::And,
                     Opcode::BandNot => VecALUOp::Bic,
                     Opcode::Bor => VecALUOp::Orr,
                     Opcode::Bxor => VecALUOp::Eor,
                     _ => unreachable!(),
                 };

                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

                 ctx.emit(Inst::VecRRR {
                     alu_op,
                     rd,
                     rn,
                     rm,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }

         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
             let ty = ty.unwrap();
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let size = OperandSize::from_bits(ty_bits(ty));
                 let narrow_mode = match (op, size) {
                     (Opcode::Ishl, _) => NarrowValueMode::None,
                     (Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
                     (Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
                     (Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
                     (Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
                     _ => unreachable!(),
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
                 let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
                 let alu_op = match op {
                     Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
                     Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
                     Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
                     _ => unreachable!(),
                 };
                 ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let size = VectorSize::from_ty(ty);
                 let (alu_op, is_right_shift) = match op {
                     Opcode::Ishl => (VecALUOp::Sshl, false),
                     Opcode::Ushr => (VecALUOp::Ushl, true),
                     Opcode::Sshr => (VecALUOp::Sshl, true),
                     _ => unreachable!(),
                 };

                 let rm = if is_right_shift {
                     // Right shifts are implemented with a negative left shift.
                     let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
                     let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                     let rn = zero_reg();
                     ctx.emit(Inst::AluRRR {
                         alu_op: ALUOp::Sub32,
                         rd: tmp,
                         rn,
                         rm,
                     });
                     tmp.to_reg()
                 } else {
                     put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
                 };

                 ctx.emit(Inst::VecDup { rd, rn: rm, size });

                 ctx.emit(Inst::VecRRR {
                     alu_op,
                     rd,
                     rn,
                     rm: rd.to_reg(),
                     size,
                 });
             }
         }

         Opcode::Rotr | Opcode::Rotl => {
             // aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
             // effectively a right rotation of N - K places, if N is the integer's bit size. We
             // implement left rotations with this trick.
             //
             // For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
             //
             // For a < 32-bit rotate-right, we synthesize this as:
             //
             //    rotr rd, rn, rm
             //
             //       =>
             //
             //    zero-extend rn, <32-or-64>
             //    and tmp_masked_rm, rm, <bitwidth - 1>
             //    sub tmp1, tmp_masked_rm, <bitwidth>
             //    sub tmp1, zero, tmp1  ; neg
             //    lsr tmp2, rn, tmp_masked_rm
             //    lsl rd, rn, tmp1
             //    orr rd, rd, tmp2
             //
             // For a constant amount, we can instead do:
             //
             //    zero-extend rn, <32-or-64>
             //    lsr tmp2, rn, #<shiftimm>
             //    lsl rd, rn, <bitwidth - shiftimm>
             //    orr rd, rd, tmp2

             let is_rotl = op == Opcode::Rotl;

             let ty = ty.unwrap();
             let ty_bits_size = ty_bits(ty) as u8;

             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(
                 ctx,
                 inputs[0],
                 if ty_bits_size <= 32 {
                     NarrowValueMode::ZeroExtend32
                 } else {
                     NarrowValueMode::ZeroExtend64
                 },
             );
             let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));

             if ty_bits_size == 32 || ty_bits_size == 64 {
                 let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
                 match rm {
                     ResultRegImmShift::ImmShift(mut immshift) => {
                         if is_rotl {
                             immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
                         }
                         immshift.imm &= ty_bits_size - 1;
                         ctx.emit(Inst::AluRRImmShift {
                             alu_op,
                             rd,
                             rn,
                             immshift,
                         });
                     }

                     ResultRegImmShift::Reg(rm) => {
                         let rm = if is_rotl {
                             // Really ty_bits_size - rn, but the upper bits of the result are
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
                             let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
                             let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
                             ctx.emit(Inst::AluRRR {
                                 alu_op,
                                 rd: tmp,
                                 rn: zero_reg(),
                                 rm,
                             });
                             tmp.to_reg()
                         } else {
                             rm
                         };
                         ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
                     }
                 }
             } else {
                 debug_assert!(ty_bits_size < 32);

                 match rm {
                     ResultRegImmShift::Reg(reg) => {
                         let reg = if is_rotl {
                             // Really ty_bits_size - rn, but the upper bits of the result are
                             // ignored (because of the implicit masking done by the instruction),
                             // so this is equivalent to negating the input.
                             let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
                             ctx.emit(Inst::AluRRR {
                                 alu_op: ALUOp::Sub32,
                                 rd: tmp,
                                 rn: zero_reg(),
                                 rm: reg,
                             });
                             tmp.to_reg()
                         } else {
                             reg
                         };

                         // Explicitly mask the rotation count.
                         let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
                         ctx.emit(Inst::AluRRImmLogic {
                             alu_op: ALUOp::And32,
                             rd: tmp_masked_rm,
                             rn: reg,
                             imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
                         });
                         let tmp_masked_rm = tmp_masked_rm.to_reg();

                         let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
                         let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
                         ctx.emit(Inst::AluRRImm12 {
                             alu_op: ALUOp::Sub32,
                             rd: tmp1,
                             rn: tmp_masked_rm,
                             imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
                         });
                         ctx.emit(Inst::AluRRR {
                             alu_op: ALUOp::Sub32,
                             rd: tmp1,
                             rn: zero_reg(),
                             rm: tmp1.to_reg(),
                         });
                         ctx.emit(Inst::AluRRR {
                             alu_op: ALUOp::Lsr32,
                             rd: tmp2,
                             rn,
                             rm: tmp_masked_rm,
                         });
                         ctx.emit(Inst::AluRRR {
                             alu_op: ALUOp::Lsl32,
                             rd,
                             rn,
                             rm: tmp1.to_reg(),
                         });
                         ctx.emit(Inst::AluRRR {
                             alu_op: ALUOp::Orr32,
                             rd,
                             rn: rd.to_reg(),
                             rm: tmp2.to_reg(),
                         });
                     }

                     ResultRegImmShift::ImmShift(mut immshift) => {
                         if is_rotl {
                             immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
                         }
                         immshift.imm &= ty_bits_size - 1;

                         let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
                         ctx.emit(Inst::AluRRImmShift {
                             alu_op: ALUOp::Lsr32,
                             rd: tmp1,
                             rn,
                             immshift: immshift.clone(),
                         });

                         let amount = immshift.value() & (ty_bits_size - 1);
                         let opp_shift =
                             ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
                         ctx.emit(Inst::AluRRImmShift {
                             alu_op: ALUOp::Lsl32,
                             rd,
                             rn,
                             immshift: opp_shift,
                         });

                         ctx.emit(Inst::AluRRR {
                             alu_op: ALUOp::Orr32,
                             rd,
                             rn: rd.to_reg(),
                             rm: tmp1.to_reg(),
                         });
                     }
                 }
             }
         }

         Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let needs_zext = match op {
                 Opcode::Bitrev | Opcode::Ctz => false,
                 Opcode::Clz | Opcode::Cls => true,
                 _ => unreachable!(),
             };
             let ty = ty.unwrap();
             let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
                 NarrowValueMode::ZeroExtend64
             } else if needs_zext {
                 NarrowValueMode::ZeroExtend32
             } else {
                 NarrowValueMode::None
             };
             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
             let op_ty = match ty {
                 I8 | I16 | I32 => I32,
                 I64 => I64,
                 _ => panic!("Unsupported type for Bitrev/Clz/Cls"),
             };
             let bitop = match op {
                 Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
                 Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
                 _ => unreachable!(),
             };
             ctx.emit(Inst::BitRR { rd, rn, op: bitop });

             // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
             // to a clz, and bitrev as the main operation.
             if op == Opcode::Bitrev || op == Opcode::Ctz {
                 // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
                 // the reversed result in the highest n bits, so we need to shift them down into
                 // place.
                 let right_shift = match ty {
                     I8 => Some(24),
                     I16 => Some(16),
                     I32 => None,
                     I64 => None,
                     _ => panic!("Unsupported type for Bitrev"),
                 };
                 if let Some(s) = right_shift {
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: ALUOp::Lsr32,
                         rd,
                         rn: rd.to_reg(),
                         immshift: ImmShift::maybe_from_u64(s).unwrap(),
                     });
                 }
             }

             if op == Opcode::Ctz {
                 ctx.emit(Inst::BitRR {
                     op: BitOp::from((Opcode::Clz, op_ty)),
                     rd,
                     rn: rd.to_reg(),
                 });
             }
         }

         Opcode::Popcnt => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
             let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
             let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();

             // fmov tmp, rn
             // cnt tmp.8b, tmp.8b
             // addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
             // umov rd, tmp.b[0]

             ctx.emit(Inst::MovToFpu {
                 rd: tmp,
                 rn: rn,
                 size,
             });
             ctx.emit(Inst::VecMisc {
                 op: VecMisc2::Cnt,
                 rd: tmp,
                 rn: tmp.to_reg(),
                 size: VectorSize::Size8x8,
             });

             match ScalarSize::from_ty(ty) {
                 ScalarSize::Size8 => {}
                 ScalarSize::Size16 => {
                     // ADDP is usually cheaper than ADDV.
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Addp,
                         rd: tmp,
                         rn: tmp.to_reg(),
                         rm: tmp.to_reg(),
                         size: VectorSize::Size8x8,
                     });
                 }
                 ScalarSize::Size32 | ScalarSize::Size64 => {
                     ctx.emit(Inst::VecLanes {
                         op: VecLanesOp::Addv,
                         rd: tmp,
                         rn: tmp.to_reg(),
                         size: VectorSize::Size8x8,
                     });
                 }
                 sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
             }

             ctx.emit(Inst::MovFromVec {
                 rd,
                 rn: tmp.to_reg(),
                 idx: 0,
                 size: VectorSize::Size8x16,
             });
         }

         Opcode::Load
         | Opcode::Uload8
         | Opcode::Sload8
         | Opcode::Uload16
         | Opcode::Sload16
         | Opcode::Uload32
         | Opcode::Sload32
         | Opcode::LoadComplex
         | Opcode::Uload8Complex
         | Opcode::Sload8Complex
         | Opcode::Uload16Complex
         | Opcode::Sload16Complex
         | Opcode::Uload32Complex
         | Opcode::Sload32Complex
         | Opcode::Sload8x8
         | Opcode::Uload8x8
         | Opcode::Sload16x4
         | Opcode::Uload16x4
         | Opcode::Sload32x2
         | Opcode::Uload32x2
         | Opcode::Uload8x8Complex
         | Opcode::Sload8x8Complex
         | Opcode::Uload16x4Complex
         | Opcode::Sload16x4Complex
         | Opcode::Uload32x2Complex
         | Opcode::Sload32x2Complex => {
             let sign_extend = match op {
                 Opcode::Sload8
                 | Opcode::Sload8Complex
                 | Opcode::Sload16
                 | Opcode::Sload16Complex
                 | Opcode::Sload32
                 | Opcode::Sload32Complex => true,
                 _ => false,
             };
             let flags = ctx
                 .memflags(insn)
                 .expect("Load instruction should have memflags");

             lower_load(
                 ctx,
                 insn,
                 &inputs[..],
                 outputs[0],
                 |ctx, rd, elem_ty, mem| {
                     let is_float = ty_has_float_or_vec_representation(elem_ty);
                     ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
                         (1, _, _) => Inst::ULoad8 { rd, mem, flags },
                         (8, false, _) => Inst::ULoad8 { rd, mem, flags },
                         (8, true, _) => Inst::SLoad8 { rd, mem, flags },
                         (16, false, _) => Inst::ULoad16 { rd, mem, flags },
                         (16, true, _) => Inst::SLoad16 { rd, mem, flags },
                         (32, false, false) => Inst::ULoad32 { rd, mem, flags },
                         (32, true, false) => Inst::SLoad32 { rd, mem, flags },
                         (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
                         (64, _, false) => Inst::ULoad64 { rd, mem, flags },
                         // Note that we treat some of the vector loads as scalar floating-point loads,
                         // which is correct in a little endian environment.
                         (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
                         (128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
                         _ => panic!("Unsupported size in load"),
                     });

                     let vec_extend = match op {
                         Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
                         Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
                         Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
                         Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
                         Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
                         Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
                         Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
                         Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
                         Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
                         Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
                         Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
                         Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
                         _ => None,
                     };

                     if let Some(t) = vec_extend {
                         ctx.emit(Inst::VecExtend {
                             t,
                             rd,
                             rn: rd.to_reg(),
                             high_half: false,
                         });
                     }
                 },
             );
         }

         Opcode::Store
         | Opcode::Istore8
         | Opcode::Istore16
         | Opcode::Istore32
         | Opcode::StoreComplex
         | Opcode::Istore8Complex
         | Opcode::Istore16Complex
         | Opcode::Istore32Complex => {
             let off = ctx.data(insn).load_store_offset().unwrap();
             let elem_ty = match op {
                 Opcode::Istore8 | Opcode::Istore8Complex => I8,
                 Opcode::Istore16 | Opcode::Istore16Complex => I16,
                 Opcode::Istore32 | Opcode::Istore32Complex => I32,
                 Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
                 _ => unreachable!(),
             };
             let is_float = ty_has_float_or_vec_representation(elem_ty);
             let flags = ctx
                 .memflags(insn)
                 .expect("Store instruction should have memflags");

             let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
             let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);

             ctx.emit(match (ty_bits(elem_ty), is_float) {
                 (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
                 (16, _) => Inst::Store16 { rd, mem, flags },
                 (32, false) => Inst::Store32 { rd, mem, flags },
                 (32, true) => Inst::FpuStore32 { rd, mem, flags },
                 (64, false) => Inst::Store64 { rd, mem, flags },
                 (64, true) => Inst::FpuStore64 { rd, mem, flags },
                 (128, _) => Inst::FpuStore128 { rd, mem, flags },
                 _ => panic!("Unsupported size in store"),
             });
         }

         Opcode::StackAddr => {
             let (stack_slot, offset) = match *ctx.data(insn) {
                 InstructionData::StackLoad {
                     opcode: Opcode::StackAddr,
                     stack_slot,
                     offset,
                 } => (stack_slot, offset),
                 _ => unreachable!(),
             };
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let offset: i32 = offset.into();
             let inst = ctx
                 .abi()
                 .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
             ctx.emit(inst);
         }

         Opcode::AtomicRmw => {
             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));
             // Make sure that both args are in virtual regs, since in effect
             // we have to do a parallel copy to get them safely to the AtomicRMW input
             // regs, and that's not guaranteed safe if either is in a real reg.
             r_addr = ctx.ensure_in_vreg(r_addr, I64);
             r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
             // Move the args to the preordained AtomicRMW input regs
             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
             ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
             // Now the AtomicRMW insn itself
             let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
             ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
             // And finally, copy the preordained AtomicRMW output reg to its destination.
             ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
             // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
         }

         Opcode::AtomicCas => {
             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));

             if isa_flags.use_lse() {
                 ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
                 ctx.emit(Inst::AtomicCAS {
                     rs: r_dst,
                     rt: r_replacement,
                     rn: r_addr,
                     ty: ty_access,
                 });
             } else {
                 // This is very similar to, but not identical to, the AtomicRmw case.  Note
                 // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
                 // about zero-extending narrow (I8/I16/I32) values here.
                 // Make sure that all three args are in virtual regs.  See corresponding comment
                 // for `Opcode::AtomicRmw` above.
                 r_addr = ctx.ensure_in_vreg(r_addr, I64);
                 r_expected = ctx.ensure_in_vreg(r_expected, I64);
                 r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
                 // Move the args to the preordained AtomicCASLoop input regs
                 ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
                 ctx.emit(Inst::gen_move(
                     Writable::from_reg(xreg(26)),
                     r_expected,
                     I64,
                 ));
                 ctx.emit(Inst::gen_move(
                     Writable::from_reg(xreg(28)),
                     r_replacement,
                     I64,
                 ));
                 // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
                 ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
                 // And finally, copy the preordained AtomicCASLoop output reg to its destination.
                 ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
                 // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
             }
         }

         Opcode::AtomicLoad => {
             let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));
             ctx.emit(Inst::AtomicLoad {
                 ty: ty_access,
                 r_data,
                 r_addr,
             });
         }

         Opcode::AtomicStore => {
             let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty_access = ctx.input_ty(insn, 0);
             assert!(is_valid_atomic_transaction_ty(ty_access));
             ctx.emit(Inst::AtomicStore {
                 ty: ty_access,
                 r_data,
                 r_addr,
             });
         }

         Opcode::Fence => {
             ctx.emit(Inst::Fence {});
         }

         Opcode::StackLoad | Opcode::StackStore => {
             panic!("Direct stack memory access not supported; should not be used by Wasm");
         }

         Opcode::HeapAddr => {
             panic!("heap_addr should have been removed by legalization!");
         }

         Opcode::TableAddr => {
             panic!("table_addr should have been removed by legalization!");
         }

         Opcode::ConstAddr => unimplemented!(),

         Opcode::Nop => {
             // Nothing.
         }

         Opcode::Select => {
             let flag_input = inputs[0];
             let cond = if let Some(icmp_insn) =
                 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
             {
                 let condcode = ctx.data(icmp_insn).cond_code().unwrap();
                 let cond = lower_condcode(condcode);
                 let is_signed = condcode_is_signed(condcode);
                 lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
                 cond
             } else if let Some(fcmp_insn) =
                 maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
             {
                 let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
                 let cond = lower_fp_condcode(condcode);
                 lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
                 cond
             } else {
                 let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
                     (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
                 } else {
                     (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
                 };

                 let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
                 // cmp rcond, #0
                 ctx.emit(Inst::AluRRR {
                     alu_op: cmp_op,
                     rd: writable_zero_reg(),
                     rn: rcond,
                     rm: zero_reg(),
                 });
                 Cond::Ne
             };

             // csel.cond rd, rn, rm
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty = ctx.output_ty(insn, 0);
             let bits = ty_bits(ty);
             let is_float = ty_has_float_or_vec_representation(ty);
             if is_float && bits == 32 {
                 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
             } else if is_float && bits == 64 {
                 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
             } else if is_float && bits == 128 {
                 ctx.emit(Inst::VecCSel { cond, rd, rn, rm });
             } else {
                 ctx.emit(Inst::CSel { cond, rd, rn, rm });
             }
         }

         Opcode::Selectif | Opcode::SelectifSpectreGuard => {
             let condcode = ctx.data(insn).cond_code().unwrap();
             let cond = lower_condcode(condcode);
             let is_signed = condcode_is_signed(condcode);
             // Verification ensures that the input is always a
             // single-def ifcmp.
             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);

             // csel.COND rd, rn, rm
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty = ctx.output_ty(insn, 0);
             let bits = ty_bits(ty);
             let is_float = ty_has_float_or_vec_representation(ty);
             if is_float && bits == 32 {
                 ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
             } else if is_float && bits == 64 {
                 ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
             } else {
                 ctx.emit(Inst::CSel { cond, rd, rn, rm });
             }
         }

         Opcode::Bitselect | Opcode::Vselect => {
             let ty = ty.unwrap();
             if !ty.is_vector() {
                 debug_assert_ne!(Opcode::Vselect, op);
                 let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
                 // AND rTmp, rn, rcond
                 ctx.emit(Inst::AluRRR {
                     alu_op: ALUOp::And64,
                     rd: tmp,
                     rn,
                     rm: rcond,
                 });
                 // BIC rd, rm, rcond
                 ctx.emit(Inst::AluRRR {
                     alu_op: ALUOp::AndNot64,
                     rd,
                     rn: rm,
                     rm: rcond,
                 });
                 // ORR rd, rd, rTmp
                 ctx.emit(Inst::AluRRR {
                     alu_op: ALUOp::Orr64,
                     rd,
                     rn: rd.to_reg(),
                     rm: tmp.to_reg(),
                 });
             } else {
                 let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::gen_move(rd, rcond, ty));

                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Bsl,
                     rd,
                     rn,
                     rm,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }

         Opcode::Trueif => {
             let condcode = ctx.data(insn).cond_code().unwrap();
             let cond = lower_condcode(condcode);
             let is_signed = condcode_is_signed(condcode);
             // Verification ensures that the input is always a
             // single-def ifcmp.
             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             materialize_bool_result(ctx, insn, rd, cond);
         }

         Opcode::Trueff => {
             let condcode = ctx.data(insn).fp_cond_code().unwrap();
             let cond = lower_fp_condcode(condcode);
             let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
             lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             materialize_bool_result(ctx, insn, rd, cond);
         }

         Opcode::IsNull | Opcode::IsInvalid => {
             // Null references are represented by the constant value 0; invalid references are
             // represented by the constant value -1. See `define_reftypes()` in
             // `meta/src/isa/x86/encodings.rs` to confirm.
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ctx.input_ty(insn, 0);
             let (alu_op, const_value) = match op {
                 Opcode::IsNull => {
                     // cmp rn, #0
                     (choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
                 }
                 Opcode::IsInvalid => {
                     // cmn rn, #1
                     (choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
                 }
                 _ => unreachable!(),
             };
             let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
             ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
             materialize_bool_result(ctx, insn, rd, Cond::Eq);
         }

         Opcode::Copy => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rn, ty));
         }

         Opcode::Breduce | Opcode::Ireduce => {
             // Smaller integers/booleans are stored with high-order bits
             // undefined, so we can simply do a copy.
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rn, ty));
         }

         Opcode::Bextend | Opcode::Bmask => {
             // Bextend and Bmask both simply sign-extend. This works for:
             // - Bextend, because booleans are stored as 0 / -1, so we
             //   sign-extend the -1 to a -1 in the wider width.
             // - Bmask, because the resulting integer mask value must be
             //   all-ones (-1) if the argument is true.

             let from_ty = ctx.input_ty(insn, 0);
             let to_ty = ctx.output_ty(insn, 0);
             let from_bits = ty_bits(from_ty);
             let to_bits = ty_bits(to_ty);

             assert!(
                 from_bits <= 64 && to_bits <= 64,
                 "Vector Bextend not supported yet"
             );
             assert!(from_bits <= to_bits);

             if from_bits == to_bits {
                 // Nothing.
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let to_bits = if to_bits == 64 {
                     64
                 } else {
                     assert!(to_bits <= 32);
                     32
                 };
                 let from_bits = from_bits as u8;
                 ctx.emit(Inst::Extend {
                     rd,
                     rn,
                     signed: true,
                     from_bits,
                     to_bits,
                 });
             }
         }

         Opcode::Bint => {
             // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
             // out the LSB to give a 0 / 1-valued integer result.
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let output_bits = ty_bits(ctx.output_ty(insn, 0));

             let (imm_ty, alu_op) = if output_bits > 32 {
                 (I64, ALUOp::And64)
             } else {
                 (I32, ALUOp::And32)
             };
             ctx.emit(Inst::AluRRImmLogic {
                 alu_op,
                 rd,
                 rn,
                 imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
             });
         }

         Opcode::Bitcast => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ity = ctx.input_ty(insn, 0);
             let oty = ctx.output_ty(insn, 0);
             let ity_bits = ty_bits(ity);
             let ity_vec_reg = ty_has_float_or_vec_representation(ity);
             let oty_bits = ty_bits(oty);
             let oty_vec_reg = ty_has_float_or_vec_representation(oty);

             debug_assert_eq!(ity_bits, oty_bits);

             match (ity_vec_reg, oty_vec_reg) {
                 (true, true) => {
                     let narrow_mode = if ity_bits <= 32 {
                         NarrowValueMode::ZeroExtend32
                     } else {
                         NarrowValueMode::ZeroExtend64
                     };
                     let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
                     ctx.emit(Inst::gen_move(rd, rm, oty));
                 }
                 (false, false) => {
                     let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                     ctx.emit(Inst::gen_move(rd, rm, oty));
                 }
                 (false, true) => {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
                     ctx.emit(Inst::MovToFpu {
                         rd,
                         rn,
                         size: ScalarSize::Size64,
                     });
                 }
                 (true, false) => {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                     let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);

                     ctx.emit(Inst::MovFromVec {
                         rd,
                         rn,
                         idx: 0,
                         size,
                     });
                 }
             }
         }

         Opcode::FallthroughReturn | Opcode::Return => {
             for (i, input) in inputs.iter().enumerate() {
                 // N.B.: according to the AArch64 ABI, the top bits of a register
                 // (above the bits for the value's type) are undefined, so we
                 // need not extend the return values.
                 let src_regs = put_input_in_regs(ctx, *input);
                 let retval_regs = ctx.retval(i);

                 assert_eq!(src_regs.len(), retval_regs.len());
                 let ty = ctx.input_ty(insn, i);
                 let (_, tys) = Inst::rc_for_type(ty)?;

                 src_regs
                     .regs()
                     .iter()
                     .zip(retval_regs.regs().iter())
                     .zip(tys.iter())
                     .for_each(|((&src, &dst), &ty)| {
                         ctx.emit(Inst::gen_move(dst, src, ty));
                     });
             }
             // N.B.: the Ret itself is generated by the ABI.
         }

         Opcode::Ifcmp | Opcode::Ffcmp => {
             // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
             // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
             // the same block, or a dominating block. In other words, it cannot pass through a BB
             // param (phi). The flags pass of the verifier will ensure this.
             panic!("Should never reach ifcmp as isel root!");
         }

         Opcode::Icmp => {
             let condcode = ctx.data(insn).cond_code().unwrap();
             let cond = lower_condcode(condcode);
             let is_signed = condcode_is_signed(condcode);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             let bits = ty_bits(ty);
             let narrow_mode = match (bits <= 32, is_signed) {
                 (true, true) => NarrowValueMode::SignExtend32,
                 (true, false) => NarrowValueMode::ZeroExtend32,
                 (false, true) => NarrowValueMode::SignExtend64,
                 (false, false) => NarrowValueMode::ZeroExtend64,
             };
             let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);

             if !ty.is_vector() {
                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
                 let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
                 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
                 materialize_bool_result(ctx, insn, rd, cond);
             } else {
                 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
             }
         }

         Opcode::Fcmp => {
             let condcode = ctx.data(insn).fp_cond_code().unwrap();
             let cond = lower_fp_condcode(condcode);
             let ty = ctx.input_ty(insn, 0);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             if !ty.is_vector() {
                 match ty_bits(ty) {
                     32 => {
                         ctx.emit(Inst::FpuCmp32 { rn, rm });
                     }
                     64 => {
                         ctx.emit(Inst::FpuCmp64 { rn, rm });
                     }
                     _ => panic!("Bad float size"),
                 }
                 materialize_bool_result(ctx, insn, rd, cond);
             } else {
                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
             }
         }

         Opcode::JumpTableEntry | Opcode::JumpTableBase => {
             panic!("Should not appear: we handle BrTable directly");
         }

         Opcode::Debugtrap => {
             ctx.emit(Inst::Brk);
         }

         Opcode::Trap | Opcode::ResumableTrap => {
             let trap_code = ctx.data(insn).trap_code().unwrap();
             ctx.emit_safepoint(Inst::Udf { trap_code });
         }

         Opcode::Trapif | Opcode::Trapff => {
             let trap_code = ctx.data(insn).trap_code().unwrap();

             let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
                 let condcode = ctx.data(insn).cond_code().unwrap();
                 let cond = lower_condcode(condcode);
                 // The flags must not have been clobbered by any other
                 // instruction between the iadd_ifcout and this instruction, as
                 // verified by the CLIF validator; so we can simply use the
                 // flags here.
                 cond
             } else if op == Opcode::Trapif {
                 let condcode = ctx.data(insn).cond_code().unwrap();
                 let cond = lower_condcode(condcode);
                 let is_signed = condcode_is_signed(condcode);

                 // Verification ensures that the input is always a single-def ifcmp.
                 let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
                 lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
                 cond
             } else {
                 let condcode = ctx.data(insn).fp_cond_code().unwrap();
                 let cond = lower_fp_condcode(condcode);

                 // Verification ensures that the input is always a
                 // single-def ffcmp.
                 let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
                 lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
                 cond
             };

             ctx.emit_safepoint(Inst::TrapIf {
                 trap_code,
                 kind: CondBrKind::Cond(cond),
             });
         }

         Opcode::Safepoint => {
             panic!("safepoint instructions not used by new backend's safepoints!");
         }

         Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
             panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
         }

         Opcode::FuncAddr => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _) = ctx.call_target(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
                 rd,
                 name: Box::new(extname),
                 offset: 0,
             });
         }

         Opcode::GlobalValue => {
             panic!("global_value should have been removed by legalization!");
         }

         Opcode::SymbolValue => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
             let extname = extname.clone();
             ctx.emit(Inst::LoadExtName {
                 rd,
                 name: Box::new(extname),
                 offset,
             });
         }

         Opcode::Call | Opcode::CallIndirect => {
             let caller_conv = ctx.abi().call_conv();
             let (mut abi, inputs) = match op {
                 Opcode::Call => {
                     let (extname, dist) = ctx.call_target(insn).unwrap();
                     let extname = extname.clone();
                     let sig = ctx.call_sig(insn).unwrap();
                     assert!(inputs.len() == sig.params.len());
                     assert!(outputs.len() == sig.returns.len());
                     (
                         AArch64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
                         &inputs[..],
                     )
                 }
                 Opcode::CallIndirect => {
                     let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
                     let sig = ctx.call_sig(insn).unwrap();
                     assert!(inputs.len() - 1 == sig.params.len());
                     assert!(outputs.len() == sig.returns.len());
                     (
                         AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
                         &inputs[1..],
                     )
                 }
                 _ => unreachable!(),
             };

             abi.emit_stack_pre_adjust(ctx);
             assert!(inputs.len() == abi.num_args());
             for i in abi.get_copy_to_arg_order() {
                 let input = inputs[i];
                 let arg_regs = put_input_in_regs(ctx, input);
                 abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
             }
             abi.emit_call(ctx);
             for (i, output) in outputs.iter().enumerate() {
                 let retval_regs = get_output_reg(ctx, *output);
                 abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
             }
             abi.emit_stack_post_adjust(ctx);
         }

         Opcode::GetPinnedReg => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
         }

         Opcode::SetPinnedReg => {
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
         }

         Opcode::Spill
         | Opcode::Fill
         | Opcode::FillNop
         | Opcode::Regmove
         | Opcode::CopySpecial
         | Opcode::CopyToSsa
         | Opcode::CopyNop
         | Opcode::AdjustSpDown
         | Opcode::AdjustSpUpImm
         | Opcode::AdjustSpDownImm
         | Opcode::IfcmpSp
         | Opcode::Regspill
         | Opcode::Regfill => {
             panic!("Unused opcode should not be encountered.");
         }

         Opcode::Jump
         | Opcode::Fallthrough
         | Opcode::Brz
         | Opcode::Brnz
         | Opcode::BrIcmp
         | Opcode::Brif
         | Opcode::Brff
         | Opcode::IndirectJumpTableBr
         | Opcode::BrTable => {
             panic!("Branch opcode reached non-branch lowering logic!");
         }

         Opcode::Vconst => {
             let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             lower_constant_f128(ctx, rd, value);
         }

         Opcode::RawBitcast => {
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rm, ty));
         }

         Opcode::Extractlane => {
             if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
                 let idx = *imm;
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
                 let ty = ty.unwrap();

                 if ty_has_int_representation(ty) {
                     ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
                 // Plain moves are faster on some processors.
                 } else if idx == 0 {
                     ctx.emit(Inst::gen_move(rd, rn, ty));
                 } else {
                     ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
                 }
             } else {
                 unreachable!();
             }
         }

         Opcode::Insertlane => {
             let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
                 *imm
             } else {
                 unreachable!();
             };
             let input_ty = ctx.input_ty(insn, 1);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             let size = VectorSize::from_ty(ty);

             ctx.emit(Inst::gen_move(rd, rm, ty));

             if ty_has_int_representation(input_ty) {
                 ctx.emit(Inst::MovToVec { rd, rn, idx, size });
             } else {
                 ctx.emit(Inst::VecMovElement {
                     rd,
                     rn,
                     dest_idx: idx,
                     src_idx: 0,
                     size,
                 });
             }
         }

         Opcode::Splat => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let size = VectorSize::from_ty(ty.unwrap());

             if let Some((_, insn)) = maybe_input_insn_multi(
                 ctx,
                 inputs[0],
                 &[
                     Opcode::Bconst,
                     Opcode::F32const,
                     Opcode::F64const,
                     Opcode::Iconst,
                 ],
             ) {
                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
             } else if let Some(insn) =
                 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
             {
                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
             } else if let Some(insn) =
                 maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
             {
                 lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
             } else if let Some((_, insn)) = maybe_input_insn_multi(
                 ctx,
                 inputs[0],
                 &[
                     Opcode::Uload8,
                     Opcode::Sload8,
                     Opcode::Uload16,
                     Opcode::Sload16,
                     Opcode::Uload32,
                     Opcode::Sload32,
                     Opcode::Load,
                 ],
             ) {
                 ctx.sink_inst(insn);
                 let load_inputs = insn_inputs(ctx, insn);
                 let load_outputs = insn_outputs(ctx, insn);
                 lower_load(
                     ctx,
                     insn,
                     &load_inputs[..],
                     load_outputs[0],
                     |ctx, _rd, _elem_ty, mem| {
                         let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
                         let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
                         if let Some(addr_inst) = addr_inst {
                             ctx.emit(addr_inst);
                         }
                         ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
                     },
                 );
             } else {
                 let input_ty = ctx.input_ty(insn, 0);
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let inst = if ty_has_int_representation(input_ty) {
                     Inst::VecDup { rd, rn, size }
                 } else {
                     Inst::VecDupFromFpu { rd, rn, size }
                 };

                 ctx.emit(inst);
             }
         }

         Opcode::ScalarToVector => {
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let input_ty = ctx.input_ty(insn, 0);
             if (input_ty == I32 && ty.unwrap() == I32X4)
                 || (input_ty == I64 && ty.unwrap() == I64X2)
             {
                 ctx.emit(Inst::MovToFpu {
                     rd,
                     rn,
                     size: ScalarSize::from_ty(input_ty),
                 });
             } else {
                 return Err(CodegenError::Unsupported(format!(
                     "ScalarToVector: unsupported types {:?} -> {:?}",
                     input_ty, ty
                 )));
             }
         }

         Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();

             // cmeq vtmp.2d, vm.2d, #0
             // addp dtmp, vtmp.2d
             // fcmp dtmp, dtmp
             // cset xd, eq
             //
             // Note that after the ADDP the value of the temporary register will
             // be either 0 when all input elements are true, i.e. non-zero, or a
             // NaN otherwise (either -1 or -2 when represented as an integer);
             // NaNs are the only floating-point numbers that compare unequal to
             // themselves.

             ctx.emit(Inst::VecMisc {
                 op: VecMisc2::Cmeq0,
                 rd: tmp,
                 rn: rm,
                 size: VectorSize::Size64x2,
             });
             ctx.emit(Inst::VecRRPair {
                 op: VecPairOp::Addp,
                 rd: tmp,
                 rn: tmp.to_reg(),
             });
             ctx.emit(Inst::FpuCmp64 {
                 rn: tmp.to_reg(),
                 rm: tmp.to_reg(),
             });
             materialize_bool_result(ctx, insn, rd, Cond::Eq);
         }

         Opcode::VanyTrue | Opcode::VallTrue => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let src_ty = ctx.input_ty(insn, 0);
             let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();

             // This operation is implemented by using umaxp or uminv to
             // create a scalar value, which is then compared against zero.
             //
             // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
             // mov xm, vn.d[0]
             // cmp xm, #0
             // cset xm, ne

             let size = VectorSize::from_ty(ctx.input_ty(insn, 0));

             if op == Opcode::VanyTrue {
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Umaxp,
                     rd: tmp,
                     rn: rm,
                     rm: rm,
                     size,
                 });
             } else {
                 ctx.emit(Inst::VecLanes {
                     op: VecLanesOp::Uminv,
                     rd: tmp,
                     rn: rm,
                     size,
                 });
             };

             ctx.emit(Inst::MovFromVec {
                 rd,
                 rn: tmp.to_reg(),
                 idx: 0,
                 size: VectorSize::Size64x2,
             });

             ctx.emit(Inst::AluRRImm12 {
                 alu_op: ALUOp::SubS64,
                 rd: writable_zero_reg(),
                 rn: rd.to_reg(),
                 imm12: Imm12::zero(),
             });

             materialize_bool_result(ctx, insn, rd, Cond::Ne);
         }

         Opcode::VhighBits => {
             let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ctx.input_ty(insn, 0);
             // All three sequences use one integer temporary and two vector temporaries.  The
             // shift is done early so as to give the register allocator the possibility of using
             // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
             // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
             // derivation of these sequences.  Alternative sequences are discussed in
             // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
             // used here.
             let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
             let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
             let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
             match ty {
                 I8X16 => {
                     // sshr  tmp_v1.16b, src_v.16b, #7
                     // mov   tmp_r0, #0x0201
                     // movk  tmp_r0, #0x0804, lsl 16
                     // movk  tmp_r0, #0x2010, lsl 32
                     // movk  tmp_r0, #0x8040, lsl 48
                     // dup   tmp_v0.2d, tmp_r0
                     // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
                     // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
                     // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
                     // addv  tmp_v0h, tmp_v0.8h
                     // mov   dst_r, tmp_v0.h[0]
                     ctx.emit(Inst::VecShiftImm {
                         op: VecShiftImmOp::Sshr,
                         rd: tmp_v1,
                         rn: src_v,
                         size: VectorSize::Size8x16,
                         imm: 7,
                     });
                     lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::And,
                         rd: tmp_v1,
                         rn: tmp_v1.to_reg(),
                         rm: tmp_v0.to_reg(),
                         size: VectorSize::Size8x16,
                     });
                     ctx.emit(Inst::VecExtract {
                         rd: tmp_v0,
                         rn: tmp_v1.to_reg(),
                         rm: tmp_v1.to_reg(),
                         imm4: 8,
                     });
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::Zip1,
                         rd: tmp_v0,
                         rn: tmp_v1.to_reg(),
                         rm: tmp_v0.to_reg(),
                         size: VectorSize::Size8x16,
                     });
                     ctx.emit(Inst::VecLanes {
                         op: VecLanesOp::Addv,
                         rd: tmp_v0,
                         rn: tmp_v0.to_reg(),
                         size: VectorSize::Size16x8,
                     });
                     ctx.emit(Inst::MovFromVec {
                         rd: dst_r,
                         rn: tmp_v0.to_reg(),
                         idx: 0,
                         size: VectorSize::Size16x8,
                     });
                 }
                 I16X8 => {
                     // sshr  tmp_v1.8h, src_v.8h, #15
                     // mov   tmp_r0, #0x1
                     // movk  tmp_r0, #0x2, lsl 16
                     // movk  tmp_r0, #0x4, lsl 32
                     // movk  tmp_r0, #0x8, lsl 48
                     // dup   tmp_v0.2d, tmp_r0
                     // shl   tmp_r0, tmp_r0, #4
                     // mov   tmp_v0.d[1], tmp_r0
                     // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
                     // addv  tmp_v0h, tmp_v0.8h
                     // mov   dst_r, tmp_v0.h[0]
                     ctx.emit(Inst::VecShiftImm {
                         op: VecShiftImmOp::Sshr,
                         rd: tmp_v1,
                         rn: src_v,
                         size: VectorSize::Size16x8,
                         imm: 15,
                     });
                     lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
                     ctx.emit(Inst::VecDup {
                         rd: tmp_v0,
                         rn: tmp_r0.to_reg(),
                         size: VectorSize::Size64x2,
                     });
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: ALUOp::Lsl64,
                         rd: tmp_r0,
                         rn: tmp_r0.to_reg(),
                         immshift: ImmShift { imm: 4 },
                     });
                     ctx.emit(Inst::MovToVec {
                         rd: tmp_v0,
                         rn: tmp_r0.to_reg(),
                         idx: 1,
                         size: VectorSize::Size64x2,
                     });
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::And,
                         rd: tmp_v0,
                         rn: tmp_v1.to_reg(),
                         rm: tmp_v0.to_reg(),
                         size: VectorSize::Size8x16,
                     });
                     ctx.emit(Inst::VecLanes {
                         op: VecLanesOp::Addv,
                         rd: tmp_v0,
                         rn: tmp_v0.to_reg(),
                         size: VectorSize::Size16x8,
                     });
                     ctx.emit(Inst::MovFromVec {
                         rd: dst_r,
                         rn: tmp_v0.to_reg(),
                         idx: 0,
                         size: VectorSize::Size16x8,
                     });
                 }
                 I32X4 => {
                     // sshr  tmp_v1.4s, src_v.4s, #31
                     // mov   tmp_r0, #0x1
                     // movk  tmp_r0, #0x2, lsl 32
                     // dup   tmp_v0.2d, tmp_r0
                     // shl   tmp_r0, tmp_r0, #2
                     // mov   tmp_v0.d[1], tmp_r0
                     // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
                     // addv  tmp_v0s, tmp_v0.4s
                     // mov   dst_r, tmp_v0.s[0]
                     ctx.emit(Inst::VecShiftImm {
                         op: VecShiftImmOp::Sshr,
                         rd: tmp_v1,
                         rn: src_v,
                         size: VectorSize::Size32x4,
                         imm: 31,
                     });
                     lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
                     ctx.emit(Inst::VecDup {
                         rd: tmp_v0,
                         rn: tmp_r0.to_reg(),
                         size: VectorSize::Size64x2,
                     });
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: ALUOp::Lsl64,
                         rd: tmp_r0,
                         rn: tmp_r0.to_reg(),
                         immshift: ImmShift { imm: 2 },
                     });
                     ctx.emit(Inst::MovToVec {
                         rd: tmp_v0,
                         rn: tmp_r0.to_reg(),
                         idx: 1,
                         size: VectorSize::Size64x2,
                     });
                     ctx.emit(Inst::VecRRR {
                         alu_op: VecALUOp::And,
                         rd: tmp_v0,
                         rn: tmp_v1.to_reg(),
                         rm: tmp_v0.to_reg(),
                         size: VectorSize::Size8x16,
                     });
                     ctx.emit(Inst::VecLanes {
                         op: VecLanesOp::Addv,
                         rd: tmp_v0,
                         rn: tmp_v0.to_reg(),
                         size: VectorSize::Size32x4,
                     });
                     ctx.emit(Inst::MovFromVec {
                         rd: dst_r,
                         rn: tmp_v0.to_reg(),
                         idx: 0,
                         size: VectorSize::Size32x4,
                     });
                 }
                 I64X2 => {
                     // mov dst_r, src_v.d[0]
                     // mov tmp_r0, src_v.d[1]
                     // lsr dst_r, dst_r, #63
                     // lsr tmp_r0, tmp_r0, #63
                     // add dst_r, dst_r, tmp_r0, lsl #1
                     ctx.emit(Inst::MovFromVec {
                         rd: dst_r,
                         rn: src_v,
                         idx: 0,
                         size: VectorSize::Size64x2,
                     });
                     ctx.emit(Inst::MovFromVec {
                         rd: tmp_r0,
                         rn: src_v,
                         idx: 1,
                         size: VectorSize::Size64x2,
                     });
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: ALUOp::Lsr64,
                         rd: dst_r,
                         rn: dst_r.to_reg(),
                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
                     });
                     ctx.emit(Inst::AluRRImmShift {
                         alu_op: ALUOp::Lsr64,
                         rd: tmp_r0,
                         rn: tmp_r0.to_reg(),
                         immshift: ImmShift::maybe_from_u64(63).unwrap(),
                     });
                     ctx.emit(Inst::AluRRRShift {
                         alu_op: ALUOp::Add32,
                         rd: dst_r,
                         rn: dst_r.to_reg(),
                         rm: tmp_r0.to_reg(),
                         shiftop: ShiftOpAndAmt::new(
                             ShiftOp::LSL,
                             ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
                         ),
                     });
                 }
                 _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
             }
         }

         Opcode::Shuffle => {
             let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             // 2 register table vector lookups require consecutive table registers;
             // we satisfy this constraint by hardcoding the usage of v29 and v30.
             let temp = writable_vreg(29);
             let temp2 = writable_vreg(30);
             let input_ty = ctx.input_ty(insn, 0);
             assert_eq!(input_ty, ctx.input_ty(insn, 1));
             // Make sure that both inputs are in virtual registers, since it is
             // not guaranteed that we can get them safely to the temporaries if
             // either is in a real register.
             let rn = ctx.ensure_in_vreg(rn, input_ty);
             let rn2 = ctx.ensure_in_vreg(rn2, input_ty);

             lower_constant_f128(ctx, rd, mask);
             ctx.emit(Inst::gen_move(temp, rn, input_ty));
             ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
             ctx.emit(Inst::VecTbl2 {
                 rd,
                 rn: temp.to_reg(),
                 rn2: temp2.to_reg(),
                 rm: rd.to_reg(),
                 is_extension: false,
             });
         }

         Opcode::Swizzle => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);

             ctx.emit(Inst::VecTbl {
                 rd,
                 rn,
                 rm,
                 is_extension: false,
             });
         }

         Opcode::Vsplit | Opcode::Vconcat => {
             // TODO
             panic!("Vector ops not implemented.");
         }

         Opcode::Isplit => {
             assert_eq!(
                 ctx.input_ty(insn, 0),
                 I128,
                 "Isplit only implemented for i128's"
             );
             assert_eq!(ctx.output_ty(insn, 0), I64);
             assert_eq!(ctx.output_ty(insn, 1), I64);

             let src_regs = put_input_in_regs(ctx, inputs[0]);
             let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();

             ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
             ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
         }

         Opcode::Iconcat => {
             assert_eq!(
                 ctx.output_ty(insn, 0),
                 I128,
                 "Iconcat only implemented for i128's"
             );
             assert_eq!(ctx.input_ty(insn, 0), I64);
             assert_eq!(ctx.input_ty(insn, 1), I64);

             let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let dst = get_output_reg(ctx, outputs[0]);

             ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
             ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
         }

         Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
             let alu_op = match op {
                 Opcode::Umin => VecALUOp::Umin,
                 Opcode::Imin => VecALUOp::Smin,
                 Opcode::Umax => VecALUOp::Umax,
                 Opcode::Imax => VecALUOp::Smax,
                 _ => unreachable!(),
             };
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             ctx.emit(Inst::VecRRR {
                 alu_op,
                 rd,
                 rn,
                 rm,
                 size: VectorSize::from_ty(ty),
             });
         }

         Opcode::WideningPairwiseDotProductS => {
             let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             if ty == I32X4 {
                 let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
                 // The args have type I16X8.
                 // "y = i32x4.dot_i16x8_s(a, b)"
                 // => smull  tmp, a, b
                 //    smull2 y,   a, b
                 //    addp   y,   tmp, y
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Smull,
                     rd: tmp,
                     rn: r_a,
                     rm: r_b,
                     size: VectorSize::Size16x8,
                 });
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Smull2,
                     rd: r_y,
                     rn: r_a,
                     rm: r_b,
                     size: VectorSize::Size16x8,
                 });
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Addp,
                     rd: r_y,
                     rn: tmp.to_reg(),
                     rm: r_y.to_reg(),
                     size: VectorSize::Size32x4,
                 });
             } else {
                 return Err(CodegenError::Unsupported(format!(
                     "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
                     ty
                 )));
             }
         }

         Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
             let ty = ty.unwrap();
             let bits = ty_bits(ty);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let fpu_op = match (op, bits) {
                     (Opcode::Fadd, 32) => FPUOp2::Add32,
                     (Opcode::Fadd, 64) => FPUOp2::Add64,
                     (Opcode::Fsub, 32) => FPUOp2::Sub32,
                     (Opcode::Fsub, 64) => FPUOp2::Sub64,
                     (Opcode::Fmul, 32) => FPUOp2::Mul32,
                     (Opcode::Fmul, 64) => FPUOp2::Mul64,
                     (Opcode::Fdiv, 32) => FPUOp2::Div32,
                     (Opcode::Fdiv, 64) => FPUOp2::Div64,
                     (Opcode::Fmin, 32) => FPUOp2::Min32,
                     (Opcode::Fmin, 64) => FPUOp2::Min64,
                     (Opcode::Fmax, 32) => FPUOp2::Max32,
                     (Opcode::Fmax, 64) => FPUOp2::Max64,
                     _ => panic!("Unknown op/bits combination"),
                 };
                 ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
             } else {
                 let alu_op = match op {
                     Opcode::Fadd => VecALUOp::Fadd,
                     Opcode::Fsub => VecALUOp::Fsub,
                     Opcode::Fdiv => VecALUOp::Fdiv,
                     Opcode::Fmax => VecALUOp::Fmax,
                     Opcode::Fmin => VecALUOp::Fmin,
                     Opcode::Fmul => VecALUOp::Fmul,
                     _ => unreachable!(),
                 };

                 ctx.emit(Inst::VecRRR {
                     rd,
                     rn,
                     rm,
                     alu_op,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }

         Opcode::FminPseudo | Opcode::FmaxPseudo => {
             let ty = ctx.input_ty(insn, 0);
             if ty == F32X4 || ty == F64X2 {
                 // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
                 // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
                 let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                 // Since we're going to write the output register `r_dst` anyway, we might as
                 // well first use it to hold the comparison result.  This has the slightly unusual
                 // effect that we modify the output register in the first instruction (`fcmgt`)
                 // but read both the inputs again in the second instruction (`bsl`), which means
                 // that the output register can't be either of the input registers.  Regalloc
                 // should handle this correctly, nevertheless.
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Fcmgt,
                     rd: r_dst,
                     rn: if op == Opcode::FminPseudo { r_a } else { r_b },
                     rm: if op == Opcode::FminPseudo { r_b } else { r_a },
                     size: if ty == F32X4 {
                         VectorSize::Size32x4
                     } else {
                         VectorSize::Size64x2
                     },
                 });
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Bsl,
                     rd: r_dst,
                     rn: r_b,
                     rm: r_a,
                     size: VectorSize::Size8x16,
                 });
             } else {
                 panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
             }
         }

         Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
             let ty = ty.unwrap();
             let bits = ty_bits(ty);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             if !ty.is_vector() {
                 let fpu_op = match (op, bits) {
                     (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
                     (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
                     (Opcode::Fneg, 32) => FPUOp1::Neg32,
                     (Opcode::Fneg, 64) => FPUOp1::Neg64,
                     (Opcode::Fabs, 32) => FPUOp1::Abs32,
                     (Opcode::Fabs, 64) => FPUOp1::Abs64,
                     (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
                     (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
                     (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
                     (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
                     _ => panic!("Unknown op/bits combination"),
                 };
                 ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
             } else {
                 let op = match op {
                     Opcode::Fabs => VecMisc2::Fabs,
                     Opcode::Fneg => VecMisc2::Fneg,
                     Opcode::Sqrt => VecMisc2::Fsqrt,
                     _ => unimplemented!(),
                 };

                 ctx.emit(Inst::VecMisc {
                     op,
                     rd,
                     rn,
                     size: VectorSize::from_ty(ty),
                 });
             }
         }

         Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
             let ty = ctx.output_ty(insn, 0);
             if !ty.is_vector() {
                 let bits = ty_bits(ty);
                 let op = match (op, bits) {
                     (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
                     (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
                     (Opcode::Floor, 32) => FpuRoundMode::Minus32,
                     (Opcode::Floor, 64) => FpuRoundMode::Minus64,
                     (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
                     (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
                     (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
                     (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
                     _ => panic!("Unknown op/bits combination (scalar)"),
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::FpuRound { op, rd, rn });
             } else {
                 let (op, size) = match (op, ty) {
                     (Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
                     (Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
                     (Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
                     (Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
                     (Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
                     (Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
                     (Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
                     (Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
                     _ => panic!("Unknown op/ty combination (vector){:?}", ty),
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                 ctx.emit(Inst::VecMisc { op, rd, rn, size });
             }
         }

         Opcode::Fma => {
             let bits = ty_bits(ctx.output_ty(insn, 0));
             let fpu_op = match bits {
                 32 => FPUOp3::MAdd32,
                 64 => FPUOp3::MAdd64,
                 _ => panic!("Unknown op size"),
             };
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::FpuRRRR {
                 fpu_op,
                 rn,
                 rm,
                 ra,
                 rd,
             });
         }

         Opcode::Fcopysign => {
             // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
             //
             // This is a scalar Fcopysign.
             // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
             // In the latter case it still sets all bits except the lowest 32 to 0.
             //
             //  mov vd, vn
             //  ushr vtmp, vm, #63 / #31
             //  sli vd, vtmp, #63 / #31

             let ty = ctx.output_ty(insn, 0);
             let bits = ty_bits(ty) as u8;
             assert!(bits == 32 || bits == 64);
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();

             // Copy LHS to rd.
             ctx.emit(Inst::gen_move(rd, rn, ty));

             // Copy the sign bit to the lowest bit in tmp.
             let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
             ctx.emit(Inst::FpuRRI {
                 fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
                 rd: tmp,
                 rn: rm,
             });

             // Insert the bit from tmp into the sign bit of rd.
             let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
             ctx.emit(Inst::FpuRRI {
                 fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
                 rd,
                 rn: tmp.to_reg(),
             });
         }

         Opcode::FcvtToUint | Opcode::FcvtToSint => {
             let in_bits = ty_bits(ctx.input_ty(insn, 0));
             let out_bits = ty_bits(ctx.output_ty(insn, 0));
             let signed = op == Opcode::FcvtToSint;
             let op = match (signed, in_bits, out_bits) {
                 (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
                 (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
                 (false, 32, 64) => FpuToIntOp::F32ToU64,
                 (true, 32, 64) => FpuToIntOp::F32ToI64,
                 (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
                 (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
                 (false, 64, 64) => FpuToIntOp::F64ToU64,
                 (true, 64, 64) => FpuToIntOp::F64ToI64,
                 _ => panic!("Unknown input/output-bits combination"),
             };

             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             // First, check the output: it's important to carry the NaN conversion before the
             // in-bounds conversion, per wasm semantics.

             // Check that the input is not a NaN.
             if in_bits == 32 {
                 ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
             } else {
                 ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
             }
             let trap_code = TrapCode::BadConversionToInteger;
             ctx.emit(Inst::TrapIf {
                 trap_code,
                 kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
             });

             let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();

             // Check that the input is in range, with "truncate towards zero" semantics. This means
             // we allow values that are slightly out of range:
             // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
             // can be represented), and strictly less than INT_MAX+1 (when this can be
             // represented).
             // - for unsigned conversions, we allow values strictly greater than -1, and strictly
             // less than UINT_MAX+1 (when this can be represented).

             if in_bits == 32 {
                 // From float32.
                 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
                     (true, 8) => (
                         i8::min_value() as f32 - 1.,
                         FloatCC::GreaterThan,
                         i8::max_value() as f32 + 1.,
                     ),
                     (true, 16) => (
                         i16::min_value() as f32 - 1.,
                         FloatCC::GreaterThan,
                         i16::max_value() as f32 + 1.,
                     ),
                     (true, 32) => (
                         i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
                         FloatCC::GreaterThanOrEqual,
                         i32::max_value() as f32 + 1.,
                     ),
                     (true, 64) => (
                         i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
                         FloatCC::GreaterThanOrEqual,
                         i64::max_value() as f32 + 1.,
                     ),
                     (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
                     (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
                     (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
                     (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
                     _ => panic!("Unknown input/output-bits combination"),
                 };

                 // >= low_bound
                 lower_constant_f32(ctx, tmp, low_bound);
                 ctx.emit(Inst::FpuCmp32 {
                     rn,
                     rm: tmp.to_reg(),
                 });
                 let trap_code = TrapCode::IntegerOverflow;
                 ctx.emit(Inst::TrapIf {
                     trap_code,
                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
                 });

                 // <= high_bound
                 lower_constant_f32(ctx, tmp, high_bound);
                 ctx.emit(Inst::FpuCmp32 {
                     rn,
                     rm: tmp.to_reg(),
                 });
                 let trap_code = TrapCode::IntegerOverflow;
                 ctx.emit(Inst::TrapIf {
                     trap_code,
                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
                 });
             } else {
                 // From float64.
                 let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
                     (true, 8) => (
                         i8::min_value() as f64 - 1.,
                         FloatCC::GreaterThan,
                         i8::max_value() as f64 + 1.,
                     ),
                     (true, 16) => (
                         i16::min_value() as f64 - 1.,
                         FloatCC::GreaterThan,
                         i16::max_value() as f64 + 1.,
                     ),
                     (true, 32) => (
                         i32::min_value() as f64 - 1.,
                         FloatCC::GreaterThan,
                         i32::max_value() as f64 + 1.,
                     ),
                     (true, 64) => (
                         i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
                         FloatCC::GreaterThanOrEqual,
                         i64::max_value() as f64 + 1.,
                     ),
                     (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
                     (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
                     (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
                     (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
                     _ => panic!("Unknown input/output-bits combination"),
                 };

                 // >= low_bound
                 lower_constant_f64(ctx, tmp, low_bound);
                 ctx.emit(Inst::FpuCmp64 {
                     rn,
                     rm: tmp.to_reg(),
                 });
                 let trap_code = TrapCode::IntegerOverflow;
                 ctx.emit(Inst::TrapIf {
                     trap_code,
                     kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
                 });

                 // <= high_bound
                 lower_constant_f64(ctx, tmp, high_bound);
                 ctx.emit(Inst::FpuCmp64 {
                     rn,
                     rm: tmp.to_reg(),
                 });
                 let trap_code = TrapCode::IntegerOverflow;
                 ctx.emit(Inst::TrapIf {
                     trap_code,
                     kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
                 });
             };

             // Do the conversion.
             ctx.emit(Inst::FpuToInt { op, rd, rn });
         }

         Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
             let ty = ty.unwrap();
             let signed = op == Opcode::FcvtFromSint;
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             if ty.is_vector() {
                 let op = if signed {
                     VecMisc2::Scvtf
                 } else {
                     VecMisc2::Ucvtf
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);

                 ctx.emit(Inst::VecMisc {
                     op,
                     rd,
                     rn,
                     size: VectorSize::from_ty(ty),
                 });
             } else {
                 let in_bits = ty_bits(ctx.input_ty(insn, 0));
                 let out_bits = ty_bits(ty);
                 let op = match (signed, in_bits, out_bits) {
                     (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
                     (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
                     (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
                     (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
                     (false, 64, 32) => IntToFpuOp::U64ToF32,
                     (true, 64, 32) => IntToFpuOp::I64ToF32,
                     (false, 64, 64) => IntToFpuOp::U64ToF64,
                     (true, 64, 64) => IntToFpuOp::I64ToF64,
                     _ => panic!("Unknown input/output-bits combination"),
                 };
                 let narrow_mode = match (signed, in_bits) {
                     (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
                     (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
                     (false, 64) => NarrowValueMode::ZeroExtend64,
                     (true, 64) => NarrowValueMode::SignExtend64,
                     _ => panic!("Unknown input size"),
                 };
                 let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
                 ctx.emit(Inst::IntToFpu { op, rd, rn });
             }
         }

         Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
             let ty = ty.unwrap();
             let out_signed = op == Opcode::FcvtToSintSat;
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

             if ty.is_vector() {
                 let op = if out_signed {
                     VecMisc2::Fcvtzs
                 } else {
                     VecMisc2::Fcvtzu
                 };

                 ctx.emit(Inst::VecMisc {
                     op,
                     rd,
                     rn,
                     size: VectorSize::from_ty(ty),
                 });
             } else {
                 let in_ty = ctx.input_ty(insn, 0);
                 let in_bits = ty_bits(in_ty);
                 let out_bits = ty_bits(ty);
                 // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
                 // FMIN Vtmp2, Vin, Vtmp1
                 // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
                 // FMAX Vtmp2, Vtmp2, Vtmp1
                 // (if signed) FIMM Vtmp1, 0
                 // FCMP Vin, Vin
                 // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
                 // convert Rout, Vtmp2

                 assert!(in_bits == 32 || in_bits == 64);
                 assert!(out_bits == 32 || out_bits == 64);

                 let min: f64 = match (out_bits, out_signed) {
                     (32, true) => std::i32::MIN as f64,
                     (32, false) => 0.0,
                     (64, true) => std::i64::MIN as f64,
                     (64, false) => 0.0,
                     _ => unreachable!(),
                 };

                 let max = match (out_bits, out_signed) {
                     (32, true) => std::i32::MAX as f64,
                     (32, false) => std::u32::MAX as f64,
                     (64, true) => std::i64::MAX as f64,
                     (64, false) => std::u64::MAX as f64,
                     _ => unreachable!(),
                 };

                 let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
                 let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();

                 if in_bits == 32 {
                     lower_constant_f32(ctx, rtmp1, max as f32);
                 } else {
                     lower_constant_f64(ctx, rtmp1, max);
                 }
                 ctx.emit(Inst::FpuRRR {
                     fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
                     rd: rtmp2,
                     rn: rn,
                     rm: rtmp1.to_reg(),
                 });
                 if in_bits == 32 {
                     lower_constant_f32(ctx, rtmp1, min as f32);
                 } else {
                     lower_constant_f64(ctx, rtmp1, min);
                 }
                 ctx.emit(Inst::FpuRRR {
                     fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
                     rd: rtmp2,
                     rn: rtmp2.to_reg(),
                     rm: rtmp1.to_reg(),
                 });
                 if out_signed {
                     if in_bits == 32 {
                         lower_constant_f32(ctx, rtmp1, 0.0);
                     } else {
                         lower_constant_f64(ctx, rtmp1, 0.0);
                     }
                 }
                 if in_bits == 32 {
                     ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
                     ctx.emit(Inst::FpuCSel32 {
                         rd: rtmp2,
                         rn: rtmp1.to_reg(),
                         rm: rtmp2.to_reg(),
                         cond: Cond::Ne,
                     });
                 } else {
                     ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
                     ctx.emit(Inst::FpuCSel64 {
                         rd: rtmp2,
                         rn: rtmp1.to_reg(),
                         rm: rtmp2.to_reg(),
                         cond: Cond::Ne,
                     });
                 }

                 let cvt = match (in_bits, out_bits, out_signed) {
                     (32, 32, false) => FpuToIntOp::F32ToU32,
                     (32, 32, true) => FpuToIntOp::F32ToI32,
                     (32, 64, false) => FpuToIntOp::F32ToU64,
                     (32, 64, true) => FpuToIntOp::F32ToI64,
                     (64, 32, false) => FpuToIntOp::F64ToU32,
                     (64, 32, true) => FpuToIntOp::F64ToI32,
                     (64, 64, false) => FpuToIntOp::F64ToU64,
                     (64, 64, true) => FpuToIntOp::F64ToI64,
                     _ => unreachable!(),
                 };
                 ctx.emit(Inst::FpuToInt {
                     op: cvt,
                     rd,
                     rn: rtmp2.to_reg(),
                 });
             }
         }

         Opcode::IaddIfcout => {
             // This is a two-output instruction that is needed for the
             // legalizer's explicit heap-check sequence, among possible other
             // uses. Its second output is a flags output only ever meant to
             // check for overflow using the
             // `backend.unsigned_add_overflow_condition()` condition.
             //
             // Note that the CLIF validation will ensure that no flag-setting
             // operation comes between this IaddIfcout and its use (e.g., a
             // Trapif). Thus, we can rely on implicit communication through the
             // processor flags rather than explicitly generating flags into a
             // register. We simply use the variant of the add instruction that
             // sets flags (`adds`) here.

             // Note that the second output (the flags) need not be generated,
             // because flags are never materialized into a register; the only
             // instructions that can use a value of type `iflags` or `fflags`
             // will look directly for the flags-producing instruction (which can
             // always be found, by construction) and merge it.

             // Now handle the iadd as above, except use an AddS opcode that sets
             // flags.
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
             ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
         }

         Opcode::IaddImm
         | Opcode::ImulImm
         | Opcode::UdivImm
         | Opcode::SdivImm
         | Opcode::UremImm
         | Opcode::SremImm
         | Opcode::IrsubImm
         | Opcode::IaddCin
         | Opcode::IaddIfcin
         | Opcode::IaddCout
         | Opcode::IaddCarry
         | Opcode::IaddIfcarry
         | Opcode::IsubBin
         | Opcode::IsubIfbin
         | Opcode::IsubBout
         | Opcode::IsubIfbout
         | Opcode::IsubBorrow
         | Opcode::IsubIfborrow
         | Opcode::BandImm
         | Opcode::BorImm
         | Opcode::BxorImm
         | Opcode::RotlImm
         | Opcode::RotrImm
         | Opcode::IshlImm
         | Opcode::UshrImm
         | Opcode::SshrImm
         | Opcode::IcmpImm
         | Opcode::IfcmpImm => {
             panic!("ALU+imm and ALU+carry ops should not appear here!");
         }

         #[cfg(feature = "x86")]
         Opcode::X86Udivmodx
         | Opcode::X86Sdivmodx
         | Opcode::X86Umulx
         | Opcode::X86Smulx
         | Opcode::X86Cvtt2si
         | Opcode::X86Fmin
         | Opcode::X86Fmax
         | Opcode::X86Push
         | Opcode::X86Pop
         | Opcode::X86Bsr
         | Opcode::X86Bsf
         | Opcode::X86Pblendw
         | Opcode::X86Pshufd
         | Opcode::X86Pshufb
         | Opcode::X86Pextr
         | Opcode::X86Pinsr
         | Opcode::X86Insertps
         | Opcode::X86Movsd
         | Opcode::X86Movlhps
         | Opcode::X86Palignr
         | Opcode::X86Psll
         | Opcode::X86Psrl
         | Opcode::X86Psra
         | Opcode::X86Ptest
         | Opcode::X86Pmaxs
         | Opcode::X86Pmaxu
         | Opcode::X86Pmins
         | Opcode::X86Pminu
         | Opcode::X86Pmullq
         | Opcode::X86Pmuludq
         | Opcode::X86Punpckh
         | Opcode::X86Punpckl
         | Opcode::X86Vcvtudq2ps
         | Opcode::X86ElfTlsGetAddr
         | Opcode::X86MachoTlsGetAddr => {
             panic!("x86-specific opcode in supposedly arch-neutral IR!");
         }

         Opcode::DummySargT => unreachable!(),

         Opcode::Iabs => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let ty = ty.unwrap();
             ctx.emit(Inst::VecMisc {
                 op: VecMisc2::Abs,
                 rd,
                 rn,
                 size: VectorSize::from_ty(ty),
             });
         }
         Opcode::AvgRound => {
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();
             ctx.emit(Inst::VecRRR {
                 alu_op: VecALUOp::Urhadd,
                 rd,
                 rn,
                 rm,
                 size: VectorSize::from_ty(ty),
             });
         }

         Opcode::Snarrow | Opcode::Unarrow => {
             let op = if op == Opcode::Snarrow {
                 VecMiscNarrowOp::Sqxtn
             } else {
                 VecMiscNarrowOp::Sqxtun
             };
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let ty = ty.unwrap();

             ctx.emit(Inst::VecMiscNarrow {
                 op,
                 rd,
                 rn,
                 size: VectorSize::from_ty(ty),
                 high_half: false,
             });
             ctx.emit(Inst::VecMiscNarrow {
                 op,
                 rd,
                 rn: rn2,
                 size: VectorSize::from_ty(ty),
                 high_half: true,
             });
         }

         Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
             let lane_type = ty.unwrap().lane_type();
             let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let (t, high_half) = match (lane_type, op) {
                 (I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
                 (I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
                 (I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
                 (I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
                 (I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
                 (I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
                 (I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
                 (I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
                 _ => {
                     return Err(CodegenError::Unsupported(format!(
                         "Unsupported SIMD vector lane type: {:?}",
                         lane_type
                     )));
                 }
             };

             ctx.emit(Inst::VecExtend {
                 t,
                 rd,
                 rn,
                 high_half,
             });
         }

         Opcode::TlsValue => unimplemented!("tls_value"),
         Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
     }

     Ok(())
 }

 pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     branches: &[IRInst],
     targets: &[MachLabel],
 ) -> CodegenResult<()> {
     // A block should end with at most two branches. The first may be a
     // conditional branch; a conditional branch can be followed only by an
     // unconditional branch or fallthrough. Otherwise, if only one branch,
     // it may be an unconditional branch, a fallthrough, a return, or a
     // trap. These conditions are verified by `is_ebb_basic()` during the
     // verifier pass.
     assert!(branches.len() <= 2);

     if branches.len() == 2 {
         // Must be a conditional branch followed by an unconditional branch.
         let op0 = ctx.data(branches[0]).opcode();
         let op1 = ctx.data(branches[1]).opcode();

         assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
         let taken = BranchTarget::Label(targets[0]);
         // not_taken target is the target of the second branch, even if it is a Fallthrough
         // instruction: because we reorder blocks while we lower, the fallthrough in the new
         // order is not (necessarily) the same as the fallthrough in CLIF. So we use the
         // explicitly-provided target.
         let not_taken = BranchTarget::Label(targets[1]);

         match op0 {
             Opcode::Brz | Opcode::Brnz => {
                 let flag_input = InsnInput {
                     insn: branches[0],
                     input: 0,
                 };
                 if let Some(icmp_insn) =
                     maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
                 {
                     let condcode = ctx.data(icmp_insn).cond_code().unwrap();
                     let cond = lower_condcode(condcode);
                     let is_signed = condcode_is_signed(condcode);
                     let negated = op0 == Opcode::Brz;
                     let cond = if negated { cond.invert() } else { cond };

                     lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind: CondBrKind::Cond(cond),
                     });
                 } else if let Some(fcmp_insn) =
                     maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
                 {
                     let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
                     let cond = lower_fp_condcode(condcode);
                     let negated = op0 == Opcode::Brz;
                     let cond = if negated { cond.invert() } else { cond };

                     lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind: CondBrKind::Cond(cond),
                     });
                 } else {
                     let rt = put_input_in_reg(
                         ctx,
                         InsnInput {
                             insn: branches[0],
                             input: 0,
                         },
                         NarrowValueMode::ZeroExtend64,
                     );
                     let kind = match op0 {
                         Opcode::Brz => CondBrKind::Zero(rt),
                         Opcode::Brnz => CondBrKind::NotZero(rt),
                         _ => unreachable!(),
                     };
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind,
                     });
                 }
             }
             Opcode::BrIcmp => {
                 let condcode = ctx.data(branches[0]).cond_code().unwrap();
                 let cond = lower_condcode(condcode);
                 let kind = CondBrKind::Cond(cond);

                 let is_signed = condcode_is_signed(condcode);
                 let ty = ctx.input_ty(branches[0], 0);
                 let bits = ty_bits(ty);
                 let narrow_mode = match (bits <= 32, is_signed) {
                     (true, true) => NarrowValueMode::SignExtend32,
                     (true, false) => NarrowValueMode::ZeroExtend32,
                     (false, true) => NarrowValueMode::SignExtend64,
                     (false, false) => NarrowValueMode::ZeroExtend64,
                 };
                 let rn = put_input_in_reg(
                     ctx,
                     InsnInput {
                         insn: branches[0],
                         input: 0,
                     },
                     narrow_mode,
                 );
                 let rm = put_input_in_rse_imm12(
                     ctx,
                     InsnInput {
                         insn: branches[0],
                         input: 1,
                     },
                     narrow_mode,
                 );

                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
                 let rd = writable_zero_reg();
                 ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
                 ctx.emit(Inst::CondBr {
                     taken,
                     not_taken,
                     kind,
                 });
             }

             Opcode::Brif => {
                 let condcode = ctx.data(branches[0]).cond_code().unwrap();
                 let cond = lower_condcode(condcode);
                 let kind = CondBrKind::Cond(cond);

                 let is_signed = condcode_is_signed(condcode);
                 let flag_input = InsnInput {
                     insn: branches[0],
                     input: 0,
                 };
                 if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
                     lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind,
                     });
                 } else {
                     // If the ifcmp result is actually placed in a
                     // register, we need to move it back into the flags.
                     let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
                     ctx.emit(Inst::MovToNZCV { rn });
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind,
                     });
                 }
             }

             Opcode::Brff => {
                 let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
                 let cond = lower_fp_condcode(condcode);
                 let kind = CondBrKind::Cond(cond);
                 let flag_input = InsnInput {
                     insn: branches[0],
                     input: 0,
                 };
                 if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
                     lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind,
                     });
                 } else {
                     // If the ffcmp result is actually placed in a
                     // register, we need to move it back into the flags.
                     let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
                     ctx.emit(Inst::MovToNZCV { rn });
                     ctx.emit(Inst::CondBr {
                         taken,
                         not_taken,
                         kind,
                     });
                 }
             }

             _ => unimplemented!(),
         }
     } else {
         // Must be an unconditional branch or an indirect branch.
         let op = ctx.data(branches[0]).opcode();
         match op {
             Opcode::Jump | Opcode::Fallthrough => {
                 assert!(branches.len() == 1);
                 // In the Fallthrough case, the machine-independent driver
                 // fills in `targets[0]` with our fallthrough block, so this
                 // is valid for both Jump and Fallthrough.
                 ctx.emit(Inst::Jump {
                     dest: BranchTarget::Label(targets[0]),
                 });
             }

             Opcode::BrTable => {
                 // Expand `br_table index, default, JT` to:
                 //
                 //   emit_island  // this forces an island at this point
                 //                // if the jumptable would push us past
                 //                // the deadline
                 //   subs idx, #jt_size
                 //   b.hs default
                 //   adr vTmp1, PC+16
                 //   ldr vTmp2, [vTmp1, idx, lsl #2]
                 //   add vTmp2, vTmp2, vTmp1
                 //   br vTmp2
                 //   [jumptable offsets relative to JT base]
                 let jt_size = targets.len() - 1;
                 assert!(jt_size <= std::u32::MAX as usize);

                 ctx.emit(Inst::EmitIsland {
                     needed_space: 4 * (6 + jt_size) as CodeOffset,
                 });

                 let ridx = put_input_in_reg(
                     ctx,
                     InsnInput {
                         insn: branches[0],
                         input: 0,
                     },
                     NarrowValueMode::ZeroExtend32,
                 );

                 let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
                 let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();

                 // Bounds-check, leaving condition codes for JTSequence's
                 // branch to default target below.
                 if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
                     ctx.emit(Inst::AluRRImm12 {
                         alu_op: ALUOp::SubS32,
                         rd: writable_zero_reg(),
                         rn: ridx,
                         imm12,
                     });
                 } else {
                     lower_constant_u64(ctx, rtmp1, jt_size as u64);
                     ctx.emit(Inst::AluRRR {
                         alu_op: ALUOp::SubS32,
                         rd: writable_zero_reg(),
                         rn: ridx,
                         rm: rtmp1.to_reg(),
                     });
                 }

                 // Emit the compound instruction that does:
                 //
                 // b.hs default
                 // adr rA, jt
                 // ldrsw rB, [rA, rIndex, UXTW 2]
                 // add rA, rA, rB
                 // br rA
                 // [jt entries]
                 //
                 // This must be *one* instruction in the vcode because
                 // we cannot allow regalloc to insert any spills/fills
                 // in the middle of the sequence; otherwise, the ADR's
                 // PC-rel offset to the jumptable would be incorrect.
                 // (The alternative is to introduce a relocation pass
                 // for inlined jumptables, which is much worse, IMHO.)

                 let jt_targets: Vec<BranchTarget> = targets
                     .iter()
                     .skip(1)
                     .map(|bix| BranchTarget::Label(*bix))
                     .collect();
                 let default_target = BranchTarget::Label(targets[0]);
                 let targets_for_term: Vec<MachLabel> = targets.to_vec();
                 ctx.emit(Inst::JTSequence {
                     ridx,
                     rtmp1,
                     rtmp2,
                     info: Box::new(JTSequenceInfo {
                         targets: jt_targets,
                         default_target,
                         targets_for_term,
                     }),
                 });
             }

             _ => panic!("Unknown branch type!"),
         }
     }

     Ok(())
 }