| /* |
| * Released under the terms of the Apache 2.0 license with LLVM |
| * exception. See `LICENSE` for details. |
| */ |
| |
| use crate::{ion::data_structures::u64_key, Allocation, PReg}; |
| use core::fmt::Debug; |
| use smallvec::{smallvec, SmallVec}; |
| |
| /// A list of moves to be performed in sequence, with auxiliary data |
| /// attached to each. |
| pub type MoveVec<T> = SmallVec<[(Allocation, Allocation, T); 16]>; |
| |
| /// A list of moves to be performance in sequence, like a |
| /// `MoveVec<T>`, except that an unchosen scratch space may occur as |
| /// well, represented by `Allocation::none()`. |
| #[derive(Clone, Debug)] |
| pub enum MoveVecWithScratch<T> { |
| /// No scratch was actually used. |
| NoScratch(MoveVec<T>), |
| /// A scratch space was used. |
| Scratch(MoveVec<T>), |
| } |
| |
| /// A `ParallelMoves` represents a list of alloc-to-alloc moves that |
| /// must happen in parallel -- i.e., all reads of sources semantically |
| /// happen before all writes of destinations, and destinations are |
| /// allowed to overwrite sources. It can compute a list of sequential |
| /// moves that will produce the equivalent data movement, possibly |
| /// using a scratch register if one is necessary. |
| pub struct ParallelMoves<T: Clone + Copy + Default> { |
| parallel_moves: MoveVec<T>, |
| } |
| |
| impl<T: Clone + Copy + Default + PartialEq> ParallelMoves<T> { |
| pub fn new() -> Self { |
| Self { |
| parallel_moves: smallvec![], |
| } |
| } |
| |
| pub fn add(&mut self, from: Allocation, to: Allocation, t: T) { |
| self.parallel_moves.push((from, to, t)); |
| } |
| |
| fn sources_overlap_dests(&self) -> bool { |
| // Assumes `parallel_moves` has already been sorted by `dst` |
| // in `resolve()` below. The O(n log n) cost of this loop is no |
| // worse than the sort we already did. |
| for &(src, _, _) in &self.parallel_moves { |
| if self |
| .parallel_moves |
| .binary_search_by_key(&src, |&(_, dst, _)| dst) |
| .is_ok() |
| { |
| return true; |
| } |
| } |
| false |
| } |
| |
| /// Resolve the parallel-moves problem to a sequence of separate |
| /// moves, such that the combined effect of the sequential moves |
| /// is as-if all of the moves added to this `ParallelMoves` |
| /// resolver happened in parallel. |
| /// |
| /// Sometimes, if there is a cycle, a scratch register is |
| /// necessary to allow the moves to occur sequentially. In this |
| /// case, `Allocation::none()` is returned to represent the |
| /// scratch register. The caller may choose to always hold a |
| /// separate scratch register unused to allow this to be trivially |
| /// rewritten; or may dynamically search for or create a free |
| /// register as needed, if none are available. |
| pub fn resolve(mut self) -> MoveVecWithScratch<T> { |
| // Easy case: zero or one move. Just return our vec. |
| if self.parallel_moves.len() <= 1 { |
| return MoveVecWithScratch::NoScratch(self.parallel_moves); |
| } |
| |
| // Sort moves so that we can efficiently test for presence. |
| // For that purpose it doesn't matter whether we sort by |
| // source or destination, but later we'll want them sorted |
| // by destination. |
| self.parallel_moves |
| .sort_by_key(|&(src, dst, _)| u64_key(dst.bits(), src.bits())); |
| |
| // Duplicate moves cannot change the semantics of this |
| // parallel move set, so remove them. This is cheap since we |
| // just sorted the list. |
| self.parallel_moves.dedup(); |
| |
| // General case: some moves overwrite dests that other moves |
| // read as sources. We'll use a general algorithm. |
| // |
| // *Important property*: because we expect that each register |
| // has only one writer (otherwise the effect of the parallel |
| // move is undefined), each move can only block one other move |
| // (with its one source corresponding to the one writer of |
| // that source). Thus, we *can only have simple cycles* (those |
| // that are a ring of nodes, i.e., with only one path from a |
| // node back to itself); there are no SCCs that are more |
| // complex than that. We leverage this fact below to avoid |
| // having to do a full Tarjan SCC DFS (with lowest-index |
| // computation, etc.): instead, as soon as we find a cycle, we |
| // know we have the full cycle and we can do a cyclic move |
| // sequence and continue. |
| |
| // Check that each destination has only one writer. |
| if cfg!(debug_assertions) { |
| let mut last_dst = None; |
| for &(_, dst, _) in &self.parallel_moves { |
| if last_dst.is_some() { |
| debug_assert!(last_dst.unwrap() != dst); |
| } |
| last_dst = Some(dst); |
| } |
| } |
| |
| // Moving an allocation into itself is technically a cycle but |
| // should have no effect, as long as there are no other writes |
| // into that destination. |
| self.parallel_moves.retain(|&mut (src, dst, _)| src != dst); |
| |
| // Do any dests overlap sources? If not, we can also just |
| // return the list. |
| if !self.sources_overlap_dests() { |
| return MoveVecWithScratch::NoScratch(self.parallel_moves); |
| } |
| |
| // Construct a mapping from move indices to moves they must |
| // come before. Any given move must come before a move that |
| // overwrites its destination; we have moves sorted by dest |
| // above so we can efficiently find such a move, if any. |
| const NONE: usize = usize::MAX; |
| let must_come_before: SmallVec<[usize; 16]> = self |
| .parallel_moves |
| .iter() |
| .map(|&(src, _, _)| { |
| self.parallel_moves |
| .binary_search_by_key(&src, |&(_, dst, _)| dst) |
| .unwrap_or(NONE) |
| }) |
| .collect(); |
| |
| // Do a simple stack-based DFS and emit moves in postorder, |
| // then reverse at the end for RPO. Unlike Tarjan's SCC |
| // algorithm, we can emit a cycle as soon as we find one, as |
| // noted above. |
| #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| enum State { |
| /// Not on stack, not visited |
| ToDo, |
| /// On stack, not yet visited |
| Pending, |
| /// Visited |
| Done, |
| } |
| let mut ret: MoveVec<T> = smallvec![]; |
| let mut stack: SmallVec<[usize; 16]> = smallvec![]; |
| let mut state: SmallVec<[State; 16]> = smallvec![State::ToDo; self.parallel_moves.len()]; |
| let mut scratch_used = false; |
| |
| while let Some(next) = state.iter().position(|&state| state == State::ToDo) { |
| stack.push(next); |
| state[next] = State::Pending; |
| |
| while let Some(&top) = stack.last() { |
| debug_assert_eq!(state[top], State::Pending); |
| let next = must_come_before[top]; |
| if next == NONE || state[next] == State::Done { |
| ret.push(self.parallel_moves[top]); |
| state[top] = State::Done; |
| stack.pop(); |
| while let Some(top) = stack.pop() { |
| ret.push(self.parallel_moves[top]); |
| state[top] = State::Done; |
| } |
| } else if state[next] == State::ToDo { |
| stack.push(next); |
| state[next] = State::Pending; |
| } else { |
| // Found a cycle -- emit a cyclic-move sequence |
| // for the cycle on the top of stack, then normal |
| // moves below it. Recall that these moves will be |
| // reversed in sequence, so from the original |
| // parallel move set |
| // |
| // { B := A, C := B, A := B } |
| // |
| // we will generate something like: |
| // |
| // A := scratch |
| // B := A |
| // C := B |
| // scratch := C |
| // |
| // which will become: |
| // |
| // scratch := C |
| // C := B |
| // B := A |
| // A := scratch |
| debug_assert_ne!(top, next); |
| state[top] = State::Done; |
| stack.pop(); |
| |
| let (scratch_src, dst, dst_t) = self.parallel_moves[top]; |
| scratch_used = true; |
| |
| ret.push((Allocation::none(), dst, dst_t)); |
| while let Some(move_idx) = stack.pop() { |
| state[move_idx] = State::Done; |
| ret.push(self.parallel_moves[move_idx]); |
| |
| if move_idx == next { |
| break; |
| } |
| } |
| ret.push((scratch_src, Allocation::none(), T::default())); |
| } |
| } |
| } |
| |
| ret.reverse(); |
| |
| if scratch_used { |
| MoveVecWithScratch::Scratch(ret) |
| } else { |
| MoveVecWithScratch::NoScratch(ret) |
| } |
| } |
| } |
| |
| impl<T> MoveVecWithScratch<T> { |
| /// Fills in the scratch space, if needed, with the given |
| /// register/allocation and returns a final list of moves. The |
| /// scratch register must not occur anywhere in the parallel-move |
| /// problem given to the resolver that produced this |
| /// `MoveVecWithScratch`. |
| pub fn with_scratch(self, scratch: Allocation) -> MoveVec<T> { |
| match self { |
| MoveVecWithScratch::NoScratch(moves) => moves, |
| MoveVecWithScratch::Scratch(mut moves) => { |
| for (src, dst, _) in &mut moves { |
| debug_assert!( |
| *src != scratch && *dst != scratch, |
| "Scratch register should not also be an actual source or dest of moves" |
| ); |
| debug_assert!( |
| !(src.is_none() && dst.is_none()), |
| "Move resolution should not have produced a scratch-to-scratch move" |
| ); |
| if src.is_none() { |
| *src = scratch; |
| } |
| if dst.is_none() { |
| *dst = scratch; |
| } |
| } |
| moves |
| } |
| } |
| } |
| |
| /// Unwrap without a scratch register. |
| pub fn without_scratch(self) -> Option<MoveVec<T>> { |
| match self { |
| MoveVecWithScratch::NoScratch(moves) => Some(moves), |
| MoveVecWithScratch::Scratch(..) => None, |
| } |
| } |
| |
| /// Do we need a scratch register? |
| pub fn needs_scratch(&self) -> bool { |
| match self { |
| MoveVecWithScratch::NoScratch(..) => false, |
| MoveVecWithScratch::Scratch(..) => true, |
| } |
| } |
| } |
| |
| /// Final stage of move resolution: finding or using scratch |
| /// registers, creating them if necessary by using stackslots, and |
| /// ensuring that the final list of moves contains no stack-to-stack |
| /// moves. |
| /// |
| /// The resolved list of moves may need one or two scratch registers, |
| /// and maybe a stackslot, to ensure these conditions. Our general |
| /// strategy is in two steps. |
| /// |
| /// First, we find a scratch register, so we only have to worry about |
| /// a list of moves, all with real locations as src and dest. If we're |
| /// lucky and there are any registers not allocated at this |
| /// program-point, we can use a real register. Otherwise, we use an |
| /// extra stackslot. This is fine, because at this step, |
| /// stack-to-stack moves are OK. |
| /// |
| /// Then, we resolve stack-to-stack moves into stack-to-reg / |
| /// reg-to-stack pairs. For this, we try to allocate a second free |
| /// register. If unavailable, we create a new scratch stackslot to |
| /// serve as a backup of one of the in-use registers, then borrow that |
| /// register as the scratch register in the middle of stack-to-stack |
| /// moves. |
| pub struct MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc> |
| where |
| GetReg: FnMut() -> Option<Allocation>, |
| GetStackSlot: FnMut() -> Allocation, |
| IsStackAlloc: Fn(Allocation) -> bool, |
| { |
| /// Closure that finds us a PReg at the current location. |
| pub find_free_reg: GetReg, |
| /// Closure that gets us a stackslot, if needed. |
| pub get_stackslot: GetStackSlot, |
| /// Closure to determine whether an `Allocation` refers to a stack slot. |
| pub is_stack_alloc: IsStackAlloc, |
| /// Use this register if no free register is available to use as a |
| /// temporary in stack-to-stack moves. If we do use this register |
| /// for that purpose, its value will be restored by the end of the |
| /// move sequence. Provided by caller and statically chosen. This is |
| /// a very last-ditch option, so static choice is OK. |
| pub borrowed_scratch_reg: PReg, |
| } |
| |
| impl<GetReg, GetStackSlot, IsStackAlloc> MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc> |
| where |
| GetReg: FnMut() -> Option<Allocation>, |
| GetStackSlot: FnMut() -> Allocation, |
| IsStackAlloc: Fn(Allocation) -> bool, |
| { |
| pub fn compute<T: Debug + Default + Copy>( |
| mut self, |
| moves: MoveVecWithScratch<T>, |
| ) -> MoveVec<T> { |
| let moves = if moves.needs_scratch() { |
| // Now, find a scratch allocation in order to resolve cycles. |
| let scratch = (self.find_free_reg)().unwrap_or_else(|| (self.get_stackslot)()); |
| trace!("scratch resolver: scratch alloc {:?}", scratch); |
| |
| moves.with_scratch(scratch) |
| } else { |
| moves.without_scratch().unwrap() |
| }; |
| |
| // Do we have any stack-to-stack moves? Fast return if not. |
| let stack_to_stack = moves |
| .iter() |
| .any(|&(src, dst, _)| self.is_stack_to_stack_move(src, dst)); |
| if !stack_to_stack { |
| return moves; |
| } |
| |
| // Allocate a scratch register for stack-to-stack move expansion. |
| let (scratch_reg, save_slot) = if let Some(reg) = (self.find_free_reg)() { |
| trace!( |
| "scratch resolver: have free stack-to-stack scratch preg: {:?}", |
| reg |
| ); |
| (reg, None) |
| } else { |
| let reg = Allocation::reg(self.borrowed_scratch_reg); |
| // Stackslot into which we need to save the stack-to-stack |
| // scratch reg before doing any stack-to-stack moves, if we stole |
| // the reg. |
| let save = (self.get_stackslot)(); |
| trace!( |
| "scratch resolver: stack-to-stack borrowing {:?} with save stackslot {:?}", |
| reg, |
| save |
| ); |
| (reg, Some(save)) |
| }; |
| |
| // Mutually exclusive flags for whether either scratch_reg or |
| // save_slot need to be restored from the other. Initially, |
| // scratch_reg has a value we should preserve and save_slot |
| // has garbage. |
| let mut scratch_dirty = false; |
| let mut save_dirty = true; |
| |
| let mut result = smallvec![]; |
| for &(src, dst, data) in &moves { |
| // Do we have a stack-to-stack move? If so, resolve. |
| if self.is_stack_to_stack_move(src, dst) { |
| trace!("scratch resolver: stack to stack: {:?} -> {:?}", src, dst); |
| |
| // If the selected scratch register is stolen from the |
| // set of in-use registers, then we need to save the |
| // current contents of the scratch register before using |
| // it as a temporary. |
| if let Some(save_slot) = save_slot { |
| // However we may have already done so for an earlier |
| // stack-to-stack move in which case we don't need |
| // to do it again. |
| if save_dirty { |
| debug_assert!(!scratch_dirty); |
| result.push((scratch_reg, save_slot, T::default())); |
| save_dirty = false; |
| } |
| } |
| |
| // We can't move directly from one stack slot to another |
| // on any architecture we care about, so stack-to-stack |
| // moves must go via a scratch register. |
| result.push((src, scratch_reg, data)); |
| result.push((scratch_reg, dst, data)); |
| scratch_dirty = true; |
| } else { |
| // This is not a stack-to-stack move, but we need to |
| // make sure that the scratch register is in the correct |
| // state if this move interacts with that register. |
| if src == scratch_reg && scratch_dirty { |
| // We're copying from the scratch register so if |
| // it was stolen for a stack-to-stack move then we |
| // need to make sure it has the correct contents, |
| // not whatever was temporarily copied into it. If |
| // we got scratch_reg from find_free_reg then it |
| // had better not have been used as the source of |
| // a move. So if we're here it's because we fell |
| // back to the caller-provided last-resort scratch |
| // register, and we must therefore have a save-slot |
| // allocated too. |
| debug_assert!(!save_dirty); |
| let save_slot = save_slot.expect("move source should not be a free register"); |
| result.push((save_slot, scratch_reg, T::default())); |
| scratch_dirty = false; |
| } |
| if dst == scratch_reg { |
| // We are writing something to the scratch register |
| // so it doesn't matter what was there before. We |
| // can avoid restoring it, but we will need to save |
| // it again before the next stack-to-stack move. |
| scratch_dirty = false; |
| save_dirty = true; |
| } |
| result.push((src, dst, data)); |
| } |
| } |
| |
| // Now that all the stack-to-stack moves are done, restore the |
| // scratch register if necessary. |
| if let Some(save_slot) = save_slot { |
| if scratch_dirty { |
| debug_assert!(!save_dirty); |
| result.push((save_slot, scratch_reg, T::default())); |
| } |
| } |
| |
| trace!("scratch resolver: got {:?}", result); |
| result |
| } |
| |
| fn is_stack_to_stack_move(&self, src: Allocation, dst: Allocation) -> bool { |
| (self.is_stack_alloc)(src) && (self.is_stack_alloc)(dst) |
| } |
| } |