vendor/blake2b_simd/src/many.rs - toolchain/rustc - Git at Google

 //! Interfaces for hashing multiple inputs at once, using SIMD more
 //! efficiently.
 //!
 //! The throughput of these interfaces is comparable to BLAKE2bp, about twice
 //! the throughput of regular BLAKE2b when AVX2 is available.
 //!
 //! These interfaces can accept any number of inputs, and the implementation
 //! does its best to parallelize them. In general, the more inputs you can pass
 //! in at once the better. If you need to batch your inputs in smaller groups,
 //! see the [`degree`](fn.degree.html) function for a good batch size.
 //!
 //! The implementation keeps working in parallel even when inputs are of
 //! different lengths, by managing a working set of jobs whose input isn't yet
 //! exhausted. However, if one or two inputs are much longer than the others,
 //! and they're encountered only at the end, there might not be any remaining
 //! work to parallelize them with. In this case, sorting the inputs
 //! longest-first can improve parallelism.
 //!
 //! # Example
 //!
 //! ```
 //! use blake2b_simd::{blake2b, State, many::update_many};
 //!
 //! let mut states = [
 //!     State::new(),
 //!     State::new(),
 //!     State::new(),
 //!     State::new(),
 //! ];
 //!
 //! let inputs = [
 //!     &b"foo"[..],
 //!     &b"bar"[..],
 //!     &b"baz"[..],
 //!     &b"bing"[..],
 //! ];
 //!
 //! update_many(states.iter_mut().zip(inputs.iter()));
 //!
 //! for (state, input) in states.iter_mut().zip(inputs.iter()) {
 //!     assert_eq!(blake2b(input), state.finalize());
 //! }
 //! ```

 use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride};
 use crate::state_words_to_bytes;
 use crate::Count;
 use crate::Hash;
 use crate::Params;
 use crate::State;
 use crate::Word;
 use crate::BLOCKBYTES;
 use arrayref::array_mut_ref;
 use arrayvec::ArrayVec;
 use core::fmt;

 /// The largest possible value of [`degree`](fn.degree.html) on the target
 /// platform.
 ///
 /// Note that this constant reflects the parallelism degree supported by this
 /// crate, so it will change over time as support is added or removed. For
 /// example, when Rust stabilizes AVX-512 support and this crate adds an
 /// AVX-512 implementation, this constant will double on x86 targets. If that
 /// implementation is an optional feature (e.g. because it's nightly-only), the
 /// value of this constant will depend on that optional feature also.
 pub const MAX_DEGREE: usize = guts::MAX_DEGREE;

 /// The parallelism degree of the implementation, detected at runtime. If you
 /// hash your inputs in small batches, making the batch size a multiple of
 /// `degree` will generally give good performance.
 ///
 /// For example, an x86 processor that supports AVX2 can compute four BLAKE2b
 /// hashes in parallel, so `degree` returns 4 on that machine. If you call
 /// [`hash_many`] with only three inputs, that's not enough to use the AVX2
 /// implementation, and your average throughput will be lower. Likewise if you
 /// call it with five inputs of equal length, the first four will be hashed in
 /// parallel with AVX2, but the last one will have to be hashed by itself, and
 /// again your average throughput will be lower.
 ///
 /// As noted in the module level docs, performance is more complicated if your
 /// inputs are of different lengths. When parallelizing long and short inputs
 /// together, the longer ones will have bytes left over, and the implementation
 /// will try to parallelize those leftover bytes with subsequent inputs. The
 /// more inputs available in that case, the more the implementation will be
 /// able to parallelize.
 ///
 /// If you need a constant batch size, for example to collect inputs in an
 /// array, see [`MAX_DEGREE`].
 ///
 /// [`hash_many`]: fn.hash_many.html
 /// [`MAX_DEGREE`]: constant.MAX_DEGREE.html
 pub fn degree() -> usize {
     guts::Implementation::detect().degree()
 }

 type JobsVec<'a, 'b> = ArrayVec<[Job<'a, 'b>; guts::MAX_DEGREE]>;

 #[inline(always)]
 fn fill_jobs_vec<'a, 'b>(
     jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>,
     vec: &mut JobsVec<'a, 'b>,
     target_len: usize,
 ) {
     while vec.len() < target_len {
         if let Some(job) = jobs_iter.next() {
             vec.push(job);
         } else {
             break;
         }
     }
 }

 #[inline(always)]
 fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) {
     // Iterate backwards so that removal doesn't cause an out-of-bounds panic.
     for i in (0..num_jobs).rev() {
         // Note that is_empty() is only valid because we know all these jobs
         // have been run at least once. Otherwise we could confuse the empty
         // input for a finished job, which would be incorrect.
         //
         // Avoid a panic branch here in release mode.
         debug_assert!(vec.len() > i);
         if vec.len() > i && vec[i].input.is_empty() {
             // Note that calling pop_at() repeatedly has some overhead, because
             // later elements need to be shifted up. However, the JobsVec is
             // small, and this approach guarantees that jobs are encountered in
             // order.
             vec.pop_at(i);
         }
     }
 }

 pub(crate) fn compress_many<'a, 'b, I>(
     jobs: I,
     imp: Implementation,
     finalize: Finalize,
     stride: Stride,
 ) where
     I: IntoIterator<Item = Job<'a, 'b>>,
 {
     // Fuse is important for correctness, since each of these blocks tries to
     // advance the iterator, even if a previous block emptied it.
     let mut jobs_iter = jobs.into_iter().fuse();
     let mut jobs_vec = JobsVec::new();

     if imp.degree() >= 4 {
         loop {
             fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4);
             if jobs_vec.len() < 4 {
                 break;
             }
             let jobs_array = array_mut_ref!(jobs_vec, 0, 4);
             imp.compress4_loop(jobs_array, finalize, stride);
             evict_finished(&mut jobs_vec, 4);
         }
     }

     if imp.degree() >= 2 {
         loop {
             fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2);
             if jobs_vec.len() < 2 {
                 break;
             }
             let jobs_array = array_mut_ref!(jobs_vec, 0, 2);
             imp.compress2_loop(jobs_array, finalize, stride);
             evict_finished(&mut jobs_vec, 2);
         }
     }

     for job in jobs_vec.into_iter().chain(jobs_iter) {
         let Job {
             input,
             words,
             count,
             last_node,
         } = job;
         imp.compress1_loop(input, words, count, last_node, finalize, stride);
     }
 }

 /// Update any number of `State` objects at once.
 ///
 /// # Example
 ///
 /// ```
 /// use blake2b_simd::{blake2b, State, many::update_many};
 ///
 /// let mut states = [
 ///     State::new(),
 ///     State::new(),
 ///     State::new(),
 ///     State::new(),
 /// ];
 ///
 /// let inputs = [
 ///     &b"foo"[..],
 ///     &b"bar"[..],
 ///     &b"baz"[..],
 ///     &b"bing"[..],
 /// ];
 ///
 /// update_many(states.iter_mut().zip(inputs.iter()));
 ///
 /// for (state, input) in states.iter_mut().zip(inputs.iter()) {
 ///     assert_eq!(blake2b(input), state.finalize());
 /// }
 /// ```
 pub fn update_many<'a, 'b, I, T>(pairs: I)
 where
     I: IntoIterator<Item = (&'a mut State, &'b T)>,
     T: 'b + AsRef<[u8]> + ?Sized,
 {
     // Get the guts::Implementation from the first state, if any.
     let mut peekable_pairs = pairs.into_iter().peekable();
     let implementation = if let Some((state, _)) = peekable_pairs.peek() {
         state.implementation
     } else {
         // No work items, just short circuit.
         return;
     };

     // Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs
     // where there's not actually any work to do (e.g. because there's not much
     // input and it's all just going in the State buffer).
     let jobs = peekable_pairs.flat_map(|(state, input_t)| {
         let mut input = input_t.as_ref();
         // For each pair, if the State has some input in its buffer, try to
         // finish that buffer. If there wasn't enough input to do that --
         // or if the input was empty to begin with -- skip this pair.
         state.compress_buffer_if_possible(&mut input);
         if input.is_empty() {
             return None;
         }
         // Now we know the buffer is empty and there's more input. Make sure we
         // buffer the final block, because update() doesn't finalize.
         let mut last_block_start = input.len() - 1;
         last_block_start -= last_block_start % BLOCKBYTES;
         let (blocks, last_block) = input.split_at(last_block_start);
         state.buf[..last_block.len()].copy_from_slice(last_block);
         state.buflen = last_block.len() as u8;
         // Finally, if the full blocks slice is non-empty, prepare that job for
         // compression, and bump the State count.
         if blocks.is_empty() {
             None
         } else {
             let count = state.count;
             state.count = state.count.wrapping_add(blocks.len() as Count);
             Some(Job {
                 input: blocks,
                 words: &mut state.words,
                 count,
                 last_node: state.last_node,
             })
         }
     });

     // Run all the Jobs in the iterator.
     compress_many(jobs, implementation, Finalize::No, Stride::Serial);
 }

 /// A job for the [`hash_many`] function. After calling [`hash_many`] on a
 /// collection of `HashManyJob` objects, you can call [`to_hash`] on each job
 /// to get the result.
 ///
 /// [`hash_many`]: fn.hash_many.html
 /// [`to_hash`]: struct.HashManyJob.html#method.to_hash
 #[derive(Clone)]
 pub struct HashManyJob<'a> {
     words: [Word; 8],
     count: Count,
     last_node: LastNode,
     hash_length: u8,
     input: &'a [u8],
     finished: bool,
     implementation: guts::Implementation,
 }

 impl<'a> HashManyJob<'a> {
     /// Construct a new `HashManyJob` from a set of hashing parameters and an
     /// input.
     #[inline]
     pub fn new(params: &Params, input: &'a [u8]) -> Self {
         let mut words = params.to_words();
         let mut count = 0;
         let mut finished = false;
         // If we have key bytes, compress them into the state words. If there's
         // no additional input, this compression needs to finalize and set
         // finished=true.
         if params.key_length > 0 {
             let mut finalization = Finalize::No;
             if input.is_empty() {
                 finalization = Finalize::Yes;
                 finished = true;
             }
             params.implementation.compress1_loop(
                 &params.key_block,
                 &mut words,
                 0,
                 params.last_node,
                 finalization,
                 Stride::Serial,
             );
             count = BLOCKBYTES as Count;
         }
         Self {
             words,
             count,
             last_node: params.last_node,
             hash_length: params.hash_length,
             input,
             finished,
             implementation: params.implementation,
         }
     }

     /// Get the hash from a finished job. If you call this before calling
     /// [`hash_many`], it will panic in debug mode.
     ///
     /// [`hash_many`]: fn.hash_many.html
     #[inline]
     pub fn to_hash(&self) -> Hash {
         debug_assert!(self.finished, "job hasn't been run yet");
         Hash {
             bytes: state_words_to_bytes(&self.words),
             len: self.hash_length,
         }
     }
 }

 impl<'a> fmt::Debug for HashManyJob<'a> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // NB: Don't print the words. Leaking them would allow length extension.
         write!(
             f,
             "HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}",
             self.count,
             self.hash_length,
             self.last_node.yes(),
             self.input.len(),
         )
     }
 }

 /// Hash any number of complete inputs all at once.
 ///
 /// This is slightly more efficient than using `update_many` with `State`
 /// objects, because it doesn't need to do any buffering.
 ///
 /// Running `hash_many` on the same `HashManyJob` object more than once has no
 /// effect.
 ///
 /// # Example
 ///
 /// ```
 /// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}};
 ///
 /// let inputs = [
 ///     &b"foo"[..],
 ///     &b"bar"[..],
 ///     &b"baz"[..],
 ///     &b"bing"[..],
 /// ];
 ///
 /// let mut params = Params::new();
 /// params.hash_length(16);
 ///
 /// let mut jobs = [
 ///     HashManyJob::new(&params, inputs[0]),
 ///     HashManyJob::new(&params, inputs[1]),
 ///     HashManyJob::new(&params, inputs[2]),
 ///     HashManyJob::new(&params, inputs[3]),
 /// ];
 ///
 /// hash_many(jobs.iter_mut());
 ///
 /// for (input, job) in inputs.iter().zip(jobs.iter()) {
 ///     let expected = params.hash(input);
 ///     assert_eq!(expected, job.to_hash());
 /// }
 /// ```
 pub fn hash_many<'a, 'b, I>(hash_many_jobs: I)
 where
     'b: 'a,
     I: IntoIterator<Item = &'a mut HashManyJob<'b>>,
 {
     // Get the guts::Implementation from the first job, if any.
     let mut peekable_jobs = hash_many_jobs.into_iter().peekable();
     let implementation = if let Some(job) = peekable_jobs.peek() {
         job.implementation
     } else {
         // No work items, just short circuit.
         return;
     };

     // In the jobs iterator, skip HashManyJobs that have already been run. This
     // is less because we actually expect callers to call hash_many twice
     // (though they're allowed to if they want), and more because
     // HashManyJob::new might need to finalize if there are key bytes but no
     // input. Tying the job lifetime to the Params reference is an alternative,
     // but I've found it too constraining in practice. We could also put key
     // bytes in every HashManyJob, but that would add unnecessary storage and
     // zeroing for all callers.
     let unfinished_jobs = peekable_jobs.into_iter().filter(|j| !j.finished);
     let jobs = unfinished_jobs.map(|j| {
         j.finished = true;
         Job {
             input: j.input,
             words: &mut j.words,
             count: j.count,
             last_node: j.last_node,
         }
     });
     compress_many(jobs, implementation, Finalize::Yes, Stride::Serial);
 }

 #[cfg(test)]
 mod test {
     use super::*;
     use crate::guts;
     use crate::paint_test_input;
     use crate::BLOCKBYTES;
     use arrayvec::ArrayVec;

     #[test]
     fn test_degree() {
         assert!(degree() <= MAX_DEGREE);

         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         #[cfg(feature = "std")]
         {
             if is_x86_feature_detected!("avx2") {
                 assert!(degree() >= 4);
             }
             if is_x86_feature_detected!("sse4.1") {
                 assert!(degree() >= 2);
             }
         }
     }

     #[test]
     fn test_hash_many() {
         // Use a length of inputs that will exercise all of the power-of-two loops.
         const LEN: usize = 2 * guts::MAX_DEGREE - 1;

         // Rerun LEN inputs LEN different times, with the empty input starting in a
         // different spot each time.
         let mut input = [0; LEN * BLOCKBYTES];
         paint_test_input(&mut input);
         for start_offset in 0..LEN {
             let mut inputs: [&[u8]; LEN] = [&[]; LEN];
             for i in 0..LEN {
                 let chunks = (i + start_offset) % LEN;
                 inputs[i] = &input[..chunks * BLOCKBYTES];
             }

             let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
             for i in 0..LEN {
                 let mut p = Params::new();
                 p.node_offset(i as u64);
                 if i % 2 == 1 {
                     p.last_node(true);
                     p.key(b"foo");
                 }
                 params.push(p);
             }

             let mut jobs: ArrayVec<[HashManyJob; LEN]> = ArrayVec::new();
             for i in 0..LEN {
                 jobs.push(HashManyJob::new(&params[i], inputs[i]));
             }

             hash_many(&mut jobs);

             // Check the outputs.
             for i in 0..LEN {
                 let expected = params[i].hash(inputs[i]);
                 assert_eq!(expected, jobs[i].to_hash());
             }
         }
     }

     #[test]
     fn test_update_many() {
         // Use a length of inputs that will exercise all of the power-of-two loops.
         const LEN: usize = 2 * guts::MAX_DEGREE - 1;

         // Rerun LEN inputs LEN different times, with the empty input starting in a
         // different spot each time.
         let mut input = [0; LEN * BLOCKBYTES];
         paint_test_input(&mut input);
         for start_offset in 0..LEN {
             let mut inputs: [&[u8]; LEN] = [&[]; LEN];
             for i in 0..LEN {
                 let chunks = (i + start_offset) % LEN;
                 inputs[i] = &input[..chunks * BLOCKBYTES];
             }

             let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
             for i in 0..LEN {
                 let mut p = Params::new();
                 p.node_offset(i as u64);
                 if i % 2 == 1 {
                     p.last_node(true);
                     p.key(b"foo");
                 }
                 params.push(p);
             }

             let mut states: ArrayVec<[State; LEN]> = ArrayVec::new();
             for i in 0..LEN {
                 states.push(params[i].to_state());
             }

             // Run each input twice through, to exercise buffering.
             update_many(states.iter_mut().zip(inputs.iter()));
             update_many(states.iter_mut().zip(inputs.iter()));

             // Check the outputs.
             for i in 0..LEN {
                 let mut reference_state = params[i].to_state();
                 // Again, run the input twice.
                 reference_state.update(inputs[i]);
                 reference_state.update(inputs[i]);
                 assert_eq!(reference_state.finalize(), states[i].finalize());
                 assert_eq!(2 * inputs[i].len() as Count, states[i].count());
             }
         }
     }
 }
	//! Interfaces for hashing multiple inputs at once, using SIMD more
	//! efficiently.
	//!
	//! The throughput of these interfaces is comparable to BLAKE2bp, about twice
	//! the throughput of regular BLAKE2b when AVX2 is available.
	//!
	//! These interfaces can accept any number of inputs, and the implementation
	//! does its best to parallelize them. In general, the more inputs you can pass
	//! in at once the better. If you need to batch your inputs in smaller groups,
	//! see the [`degree`](fn.degree.html) function for a good batch size.
	//!
	//! The implementation keeps working in parallel even when inputs are of
	//! different lengths, by managing a working set of jobs whose input isn't yet
	//! exhausted. However, if one or two inputs are much longer than the others,
	//! and they're encountered only at the end, there might not be any remaining
	//! work to parallelize them with. In this case, sorting the inputs
	//! longest-first can improve parallelism.
	//!
	//! # Example
	//!
	//! ```
	//! use blake2b_simd::{blake2b, State, many::update_many};
	//!
	//! let mut states = [
	//! State::new(),
	//! State::new(),
	//! State::new(),
	//! State::new(),
	//! ];
	//!
	//! let inputs = [
	//! &b"foo"[..],
	//! &b"bar"[..],
	//! &b"baz"[..],
	//! &b"bing"[..],
	//! ];
	//!
	//! update_many(states.iter_mut().zip(inputs.iter()));
	//!
	//! for (state, input) in states.iter_mut().zip(inputs.iter()) {
	//! assert_eq!(blake2b(input), state.finalize());
	//! }
	//! ```

	use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride};
	use crate::state_words_to_bytes;
	use crate::Count;
	use crate::Hash;
	use crate::Params;
	use crate::State;
	use crate::Word;
	use crate::BLOCKBYTES;
	use arrayref::array_mut_ref;
	use arrayvec::ArrayVec;
	use core::fmt;

	/// The largest possible value of [`degree`](fn.degree.html) on the target
	/// platform.
	///
	/// Note that this constant reflects the parallelism degree supported by this
	/// crate, so it will change over time as support is added or removed. For
	/// example, when Rust stabilizes AVX-512 support and this crate adds an
	/// AVX-512 implementation, this constant will double on x86 targets. If that
	/// implementation is an optional feature (e.g. because it's nightly-only), the
	/// value of this constant will depend on that optional feature also.
	pub const MAX_DEGREE: usize = guts::MAX_DEGREE;

	/// The parallelism degree of the implementation, detected at runtime. If you
	/// hash your inputs in small batches, making the batch size a multiple of
	/// `degree` will generally give good performance.
	///
	/// For example, an x86 processor that supports AVX2 can compute four BLAKE2b
	/// hashes in parallel, so `degree` returns 4 on that machine. If you call
	/// [`hash_many`] with only three inputs, that's not enough to use the AVX2
	/// implementation, and your average throughput will be lower. Likewise if you
	/// call it with five inputs of equal length, the first four will be hashed in
	/// parallel with AVX2, but the last one will have to be hashed by itself, and
	/// again your average throughput will be lower.
	///
	/// As noted in the module level docs, performance is more complicated if your
	/// inputs are of different lengths. When parallelizing long and short inputs
	/// together, the longer ones will have bytes left over, and the implementation
	/// will try to parallelize those leftover bytes with subsequent inputs. The
	/// more inputs available in that case, the more the implementation will be
	/// able to parallelize.
	///
	/// If you need a constant batch size, for example to collect inputs in an
	/// array, see [`MAX_DEGREE`].
	///
	/// [`hash_many`]: fn.hash_many.html
	/// [`MAX_DEGREE`]: constant.MAX_DEGREE.html
	pub fn degree() -> usize {
	guts::Implementation::detect().degree()
	}

	type JobsVec<'a, 'b> = ArrayVec<[Job<'a, 'b>; guts::MAX_DEGREE]>;

	#[inline(always)]
	fn fill_jobs_vec<'a, 'b>(
	jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>,
	vec: &mut JobsVec<'a, 'b>,
	target_len: usize,
	) {
	while vec.len() < target_len {
	if let Some(job) = jobs_iter.next() {
	vec.push(job);
	} else {
	break;
	}
	}
	}

	#[inline(always)]
	fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) {
	// Iterate backwards so that removal doesn't cause an out-of-bounds panic.
	for i in (0..num_jobs).rev() {
	// Note that is_empty() is only valid because we know all these jobs
	// have been run at least once. Otherwise we could confuse the empty
	// input for a finished job, which would be incorrect.
	//
	// Avoid a panic branch here in release mode.
	debug_assert!(vec.len() > i);
	if vec.len() > i && vec[i].input.is_empty() {
	// Note that calling pop_at() repeatedly has some overhead, because
	// later elements need to be shifted up. However, the JobsVec is
	// small, and this approach guarantees that jobs are encountered in
	// order.
	vec.pop_at(i);
	}
	}
	}

	pub(crate) fn compress_many<'a, 'b, I>(
	jobs: I,
	imp: Implementation,
	finalize: Finalize,
	stride: Stride,
	) where
	I: IntoIterator<Item = Job<'a, 'b>>,
	{
	// Fuse is important for correctness, since each of these blocks tries to
	// advance the iterator, even if a previous block emptied it.
	let mut jobs_iter = jobs.into_iter().fuse();
	let mut jobs_vec = JobsVec::new();

	if imp.degree() >= 4 {
	loop {
	fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4);
	if jobs_vec.len() < 4 {
	break;
	}
	let jobs_array = array_mut_ref!(jobs_vec, 0, 4);
	imp.compress4_loop(jobs_array, finalize, stride);
	evict_finished(&mut jobs_vec, 4);
	}
	}

	if imp.degree() >= 2 {
	loop {
	fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2);
	if jobs_vec.len() < 2 {
	break;
	}
	let jobs_array = array_mut_ref!(jobs_vec, 0, 2);
	imp.compress2_loop(jobs_array, finalize, stride);
	evict_finished(&mut jobs_vec, 2);
	}
	}

	for job in jobs_vec.into_iter().chain(jobs_iter) {
	let Job {
	input,
	words,
	count,
	last_node,
	} = job;
	imp.compress1_loop(input, words, count, last_node, finalize, stride);
	}
	}

	/// Update any number of `State` objects at once.
	///
	/// # Example
	///
	/// ```
	/// use blake2b_simd::{blake2b, State, many::update_many};
	///
	/// let mut states = [
	/// State::new(),
	/// State::new(),
	/// State::new(),
	/// State::new(),
	/// ];
	///
	/// let inputs = [
	/// &b"foo"[..],
	/// &b"bar"[..],
	/// &b"baz"[..],
	/// &b"bing"[..],
	/// ];
	///
	/// update_many(states.iter_mut().zip(inputs.iter()));
	///
	/// for (state, input) in states.iter_mut().zip(inputs.iter()) {
	/// assert_eq!(blake2b(input), state.finalize());
	/// }
	/// ```
	pub fn update_many<'a, 'b, I, T>(pairs: I)
	where
	I: IntoIterator<Item = (&'a mut State, &'b T)>,
	T: 'b + AsRef<[u8]> + ?Sized,
	{
	// Get the guts::Implementation from the first state, if any.
	let mut peekable_pairs = pairs.into_iter().peekable();
	let implementation = if let Some((state, _)) = peekable_pairs.peek() {
	state.implementation
	} else {
	// No work items, just short circuit.
	return;
	};

	// Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs
	// where there's not actually any work to do (e.g. because there's not much
	// input and it's all just going in the State buffer).
	let jobs = peekable_pairs.flat_map(\|(state, input_t)\| {
	let mut input = input_t.as_ref();
	// For each pair, if the State has some input in its buffer, try to
	// finish that buffer. If there wasn't enough input to do that --
	// or if the input was empty to begin with -- skip this pair.
	state.compress_buffer_if_possible(&mut input);
	if input.is_empty() {
	return None;
	}
	// Now we know the buffer is empty and there's more input. Make sure we
	// buffer the final block, because update() doesn't finalize.
	let mut last_block_start = input.len() - 1;
	last_block_start -= last_block_start % BLOCKBYTES;
	let (blocks, last_block) = input.split_at(last_block_start);
	state.buf[..last_block.len()].copy_from_slice(last_block);
	state.buflen = last_block.len() as u8;
	// Finally, if the full blocks slice is non-empty, prepare that job for
	// compression, and bump the State count.
	if blocks.is_empty() {
	None
	} else {
	let count = state.count;
	state.count = state.count.wrapping_add(blocks.len() as Count);
	Some(Job {
	input: blocks,
	words: &mut state.words,
	count,
	last_node: state.last_node,
	})
	}
	});

	// Run all the Jobs in the iterator.
	compress_many(jobs, implementation, Finalize::No, Stride::Serial);
	}

	/// A job for the [`hash_many`] function. After calling [`hash_many`] on a
	/// collection of `HashManyJob` objects, you can call [`to_hash`] on each job
	/// to get the result.
	///
	/// [`hash_many`]: fn.hash_many.html
	/// [`to_hash`]: struct.HashManyJob.html#method.to_hash
	#[derive(Clone)]
	pub struct HashManyJob<'a> {
	words: [Word; 8],
	count: Count,
	last_node: LastNode,
	hash_length: u8,
	input: &'a [u8],
	finished: bool,
	implementation: guts::Implementation,
	}

	impl<'a> HashManyJob<'a> {
	/// Construct a new `HashManyJob` from a set of hashing parameters and an
	/// input.
	#[inline]
	pub fn new(params: &Params, input: &'a [u8]) -> Self {
	let mut words = params.to_words();
	let mut count = 0;
	let mut finished = false;
	// If we have key bytes, compress them into the state words. If there's
	// no additional input, this compression needs to finalize and set
	// finished=true.
	if params.key_length > 0 {
	let mut finalization = Finalize::No;
	if input.is_empty() {
	finalization = Finalize::Yes;
	finished = true;
	}
	params.implementation.compress1_loop(
	&params.key_block,
	&mut words,
	0,
	params.last_node,
	finalization,
	Stride::Serial,
	);
	count = BLOCKBYTES as Count;
	}
	Self {
	words,
	count,
	last_node: params.last_node,
	hash_length: params.hash_length,
	input,
	finished,
	implementation: params.implementation,
	}
	}

	/// Get the hash from a finished job. If you call this before calling
	/// [`hash_many`], it will panic in debug mode.
	///
	/// [`hash_many`]: fn.hash_many.html
	#[inline]
	pub fn to_hash(&self) -> Hash {
	debug_assert!(self.finished, "job hasn't been run yet");
	Hash {
	bytes: state_words_to_bytes(&self.words),
	len: self.hash_length,
	}
	}
	}

	impl<'a> fmt::Debug for HashManyJob<'a> {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	// NB: Don't print the words. Leaking them would allow length extension.
	write!(
	f,
	"HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}",
	self.count,
	self.hash_length,
	self.last_node.yes(),
	self.input.len(),
	)
	}
	}

	/// Hash any number of complete inputs all at once.
	///
	/// This is slightly more efficient than using `update_many` with `State`
	/// objects, because it doesn't need to do any buffering.
	///
	/// Running `hash_many` on the same `HashManyJob` object more than once has no
	/// effect.
	///
	/// # Example
	///
	/// ```
	/// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}};
	///
	/// let inputs = [
	/// &b"foo"[..],
	/// &b"bar"[..],
	/// &b"baz"[..],
	/// &b"bing"[..],
	/// ];
	///
	/// let mut params = Params::new();
	/// params.hash_length(16);
	///
	/// let mut jobs = [
	/// HashManyJob::new(&params, inputs[0]),
	/// HashManyJob::new(&params, inputs[1]),
	/// HashManyJob::new(&params, inputs[2]),
	/// HashManyJob::new(&params, inputs[3]),
	/// ];
	///
	/// hash_many(jobs.iter_mut());
	///
	/// for (input, job) in inputs.iter().zip(jobs.iter()) {
	/// let expected = params.hash(input);
	/// assert_eq!(expected, job.to_hash());
	/// }
	/// ```
	pub fn hash_many<'a, 'b, I>(hash_many_jobs: I)
	where
	'b: 'a,
	I: IntoIterator<Item = &'a mut HashManyJob<'b>>,
	{
	// Get the guts::Implementation from the first job, if any.
	let mut peekable_jobs = hash_many_jobs.into_iter().peekable();
	let implementation = if let Some(job) = peekable_jobs.peek() {
	job.implementation
	} else {
	// No work items, just short circuit.
	return;
	};

	// In the jobs iterator, skip HashManyJobs that have already been run. This
	// is less because we actually expect callers to call hash_many twice
	// (though they're allowed to if they want), and more because
	// HashManyJob::new might need to finalize if there are key bytes but no
	// input. Tying the job lifetime to the Params reference is an alternative,
	// but I've found it too constraining in practice. We could also put key
	// bytes in every HashManyJob, but that would add unnecessary storage and
	// zeroing for all callers.
	let unfinished_jobs = peekable_jobs.into_iter().filter(\|j\| !j.finished);
	let jobs = unfinished_jobs.map(\|j\| {
	j.finished = true;
	Job {
	input: j.input,
	words: &mut j.words,
	count: j.count,
	last_node: j.last_node,
	}
	});
	compress_many(jobs, implementation, Finalize::Yes, Stride::Serial);
	}

	#[cfg(test)]
	mod test {
	use super::*;
	use crate::guts;
	use crate::paint_test_input;
	use crate::BLOCKBYTES;
	use arrayvec::ArrayVec;

	#[test]
	fn test_degree() {
	assert!(degree() <= MAX_DEGREE);

	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	#[cfg(feature = "std")]
	{
	if is_x86_feature_detected!("avx2") {
	assert!(degree() >= 4);
	}
	if is_x86_feature_detected!("sse4.1") {
	assert!(degree() >= 2);
	}
	}
	}

	#[test]
	fn test_hash_many() {
	// Use a length of inputs that will exercise all of the power-of-two loops.
	const LEN: usize = 2 * guts::MAX_DEGREE - 1;

	// Rerun LEN inputs LEN different times, with the empty input starting in a
	// different spot each time.
	let mut input = [0; LEN * BLOCKBYTES];
	paint_test_input(&mut input);
	for start_offset in 0..LEN {
	let mut inputs: [&[u8]; LEN] = [&[]; LEN];
	for i in 0..LEN {
	let chunks = (i + start_offset) % LEN;
	inputs[i] = &input[..chunks * BLOCKBYTES];
	}

	let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
	for i in 0..LEN {
	let mut p = Params::new();
	p.node_offset(i as u64);
	if i % 2 == 1 {
	p.last_node(true);
	p.key(b"foo");
	}
	params.push(p);
	}

	let mut jobs: ArrayVec<[HashManyJob; LEN]> = ArrayVec::new();
	for i in 0..LEN {
	jobs.push(HashManyJob::new(&params[i], inputs[i]));
	}

	hash_many(&mut jobs);

	// Check the outputs.
	for i in 0..LEN {
	let expected = params[i].hash(inputs[i]);
	assert_eq!(expected, jobs[i].to_hash());
	}
	}
	}

	#[test]
	fn test_update_many() {
	// Use a length of inputs that will exercise all of the power-of-two loops.
	const LEN: usize = 2 * guts::MAX_DEGREE - 1;

	// Rerun LEN inputs LEN different times, with the empty input starting in a
	// different spot each time.
	let mut input = [0; LEN * BLOCKBYTES];
	paint_test_input(&mut input);
	for start_offset in 0..LEN {
	let mut inputs: [&[u8]; LEN] = [&[]; LEN];
	for i in 0..LEN {
	let chunks = (i + start_offset) % LEN;
	inputs[i] = &input[..chunks * BLOCKBYTES];
	}

	let mut params: ArrayVec<[Params; LEN]> = ArrayVec::new();
	for i in 0..LEN {
	let mut p = Params::new();
	p.node_offset(i as u64);
	if i % 2 == 1 {
	p.last_node(true);
	p.key(b"foo");
	}
	params.push(p);
	}

	let mut states: ArrayVec<[State; LEN]> = ArrayVec::new();
	for i in 0..LEN {
	states.push(params[i].to_state());
	}

	// Run each input twice through, to exercise buffering.
	update_many(states.iter_mut().zip(inputs.iter()));
	update_many(states.iter_mut().zip(inputs.iter()));

	// Check the outputs.
	for i in 0..LEN {
	let mut reference_state = params[i].to_state();
	// Again, run the input twice.
	reference_state.update(inputs[i]);
	reference_state.update(inputs[i]);
	assert_eq!(reference_state.finalize(), states[i].finalize());
	assert_eq!(2 * inputs[i].len() as Count, states[i].count());
	}
	}
	}
	}