vendor/blake2b_simd/src/guts.rs - toolchain/rustc - Git at Google

 use crate::*;
 use arrayref::array_ref;
 use core::cmp;

 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub const MAX_DEGREE: usize = 4;

 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
 pub const MAX_DEGREE: usize = 1;

 // Variants other than Portable are unreachable in no_std, unless CPU features
 // are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2".
 // This might change in the future if is_x86_feature_detected moves into libcore.
 #[allow(dead_code)]
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 enum Platform {
     Portable,
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     SSE41,
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     AVX2,
 }

 #[derive(Clone, Copy, Debug)]
 pub struct Implementation(Platform);

 impl Implementation {
     pub fn detect() -> Self {
         // Try the different implementations in order of how fast/modern they
         // are. Currently on non-x86, everything just uses portable.
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
             if let Some(avx2_impl) = Self::avx2_if_supported() {
                 return avx2_impl;
             }
         }
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
             if let Some(sse41_impl) = Self::sse41_if_supported() {
                 return sse41_impl;
             }
         }
         Self::portable()
     }

     pub fn portable() -> Self {
         Implementation(Platform::Portable)
     }

     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     #[allow(unreachable_code)]
     pub fn sse41_if_supported() -> Option<Self> {
         // Check whether SSE4.1 support is assumed by the build.
         #[cfg(target_feature = "sse4.1")]
         {
             return Some(Implementation(Platform::SSE41));
         }
         // Otherwise dynamically check for support if we can.
         #[cfg(feature = "std")]
         {
             if is_x86_feature_detected!("sse4.1") {
                 return Some(Implementation(Platform::SSE41));
             }
         }
         None
     }

     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     #[allow(unreachable_code)]
     pub fn avx2_if_supported() -> Option<Self> {
         // Check whether AVX2 support is assumed by the build.
         #[cfg(target_feature = "avx2")]
         {
             return Some(Implementation(Platform::AVX2));
         }
         // Otherwise dynamically check for support if we can.
         #[cfg(feature = "std")]
         {
             if is_x86_feature_detected!("avx2") {
                 return Some(Implementation(Platform::AVX2));
             }
         }
         None
     }

     pub fn degree(&self) -> usize {
         match self.0 {
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 => avx2::DEGREE,
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 => sse41::DEGREE,
             Platform::Portable => 1,
         }
     }

     pub fn compress1_loop(
         &self,
         input: &[u8],
         words: &mut [Word; 8],
         count: Count,
         last_node: LastNode,
         finalize: Finalize,
         stride: Stride,
     ) {
         match self.0 {
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 => unsafe {
                 avx2::compress1_loop(input, words, count, last_node, finalize, stride);
             },
             // Note that there's an SSE version of compress1 in the official C
             // implementation, but I haven't ported it yet.
             _ => {
                 portable::compress1_loop(input, words, count, last_node, finalize, stride);
             }
         }
     }

     pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) {
         match self.0 {
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 | Platform::SSE41 => unsafe {
                 sse41::compress2_loop(jobs, finalize, stride)
             },
             _ => panic!("unsupported"),
         }
     }

     pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) {
         match self.0 {
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) },
             _ => panic!("unsupported"),
         }
     }
 }

 pub struct Job<'a, 'b> {
     pub input: &'a [u8],
     pub words: &'b mut [Word; 8],
     pub count: Count,
     pub last_node: LastNode,
 }

 impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // NB: Don't print the words. Leaking them would allow length extension.
         write!(
             f,
             "Job {{ input_len: {}, count: {}, last_node: {} }}",
             self.input.len(),
             self.count,
             self.last_node.yes(),
         )
     }
 }

 // Finalize could just be a bool, but this is easier to read at callsites.
 #[derive(Clone, Copy, Debug)]
 pub enum Finalize {
     Yes,
     No,
 }

 impl Finalize {
     pub fn yes(&self) -> bool {
         match self {
             Finalize::Yes => true,
             Finalize::No => false,
         }
     }
 }

 // Like Finalize, this is easier to read at callsites.
 #[derive(Clone, Copy, Debug)]
 pub enum LastNode {
     Yes,
     No,
 }

 impl LastNode {
     pub fn yes(&self) -> bool {
         match self {
             LastNode::Yes => true,
             LastNode::No => false,
         }
     }
 }

 #[derive(Clone, Copy, Debug)]
 pub enum Stride {
     Serial,   // BLAKE2b/BLAKE2s
     Parallel, // BLAKE2bp/BLAKE2sp
 }

 impl Stride {
     pub fn padded_blockbytes(&self) -> usize {
         match self {
             Stride::Serial => BLOCKBYTES,
             Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES,
         }
     }
 }

 pub(crate) fn count_low(count: Count) -> Word {
     count as Word
 }

 pub(crate) fn count_high(count: Count) -> Word {
     (count >> 8 * size_of::<Word>()) as Word
 }

 pub(crate) fn assemble_count(low: Word, high: Word) -> Count {
     low as Count + ((high as Count) << 8 * size_of::<Word>())
 }

 pub(crate) fn flag_word(flag: bool) -> Word {
     if flag {
         !0
     } else {
         0
     }
 }

 // Pull a array reference at the given offset straight from the input, if
 // there's a full block of input available. If there's only a partial block,
 // copy it into the provided buffer, and return an array reference that. Along
 // with the array, return the number of bytes of real input, and whether the
 // input can be finalized (i.e. whether there aren't any more bytes after this
 // block). Note that this is written so that the optimizer can elide bounds
 // checks, see: https://godbolt.org/z/0hH2bC
 pub fn final_block<'a>(
     input: &'a [u8],
     offset: usize,
     buffer: &'a mut [u8; BLOCKBYTES],
     stride: Stride,
 ) -> (&'a [u8; BLOCKBYTES], usize, bool) {
     let capped_offset = cmp::min(offset, input.len());
     let offset_slice = &input[capped_offset..];
     if offset_slice.len() >= BLOCKBYTES {
         let block = array_ref!(offset_slice, 0, BLOCKBYTES);
         let should_finalize = offset_slice.len() <= stride.padded_blockbytes();
         (block, BLOCKBYTES, should_finalize)
     } else {
         // Copy the final block to the front of the block buffer. The rest of
         // the buffer is assumed to be initialized to zero.
         buffer[..offset_slice.len()].copy_from_slice(offset_slice);
         (buffer, offset_slice.len(), true)
     }
 }

 pub fn input_debug_asserts(input: &[u8], finalize: Finalize) {
     // If we're not finalizing, the input must not be empty, and it must be an
     // even multiple of the block size.
     if !finalize.yes() {
         debug_assert!(!input.is_empty());
         debug_assert_eq!(0, input.len() % BLOCKBYTES);
     }
 }

 #[cfg(test)]
 mod test {
     use super::*;
     use arrayvec::ArrayVec;
     use core::mem::size_of;

     #[test]
     fn test_detection() {
         assert_eq!(Platform::Portable, Implementation::portable().0);

         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         #[cfg(feature = "std")]
         {
             if is_x86_feature_detected!("avx2") {
                 assert_eq!(Platform::AVX2, Implementation::detect().0);
                 assert_eq!(
                     Platform::AVX2,
                     Implementation::avx2_if_supported().unwrap().0
                 );
                 assert_eq!(
                     Platform::SSE41,
                     Implementation::sse41_if_supported().unwrap().0
                 );
             } else if is_x86_feature_detected!("sse4.1") {
                 assert_eq!(Platform::SSE41, Implementation::detect().0);
                 assert!(Implementation::avx2_if_supported().is_none());
                 assert_eq!(
                     Platform::SSE41,
                     Implementation::sse41_if_supported().unwrap().0
                 );
             } else {
                 assert_eq!(Platform::Portable, Implementation::detect().0);
                 assert!(Implementation::avx2_if_supported().is_none());
                 assert!(Implementation::sse41_if_supported().is_none());
             }
         }
     }

     // TODO: Move all of these case tests into the implementation files.
     fn exercise_cases<F>(mut f: F)
     where
         F: FnMut(Stride, usize, LastNode, Finalize, Count),
     {
         // Chose counts to hit the relevant overflow cases.
         let counts = &[
             (0 as Count),
             ((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count,
             (0 as Count).wrapping_sub(BLOCKBYTES as Count),
         ];
         for &stride in &[Stride::Serial, Stride::Parallel] {
             let lengths = [
                 0,
                 1,
                 BLOCKBYTES - 1,
                 BLOCKBYTES,
                 BLOCKBYTES + 1,
                 2 * BLOCKBYTES - 1,
                 2 * BLOCKBYTES,
                 2 * BLOCKBYTES + 1,
                 stride.padded_blockbytes() - 1,
                 stride.padded_blockbytes(),
                 stride.padded_blockbytes() + 1,
                 2 * stride.padded_blockbytes() - 1,
                 2 * stride.padded_blockbytes(),
                 2 * stride.padded_blockbytes() + 1,
             ];
             for &length in &lengths {
                 for &last_node in &[LastNode::No, LastNode::Yes] {
                     for &finalize in &[Finalize::No, Finalize::Yes] {
                         if !finalize.yes() && (length == 0 || length % BLOCKBYTES != 0) {
                             // Skip these cases, they're invalid.
                             continue;
                         }
                         for &count in counts {
                             // eprintln!("\ncase -----");
                             // dbg!(stride);
                             // dbg!(length);
                             // dbg!(last_node);
                             // dbg!(finalize);
                             // dbg!(count);

                             f(stride, length, last_node, finalize, count);
                         }
                     }
                 }
             }
         }
     }

     fn initial_test_words(input_index: usize) -> [Word; 8] {
         crate::Params::new()
             .node_offset(input_index as u64)
             .to_words()
     }

     // Use the portable implementation, one block at a time, to compute the
     // final state words expected for a given test case.
     fn reference_compression(
         input: &[u8],
         stride: Stride,
         last_node: LastNode,
         finalize: Finalize,
         mut count: Count,
         input_index: usize,
     ) -> [Word; 8] {
         let mut words = initial_test_words(input_index);
         let mut offset = 0;
         while offset == 0 || offset < input.len() {
             let block_size = cmp::min(BLOCKBYTES, input.len() - offset);
             let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() {
                 Finalize::No
             } else {
                 finalize
             };
             portable::compress1_loop(
                 &input[offset..][..block_size],
                 &mut words,
                 count,
                 last_node,
                 maybe_finalize,
                 Stride::Serial,
             );
             offset += stride.padded_blockbytes();
             count = count.wrapping_add(BLOCKBYTES as Count);
         }
         words
     }

     // For various loop lengths and finalization parameters, make sure that the
     // implementation gives the same answer as the portable implementation does
     // when invoked one block at a time. (So even the portable implementation
     // itself is being tested here, to make sure its loop is correct.) Note
     // that this doesn't include any fixed test vectors; those are taken from
     // the blake2-kat.json file (copied from upstream) and tested elsewhere.
     fn exercise_compress1_loop(implementation: Implementation) {
         let mut input = [0; 100 * BLOCKBYTES];
         paint_test_input(&mut input);

         exercise_cases(|stride, length, last_node, finalize, count| {
             let reference_words =
                 reference_compression(&input[..length], stride, last_node, finalize, count, 0);

             let mut test_words = initial_test_words(0);
             implementation.compress1_loop(
                 &input[..length],
                 &mut test_words,
                 count,
                 last_node,
                 finalize,
                 stride,
             );
             assert_eq!(reference_words, test_words);
         });
     }

     #[test]
     fn test_compress1_loop_portable() {
         exercise_compress1_loop(Implementation::portable());
     }

     #[test]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     fn test_compress1_loop_sse41() {
         // Currently this just falls back to portable, but we test it anyway.
         if let Some(imp) = Implementation::sse41_if_supported() {
             exercise_compress1_loop(imp);
         }
     }

     #[test]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     fn test_compress1_loop_avx2() {
         if let Some(imp) = Implementation::avx2_if_supported() {
             exercise_compress1_loop(imp);
         }
     }

     // I use ArrayVec everywhere in here becuase currently these tests pass
     // under no_std. I might decide that's not worth maintaining at some point,
     // since really all we care about with no_std is that the library builds,
     // but for now it's here. Everything is keyed off of this N constant so
     // that it's easy to copy the code to exercise_compress4_loop.
     fn exercise_compress2_loop(implementation: Implementation) {
         const N: usize = 2;

         let mut input_buffer = [0; 100 * BLOCKBYTES];
         paint_test_input(&mut input_buffer);
         let mut inputs = ArrayVec::<[_; N]>::new();
         for i in 0..N {
             inputs.push(&input_buffer[i..]);
         }

         exercise_cases(|stride, length, last_node, finalize, count| {
             let mut reference_words = ArrayVec::<[_; N]>::new();
             for i in 0..N {
                 let words = reference_compression(
                     &inputs[i][..length],
                     stride,
                     last_node,
                     finalize,
                     count.wrapping_add((i * BLOCKBYTES) as Count),
                     i,
                 );
                 reference_words.push(words);
             }

             let mut test_words = ArrayVec::<[_; N]>::new();
             for i in 0..N {
                 test_words.push(initial_test_words(i));
             }
             let mut jobs = ArrayVec::<[_; N]>::new();
             for (i, words) in test_words.iter_mut().enumerate() {
                 jobs.push(Job {
                     input: &inputs[i][..length],
                     words,
                     count: count.wrapping_add((i * BLOCKBYTES) as Count),
                     last_node,
                 });
             }
             let mut jobs = jobs.into_inner().expect("full");
             implementation.compress2_loop(&mut jobs, finalize, stride);

             for i in 0..N {
                 assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
             }
         });
     }

     #[test]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     fn test_compress2_loop_sse41() {
         if let Some(imp) = Implementation::sse41_if_supported() {
             exercise_compress2_loop(imp);
         }
     }

     #[test]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     fn test_compress2_loop_avx2() {
         // Currently this just falls back to SSE4.1, but we test it anyway.
         if let Some(imp) = Implementation::avx2_if_supported() {
             exercise_compress2_loop(imp);
         }
     }

     // Copied from exercise_compress2_loop, with a different value of N and an
     // interior call to compress4_loop.
     fn exercise_compress4_loop(implementation: Implementation) {
         const N: usize = 4;

         let mut input_buffer = [0; 100 * BLOCKBYTES];
         paint_test_input(&mut input_buffer);
         let mut inputs = ArrayVec::<[_; N]>::new();
         for i in 0..N {
             inputs.push(&input_buffer[i..]);
         }

         exercise_cases(|stride, length, last_node, finalize, count| {
             let mut reference_words = ArrayVec::<[_; N]>::new();
             for i in 0..N {
                 let words = reference_compression(
                     &inputs[i][..length],
                     stride,
                     last_node,
                     finalize,
                     count.wrapping_add((i * BLOCKBYTES) as Count),
                     i,
                 );
                 reference_words.push(words);
             }

             let mut test_words = ArrayVec::<[_; N]>::new();
             for i in 0..N {
                 test_words.push(initial_test_words(i));
             }
             let mut jobs = ArrayVec::<[_; N]>::new();
             for (i, words) in test_words.iter_mut().enumerate() {
                 jobs.push(Job {
                     input: &inputs[i][..length],
                     words,
                     count: count.wrapping_add((i * BLOCKBYTES) as Count),
                     last_node,
                 });
             }
             let mut jobs = jobs.into_inner().expect("full");
             implementation.compress4_loop(&mut jobs, finalize, stride);

             for i in 0..N {
                 assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
             }
         });
     }

     #[test]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     fn test_compress4_loop_avx2() {
         if let Some(imp) = Implementation::avx2_if_supported() {
             exercise_compress4_loop(imp);
         }
     }

     #[test]
     fn sanity_check_count_size() {
         assert_eq!(size_of::<Count>(), 2 * size_of::<Word>());
     }
 }
	use crate::*;
	use arrayref::array_ref;
	use core::cmp;

	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	pub const MAX_DEGREE: usize = 4;

	#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
	pub const MAX_DEGREE: usize = 1;

	// Variants other than Portable are unreachable in no_std, unless CPU features
	// are explicitly enabled for the build with e.g. RUSTFLAGS="-C target-feature=avx2".
	// This might change in the future if is_x86_feature_detected moves into libcore.
	#[allow(dead_code)]
	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
	enum Platform {
	Portable,
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	SSE41,
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	AVX2,
	}

	#[derive(Clone, Copy, Debug)]
	pub struct Implementation(Platform);

	impl Implementation {
	pub fn detect() -> Self {
	// Try the different implementations in order of how fast/modern they
	// are. Currently on non-x86, everything just uses portable.
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	{
	if let Some(avx2_impl) = Self::avx2_if_supported() {
	return avx2_impl;
	}
	}
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	{
	if let Some(sse41_impl) = Self::sse41_if_supported() {
	return sse41_impl;
	}
	}
	Self::portable()
	}

	pub fn portable() -> Self {
	Implementation(Platform::Portable)
	}

	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	#[allow(unreachable_code)]
	pub fn sse41_if_supported() -> Option<Self> {
	// Check whether SSE4.1 support is assumed by the build.
	#[cfg(target_feature = "sse4.1")]
	{
	return Some(Implementation(Platform::SSE41));
	}
	// Otherwise dynamically check for support if we can.
	#[cfg(feature = "std")]
	{
	if is_x86_feature_detected!("sse4.1") {
	return Some(Implementation(Platform::SSE41));
	}
	}
	None
	}

	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	#[allow(unreachable_code)]
	pub fn avx2_if_supported() -> Option<Self> {
	// Check whether AVX2 support is assumed by the build.
	#[cfg(target_feature = "avx2")]
	{
	return Some(Implementation(Platform::AVX2));
	}
	// Otherwise dynamically check for support if we can.
	#[cfg(feature = "std")]
	{
	if is_x86_feature_detected!("avx2") {
	return Some(Implementation(Platform::AVX2));
	}
	}
	None
	}

	pub fn degree(&self) -> usize {
	match self.0 {
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	Platform::AVX2 => avx2::DEGREE,
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	Platform::SSE41 => sse41::DEGREE,
	Platform::Portable => 1,
	}
	}

	pub fn compress1_loop(
	&self,
	input: &[u8],
	words: &mut [Word; 8],
	count: Count,
	last_node: LastNode,
	finalize: Finalize,
	stride: Stride,
	) {
	match self.0 {
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	Platform::AVX2 => unsafe {
	avx2::compress1_loop(input, words, count, last_node, finalize, stride);
	},
	// Note that there's an SSE version of compress1 in the official C
	// implementation, but I haven't ported it yet.
	_ => {
	portable::compress1_loop(input, words, count, last_node, finalize, stride);
	}
	}
	}

	pub fn compress2_loop(&self, jobs: &mut [Job; 2], finalize: Finalize, stride: Stride) {
	match self.0 {
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	Platform::AVX2 \| Platform::SSE41 => unsafe {
	sse41::compress2_loop(jobs, finalize, stride)
	},
	_ => panic!("unsupported"),
	}
	}

	pub fn compress4_loop(&self, jobs: &mut [Job; 4], finalize: Finalize, stride: Stride) {
	match self.0 {
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	Platform::AVX2 => unsafe { avx2::compress4_loop(jobs, finalize, stride) },
	_ => panic!("unsupported"),
	}
	}
	}

	pub struct Job<'a, 'b> {
	pub input: &'a [u8],
	pub words: &'b mut [Word; 8],
	pub count: Count,
	pub last_node: LastNode,
	}

	impl<'a, 'b> core::fmt::Debug for Job<'a, 'b> {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	// NB: Don't print the words. Leaking them would allow length extension.
	write!(
	f,
	"Job {{ input_len: {}, count: {}, last_node: {} }}",
	self.input.len(),
	self.count,
	self.last_node.yes(),
	)
	}
	}

	// Finalize could just be a bool, but this is easier to read at callsites.
	#[derive(Clone, Copy, Debug)]
	pub enum Finalize {
	Yes,
	No,
	}

	impl Finalize {
	pub fn yes(&self) -> bool {
	match self {
	Finalize::Yes => true,
	Finalize::No => false,
	}
	}
	}

	// Like Finalize, this is easier to read at callsites.
	#[derive(Clone, Copy, Debug)]
	pub enum LastNode {
	Yes,
	No,
	}

	impl LastNode {
	pub fn yes(&self) -> bool {
	match self {
	LastNode::Yes => true,
	LastNode::No => false,
	}
	}
	}

	#[derive(Clone, Copy, Debug)]
	pub enum Stride {
	Serial, // BLAKE2b/BLAKE2s
	Parallel, // BLAKE2bp/BLAKE2sp
	}

	impl Stride {
	pub fn padded_blockbytes(&self) -> usize {
	match self {
	Stride::Serial => BLOCKBYTES,
	Stride::Parallel => blake2bp::DEGREE * BLOCKBYTES,
	}
	}
	}

	pub(crate) fn count_low(count: Count) -> Word {
	count as Word
	}

	pub(crate) fn count_high(count: Count) -> Word {
	(count >> 8 * size_of::<Word>()) as Word
	}

	pub(crate) fn assemble_count(low: Word, high: Word) -> Count {
	low as Count + ((high as Count) << 8 * size_of::<Word>())
	}

	pub(crate) fn flag_word(flag: bool) -> Word {
	if flag {
	!0
	} else {
	0
	}
	}

	// Pull a array reference at the given offset straight from the input, if
	// there's a full block of input available. If there's only a partial block,
	// copy it into the provided buffer, and return an array reference that. Along
	// with the array, return the number of bytes of real input, and whether the
	// input can be finalized (i.e. whether there aren't any more bytes after this
	// block). Note that this is written so that the optimizer can elide bounds
	// checks, see: https://godbolt.org/z/0hH2bC
	pub fn final_block<'a>(
	input: &'a [u8],
	offset: usize,
	buffer: &'a mut [u8; BLOCKBYTES],
	stride: Stride,
	) -> (&'a [u8; BLOCKBYTES], usize, bool) {
	let capped_offset = cmp::min(offset, input.len());
	let offset_slice = &input[capped_offset..];
	if offset_slice.len() >= BLOCKBYTES {
	let block = array_ref!(offset_slice, 0, BLOCKBYTES);
	let should_finalize = offset_slice.len() <= stride.padded_blockbytes();
	(block, BLOCKBYTES, should_finalize)
	} else {
	// Copy the final block to the front of the block buffer. The rest of
	// the buffer is assumed to be initialized to zero.
	buffer[..offset_slice.len()].copy_from_slice(offset_slice);
	(buffer, offset_slice.len(), true)
	}
	}

	pub fn input_debug_asserts(input: &[u8], finalize: Finalize) {
	// If we're not finalizing, the input must not be empty, and it must be an
	// even multiple of the block size.
	if !finalize.yes() {
	debug_assert!(!input.is_empty());
	debug_assert_eq!(0, input.len() % BLOCKBYTES);
	}
	}

	#[cfg(test)]
	mod test {
	use super::*;
	use arrayvec::ArrayVec;
	use core::mem::size_of;

	#[test]
	fn test_detection() {
	assert_eq!(Platform::Portable, Implementation::portable().0);

	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	#[cfg(feature = "std")]
	{
	if is_x86_feature_detected!("avx2") {
	assert_eq!(Platform::AVX2, Implementation::detect().0);
	assert_eq!(
	Platform::AVX2,
	Implementation::avx2_if_supported().unwrap().0
	);
	assert_eq!(
	Platform::SSE41,
	Implementation::sse41_if_supported().unwrap().0
	);
	} else if is_x86_feature_detected!("sse4.1") {
	assert_eq!(Platform::SSE41, Implementation::detect().0);
	assert!(Implementation::avx2_if_supported().is_none());
	assert_eq!(
	Platform::SSE41,
	Implementation::sse41_if_supported().unwrap().0
	);
	} else {
	assert_eq!(Platform::Portable, Implementation::detect().0);
	assert!(Implementation::avx2_if_supported().is_none());
	assert!(Implementation::sse41_if_supported().is_none());
	}
	}
	}

	// TODO: Move all of these case tests into the implementation files.
	fn exercise_cases<F>(mut f: F)
	where
	F: FnMut(Stride, usize, LastNode, Finalize, Count),
	{
	// Chose counts to hit the relevant overflow cases.
	let counts = &[
	(0 as Count),
	((1 as Count) << (8 * size_of::<Word>())) - BLOCKBYTES as Count,
	(0 as Count).wrapping_sub(BLOCKBYTES as Count),
	];
	for &stride in &[Stride::Serial, Stride::Parallel] {
	let lengths = [
	0,
	1,
	BLOCKBYTES - 1,
	BLOCKBYTES,
	BLOCKBYTES + 1,
	2 * BLOCKBYTES - 1,
	2 * BLOCKBYTES,
	2 * BLOCKBYTES + 1,
	stride.padded_blockbytes() - 1,
	stride.padded_blockbytes(),
	stride.padded_blockbytes() + 1,
	2 * stride.padded_blockbytes() - 1,
	2 * stride.padded_blockbytes(),
	2 * stride.padded_blockbytes() + 1,
	];
	for &length in &lengths {
	for &last_node in &[LastNode::No, LastNode::Yes] {
	for &finalize in &[Finalize::No, Finalize::Yes] {
	if !finalize.yes() && (length == 0 \|\| length % BLOCKBYTES != 0) {
	// Skip these cases, they're invalid.
	continue;
	}
	for &count in counts {
	// eprintln!("\ncase -----");
	// dbg!(stride);
	// dbg!(length);
	// dbg!(last_node);
	// dbg!(finalize);
	// dbg!(count);

	f(stride, length, last_node, finalize, count);
	}
	}
	}
	}
	}
	}

	fn initial_test_words(input_index: usize) -> [Word; 8] {
	crate::Params::new()
	.node_offset(input_index as u64)
	.to_words()
	}

	// Use the portable implementation, one block at a time, to compute the
	// final state words expected for a given test case.
	fn reference_compression(
	input: &[u8],
	stride: Stride,
	last_node: LastNode,
	finalize: Finalize,
	mut count: Count,
	input_index: usize,
	) -> [Word; 8] {
	let mut words = initial_test_words(input_index);
	let mut offset = 0;
	while offset == 0 \|\| offset < input.len() {
	let block_size = cmp::min(BLOCKBYTES, input.len() - offset);
	let maybe_finalize = if offset + stride.padded_blockbytes() < input.len() {
	Finalize::No
	} else {
	finalize
	};
	portable::compress1_loop(
	&input[offset..][..block_size],
	&mut words,
	count,
	last_node,
	maybe_finalize,
	Stride::Serial,
	);
	offset += stride.padded_blockbytes();
	count = count.wrapping_add(BLOCKBYTES as Count);
	}
	words
	}

	// For various loop lengths and finalization parameters, make sure that the
	// implementation gives the same answer as the portable implementation does
	// when invoked one block at a time. (So even the portable implementation
	// itself is being tested here, to make sure its loop is correct.) Note
	// that this doesn't include any fixed test vectors; those are taken from
	// the blake2-kat.json file (copied from upstream) and tested elsewhere.
	fn exercise_compress1_loop(implementation: Implementation) {
	let mut input = [0; 100 * BLOCKBYTES];
	paint_test_input(&mut input);

	exercise_cases(\|stride, length, last_node, finalize, count\| {
	let reference_words =
	reference_compression(&input[..length], stride, last_node, finalize, count, 0);

	let mut test_words = initial_test_words(0);
	implementation.compress1_loop(
	&input[..length],
	&mut test_words,
	count,
	last_node,
	finalize,
	stride,
	);
	assert_eq!(reference_words, test_words);
	});
	}

	#[test]
	fn test_compress1_loop_portable() {
	exercise_compress1_loop(Implementation::portable());
	}

	#[test]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	fn test_compress1_loop_sse41() {
	// Currently this just falls back to portable, but we test it anyway.
	if let Some(imp) = Implementation::sse41_if_supported() {
	exercise_compress1_loop(imp);
	}
	}

	#[test]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	fn test_compress1_loop_avx2() {
	if let Some(imp) = Implementation::avx2_if_supported() {
	exercise_compress1_loop(imp);
	}
	}

	// I use ArrayVec everywhere in here becuase currently these tests pass
	// under no_std. I might decide that's not worth maintaining at some point,
	// since really all we care about with no_std is that the library builds,
	// but for now it's here. Everything is keyed off of this N constant so
	// that it's easy to copy the code to exercise_compress4_loop.
	fn exercise_compress2_loop(implementation: Implementation) {
	const N: usize = 2;

	let mut input_buffer = [0; 100 * BLOCKBYTES];
	paint_test_input(&mut input_buffer);
	let mut inputs = ArrayVec::<[_; N]>::new();
	for i in 0..N {
	inputs.push(&input_buffer[i..]);
	}

	exercise_cases(\|stride, length, last_node, finalize, count\| {
	let mut reference_words = ArrayVec::<[_; N]>::new();
	for i in 0..N {
	let words = reference_compression(
	&inputs[i][..length],
	stride,
	last_node,
	finalize,
	count.wrapping_add((i * BLOCKBYTES) as Count),
	i,
	);
	reference_words.push(words);
	}

	let mut test_words = ArrayVec::<[_; N]>::new();
	for i in 0..N {
	test_words.push(initial_test_words(i));
	}
	let mut jobs = ArrayVec::<[_; N]>::new();
	for (i, words) in test_words.iter_mut().enumerate() {
	jobs.push(Job {
	input: &inputs[i][..length],
	words,
	count: count.wrapping_add((i * BLOCKBYTES) as Count),
	last_node,
	});
	}
	let mut jobs = jobs.into_inner().expect("full");
	implementation.compress2_loop(&mut jobs, finalize, stride);

	for i in 0..N {
	assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
	}
	});
	}

	#[test]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	fn test_compress2_loop_sse41() {
	if let Some(imp) = Implementation::sse41_if_supported() {
	exercise_compress2_loop(imp);
	}
	}

	#[test]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	fn test_compress2_loop_avx2() {
	// Currently this just falls back to SSE4.1, but we test it anyway.
	if let Some(imp) = Implementation::avx2_if_supported() {
	exercise_compress2_loop(imp);
	}
	}

	// Copied from exercise_compress2_loop, with a different value of N and an
	// interior call to compress4_loop.
	fn exercise_compress4_loop(implementation: Implementation) {
	const N: usize = 4;

	let mut input_buffer = [0; 100 * BLOCKBYTES];
	paint_test_input(&mut input_buffer);
	let mut inputs = ArrayVec::<[_; N]>::new();
	for i in 0..N {
	inputs.push(&input_buffer[i..]);
	}

	exercise_cases(\|stride, length, last_node, finalize, count\| {
	let mut reference_words = ArrayVec::<[_; N]>::new();
	for i in 0..N {
	let words = reference_compression(
	&inputs[i][..length],
	stride,
	last_node,
	finalize,
	count.wrapping_add((i * BLOCKBYTES) as Count),
	i,
	);
	reference_words.push(words);
	}

	let mut test_words = ArrayVec::<[_; N]>::new();
	for i in 0..N {
	test_words.push(initial_test_words(i));
	}
	let mut jobs = ArrayVec::<[_; N]>::new();
	for (i, words) in test_words.iter_mut().enumerate() {
	jobs.push(Job {
	input: &inputs[i][..length],
	words,
	count: count.wrapping_add((i * BLOCKBYTES) as Count),
	last_node,
	});
	}
	let mut jobs = jobs.into_inner().expect("full");
	implementation.compress4_loop(&mut jobs, finalize, stride);

	for i in 0..N {
	assert_eq!(reference_words[i], test_words[i], "words {} unequal", i);
	}
	});
	}

	#[test]
	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
	fn test_compress4_loop_avx2() {
	if let Some(imp) = Implementation::avx2_if_supported() {
	exercise_compress4_loop(imp);
	}
	}

	#[test]
	fn sanity_check_count_size() {
	assert_eq!(size_of::<Count>(), 2 * size_of::<Word>());
	}
	}