src/block/compress.rs - platform/external/rust/crates/lz4_flex - Git at Google

 //! The compression algorithm.
 //!
 //! We make use of hash tables to find duplicates. This gives a reasonable compression ratio with a
 //! high performance. It has fixed memory usage, which contrary to other approachs, makes it less
 //! memory hungry.

 use crate::block::hashtable::HashTable;
 use crate::block::END_OFFSET;
 use crate::block::LZ4_MIN_LENGTH;
 use crate::block::MAX_DISTANCE;
 use crate::block::MFLIMIT;
 use crate::block::MINMATCH;
 #[cfg(not(feature = "safe-encode"))]
 use crate::sink::PtrSink;
 use crate::sink::Sink;
 use crate::sink::SliceSink;
 #[allow(unused_imports)]
 use alloc::vec;
 use alloc::vec::Vec;

 #[cfg(feature = "safe-encode")]
 use core::convert::TryInto;

 use super::hashtable::HashTable4K;
 use super::hashtable::HashTable4KU16;
 use super::{CompressError, WINDOW_SIZE};

 /// Increase step size after 1<<INCREASE_STEPSIZE_BITSHIFT non matches
 const INCREASE_STEPSIZE_BITSHIFT: usize = 5;

 /// Read a 4-byte "batch" from some position.
 ///
 /// This will read a native-endian 4-byte integer from some position.
 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 pub(super) fn get_batch(input: &[u8], n: usize) -> u32 {
     unsafe { read_u32_ptr(input.as_ptr().add(n)) }
 }

 #[inline]
 #[cfg(feature = "safe-encode")]
 pub(super) fn get_batch(input: &[u8], n: usize) -> u32 {
     u32::from_ne_bytes(input[n..n + 4].try_into().unwrap())
 }

 /// Read an usize sized "batch" from some position.
 ///
 /// This will read a native-endian usize from some position.
 #[inline]
 #[allow(dead_code)]
 #[cfg(not(feature = "safe-encode"))]
 pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize {
     unsafe { read_usize_ptr(input.as_ptr().add(n)) }
 }

 #[inline]
 #[allow(dead_code)]
 #[cfg(feature = "safe-encode")]
 pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize {
     const USIZE_SIZE: usize = core::mem::size_of::<usize>();
     let arr: &[u8; USIZE_SIZE] = input[n..n + USIZE_SIZE].try_into().unwrap();
     usize::from_ne_bytes(*arr)
 }

 #[inline]
 fn token_from_literal(lit_len: usize) -> u8 {
     if lit_len < 0xF {
         // Since we can fit the literals length into it, there is no need for saturation.
         (lit_len as u8) << 4
     } else {
         // We were unable to fit the literals into it, so we saturate to 0xF. We will later
         // write the extensional value.
         0xF0
     }
 }

 #[inline]
 fn token_from_literal_and_match_length(lit_len: usize, duplicate_length: usize) -> u8 {
     let mut token = if lit_len < 0xF {
         // Since we can fit the literals length into it, there is no need for saturation.
         (lit_len as u8) << 4
     } else {
         // We were unable to fit the literals into it, so we saturate to 0xF. We will later
         // write the extensional value.
         0xF0
     };

     token |= if duplicate_length < 0xF {
         // We could fit it in.
         duplicate_length as u8
     } else {
         // We were unable to fit it in, so we default to 0xF, which will later be extended.
         0xF
     };

     token
 }

 /// Counts the number of same bytes in two byte streams.
 /// `input` is the complete input
 /// `cur` is the current position in the input. it will be incremented by the number of matched
 /// bytes `source` either the same as input or an external slice
 /// `candidate` is the candidate position in `source`
 ///
 /// The function ignores the last END_OFFSET bytes in input as those should be literals.
 #[inline]
 #[cfg(feature = "safe-encode")]
 fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize {
     const USIZE_SIZE: usize = core::mem::size_of::<usize>();
     let cur_slice = &input[*cur..input.len() - END_OFFSET];
     let cand_slice = &source[candidate..];

     let mut num = 0;
     for (block1, block2) in cur_slice
         .chunks_exact(USIZE_SIZE)
         .zip(cand_slice.chunks_exact(USIZE_SIZE))
     {
         let input_block = usize::from_ne_bytes(block1.try_into().unwrap());
         let match_block = usize::from_ne_bytes(block2.try_into().unwrap());

         if input_block == match_block {
             num += USIZE_SIZE;
         } else {
             let diff = input_block ^ match_block;
             num += (diff.to_le().trailing_zeros() / 8) as usize;
             *cur += num;
             return num;
         }
     }

     // If we're here we may have 1 to 7 bytes left to check close to the end of input
     // or source slices. Since this is rare occurrence we mark it cold to get better
     // ~5% better performance.
     #[cold]
     fn count_same_bytes_tail(a: &[u8], b: &[u8], offset: usize) -> usize {
         a.iter()
             .zip(b)
             .skip(offset)
             .take_while(|(a, b)| a == b)
             .count()
     }
     num += count_same_bytes_tail(cur_slice, cand_slice, num);

     *cur += num;
     num
 }

 /// Counts the number of same bytes in two byte streams.
 /// `input` is the complete input
 /// `cur` is the current position in the input. it will be incremented by the number of matched
 /// bytes `source` either the same as input OR an external slice
 /// `candidate` is the candidate position in `source`
 ///
 /// The function ignores the last END_OFFSET bytes in input as those should be literals.
 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize {
     let max_input_match = input.len().saturating_sub(*cur + END_OFFSET);
     let max_candidate_match = source.len() - candidate;
     // Considering both limits calc how far we may match in input.
     let input_end = *cur + max_input_match.min(max_candidate_match);

     let start = *cur;
     let mut source_ptr = unsafe { source.as_ptr().add(candidate) };

     // compare 4/8 bytes blocks depending on the arch
     const STEP_SIZE: usize = core::mem::size_of::<usize>();
     while *cur + STEP_SIZE <= input_end {
         let diff = read_usize_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_usize_ptr(source_ptr);

         if diff == 0 {
             *cur += STEP_SIZE;
             unsafe {
                 source_ptr = source_ptr.add(STEP_SIZE);
             }
         } else {
             *cur += (diff.to_le().trailing_zeros() / 8) as usize;
             return *cur - start;
         }
     }

     // compare 4 bytes block
     #[cfg(target_pointer_width = "64")]
     {
         if input_end - *cur >= 4 {
             let diff = read_u32_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_u32_ptr(source_ptr);

             if diff == 0 {
                 *cur += 4;
                 unsafe {
                     source_ptr = source_ptr.add(4);
                 }
             } else {
                 *cur += (diff.to_le().trailing_zeros() / 8) as usize;
                 return *cur - start;
             }
         }
     }

     // compare 2 bytes block
     if input_end - *cur >= 2
         && unsafe { read_u16_ptr(input.as_ptr().add(*cur)) == read_u16_ptr(source_ptr) }
     {
         *cur += 2;
         unsafe {
             source_ptr = source_ptr.add(2);
         }
     }

     if *cur < input_end
         && unsafe { input.as_ptr().add(*cur).read() } == unsafe { source_ptr.read() }
     {
         *cur += 1;
     }

     *cur - start
 }

 /// Write an integer to the output.
 ///
 /// Each additional byte then represent a value from 0 to 255, which is added to the previous value
 /// to produce a total length. When the byte value is 255, another byte must read and added, and so
 /// on. There can be any number of bytes of value "255" following token
 #[inline]
 #[cfg(feature = "safe-encode")]
 fn write_integer(output: &mut impl Sink, mut n: usize) {
     // Note: Since `n` is usually < 0xFF and writing multiple bytes to the output
     // requires 2 branches of bound check (due to the possibility of add overflows)
     // the simple byte at a time implementation below is faster in most cases.
     while n >= 0xFF {
         n -= 0xFF;
         push_byte(output, 0xFF);
     }
     push_byte(output, n as u8);
 }

 /// Write an integer to the output.
 ///
 /// Each additional byte then represent a value from 0 to 255, which is added to the previous value
 /// to produce a total length. When the byte value is 255, another byte must read and added, and so
 /// on. There can be any number of bytes of value "255" following token
 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn write_integer(output: &mut impl Sink, mut n: usize) {
     // Write the 0xFF bytes as long as the integer is higher than said value.
     if n >= 4 * 0xFF {
         // In this unlikelly branch we use a fill instead of a loop,
         // otherwise rustc may output a large unrolled/vectorized loop.
         let bulk = n / (4 * 0xFF);
         n %= 4 * 0xFF;
         unsafe {
             core::ptr::write_bytes(output.pos_mut_ptr(), 0xFF, 4 * bulk);
             output.set_pos(output.pos() + 4 * bulk);
         }
     }

     // Handle last 1 to 4 bytes
     push_u32(output, 0xFFFFFFFF);
     // Updating output len for the remainder
     unsafe {
         output.set_pos(output.pos() - 4 + 1 + n / 255);
         // Write the remaining byte.
         *output.pos_mut_ptr().sub(1) = (n % 255) as u8;
     }
 }

 /// Handle the last bytes from the input as literals
 #[cold]
 fn handle_last_literals(output: &mut impl Sink, input: &[u8], start: usize) {
     let lit_len = input.len() - start;

     let token = token_from_literal(lit_len);
     push_byte(output, token);
     if lit_len >= 0xF {
         write_integer(output, lit_len - 0xF);
     }
     // Now, write the actual literals.
     output.extend_from_slice(&input[start..]);
 }

 /// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate
 #[inline]
 #[cfg(feature = "safe-encode")]
 fn backtrack_match(
     input: &[u8],
     cur: &mut usize,
     literal_start: usize,
     source: &[u8],
     candidate: &mut usize,
 ) {
     // Note: Even if iterator version of this loop has less branches inside the loop it has more
     // branches before the loop. That in practice seems to make it slower than the while version
     // bellow. TODO: It should be possible remove all bounds checks, since we are walking
     // backwards
     while *candidate > 0 && *cur > literal_start && input[*cur - 1] == source[*candidate - 1] {
         *cur -= 1;
         *candidate -= 1;
     }
 }

 /// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate
 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn backtrack_match(
     input: &[u8],
     cur: &mut usize,
     literal_start: usize,
     source: &[u8],
     candidate: &mut usize,
 ) {
     while unsafe {
         *candidate > 0
             && *cur > literal_start
             && input.get_unchecked(*cur - 1) == source.get_unchecked(*candidate - 1)
     } {
         *cur -= 1;
         *candidate -= 1;
     }
 }

 /// Compress all bytes of `input[input_pos..]` into `output`.
 ///
 /// Bytes in `input[..input_pos]` are treated as a preamble and can be used for lookback.
 /// This part is known as the compressor "prefix".
 /// Bytes in `ext_dict` logically precede the bytes in `input` and can also be used for lookback.
 ///
 /// `input_stream_offset` is the logical position of the first byte of `input`. This allows same
 /// `dict` to be used for many calls to `compress_internal` as we can "readdress" the first byte of
 /// `input` to be something other than 0.
 ///
 /// `dict` is the dictionary of previously encoded sequences.
 ///
 /// This is used to find duplicates in the stream so they are not written multiple times.
 ///
 /// Every four bytes are hashed, and in the resulting slot their position in the input buffer
 /// is placed in the dict. This way we can easily look up a candidate to back references.
 ///
 /// Returns the number of bytes written (compressed) into `output`.
 ///
 /// # Const parameters
 /// `USE_DICT`: Disables usage of ext_dict (it'll panic if a non-empty slice is used).
 /// In other words, this generates more optimized code when an external dictionary isn't used.
 ///
 /// A similar const argument could be used to disable the Prefix mode (eg. USE_PREFIX),
 /// which would impose `input_pos == 0 && input_stream_offset == 0`. Experiments didn't
 /// show significant improvement though.
 // Intentionally avoid inlining.
 // Empirical tests revealed it to be rarely better but often significantly detrimental.
 #[inline(never)]
 pub(crate) fn compress_internal<T: HashTable, const USE_DICT: bool, S: Sink>(
     input: &[u8],
     input_pos: usize,
     output: &mut S,
     dict: &mut T,
     ext_dict: &[u8],
     input_stream_offset: usize,
 ) -> Result<usize, CompressError> {
     assert!(input_pos <= input.len());
     if USE_DICT {
         assert!(ext_dict.len() <= super::WINDOW_SIZE);
         assert!(ext_dict.len() <= input_stream_offset);
         // Check for overflow hazard when using ext_dict
         assert!(input_stream_offset
             .checked_add(input.len())
             .and_then(|i| i.checked_add(ext_dict.len()))
             .map_or(false, |i| i <= isize::MAX as usize));
     } else {
         assert!(ext_dict.is_empty());
     }
     if output.capacity() - output.pos() < get_maximum_output_size(input.len() - input_pos) {
         return Err(CompressError::OutputTooSmall);
     }

     let output_start_pos = output.pos();
     if input.len() - input_pos < LZ4_MIN_LENGTH {
         handle_last_literals(output, input, input_pos);
         return Ok(output.pos() - output_start_pos);
     }

     let ext_dict_stream_offset = input_stream_offset - ext_dict.len();
     let end_pos_check = input.len() - MFLIMIT;
     let mut literal_start = input_pos;
     let mut cur = input_pos;

     if cur == 0 && input_stream_offset == 0 {
         // According to the spec we can't start with a match,
         // except when referencing another block.
         let hash = T::get_hash_at(input, 0);
         dict.put_at(hash, 0);
         cur = 1;
     }

     loop {
         // Read the next block into two sections, the literals and the duplicates.
         let mut step_size;
         let mut candidate;
         let mut candidate_source;
         let mut offset;
         let mut non_match_count = 1 << INCREASE_STEPSIZE_BITSHIFT;
         // The number of bytes before our cursor, where the duplicate starts.
         let mut next_cur = cur;

         // In this loop we search for duplicates via the hashtable. 4bytes or 8bytes are hashed and
         // compared.
         loop {
             step_size = non_match_count >> INCREASE_STEPSIZE_BITSHIFT;
             non_match_count += 1;

             cur = next_cur;
             next_cur += step_size;

             // Same as cur + MFLIMIT > input.len()
             if cur > end_pos_check {
                 handle_last_literals(output, input, literal_start);
                 return Ok(output.pos() - output_start_pos);
             }
             // Find a candidate in the dictionary with the hash of the current four bytes.
             // Unchecked is safe as long as the values from the hash function don't exceed the size
             // of the table. This is ensured by right shifting the hash values
             // (`dict_bitshift`) to fit them in the table

             // [Bounds Check]: Can be elided due to `end_pos_check` above
             let hash = T::get_hash_at(input, cur);
             candidate = dict.get_at(hash);
             dict.put_at(hash, cur + input_stream_offset);

             // Sanity check: Matches can't be ahead of `cur`.
             debug_assert!(candidate <= input_stream_offset + cur);

             // Two requirements to the candidate exists:
             // - We should not return a position which is merely a hash collision, so that the
             //   candidate actually matches what we search for.
             // - We can address up to 16-bit offset, hence we are only able to address the candidate
             //   if its offset is less than or equals to 0xFFFF.
             if input_stream_offset + cur - candidate > MAX_DISTANCE {
                 continue;
             }

             if candidate >= input_stream_offset {
                 // match within input
                 offset = (input_stream_offset + cur - candidate) as u16;
                 candidate -= input_stream_offset;
                 candidate_source = input;
             } else if USE_DICT {
                 // Sanity check, which may fail if we lost history beyond MAX_DISTANCE
                 debug_assert!(
                     candidate >= ext_dict_stream_offset,
                     "Lost history in ext dict mode"
                 );
                 // match within ext dict
                 offset = (input_stream_offset + cur - candidate) as u16;
                 candidate -= ext_dict_stream_offset;
                 candidate_source = ext_dict;
             } else {
                 // Match is not reachable anymore
                 // eg. compressing an independent block frame w/o clearing
                 // the matches tables, only increasing input_stream_offset.
                 // Sanity check
                 debug_assert!(input_pos == 0, "Lost history in prefix mode");
                 continue;
             }
             // [Bounds Check]: Candidate is coming from the Hashmap. It can't be out of bounds, but
             // impossible to prove for the compiler and remove the bounds checks.
             let cand_bytes: u32 = get_batch(candidate_source, candidate);
             // [Bounds Check]: Should be able to be elided due to `end_pos_check`.
             let curr_bytes: u32 = get_batch(input, cur);

             if cand_bytes == curr_bytes {
                 break;
             }
         }

         // Extend the match backwards if we can
         backtrack_match(
             input,
             &mut cur,
             literal_start,
             candidate_source,
             &mut candidate,
         );

         // The length (in bytes) of the literals section.
         let lit_len = cur - literal_start;

         // Generate the higher half of the token.
         cur += MINMATCH;
         candidate += MINMATCH;
         let duplicate_length = count_same_bytes(input, &mut cur, candidate_source, candidate);

         // Note: The `- 2` offset was copied from the reference implementation, it could be
         // arbitrary.
         let hash = T::get_hash_at(input, cur - 2);
         dict.put_at(hash, cur - 2 + input_stream_offset);

         let token = token_from_literal_and_match_length(lit_len, duplicate_length);

         // Push the token to the output stream.
         push_byte(output, token);
         // If we were unable to fit the literals length into the token, write the extensional
         // part.
         if lit_len >= 0xF {
             write_integer(output, lit_len - 0xF);
         }

         // Now, write the actual literals.
         //
         // The unsafe version copies blocks of 8bytes, and therefore may copy up to 7bytes more than
         // needed. This is safe, because the last 12 bytes (MF_LIMIT) are handled in
         // handle_last_literals.
         copy_literals_wild(output, input, literal_start, lit_len);
         // write the offset in little endian.
         push_u16(output, offset);

         // If we were unable to fit the duplicates length into the token, write the
         // extensional part.
         if duplicate_length >= 0xF {
             write_integer(output, duplicate_length - 0xF);
         }
         literal_start = cur;
     }
 }

 #[inline]
 #[cfg(feature = "safe-encode")]
 fn push_byte(output: &mut impl Sink, el: u8) {
     output.push(el);
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn push_byte(output: &mut impl Sink, el: u8) {
     unsafe {
         core::ptr::write(output.pos_mut_ptr(), el);
         output.set_pos(output.pos() + 1);
     }
 }

 #[inline]
 #[cfg(feature = "safe-encode")]
 fn push_u16(output: &mut impl Sink, el: u16) {
     output.extend_from_slice(&el.to_le_bytes());
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn push_u16(output: &mut impl Sink, el: u16) {
     unsafe {
         core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 2);
         output.set_pos(output.pos() + 2);
     }
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn push_u32(output: &mut impl Sink, el: u32) {
     unsafe {
         core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 4);
         output.set_pos(output.pos() + 4);
     }
 }

 #[inline(always)] // (always) necessary otherwise compiler fails to inline it
 #[cfg(feature = "safe-encode")]
 fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
     output.extend_from_slice_wild(&input[input_start..input_start + len], len)
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
     debug_assert!(input_start + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= input.len());
     debug_assert!(output.pos() + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= output.capacity());
     unsafe {
         // Note: This used to be a wild copy loop of 8 bytes, but the compiler consistently
         // transformed it into a call to memcopy, which hurts performance significantly for
         // small copies, which are common.
         let start_ptr = input.as_ptr().add(input_start);
         match len {
             0..=8 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 8),
             9..=16 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 16),
             17..=24 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 24),
             _ => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), len),
         }
         output.set_pos(output.pos() + len);
     }
 }

 /// Compress all bytes of `input` into `output`.
 /// The method chooses an appropriate hashtable to lookup duplicates.
 /// output should be preallocated with a size of
 /// `get_maximum_output_size`.
 ///
 /// Returns the number of bytes written (compressed) into `output`.

 #[inline]
 pub(crate) fn compress_into_sink_with_dict<const USE_DICT: bool>(
     input: &[u8],
     output: &mut impl Sink,
     mut dict_data: &[u8],
 ) -> Result<usize, CompressError> {
     if dict_data.len() + input.len() < u16::MAX as usize {
         let mut dict = HashTable4KU16::new();
         init_dict(&mut dict, &mut dict_data);
         compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len())
     } else {
         let mut dict = HashTable4K::new();
         init_dict(&mut dict, &mut dict_data);
         compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len())
     }
 }

 #[inline]
 fn init_dict<T: HashTable>(dict: &mut T, dict_data: &mut &[u8]) {
     if dict_data.len() > WINDOW_SIZE {
         *dict_data = &dict_data[dict_data.len() - WINDOW_SIZE..];
     }
     let mut i = 0usize;
     while i + core::mem::size_of::<usize>() <= dict_data.len() {
         let hash = T::get_hash_at(dict_data, i);
         dict.put_at(hash, i);
         // Note: The 3 byte step was copied from the reference implementation, it could be
         // arbitrary.
         i += 3;
     }
 }

 /// Returns the maximum output size of the compressed data.
 /// Can be used to preallocate capacity on the output vector
 #[inline]
 pub fn get_maximum_output_size(input_len: usize) -> usize {
     16 + 4 + (input_len as f64 * 1.1) as usize
 }

 /// Compress all bytes of `input` into `output`.
 /// The method chooses an appropriate hashtable to lookup duplicates.
 /// output should be preallocated with a size of
 /// `get_maximum_output_size`.
 ///
 /// Returns the number of bytes written (compressed) into `output`.
 #[inline]
 pub fn compress_into(input: &[u8], output: &mut [u8]) -> Result<usize, CompressError> {
     compress_into_sink_with_dict::<false>(input, &mut SliceSink::new(output, 0), b"")
 }

 /// Compress all bytes of `input` into `output`.
 /// The method chooses an appropriate hashtable to lookup duplicates.
 /// output should be preallocated with a size of
 /// `get_maximum_output_size`.
 ///
 /// Returns the number of bytes written (compressed) into `output`.
 #[inline]
 pub fn compress_into_with_dict(
     input: &[u8],
     output: &mut [u8],
     dict_data: &[u8],
 ) -> Result<usize, CompressError> {
     compress_into_sink_with_dict::<true>(input, &mut SliceSink::new(output, 0), dict_data)
 }

 #[inline]
 fn compress_into_vec_with_dict<const USE_DICT: bool>(
     input: &[u8],
     prepend_size: bool,
     mut dict_data: &[u8],
 ) -> Vec<u8> {
     let prepend_size_num_bytes = if prepend_size { 4 } else { 0 };
     let max_compressed_size = get_maximum_output_size(input.len()) + prepend_size_num_bytes;
     if dict_data.len() <= 3 {
         dict_data = b"";
     }
     #[cfg(feature = "safe-encode")]
     let mut compressed = {
         let mut compressed: Vec<u8> = vec![0u8; max_compressed_size];
         let out = if prepend_size {
             compressed[..4].copy_from_slice(&(input.len() as u32).to_le_bytes());
             &mut compressed[4..]
         } else {
             &mut compressed
         };
         let compressed_len =
             compress_into_sink_with_dict::<USE_DICT>(input, &mut SliceSink::new(out, 0), dict_data)
                 .unwrap();

         compressed.truncate(prepend_size_num_bytes + compressed_len);
         compressed
     };
     #[cfg(not(feature = "safe-encode"))]
     let mut compressed = {
         let mut vec = Vec::with_capacity(max_compressed_size);
         let start_pos = if prepend_size {
             vec.extend_from_slice(&(input.len() as u32).to_le_bytes());
             4
         } else {
             0
         };
         let compressed_len = compress_into_sink_with_dict::<USE_DICT>(
             input,
             &mut PtrSink::from_vec(&mut vec, start_pos),
             dict_data,
         )
         .unwrap();
         unsafe {
             vec.set_len(prepend_size_num_bytes + compressed_len);
         }
         vec
     };

     compressed.shrink_to_fit();
     compressed
 }

 /// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little
 /// endian u32. Can be used in conjunction with `decompress_size_prepended`
 #[inline]
 pub fn compress_prepend_size(input: &[u8]) -> Vec<u8> {
     compress_into_vec_with_dict::<false>(input, true, b"")
 }

 /// Compress all bytes of `input`.
 #[inline]
 pub fn compress(input: &[u8]) -> Vec<u8> {
     compress_into_vec_with_dict::<false>(input, false, b"")
 }

 /// Compress all bytes of `input` with an external dictionary.
 #[inline]
 pub fn compress_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec<u8> {
     compress_into_vec_with_dict::<true>(input, false, ext_dict)
 }

 /// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little
 /// endian u32. Can be used in conjunction with `decompress_size_prepended_with_dict`
 #[inline]
 pub fn compress_prepend_size_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec<u8> {
     compress_into_vec_with_dict::<true>(input, true, ext_dict)
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn read_u16_ptr(input: *const u8) -> u16 {
     let mut num: u16 = 0;
     unsafe {
         core::ptr::copy_nonoverlapping(input, &mut num as *mut u16 as *mut u8, 2);
     }
     num
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn read_u32_ptr(input: *const u8) -> u32 {
     let mut num: u32 = 0;
     unsafe {
         core::ptr::copy_nonoverlapping(input, &mut num as *mut u32 as *mut u8, 4);
     }
     num
 }

 #[inline]
 #[cfg(not(feature = "safe-encode"))]
 fn read_usize_ptr(input: *const u8) -> usize {
     let mut num: usize = 0;
     unsafe {
         core::ptr::copy_nonoverlapping(
             input,
             &mut num as *mut usize as *mut u8,
             core::mem::size_of::<usize>(),
         );
     }
     num
 }

 #[cfg(test)]
 mod tests {
     use super::*;

     #[test]
     fn test_count_same_bytes() {
         // 8byte aligned block, zeros and ones are added because the end/offset
         let first: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         ];
         let second: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         ];
         assert_eq!(count_same_bytes(first, &mut 0, second, 0), 16);

         // 4byte aligned block
         let first: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0,
         ];
         let second: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1,
         ];
         assert_eq!(count_same_bytes(first, &mut 0, second, 0), 20);

         // 2byte aligned block
         let first: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0,
         ];
         let second: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1,
         ];
         assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22);

         // 1byte aligned block
         let first: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0,
         ];
         let second: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1,
         ];
         assert_eq!(count_same_bytes(first, &mut 0, second, 0), 23);

         // 1byte aligned block - last byte different
         let first: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0,
         ];
         let second: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1,
         ];
         assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22);

         // 1byte aligned block
         let first: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 9, 5, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0,
         ];
         let second: &[u8] = &[
             1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1,
         ];
         assert_eq!(count_same_bytes(first, &mut 0, second, 0), 21);

         for diff_idx in 8..100 {
             let first: Vec<u8> = (0u8..255).cycle().take(100 + 12).collect();
             let mut second = first.clone();
             second[diff_idx] = 255;
             for start in 0..=diff_idx {
                 let same_bytes = count_same_bytes(&first, &mut start.clone(), &second, start);
                 assert_eq!(same_bytes, diff_idx - start);
             }
         }
     }

     #[test]
     fn test_bug() {
         let input: &[u8] = &[
             10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
         ];
         let _out = compress(input);
     }

     #[test]
     fn test_dict() {
         let input: &[u8] = &[
             10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
         ];
         let dict = input;
         let compressed = compress_with_dict(input, dict);
         assert_lt!(compressed.len(), compress(input).len());

         assert!(compressed.len() < compress(input).len());
         let mut uncompressed = vec![0u8; input.len()];
         let uncomp_size = crate::block::decompress::decompress_into_with_dict(
             &compressed,
             &mut uncompressed,
             dict,
         )
         .unwrap();
         uncompressed.truncate(uncomp_size);
         assert_eq!(input, uncompressed);
     }

     #[test]
     fn test_dict_no_panic() {
         let input: &[u8] = &[
             10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
         ];
         let dict = &[10, 12, 14];
         let _compressed = compress_with_dict(input, dict);
     }

     #[test]
     fn test_dict_match_crossing() {
         let input: &[u8] = &[
             10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
         ];
         let dict = input;
         let compressed = compress_with_dict(input, dict);
         assert_lt!(compressed.len(), compress(input).len());

         let mut uncompressed = vec![0u8; input.len() * 2];
         // copy first half of the input into output
         let dict_cutoff = dict.len() / 2;
         let output_start = dict.len() - dict_cutoff;
         uncompressed[..output_start].copy_from_slice(&dict[dict_cutoff..]);
         let uncomp_len = {
             let mut sink = SliceSink::new(&mut uncompressed[..], output_start);
             crate::block::decompress::decompress_internal::<true, _>(
                 &compressed,
                 &mut sink,
                 &dict[..dict_cutoff],
             )
             .unwrap()
         };
         assert_eq!(input.len(), uncomp_len);
         assert_eq!(
             input,
             &uncompressed[output_start..output_start + uncomp_len]
         );
     }

     #[test]
     fn test_conformant_last_block() {
         // From the spec:
         // The last match must start at least 12 bytes before the end of block.
         // The last match is part of the penultimate sequence. It is followed by the last sequence,
         // which contains only literals. Note that, as a consequence, an independent block <
         // 13 bytes cannot be compressed, because the match must copy "something",
         // so it needs at least one prior byte.
         // When a block can reference data from another block, it can start immediately with a match
         // and no literal, so a block of 12 bytes can be compressed.
         let aaas: &[u8] = b"aaaaaaaaaaaaaaa";

         // uncompressible
         let out = compress(&aaas[..12]);
         assert_gt!(out.len(), 12);
         // compressible
         let out = compress(&aaas[..13]);
         assert_le!(out.len(), 13);
         let out = compress(&aaas[..14]);
         assert_le!(out.len(), 14);
         let out = compress(&aaas[..15]);
         assert_le!(out.len(), 15);

         // dict uncompressible
         let out = compress_with_dict(&aaas[..11], aaas);
         assert_gt!(out.len(), 11);
         // compressible
         let out = compress_with_dict(&aaas[..12], aaas);
         // According to the spec this _could_ compres, but it doesn't in this lib
         // as it aborts compression for any input len < LZ4_MIN_LENGTH
         assert_gt!(out.len(), 12);
         let out = compress_with_dict(&aaas[..13], aaas);
         assert_le!(out.len(), 13);
         let out = compress_with_dict(&aaas[..14], aaas);
         assert_le!(out.len(), 14);
         let out = compress_with_dict(&aaas[..15], aaas);
         assert_le!(out.len(), 15);
     }

     #[test]
     fn test_dict_size() {
         let dict = vec![b'a'; 1024 * 1024];
         let input = &b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"[..];
         let compressed = compress_prepend_size_with_dict(input, &dict);
         let decompressed =
             crate::block::decompress_size_prepended_with_dict(&compressed, &dict).unwrap();
         assert_eq!(decompressed, input);
     }
 }