| //! An example showing runtime dispatch to an architecture-optimized |
| //! implementation. |
| //! |
| //! This program implements hex encoding a slice into a predetermined |
| //! destination using various different instruction sets. This selects at |
| //! runtime the most optimized implementation and uses that rather than being |
| //! required to be compiled differently. |
| //! |
| //! You can test out this program via: |
| //! |
| //! echo test | cargo +nightly run --release hex |
| //! |
| //! and you should see `746573740a` get printed out. |
| |
| #![allow(internal_features)] |
| #![feature(wasm_target_feature)] |
| #![cfg_attr(test, feature(test))] |
| #![cfg_attr( |
| any(target_arch = "x86", target_arch = "x86_64"), |
| feature(stdarch_internal) |
| )] |
| #![allow( |
| clippy::unwrap_used, |
| clippy::print_stdout, |
| clippy::unwrap_used, |
| clippy::shadow_reuse, |
| clippy::cast_possible_wrap, |
| clippy::cast_ptr_alignment, |
| clippy::cast_sign_loss, |
| clippy::missing_docs_in_private_items |
| )] |
| |
| use std::{ |
| io::{self, Read}, |
| str, |
| }; |
| |
| #[cfg(target_arch = "x86")] |
| use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected}; |
| #[cfg(target_arch = "x86_64")] |
| use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected}; |
| |
| fn main() { |
| let mut input = Vec::new(); |
| io::stdin().read_to_end(&mut input).unwrap(); |
| let mut dst = vec![0; 2 * input.len()]; |
| let s = hex_encode(&input, &mut dst).unwrap(); |
| println!("{s}"); |
| } |
| |
| fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
| let len = src.len().checked_mul(2).unwrap(); |
| if dst.len() < len { |
| return Err(len); |
| } |
| |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| { |
| if is_x86_feature_detected!("avx2") { |
| return unsafe { hex_encode_avx2(src, dst) }; |
| } |
| if is_x86_feature_detected!("sse4.1") { |
| return unsafe { hex_encode_sse41(src, dst) }; |
| } |
| } |
| #[cfg(target_arch = "wasm32")] |
| { |
| if true { |
| return unsafe { hex_encode_simd128(src, dst) }; |
| } |
| } |
| |
| hex_encode_fallback(src, dst) |
| } |
| |
| #[target_feature(enable = "avx2")] |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
| let ascii_zero = _mm256_set1_epi8(b'0' as i8); |
| let nines = _mm256_set1_epi8(9); |
| let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8); |
| let and4bits = _mm256_set1_epi8(0xf); |
| |
| let mut i = 0_usize; |
| while src.len() >= 32 { |
| let invec = _mm256_loadu_si256(src.as_ptr() as *const _); |
| |
| let masked1 = _mm256_and_si256(invec, and4bits); |
| let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits); |
| |
| // return 0xff corresponding to the elements > 9, or 0x00 otherwise |
| let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines); |
| let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines); |
| |
| // add '0' or the offset depending on the masks |
| let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); |
| let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); |
| |
| // interleave masked1 and masked2 bytes |
| let res1 = _mm256_unpacklo_epi8(masked2, masked1); |
| let res2 = _mm256_unpackhi_epi8(masked2, masked1); |
| |
| // Store everything into the right destination now |
| let base = dst.as_mut_ptr().add(i * 2); |
| let base1 = base.add(0) as *mut _; |
| let base2 = base.add(16) as *mut _; |
| let base3 = base.add(32) as *mut _; |
| let base4 = base.add(48) as *mut _; |
| _mm256_storeu2_m128i(base3, base1, res1); |
| _mm256_storeu2_m128i(base4, base2, res2); |
| src = &src[32..]; |
| i += 32; |
| } |
| |
| let _ = hex_encode_sse41(src, &mut dst[i * 2..]); |
| |
| Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
| } |
| |
| // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp |
| #[target_feature(enable = "sse4.1")] |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
| let ascii_zero = _mm_set1_epi8(b'0' as i8); |
| let nines = _mm_set1_epi8(9); |
| let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8); |
| let and4bits = _mm_set1_epi8(0xf); |
| |
| let mut i = 0_usize; |
| while src.len() >= 16 { |
| let invec = _mm_loadu_si128(src.as_ptr() as *const _); |
| |
| let masked1 = _mm_and_si128(invec, and4bits); |
| let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits); |
| |
| // return 0xff corresponding to the elements > 9, or 0x00 otherwise |
| let cmpmask1 = _mm_cmpgt_epi8(masked1, nines); |
| let cmpmask2 = _mm_cmpgt_epi8(masked2, nines); |
| |
| // add '0' or the offset depending on the masks |
| let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); |
| let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); |
| |
| // interleave masked1 and masked2 bytes |
| let res1 = _mm_unpacklo_epi8(masked2, masked1); |
| let res2 = _mm_unpackhi_epi8(masked2, masked1); |
| |
| _mm_storeu_si128(dst.as_mut_ptr().add(i * 2) as *mut _, res1); |
| _mm_storeu_si128(dst.as_mut_ptr().add(i * 2 + 16) as *mut _, res2); |
| src = &src[16..]; |
| i += 16; |
| } |
| |
| let _ = hex_encode_fallback(src, &mut dst[i * 2..]); |
| |
| Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
| } |
| |
| #[cfg(target_arch = "wasm32")] |
| #[target_feature(enable = "simd128")] |
| unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
| use core_arch::arch::wasm32::*; |
| |
| let ascii_zero = u8x16_splat(b'0'); |
| let nines = u8x16_splat(9); |
| let ascii_a = u8x16_splat(b'a' - 9 - 1); |
| let and4bits = u8x16_splat(0xf); |
| |
| let mut i = 0_usize; |
| while src.len() >= 16 { |
| let invec = v128_load(src.as_ptr() as *const _); |
| |
| let masked1 = v128_and(invec, and4bits); |
| let masked2 = v128_and(u8x16_shr(invec, 4), and4bits); |
| |
| // return 0xff corresponding to the elements > 9, or 0x00 otherwise |
| let cmpmask1 = u8x16_gt(masked1, nines); |
| let cmpmask2 = u8x16_gt(masked2, nines); |
| |
| // add '0' or the offset depending on the masks |
| let masked1 = u8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1)); |
| let masked2 = u8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2)); |
| |
| // Next we need to shuffle around masked{1,2} to get back to the |
| // original source text order. The first element (res1) we'll store uses |
| // all the low bytes from the 2 masks and the second element (res2) uses |
| // all the upper bytes. |
| let res1 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( |
| masked2, masked1, |
| ); |
| let res2 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( |
| masked2, masked1, |
| ); |
| |
| v128_store(dst.as_mut_ptr().add(i * 2) as *mut _, res1); |
| v128_store(dst.as_mut_ptr().add(i * 2 + 16) as *mut _, res2); |
| src = &src[16..]; |
| i += 16; |
| } |
| |
| let _ = hex_encode_fallback(src, &mut dst[i * 2..]); |
| |
| Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
| } |
| |
| fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
| fn hex(byte: u8) -> u8 { |
| static TABLE: &[u8] = b"0123456789abcdef"; |
| TABLE[byte as usize] |
| } |
| |
| for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { |
| slots[0] = hex((*byte >> 4) & 0xf); |
| slots[1] = hex(*byte & 0xf); |
| } |
| |
| unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) } |
| } |
| |
| // Run these with `cargo +nightly test --example hex -p stdarch` |
| #[cfg(test)] |
| mod tests { |
| use std::iter; |
| |
| use super::*; |
| |
| fn test(input: &[u8], output: &str) { |
| let tmp = || vec![0; input.len() * 2]; |
| |
| assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output); |
| assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output); |
| |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| unsafe { |
| if self::is_x86_feature_detected!("avx2") { |
| assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output); |
| } |
| if self::is_x86_feature_detected!("sse4.1") { |
| assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output); |
| } |
| } |
| } |
| |
| #[test] |
| fn empty() { |
| test(b"", ""); |
| } |
| |
| #[test] |
| fn big() { |
| test( |
| &[0; 1024], |
| &iter::repeat('0').take(2048).collect::<String>(), |
| ); |
| } |
| |
| #[test] |
| fn odd() { |
| test( |
| &[0; 313], |
| &iter::repeat('0').take(313 * 2).collect::<String>(), |
| ); |
| } |
| |
| #[test] |
| fn avx_works() { |
| let mut input = [0; 33]; |
| input[4] = 3; |
| input[16] = 3; |
| input[17] = 0x30; |
| input[21] = 1; |
| input[31] = 0x24; |
| test( |
| &input, |
| "\ |
| 0000000003000000\ |
| 0000000000000000\ |
| 0330000000010000\ |
| 0000000000000024\ |
| 00\ |
| ", |
| ); |
| } |
| |
| quickcheck::quickcheck! { |
| fn encode_equals_fallback(input: Vec<u8>) -> bool { |
| let mut space1 = vec![0; input.len() * 2]; |
| let mut space2 = vec![0; input.len() * 2]; |
| let a = hex_encode(&input, &mut space1).unwrap(); |
| let b = hex_encode_fallback(&input, &mut space2).unwrap(); |
| a == b |
| } |
| |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| fn avx_equals_fallback(input: Vec<u8>) -> bool { |
| if !self::is_x86_feature_detected!("avx2") { |
| return true |
| } |
| let mut space1 = vec![0; input.len() * 2]; |
| let mut space2 = vec![0; input.len() * 2]; |
| let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() }; |
| let b = hex_encode_fallback(&input, &mut space2).unwrap(); |
| a == b |
| } |
| |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| fn sse41_equals_fallback(input: Vec<u8>) -> bool { |
| if !self::is_x86_feature_detected!("avx2") { |
| return true |
| } |
| let mut space1 = vec![0; input.len() * 2]; |
| let mut space2 = vec![0; input.len() * 2]; |
| let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() }; |
| let b = hex_encode_fallback(&input, &mut space2).unwrap(); |
| a == b |
| } |
| } |
| } |
| |
| // Run these with `cargo +nightly bench --example hex -p stdarch` |
| #[cfg(test)] |
| mod benches { |
| extern crate rand; |
| extern crate test; |
| |
| use self::rand::Rng; |
| |
| use super::*; |
| |
| const SMALL_LEN: usize = 117; |
| const LARGE_LEN: usize = 1 * 1024 * 1024; |
| |
| fn doit( |
| b: &mut test::Bencher, |
| len: usize, |
| f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>, |
| ) { |
| let mut rng = rand::thread_rng(); |
| let input = std::iter::repeat(()) |
| .map(|()| rng.gen::<u8>()) |
| .take(len) |
| .collect::<Vec<_>>(); |
| let mut dst = vec![0; input.len() * 2]; |
| b.bytes = len as u64; |
| b.iter(|| unsafe { |
| f(&input, &mut dst).unwrap(); |
| dst[0] |
| }); |
| } |
| |
| #[bench] |
| fn small_default(b: &mut test::Bencher) { |
| doit(b, SMALL_LEN, hex_encode); |
| } |
| |
| #[bench] |
| fn small_fallback(b: &mut test::Bencher) { |
| doit(b, SMALL_LEN, hex_encode_fallback); |
| } |
| |
| #[bench] |
| fn large_default(b: &mut test::Bencher) { |
| doit(b, LARGE_LEN, hex_encode); |
| } |
| |
| #[bench] |
| fn large_fallback(b: &mut test::Bencher) { |
| doit(b, LARGE_LEN, hex_encode_fallback); |
| } |
| |
| #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| mod x86 { |
| use super::*; |
| |
| #[bench] |
| fn small_avx2(b: &mut test::Bencher) { |
| if self::is_x86_feature_detected!("avx2") { |
| doit(b, SMALL_LEN, hex_encode_avx2); |
| } |
| } |
| |
| #[bench] |
| fn small_sse41(b: &mut test::Bencher) { |
| if self::is_x86_feature_detected!("sse4.1") { |
| doit(b, SMALL_LEN, hex_encode_sse41); |
| } |
| } |
| |
| #[bench] |
| fn large_avx2(b: &mut test::Bencher) { |
| if self::is_x86_feature_detected!("avx2") { |
| doit(b, LARGE_LEN, hex_encode_avx2); |
| } |
| } |
| |
| #[bench] |
| fn large_sse41(b: &mut test::Bencher) { |
| if self::is_x86_feature_detected!("sse4.1") { |
| doit(b, LARGE_LEN, hex_encode_sse41); |
| } |
| } |
| } |
| } |