vendor/packed_simd/src/lib.rs - toolchain/rustc - Git at Google

 //! # Portable packed SIMD vectors
 //!
 //! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:
 //! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .
 //!
 //! The examples available in the
 //! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)
 //! sub-directory of the crate showcase how to use the library in practice.
 //!
 //! ## Table of contents
 //!
 //! - [Introduction](#introduction)
 //! - [Vector types](#vector-types)
 //! - [Conditional operations](#conditional-operations)
 //! - [Conversions](#conversions)
 //! - [Performance
 //!   guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)
 //!
 //! ## Introduction
 //!
 //! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`
 //! elements of type `T` as well as many type aliases for this type: for
 //! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.
 //!
 //! The operations on packed vectors are, by default, "vertical", that is, they
 //! are applied to each vector lane in isolation of the others:
 //!
 //! ```
 //! # use packed_simd::*;
 //! let a = i32x4::new(1, 2, 3, 4);
 //! let b = i32x4::new(5, 6, 7, 8);
 //! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));
 //! ```
 //!
 //! Many "horizontal" operations are also provided:
 //!
 //! ```
 //! # use packed_simd::*;
 //! # let a = i32x4::new(1, 2, 3, 4);
 //! assert_eq!(a.wrapping_sum(), 10);
 //! ```
 //!
 //! In virtually all architectures vertical operations are fast, while
 //! horizontal operations are, by comparison, much slower. That is, the
 //! most portably-efficient way of performing a reduction over a slice
 //! is to collect the results into a vector using vertical operations,
 //! and performing a single horizontal operation at the end:
 //!
 //! ```
 //! # use packed_simd::*;
 //! fn reduce(x: &[i32]) -> i32 {
 //!     assert!(x.len() % 4 == 0);
 //!     let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
 //!     for i in (0..x.len()).step_by(4) {
 //!         sum += i32x4::from_slice_unaligned(&x[i..]);
 //!     }
 //!     sum.wrapping_sum()
 //! }
 //!
 //! let x = [0, 1, 2, 3, 4, 5, 6, 7];
 //! assert_eq!(reduce(&x), 28);
 //! ```
 //!
 //! ## Vector types
 //!
 //! The vector type aliases are named according to the following scheme:
 //!
 //! > `{element_type}x{number_of_lanes} == Simd<[element_type;
 //! number_of_lanes]>`
 //!
 //! where the following element types are supported:
 //!
 //! * `i{element_width}`: signed integer
 //! * `u{element_width}`: unsigned integer
 //! * `f{element_width}`: float
 //! * `m{element_width}`: mask (see below)
 //! * `*{const,mut} T`: `const` and `mut` pointers
 //!
 //! ## Basic operations
 //!
 //! ```
 //! # use packed_simd::*;
 //! // Sets all elements to `0`:
 //! let a = i32x4::splat(0);
 //!
 //! // Reads a vector from a slice:
 //! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];
 //! let b = i32x4::from_slice_unaligned(&arr);
 //!
 //! // Reads the 4-th element of a vector:
 //! assert_eq!(b.extract(3), 1);
 //!
 //! // Returns a new vector where the 4-th element is replaced with `1`:
 //! let a = a.replace(3, 1);
 //! assert_eq!(a, b);
 //!
 //! // Writes a vector to a slice:
 //! let a = a.replace(2, 1);
 //! a.write_to_slice_unaligned(&mut arr[4..]);
 //! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);
 //! ```
 //!
 //! ## Conditional operations
 //!
 //! One often needs to perform an operation on some lanes of the vector. Vector
 //! masks, like `m32x4`, allow selecting on which vector lanes an operation is
 //! to be performed:
 //!
 //! ```
 //! # use packed_simd::*;
 //! let a = i32x4::new(1, 1, 2, 2);
 //!
 //! // Add `1` to the first two lanes of the vector.
 //! let m = m16x4::new(true, true, false, false);
 //! let a = m.select(a + 1, a);
 //! assert_eq!(a, i32x4::splat(2));
 //! ```
 //!
 //! The elements of a vector mask are either `true` or `false`. Here `true`
 //! means that a lane is "selected", while `false` means that a lane is not
 //! selected.
 //!
 //! All vector masks implement a `mask.select(a: T, b: T) -> T` method that
 //! works on all vectors that have the same number of lanes as the mask. The
 //! resulting vector contains the elements of `a` for those lanes for which the
 //! mask is `true`, and the elements of `b` otherwise.
 //!
 //! The example constructs a mask with the first two lanes set to `true` and
 //! the last two lanes set to `false`. This selects the first two lanes of `a +
 //! 1` and the last two lanes of `a`, producing a vector where the first two
 //! lanes have been incremented by `1`.
 //!
 //! > note: mask `select` can be used on vector types that have the same number
 //! > of lanes as the mask. The example shows this by using [`m16x4`] instead
 //! > of [`m32x4`]. It is _typically_ more performant to use a mask element
 //! > width equal to the element width of the vectors being operated upon.
 //! > This is, however, not true for 512-bit wide vectors when targetting
 //! > AVX-512, where the most efficient masks use only 1-bit per element.
 //!
 //! All vertical comparison operations returns masks:
 //!
 //! ```
 //! # use packed_simd::*;
 //! let a = i32x4::new(1, 1, 3, 3);
 //! let b = i32x4::new(2, 2, 0, 0);
 //!
 //! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).
 //! let m = a.ge(i32x4::splat(2));
 //!
 //! if m.any() {
 //!     // all / any / none allow coherent control flow
 //!     let d = m.select(a, b);
 //!     assert_eq!(d, i32x4::new(2, 2, 3, 3));
 //! }
 //! ```
 //!
 //! ## Conversions
 //!
 //! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for
 //!   vectors with the same number of lanes when the conversion is value
 //! preserving   (same as in `std`).
 //!
 //! * **safe bitwise conversions**: The cargo feature `into_bits` provides the
 //!   `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise
 //!   `transmute`s when all bit patterns of the source type are valid bit
 //!   patterns of the target type and are also implemented for the
 //!   architecture-specific vector types of `std::arch`. For example, `let x:
 //!   u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit
 //!   patterns are valid `u8x8` bit patterns. However, the opposite is not
 //! true,   not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this
 //!   operation cannot be peformed safely using `x.into_bits()`; one needs to
 //!   use `unsafe { crate::mem::transmute(x) }` for that, making sure that the
 //!   value in the `u8x8` is a valid bit-pattern of `m8x8`.
 //!
 //! * **numeric casts** (`as`): are peformed using [`FromCast`]/[`Cast`]
 //! (`x.cast()`), just like `as`:
 //!
 //!   * casting integer vectors whose lane types have the same size (e.g.
 //! `i32xN`     -> `u32xN`) is a **no-op**,
 //!
 //!   * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
 //! `u8xN`)     will **truncate**,
 //!
 //!   * casting from a smaller integer to a larger integer     (e.g. `u8xN` ->
 //!     `u32xN`) will:
 //!        * **zero-extend** if the source is unsigned, or
 //!        * **sign-extend** if the source is signed,
 //!
 //!   * casting from a float to an integer will **round the float towards
 //! zero**,
 //!
 //!   * casting from an integer to float will produce the floating point
 //!     representation of the integer, **rounding to nearest, ties to even**,
 //!
 //!   * casting from an `f32` to an `f64` is perfect and lossless,
 //!
 //!   * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
 //!
 //!   Numeric casts are not very "precise": sometimes lossy, sometimes value
 //!   preserving, etc.

 #![feature(
     repr_simd,
     const_fn,
     platform_intrinsics,
     stdsimd,
     aarch64_target_feature,
     arm_target_feature,
     link_llvm_intrinsics,
     core_intrinsics,
     stmt_expr_attributes,
     align_offset,
     mmx_target_feature,
     crate_visibility_modifier,
     custom_inner_attributes
 )]
 #![allow(non_camel_case_types, non_snake_case)]
 #![cfg_attr(test, feature(hashmap_internals))]
 #![cfg_attr(
     feature = "cargo-clippy",
     allow(
         clippy::cast_possible_truncation,
         clippy::cast_lossless,
         clippy::cast_possible_wrap,
         clippy::cast_precision_loss,
         // This lint is currently broken for generic code
         // See https://github.com/rust-lang/rust-clippy/issues/3410
         clippy::use_self
     )
 )]
 #![cfg_attr(
     feature = "cargo-clippy",
     deny(clippy::missing_inline_in_public_items)
 )]
 #![deny(warnings, rust_2018_idioms)]
 #![no_std]

 use cfg_if::cfg_if;

 cfg_if! {
     if #[cfg(feature = "core_arch")] {
         #[allow(unused_imports)]
         use core_arch as arch;
     } else {
         #[allow(unused_imports)]
         use core::arch;
     }
 }

 #[cfg(all(target_arch = "wasm32", test))]
 use wasm_bindgen_test::*;

 #[allow(unused_imports)]
 use core::{
     /* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128,
     i16, i32, i64, i8, intrinsics, isize, iter, marker, mem, ops, ptr, slice,
     u128, u16, u32, u64, u8, usize,
 };

 #[macro_use]
 mod testing;
 #[macro_use]
 mod api;
 mod codegen;
 mod sealed;

 /// Packed SIMD vector type.
 ///
 /// # Examples
 ///
 /// ```
 /// # use packed_simd::Simd;
 /// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);
 /// assert_eq!(v.extract(2), 2);
 /// ```
 #[repr(transparent)]
 #[derive(Copy, Clone)]
 pub struct Simd<A: sealed::SimdArray>(
     // FIXME: this type should be private,
     // but it currently must be public for the
     // `shuffle!` macro to work: it needs to
     // access the internal `repr(simd)` type
     // to call the shuffle intrinsics.
     #[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,
 );

 /// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`
 /// and/or `Ord` traits.
 #[repr(transparent)]
 #[derive(Copy, Clone, Debug)]
 #[cfg_attr(
     feature = "cargo-clippy",
     allow(clippy::missing_inline_in_public_items)
 )]
 pub struct LexicographicallyOrdered<T>(T);

 mod masks;
 pub use self::masks::*;

 mod v16;
 pub use self::v16::*;

 mod v32;
 pub use self::v32::*;

 mod v64;
 pub use self::v64::*;

 mod v128;
 pub use self::v128::*;

 mod v256;
 pub use self::v256::*;

 mod v512;
 pub use self::v512::*;

 mod vSize;
 pub use self::vSize::*;

 mod vPtr;
 pub use self::vPtr::*;

 pub use self::api::cast::*;

 #[cfg(feature = "into_bits")]
 pub use self::api::into_bits::*;

 // Re-export the shuffle intrinsics required by the `shuffle!` macro.
 #[doc(hidden)]
 pub use self::codegen::llvm::{
     __shuffle_vector16, __shuffle_vector2, __shuffle_vector32,
     __shuffle_vector4, __shuffle_vector64, __shuffle_vector8,
 };

 crate mod llvm {
     crate use crate::codegen::llvm::*;
 }
	//! # Portable packed SIMD vectors
	//!
	//! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:
	//! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .
	//!
	//! The examples available in the
	//! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)
	//! sub-directory of the crate showcase how to use the library in practice.
	//!
	//! ## Table of contents
	//!
	//! - [Introduction](#introduction)
	//! - [Vector types](#vector-types)
	//! - [Conditional operations](#conditional-operations)
	//! - [Conversions](#conversions)
	//! - [Performance
	//! guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)
	//!
	//! ## Introduction
	//!
	//! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`
	//! elements of type `T` as well as many type aliases for this type: for
	//! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.
	//!
	//! The operations on packed vectors are, by default, "vertical", that is, they
	//! are applied to each vector lane in isolation of the others:
	//!
	//! ```
	//! # use packed_simd::*;
	//! let a = i32x4::new(1, 2, 3, 4);
	//! let b = i32x4::new(5, 6, 7, 8);
	//! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));
	//! ```
	//!
	//! Many "horizontal" operations are also provided:
	//!
	//! ```
	//! # use packed_simd::*;
	//! # let a = i32x4::new(1, 2, 3, 4);
	//! assert_eq!(a.wrapping_sum(), 10);
	//! ```
	//!
	//! In virtually all architectures vertical operations are fast, while
	//! horizontal operations are, by comparison, much slower. That is, the
	//! most portably-efficient way of performing a reduction over a slice
	//! is to collect the results into a vector using vertical operations,
	//! and performing a single horizontal operation at the end:
	//!
	//! ```
	//! # use packed_simd::*;
	//! fn reduce(x: &[i32]) -> i32 {
	//! assert!(x.len() % 4 == 0);
	//! let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
	//! for i in (0..x.len()).step_by(4) {
	//! sum += i32x4::from_slice_unaligned(&x[i..]);
	//! }
	//! sum.wrapping_sum()
	//! }
	//!
	//! let x = [0, 1, 2, 3, 4, 5, 6, 7];
	//! assert_eq!(reduce(&x), 28);
	//! ```
	//!
	//! ## Vector types
	//!
	//! The vector type aliases are named according to the following scheme:
	//!
	//! > `{element_type}x{number_of_lanes} == Simd<[element_type;
	//! number_of_lanes]>`
	//!
	//! where the following element types are supported:
	//!
	//! * `i{element_width}`: signed integer
	//! * `u{element_width}`: unsigned integer
	//! * `f{element_width}`: float
	//! * `m{element_width}`: mask (see below)
	//! * `*{const,mut} T`: `const` and `mut` pointers
	//!
	//! ## Basic operations
	//!
	//! ```
	//! # use packed_simd::*;
	//! // Sets all elements to `0`:
	//! let a = i32x4::splat(0);
	//!
	//! // Reads a vector from a slice:
	//! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];
	//! let b = i32x4::from_slice_unaligned(&arr);
	//!
	//! // Reads the 4-th element of a vector:
	//! assert_eq!(b.extract(3), 1);
	//!
	//! // Returns a new vector where the 4-th element is replaced with `1`:
	//! let a = a.replace(3, 1);
	//! assert_eq!(a, b);
	//!
	//! // Writes a vector to a slice:
	//! let a = a.replace(2, 1);
	//! a.write_to_slice_unaligned(&mut arr[4..]);
	//! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);
	//! ```
	//!
	//! ## Conditional operations
	//!
	//! One often needs to perform an operation on some lanes of the vector. Vector
	//! masks, like `m32x4`, allow selecting on which vector lanes an operation is
	//! to be performed:
	//!
	//! ```
	//! # use packed_simd::*;
	//! let a = i32x4::new(1, 1, 2, 2);
	//!
	//! // Add `1` to the first two lanes of the vector.
	//! let m = m16x4::new(true, true, false, false);
	//! let a = m.select(a + 1, a);
	//! assert_eq!(a, i32x4::splat(2));
	//! ```
	//!
	//! The elements of a vector mask are either `true` or `false`. Here `true`
	//! means that a lane is "selected", while `false` means that a lane is not
	//! selected.
	//!
	//! All vector masks implement a `mask.select(a: T, b: T) -> T` method that
	//! works on all vectors that have the same number of lanes as the mask. The
	//! resulting vector contains the elements of `a` for those lanes for which the
	//! mask is `true`, and the elements of `b` otherwise.
	//!
	//! The example constructs a mask with the first two lanes set to `true` and
	//! the last two lanes set to `false`. This selects the first two lanes of `a +
	//! 1` and the last two lanes of `a`, producing a vector where the first two
	//! lanes have been incremented by `1`.
	//!
	//! > note: mask `select` can be used on vector types that have the same number
	//! > of lanes as the mask. The example shows this by using [`m16x4`] instead
	//! > of [`m32x4`]. It is _typically_ more performant to use a mask element
	//! > width equal to the element width of the vectors being operated upon.
	//! > This is, however, not true for 512-bit wide vectors when targetting
	//! > AVX-512, where the most efficient masks use only 1-bit per element.
	//!
	//! All vertical comparison operations returns masks:
	//!
	//! ```
	//! # use packed_simd::*;
	//! let a = i32x4::new(1, 1, 3, 3);
	//! let b = i32x4::new(2, 2, 0, 0);
	//!
	//! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).
	//! let m = a.ge(i32x4::splat(2));
	//!
	//! if m.any() {
	//! // all / any / none allow coherent control flow
	//! let d = m.select(a, b);
	//! assert_eq!(d, i32x4::new(2, 2, 3, 3));
	//! }
	//! ```
	//!
	//! ## Conversions
	//!
	//! * lossless widening conversions: [`From`]/[`Into`] are implemented for
	//! vectors with the same number of lanes when the conversion is value
	//! preserving (same as in `std`).
	//!
	//! * safe bitwise conversions: The cargo feature `into_bits` provides the
	//! `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise
	//! `transmute`s when all bit patterns of the source type are valid bit
	//! patterns of the target type and are also implemented for the
	//! architecture-specific vector types of `std::arch`. For example, `let x:
	//! u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit
	//! patterns are valid `u8x8` bit patterns. However, the opposite is not
	//! true, not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this
	//! operation cannot be peformed safely using `x.into_bits()`; one needs to
	//! use `unsafe { crate::mem::transmute(x) }` for that, making sure that the
	//! value in the `u8x8` is a valid bit-pattern of `m8x8`.
	//!
	//! * numeric casts (`as`): are peformed using [`FromCast`]/[`Cast`]
	//! (`x.cast()`), just like `as`:
	//!
	//! * casting integer vectors whose lane types have the same size (e.g.
	//! `i32xN` -> `u32xN`) is a no-op,
	//!
	//! * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
	//! `u8xN`) will truncate,
	//!
	//! * casting from a smaller integer to a larger integer (e.g. `u8xN` ->
	//! `u32xN`) will:
	//! * zero-extend if the source is unsigned, or
	//! * sign-extend if the source is signed,
	//!
	//! * casting from a float to an integer will **round the float towards
	//! zero**,
	//!
	//! * casting from an integer to float will produce the floating point
	//! representation of the integer, rounding to nearest, ties to even,
	//!
	//! * casting from an `f32` to an `f64` is perfect and lossless,
	//!
	//! * casting from an `f64` to an `f32` rounds to nearest, ties to even.
	//!
	//! Numeric casts are not very "precise": sometimes lossy, sometimes value
	//! preserving, etc.

	#![feature(
	repr_simd,
	const_fn,
	platform_intrinsics,
	stdsimd,
	aarch64_target_feature,
	arm_target_feature,
	link_llvm_intrinsics,
	core_intrinsics,
	stmt_expr_attributes,
	align_offset,
	mmx_target_feature,
	crate_visibility_modifier,
	custom_inner_attributes
	)]
	#![allow(non_camel_case_types, non_snake_case)]
	#![cfg_attr(test, feature(hashmap_internals))]
	#![cfg_attr(
	feature = "cargo-clippy",
	allow(
	clippy::cast_possible_truncation,
	clippy::cast_lossless,
	clippy::cast_possible_wrap,
	clippy::cast_precision_loss,
	// This lint is currently broken for generic code
	// See https://github.com/rust-lang/rust-clippy/issues/3410
	clippy::use_self
	)
	)]
	#![cfg_attr(
	feature = "cargo-clippy",
	deny(clippy::missing_inline_in_public_items)
	)]
	#![deny(warnings, rust_2018_idioms)]
	#![no_std]

	use cfg_if::cfg_if;

	cfg_if! {
	if #[cfg(feature = "core_arch")] {
	#[allow(unused_imports)]
	use core_arch as arch;
	} else {
	#[allow(unused_imports)]
	use core::arch;
	}
	}

	#[cfg(all(target_arch = "wasm32", test))]
	use wasm_bindgen_test::*;

	#[allow(unused_imports)]
	use core::{
	/* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128,
	i16, i32, i64, i8, intrinsics, isize, iter, marker, mem, ops, ptr, slice,
	u128, u16, u32, u64, u8, usize,
	};

	#[macro_use]
	mod testing;
	#[macro_use]
	mod api;
	mod codegen;
	mod sealed;

	/// Packed SIMD vector type.
	///
	/// # Examples
	///
	/// ```
	/// # use packed_simd::Simd;
	/// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);
	/// assert_eq!(v.extract(2), 2);
	/// ```
	#[repr(transparent)]
	#[derive(Copy, Clone)]
	pub struct Simd<A: sealed::SimdArray>(
	// FIXME: this type should be private,
	// but it currently must be public for the
	// `shuffle!` macro to work: it needs to
	// access the internal `repr(simd)` type
	// to call the shuffle intrinsics.
	#[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,
	);

	/// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`
	/// and/or `Ord` traits.
	#[repr(transparent)]
	#[derive(Copy, Clone, Debug)]
	#[cfg_attr(
	feature = "cargo-clippy",
	allow(clippy::missing_inline_in_public_items)
	)]
	pub struct LexicographicallyOrdered<T>(T);

	mod masks;
	pub use self::masks::*;

	mod v16;
	pub use self::v16::*;

	mod v32;
	pub use self::v32::*;

	mod v64;
	pub use self::v64::*;

	mod v128;
	pub use self::v128::*;

	mod v256;
	pub use self::v256::*;

	mod v512;
	pub use self::v512::*;

	mod vSize;
	pub use self::vSize::*;

	mod vPtr;
	pub use self::vPtr::*;

	pub use self::api::cast::*;

	#[cfg(feature = "into_bits")]
	pub use self::api::into_bits::*;

	// Re-export the shuffle intrinsics required by the `shuffle!` macro.
	#[doc(hidden)]
	pub use self::codegen::llvm::{
	__shuffle_vector16, __shuffle_vector2, __shuffle_vector32,
	__shuffle_vector4, __shuffle_vector64, __shuffle_vector8,
	};

	crate mod llvm {
	crate use crate::codegen::llvm::*;
	}