crates/glam/src/sse2.rs - platform/external/rust/android-crates-io - Git at Google

 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;

 #[repr(C)]
 union UnionCast {
     u32x4: [u32; 4],
     f32x4: [f32; 4],
     m128: __m128,
 }

 pub const fn m128_from_f32x4(f32x4: [f32; 4]) -> __m128 {
     unsafe { UnionCast { f32x4 }.m128 }
 }

 const fn m128_from_u32x4(u32x4: [u32; 4]) -> __m128 {
     unsafe { UnionCast { u32x4 }.m128 }
 }

 const PS_ABS_MASK: __m128 = m128_from_u32x4([0x7fffffff; 4]);
 const PS_INV_SIGN_MASK: __m128 = m128_from_u32x4([!0x8000_0000; 4]);
 const PS_SIGN_MASK: __m128 = m128_from_u32x4([0x8000_0000; 4]);
 const PS_NO_FRACTION: __m128 = m128_from_f32x4([8388608.0; 4]);
 const PS_NEGATIVE_ZERO: __m128 = m128_from_u32x4([0x8000_0000; 4]);
 const PS_PI: __m128 = m128_from_f32x4([core::f32::consts::PI; 4]);
 const PS_HALF_PI: __m128 = m128_from_f32x4([core::f32::consts::FRAC_PI_2; 4]);
 const PS_SIN_COEFFICIENTS0: __m128 =
     m128_from_f32x4([-0.16666667, 0.008_333_331, -0.00019840874, 2.752_556_2e-6]);
 const PS_SIN_COEFFICIENTS1: __m128 = m128_from_f32x4([
     -2.388_985_9e-8,
     -0.16665852,      /*Est1*/
     0.008_313_95,     /*Est2*/
     -0.000_185_246_7, /*Est3*/
 ]);
 const PS_ONE: __m128 = m128_from_f32x4([1.0; 4]);
 const PS_TWO_PI: __m128 = m128_from_f32x4([core::f32::consts::TAU; 4]);
 const PS_RECIPROCAL_TWO_PI: __m128 = m128_from_f32x4([0.159_154_94; 4]);

 /// Calculates the vector 3 dot product and returns answer in x lane of __m128.
 #[inline(always)]
 pub(crate) unsafe fn dot3_in_x(lhs: __m128, rhs: __m128) -> __m128 {
     let x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
     let y2_0_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_00_01);
     let z2_0_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_00_10);
     let x2y2_0_0_0 = _mm_add_ss(x2_y2_z2_w2, y2_0_0_0);
     _mm_add_ss(x2y2_0_0_0, z2_0_0_0)
 }

 /// Calculates the vector 4 dot product and returns answer in x lane of __m128.
 #[inline(always)]
 pub(crate) unsafe fn dot4_in_x(lhs: __m128, rhs: __m128) -> __m128 {
     let x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
     let z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_11_10);
     let x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0);
     let y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, 0b00_00_00_01);
     _mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0)
 }

 #[inline]
 pub(crate) unsafe fn dot3(lhs: __m128, rhs: __m128) -> f32 {
     _mm_cvtss_f32(dot3_in_x(lhs, rhs))
 }

 #[inline]
 pub(crate) unsafe fn dot3_into_m128(lhs: __m128, rhs: __m128) -> __m128 {
     let dot_in_x = dot3_in_x(lhs, rhs);
     _mm_shuffle_ps(dot_in_x, dot_in_x, 0b00_00_00_00)
 }

 #[inline]
 pub(crate) unsafe fn dot4(lhs: __m128, rhs: __m128) -> f32 {
     _mm_cvtss_f32(dot4_in_x(lhs, rhs))
 }

 #[inline]
 pub(crate) unsafe fn dot4_into_m128(lhs: __m128, rhs: __m128) -> __m128 {
     let dot_in_x = dot4_in_x(lhs, rhs);
     _mm_shuffle_ps(dot_in_x, dot_in_x, 0b00_00_00_00)
 }

 #[inline]
 pub(crate) unsafe fn m128_floor(v: __m128) -> __m128 {
     // Based on https://github.com/microsoft/DirectXMath `XMVectorFloor`
     // To handle NAN, INF and numbers greater than 8388608, use masking
     let test = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_INV_SIGN_MASK));
     let test = _mm_cmplt_epi32(test, _mm_castps_si128(PS_NO_FRACTION));
     // Truncate
     let vint = _mm_cvttps_epi32(v);
     let result = _mm_cvtepi32_ps(vint);
     let larger = _mm_cmpgt_ps(result, v);
     // 0 -> 0, 0xffffffff -> -1.0f
     let larger = _mm_cvtepi32_ps(_mm_castps_si128(larger));
     let result = _mm_add_ps(result, larger);
     // All numbers less than 8388608 will use the round to int
     let result = _mm_and_ps(result, _mm_castsi128_ps(test));
     // All others, use the ORIGINAL value
     let test = _mm_andnot_si128(test, _mm_castps_si128(v));
     _mm_or_ps(result, _mm_castsi128_ps(test))
 }

 #[inline]
 pub(crate) unsafe fn m128_ceil(v: __m128) -> __m128 {
     // Based on https://github.com/microsoft/DirectXMath `XMVectorCeil`
     // To handle NAN, INF and numbers greater than 8388608, use masking
     let test = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_INV_SIGN_MASK));
     let test = _mm_cmplt_epi32(test, _mm_castps_si128(PS_NO_FRACTION));
     // Truncate
     let vint = _mm_cvttps_epi32(v);
     let result = _mm_cvtepi32_ps(vint);
     let smaller = _mm_cmplt_ps(result, v);
     // 0 -> 0, 0xffffffff -> -1.0f
     let smaller = _mm_cvtepi32_ps(_mm_castps_si128(smaller));
     let result = _mm_sub_ps(result, smaller);
     // All numbers less than 8388608 will use the round to int
     let result = _mm_and_ps(result, _mm_castsi128_ps(test));
     // All others, use the ORIGINAL value
     let test = _mm_andnot_si128(test, _mm_castps_si128(v));
     _mm_or_ps(result, _mm_castsi128_ps(test))
 }

 #[inline]
 pub(crate) unsafe fn m128_abs(v: __m128) -> __m128 {
     _mm_and_ps(v, _mm_castsi128_ps(_mm_set1_epi32(0x7f_ff_ff_ff)))
 }

 #[inline(always)]
 pub(crate) unsafe fn m128_mul_add(a: __m128, b: __m128, c: __m128) -> __m128 {
     // Only enable fused multiply-adds here if "fast-math" is enabled and the
     // platform supports it. Otherwise this may break cross-platform determinism.
     #[cfg(all(feature = "fast-math", target_feature = "fma"))]
     {
         _mm_fmadd_ps(a, b, c)
     }

     #[cfg(any(not(feature = "fast-math"), not(target_feature = "fma")))]
     {
         _mm_add_ps(_mm_mul_ps(a, b), c)
     }
 }

 #[inline(always)]
 pub(crate) unsafe fn m128_neg_mul_sub(a: __m128, b: __m128, c: __m128) -> __m128 {
     _mm_sub_ps(c, _mm_mul_ps(a, b))
 }

 #[inline]
 pub(crate) unsafe fn m128_round(v: __m128) -> __m128 {
     // Based on https://github.com/microsoft/DirectXMath `XMVectorRound`
     let sign = _mm_and_ps(v, PS_SIGN_MASK);
     let s_magic = _mm_or_ps(PS_NO_FRACTION, sign);
     let r1 = _mm_add_ps(v, s_magic);
     let r1 = _mm_sub_ps(r1, s_magic);
     let r2 = _mm_and_ps(v, PS_INV_SIGN_MASK);
     let mask = _mm_cmple_ps(r2, PS_NO_FRACTION);
     let r2 = _mm_andnot_ps(mask, v);
     let r1 = _mm_and_ps(r1, mask);
     _mm_xor_ps(r1, r2)
 }

 #[inline]
 pub(crate) unsafe fn m128_trunc(v: __m128) -> __m128 {
     // Based on https://github.com/microsoft/DirectXMath `XMVectorTruncate`
     // To handle NAN, INF and numbers greater than 8388608, use masking
     // Get the abs value
     let mut vtest = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_ABS_MASK));
     // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
     vtest = _mm_cmplt_epi32(vtest, _mm_castps_si128(PS_NO_FRACTION));
     // Convert to int and back to float for rounding with truncation
     let vint = _mm_cvttps_epi32(v);
     // Convert back to floats
     let mut vresult = _mm_cvtepi32_ps(vint);
     // All numbers less than 8388608 will use the round to int
     vresult = _mm_and_ps(vresult, _mm_castsi128_ps(vtest));
     // All others, use the ORIGINAL value
     vtest = _mm_andnot_si128(vtest, _mm_castps_si128(v));
     _mm_or_ps(vresult, _mm_castsi128_ps(vtest))
 }

 /// Returns a vector whose components are the corresponding components of Angles modulo 2PI.
 #[inline]
 pub(crate) unsafe fn m128_mod_angles(angles: __m128) -> __m128 {
     // Based on https://github.com/microsoft/DirectXMath `XMVectorModAngles`
     let v = _mm_mul_ps(angles, PS_RECIPROCAL_TWO_PI);
     let v = m128_round(v);
     m128_neg_mul_sub(PS_TWO_PI, v, angles)
 }

 /// Computes the sine of the angle in each lane of `v`. Values outside
 /// the bounds of PI may produce an increasing error as the input angle
 /// drifts from `[-PI, PI]`.
 #[inline]
 pub(crate) unsafe fn m128_sin(v: __m128) -> __m128 {
     // Based on https://github.com/microsoft/DirectXMath `XMVectorSin`

     // 11-degree minimax approximation

     // Force the value within the bounds of pi
     let mut x = m128_mod_angles(v);

     // Map in [-pi/2,pi/2] with sin(y) = sin(x).
     let sign = _mm_and_ps(x, PS_NEGATIVE_ZERO);
     // pi when x >= 0, -pi when x < 0
     let c = _mm_or_ps(PS_PI, sign);
     // |x|
     let absx = _mm_andnot_ps(sign, x);
     let rflx = _mm_sub_ps(c, x);
     let comp = _mm_cmple_ps(absx, PS_HALF_PI);
     let select0 = _mm_and_ps(comp, x);
     let select1 = _mm_andnot_ps(comp, rflx);
     x = _mm_or_ps(select0, select1);

     let x2 = _mm_mul_ps(x, x);

     // Compute polynomial approximation
     const SC1: __m128 = PS_SIN_COEFFICIENTS1;
     let v_constants_b = _mm_shuffle_ps(SC1, SC1, 0b00_00_00_00);

     const SC0: __m128 = PS_SIN_COEFFICIENTS0;
     let mut v_constants = _mm_shuffle_ps(SC0, SC0, 0b11_11_11_11);
     let mut result = m128_mul_add(v_constants_b, x2, v_constants);

     v_constants = _mm_shuffle_ps(SC0, SC0, 0b10_10_10_10);
     result = m128_mul_add(result, x2, v_constants);

     v_constants = _mm_shuffle_ps(SC0, SC0, 0b01_01_01_01);
     result = m128_mul_add(result, x2, v_constants);

     v_constants = _mm_shuffle_ps(SC0, SC0, 0b00_00_00_00);
     result = m128_mul_add(result, x2, v_constants);

     result = m128_mul_add(result, x2, PS_ONE);
     result = _mm_mul_ps(result, x);

     result
 }

 #[test]
 fn test_sse2_m128_sin() {
     use crate::Vec4;
     use core::f32::consts::PI;

     fn test_sse2_m128_sin_angle(a: f32) {
         let v = unsafe { m128_sin(_mm_set_ps1(a)) };
         let v = Vec4(v);
         let a_sin = a.sin();
         // dbg!((a, a_sin, v));
         assert!(v.abs_diff_eq(Vec4::splat(a_sin), 1e-6));
     }

     let mut a = -PI;
     let end = PI;
     let step = PI / 8192.0;

     while a <= end {
         test_sse2_m128_sin_angle(a);
         a += step;
     }
 }
	#[cfg(target_arch = "x86")]
	use core::arch::x86::*;
	#[cfg(target_arch = "x86_64")]
	use core::arch::x86_64::*;

	#[repr(C)]
	union UnionCast {
	u32x4: [u32; 4],
	f32x4: [f32; 4],
	m128: __m128,
	}

	pub const fn m128_from_f32x4(f32x4: [f32; 4]) -> __m128 {
	unsafe { UnionCast { f32x4 }.m128 }
	}

	const fn m128_from_u32x4(u32x4: [u32; 4]) -> __m128 {
	unsafe { UnionCast { u32x4 }.m128 }
	}

	const PS_ABS_MASK: __m128 = m128_from_u32x4([0x7fffffff; 4]);
	const PS_INV_SIGN_MASK: __m128 = m128_from_u32x4([!0x8000_0000; 4]);
	const PS_SIGN_MASK: __m128 = m128_from_u32x4([0x8000_0000; 4]);
	const PS_NO_FRACTION: __m128 = m128_from_f32x4([8388608.0; 4]);
	const PS_NEGATIVE_ZERO: __m128 = m128_from_u32x4([0x8000_0000; 4]);
	const PS_PI: __m128 = m128_from_f32x4([core::f32::consts::PI; 4]);
	const PS_HALF_PI: __m128 = m128_from_f32x4([core::f32::consts::FRAC_PI_2; 4]);
	const PS_SIN_COEFFICIENTS0: __m128 =
	m128_from_f32x4([-0.16666667, 0.008_333_331, -0.00019840874, 2.752_556_2e-6]);
	const PS_SIN_COEFFICIENTS1: __m128 = m128_from_f32x4([
	-2.388_985_9e-8,
	-0.16665852, /Est1/
	0.008_313_95, /Est2/
	-0.000_185_246_7, /Est3/
	]);
	const PS_ONE: __m128 = m128_from_f32x4([1.0; 4]);
	const PS_TWO_PI: __m128 = m128_from_f32x4([core::f32::consts::TAU; 4]);
	const PS_RECIPROCAL_TWO_PI: __m128 = m128_from_f32x4([0.159_154_94; 4]);

	/// Calculates the vector 3 dot product and returns answer in x lane of __m128.
	#[inline(always)]
	pub(crate) unsafe fn dot3_in_x(lhs: __m128, rhs: __m128) -> __m128 {
	let x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
	let y2_0_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_00_01);
	let z2_0_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_00_10);
	let x2y2_0_0_0 = _mm_add_ss(x2_y2_z2_w2, y2_0_0_0);
	_mm_add_ss(x2y2_0_0_0, z2_0_0_0)
	}

	/// Calculates the vector 4 dot product and returns answer in x lane of __m128.
	#[inline(always)]
	pub(crate) unsafe fn dot4_in_x(lhs: __m128, rhs: __m128) -> __m128 {
	let x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
	let z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, 0b00_00_11_10);
	let x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0);
	let y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, 0b00_00_00_01);
	_mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0)
	}

	#[inline]
	pub(crate) unsafe fn dot3(lhs: __m128, rhs: __m128) -> f32 {
	_mm_cvtss_f32(dot3_in_x(lhs, rhs))
	}

	#[inline]
	pub(crate) unsafe fn dot3_into_m128(lhs: __m128, rhs: __m128) -> __m128 {
	let dot_in_x = dot3_in_x(lhs, rhs);
	_mm_shuffle_ps(dot_in_x, dot_in_x, 0b00_00_00_00)
	}

	#[inline]
	pub(crate) unsafe fn dot4(lhs: __m128, rhs: __m128) -> f32 {
	_mm_cvtss_f32(dot4_in_x(lhs, rhs))
	}

	#[inline]
	pub(crate) unsafe fn dot4_into_m128(lhs: __m128, rhs: __m128) -> __m128 {
	let dot_in_x = dot4_in_x(lhs, rhs);
	_mm_shuffle_ps(dot_in_x, dot_in_x, 0b00_00_00_00)
	}

	#[inline]
	pub(crate) unsafe fn m128_floor(v: __m128) -> __m128 {
	// Based on https://github.com/microsoft/DirectXMath `XMVectorFloor`
	// To handle NAN, INF and numbers greater than 8388608, use masking
	let test = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_INV_SIGN_MASK));
	let test = _mm_cmplt_epi32(test, _mm_castps_si128(PS_NO_FRACTION));
	// Truncate
	let vint = _mm_cvttps_epi32(v);
	let result = _mm_cvtepi32_ps(vint);
	let larger = _mm_cmpgt_ps(result, v);
	// 0 -> 0, 0xffffffff -> -1.0f
	let larger = _mm_cvtepi32_ps(_mm_castps_si128(larger));
	let result = _mm_add_ps(result, larger);
	// All numbers less than 8388608 will use the round to int
	let result = _mm_and_ps(result, _mm_castsi128_ps(test));
	// All others, use the ORIGINAL value
	let test = _mm_andnot_si128(test, _mm_castps_si128(v));
	_mm_or_ps(result, _mm_castsi128_ps(test))
	}

	#[inline]
	pub(crate) unsafe fn m128_ceil(v: __m128) -> __m128 {
	// Based on https://github.com/microsoft/DirectXMath `XMVectorCeil`
	// To handle NAN, INF and numbers greater than 8388608, use masking
	let test = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_INV_SIGN_MASK));
	let test = _mm_cmplt_epi32(test, _mm_castps_si128(PS_NO_FRACTION));
	// Truncate
	let vint = _mm_cvttps_epi32(v);
	let result = _mm_cvtepi32_ps(vint);
	let smaller = _mm_cmplt_ps(result, v);
	// 0 -> 0, 0xffffffff -> -1.0f
	let smaller = _mm_cvtepi32_ps(_mm_castps_si128(smaller));
	let result = _mm_sub_ps(result, smaller);
	// All numbers less than 8388608 will use the round to int
	let result = _mm_and_ps(result, _mm_castsi128_ps(test));
	// All others, use the ORIGINAL value
	let test = _mm_andnot_si128(test, _mm_castps_si128(v));
	_mm_or_ps(result, _mm_castsi128_ps(test))
	}

	#[inline]
	pub(crate) unsafe fn m128_abs(v: __m128) -> __m128 {
	_mm_and_ps(v, _mm_castsi128_ps(_mm_set1_epi32(0x7f_ff_ff_ff)))
	}

	#[inline(always)]
	pub(crate) unsafe fn m128_mul_add(a: __m128, b: __m128, c: __m128) -> __m128 {
	// Only enable fused multiply-adds here if "fast-math" is enabled and the
	// platform supports it. Otherwise this may break cross-platform determinism.
	#[cfg(all(feature = "fast-math", target_feature = "fma"))]
	{
	_mm_fmadd_ps(a, b, c)
	}

	#[cfg(any(not(feature = "fast-math"), not(target_feature = "fma")))]
	{
	_mm_add_ps(_mm_mul_ps(a, b), c)
	}
	}

	#[inline(always)]
	pub(crate) unsafe fn m128_neg_mul_sub(a: __m128, b: __m128, c: __m128) -> __m128 {
	_mm_sub_ps(c, _mm_mul_ps(a, b))
	}

	#[inline]
	pub(crate) unsafe fn m128_round(v: __m128) -> __m128 {
	// Based on https://github.com/microsoft/DirectXMath `XMVectorRound`
	let sign = _mm_and_ps(v, PS_SIGN_MASK);
	let s_magic = _mm_or_ps(PS_NO_FRACTION, sign);
	let r1 = _mm_add_ps(v, s_magic);
	let r1 = _mm_sub_ps(r1, s_magic);
	let r2 = _mm_and_ps(v, PS_INV_SIGN_MASK);
	let mask = _mm_cmple_ps(r2, PS_NO_FRACTION);
	let r2 = _mm_andnot_ps(mask, v);
	let r1 = _mm_and_ps(r1, mask);
	_mm_xor_ps(r1, r2)
	}

	#[inline]
	pub(crate) unsafe fn m128_trunc(v: __m128) -> __m128 {
	// Based on https://github.com/microsoft/DirectXMath `XMVectorTruncate`
	// To handle NAN, INF and numbers greater than 8388608, use masking
	// Get the abs value
	let mut vtest = _mm_and_si128(_mm_castps_si128(v), _mm_castps_si128(PS_ABS_MASK));
	// Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
	vtest = _mm_cmplt_epi32(vtest, _mm_castps_si128(PS_NO_FRACTION));
	// Convert to int and back to float for rounding with truncation
	let vint = _mm_cvttps_epi32(v);
	// Convert back to floats
	let mut vresult = _mm_cvtepi32_ps(vint);
	// All numbers less than 8388608 will use the round to int
	vresult = _mm_and_ps(vresult, _mm_castsi128_ps(vtest));
	// All others, use the ORIGINAL value
	vtest = _mm_andnot_si128(vtest, _mm_castps_si128(v));
	_mm_or_ps(vresult, _mm_castsi128_ps(vtest))
	}

	/// Returns a vector whose components are the corresponding components of Angles modulo 2PI.
	#[inline]
	pub(crate) unsafe fn m128_mod_angles(angles: __m128) -> __m128 {
	// Based on https://github.com/microsoft/DirectXMath `XMVectorModAngles`
	let v = _mm_mul_ps(angles, PS_RECIPROCAL_TWO_PI);
	let v = m128_round(v);
	m128_neg_mul_sub(PS_TWO_PI, v, angles)
	}

	/// Computes the sine of the angle in each lane of `v`. Values outside
	/// the bounds of PI may produce an increasing error as the input angle
	/// drifts from `[-PI, PI]`.
	#[inline]
	pub(crate) unsafe fn m128_sin(v: __m128) -> __m128 {
	// Based on https://github.com/microsoft/DirectXMath `XMVectorSin`

	// 11-degree minimax approximation

	// Force the value within the bounds of pi
	let mut x = m128_mod_angles(v);

	// Map in [-pi/2,pi/2] with sin(y) = sin(x).
	let sign = _mm_and_ps(x, PS_NEGATIVE_ZERO);
	// pi when x >= 0, -pi when x < 0
	let c = _mm_or_ps(PS_PI, sign);
	// \|x\|
	let absx = _mm_andnot_ps(sign, x);
	let rflx = _mm_sub_ps(c, x);
	let comp = _mm_cmple_ps(absx, PS_HALF_PI);
	let select0 = _mm_and_ps(comp, x);
	let select1 = _mm_andnot_ps(comp, rflx);
	x = _mm_or_ps(select0, select1);

	let x2 = _mm_mul_ps(x, x);

	// Compute polynomial approximation
	const SC1: __m128 = PS_SIN_COEFFICIENTS1;
	let v_constants_b = _mm_shuffle_ps(SC1, SC1, 0b00_00_00_00);

	const SC0: __m128 = PS_SIN_COEFFICIENTS0;
	let mut v_constants = _mm_shuffle_ps(SC0, SC0, 0b11_11_11_11);
	let mut result = m128_mul_add(v_constants_b, x2, v_constants);

	v_constants = _mm_shuffle_ps(SC0, SC0, 0b10_10_10_10);
	result = m128_mul_add(result, x2, v_constants);

	v_constants = _mm_shuffle_ps(SC0, SC0, 0b01_01_01_01);
	result = m128_mul_add(result, x2, v_constants);

	v_constants = _mm_shuffle_ps(SC0, SC0, 0b00_00_00_00);
	result = m128_mul_add(result, x2, v_constants);

	result = m128_mul_add(result, x2, PS_ONE);
	result = _mm_mul_ps(result, x);

	result
	}

	#[test]
	fn test_sse2_m128_sin() {
	use crate::Vec4;
	use core::f32::consts::PI;

	fn test_sse2_m128_sin_angle(a: f32) {
	let v = unsafe { m128_sin(_mm_set_ps1(a)) };
	let v = Vec4(v);
	let a_sin = a.sin();
	// dbg!((a, a_sin, v));
	assert!(v.abs_diff_eq(Vec4::splat(a_sin), 1e-6));
	}

	let mut a = -PI;
	let end = PI;
	let step = PI / 8192.0;

	while a <= end {
	test_sse2_m128_sin_angle(a);
	a += step;
	}
	}