#![allow(non_camel_case_types)]

use crate::soft::{x2, x4};
use crate::types::*;
use core::ops::*;
use zerocopy::{AsBytes, FromBytes, FromZeroes};

#[repr(C)]
#[derive(Clone, Copy, FromBytes, AsBytes, FromZeroes)]
pub union vec128_storage {
    d: [u32; 4],
    q: [u64; 2],
}
impl From<[u32; 4]> for vec128_storage {
    #[inline(always)]
    fn from(d: [u32; 4]) -> Self {
        Self { d }
    }
}
impl From<vec128_storage> for [u32; 4] {
    #[inline(always)]
    fn from(d: vec128_storage) -> Self {
        unsafe { d.d }
    }
}
impl From<[u64; 2]> for vec128_storage {
    #[inline(always)]
    fn from(q: [u64; 2]) -> Self {
        Self { q }
    }
}
impl From<vec128_storage> for [u64; 2] {
    #[inline(always)]
    fn from(q: vec128_storage) -> Self {
        unsafe { q.q }
    }
}
impl Default for vec128_storage {
    #[inline(always)]
    fn default() -> Self {
        Self { q: [0, 0] }
    }
}
impl Eq for vec128_storage {}
impl PartialEq<vec128_storage> for vec128_storage {
    #[inline(always)]
    fn eq(&self, rhs: &Self) -> bool {
        unsafe { self.q == rhs.q }
    }
}
#[derive(Clone, Copy, PartialEq, Eq, Default)]
pub struct vec256_storage {
    v128: [vec128_storage; 2],
}
impl vec256_storage {
    #[inline(always)]
    pub fn new128(v128: [vec128_storage; 2]) -> Self {
        Self { v128 }
    }
    #[inline(always)]
    pub fn split128(self) -> [vec128_storage; 2] {
        self.v128
    }
}
impl From<vec256_storage> for [u64; 4] {
    #[inline(always)]
    fn from(q: vec256_storage) -> Self {
        let [a, b]: [u64; 2] = q.v128[0].into();
        let [c, d]: [u64; 2] = q.v128[1].into();
        [a, b, c, d]
    }
}
impl From<[u64; 4]> for vec256_storage {
    #[inline(always)]
    fn from([a, b, c, d]: [u64; 4]) -> Self {
        Self {
            v128: [[a, b].into(), [c, d].into()],
        }
    }
}
#[derive(Clone, Copy, PartialEq, Eq, Default)]
pub struct vec512_storage {
    v128: [vec128_storage; 4],
}
impl vec512_storage {
    #[inline(always)]
    pub fn new128(v128: [vec128_storage; 4]) -> Self {
        Self { v128 }
    }
    #[inline(always)]
    pub fn split128(self) -> [vec128_storage; 4] {
        self.v128
    }
}

#[inline(always)]
fn dmap<T, F>(t: T, f: F) -> T
where
    T: Store<vec128_storage> + Into<vec128_storage>,
    F: Fn(u32) -> u32,
{
    let t: vec128_storage = t.into();
    let d = unsafe { t.d };
    let d = vec128_storage {
        d: [f(d[0]), f(d[1]), f(d[2]), f(d[3])],
    };
    unsafe { T::unpack(d) }
}

fn dmap2<T, F>(a: T, b: T, f: F) -> T
where
    T: Store<vec128_storage> + Into<vec128_storage>,
    F: Fn(u32, u32) -> u32,
{
    let a: vec128_storage = a.into();
    let b: vec128_storage = b.into();
    let ao = unsafe { a.d };
    let bo = unsafe { b.d };
    let d = vec128_storage {
        d: [
            f(ao[0], bo[0]),
            f(ao[1], bo[1]),
            f(ao[2], bo[2]),
            f(ao[3], bo[3]),
        ],
    };
    unsafe { T::unpack(d) }
}

#[inline(always)]
fn qmap<T, F>(t: T, f: F) -> T
where
    T: Store<vec128_storage> + Into<vec128_storage>,
    F: Fn(u64) -> u64,
{
    let t: vec128_storage = t.into();
    let q = unsafe { t.q };
    let q = vec128_storage {
        q: [f(q[0]), f(q[1])],
    };
    unsafe { T::unpack(q) }
}

#[inline(always)]
fn qmap2<T, F>(a: T, b: T, f: F) -> T
where
    T: Store<vec128_storage> + Into<vec128_storage>,
    F: Fn(u64, u64) -> u64,
{
    let a: vec128_storage = a.into();
    let b: vec128_storage = b.into();
    let ao = unsafe { a.q };
    let bo = unsafe { b.q };
    let q = vec128_storage {
        q: [f(ao[0], bo[0]), f(ao[1], bo[1])],
    };
    unsafe { T::unpack(q) }
}

#[inline(always)]
fn o_of_q(q: [u64; 2]) -> u128 {
    u128::from(q[0]) | (u128::from(q[1]) << 64)
}

#[inline(always)]
fn q_of_o(o: u128) -> [u64; 2] {
    [o as u64, (o >> 64) as u64]
}

#[inline(always)]
fn omap<T, F>(a: T, f: F) -> T
where
    T: Store<vec128_storage> + Into<vec128_storage>,
    F: Fn(u128) -> u128,
{
    let a: vec128_storage = a.into();
    let ao = o_of_q(unsafe { a.q });
    let o = vec128_storage { q: q_of_o(f(ao)) };
    unsafe { T::unpack(o) }
}

#[inline(always)]
fn omap2<T, F>(a: T, b: T, f: F) -> T
where
    T: Store<vec128_storage> + Into<vec128_storage>,
    F: Fn(u128, u128) -> u128,
{
    let a: vec128_storage = a.into();
    let b: vec128_storage = b.into();
    let ao = o_of_q(unsafe { a.q });
    let bo = o_of_q(unsafe { b.q });
    let o = vec128_storage {
        q: q_of_o(f(ao, bo)),
    };
    unsafe { T::unpack(o) }
}

impl RotateEachWord128 for u128x1_generic {}
impl BitOps128 for u128x1_generic {}
impl BitOps64 for u128x1_generic {}
impl BitOps64 for u64x2_generic {}
impl BitOps32 for u128x1_generic {}
impl BitOps32 for u64x2_generic {}
impl BitOps32 for u32x4_generic {}
impl BitOps0 for u128x1_generic {}
impl BitOps0 for u64x2_generic {}
impl BitOps0 for u32x4_generic {}

macro_rules! impl_bitops {
    ($vec:ident) => {
        impl Not for $vec {
            type Output = Self;
            #[inline(always)]
            fn not(self) -> Self::Output {
                omap(self, |x| !x)
            }
        }
        impl BitAnd for $vec {
            type Output = Self;
            #[inline(always)]
            fn bitand(self, rhs: Self) -> Self::Output {
                omap2(self, rhs, |x, y| x & y)
            }
        }
        impl BitOr for $vec {
            type Output = Self;
            #[inline(always)]
            fn bitor(self, rhs: Self) -> Self::Output {
                omap2(self, rhs, |x, y| x | y)
            }
        }
        impl BitXor for $vec {
            type Output = Self;
            #[inline(always)]
            fn bitxor(self, rhs: Self) -> Self::Output {
                omap2(self, rhs, |x, y| x ^ y)
            }
        }
        impl AndNot for $vec {
            type Output = Self;
            #[inline(always)]
            fn andnot(self, rhs: Self) -> Self::Output {
                omap2(self, rhs, |x, y| !x & y)
            }
        }
        impl BitAndAssign for $vec {
            #[inline(always)]
            fn bitand_assign(&mut self, rhs: Self) {
                *self = *self & rhs
            }
        }
        impl BitOrAssign for $vec {
            #[inline(always)]
            fn bitor_assign(&mut self, rhs: Self) {
                *self = *self | rhs
            }
        }
        impl BitXorAssign for $vec {
            #[inline(always)]
            fn bitxor_assign(&mut self, rhs: Self) {
                *self = *self ^ rhs
            }
        }

        impl Swap64 for $vec {
            #[inline(always)]
            fn swap1(self) -> Self {
                qmap(self, |x| {
                    ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1)
                })
            }
            #[inline(always)]
            fn swap2(self) -> Self {
                qmap(self, |x| {
                    ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2)
                })
            }
            #[inline(always)]
            fn swap4(self) -> Self {
                qmap(self, |x| {
                    ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4)
                })
            }
            #[inline(always)]
            fn swap8(self) -> Self {
                qmap(self, |x| {
                    ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8)
                })
            }
            #[inline(always)]
            fn swap16(self) -> Self {
                dmap(self, |x| x.rotate_left(16))
            }
            #[inline(always)]
            fn swap32(self) -> Self {
                qmap(self, |x| x.rotate_left(32))
            }
            #[inline(always)]
            fn swap64(self) -> Self {
                omap(self, |x| (x << 64) | (x >> 64))
            }
        }
    };
}
impl_bitops!(u32x4_generic);
impl_bitops!(u64x2_generic);
impl_bitops!(u128x1_generic);

impl RotateEachWord32 for u32x4_generic {
    #[inline(always)]
    fn rotate_each_word_right7(self) -> Self {
        dmap(self, |x| x.rotate_right(7))
    }
    #[inline(always)]
    fn rotate_each_word_right8(self) -> Self {
        dmap(self, |x| x.rotate_right(8))
    }
    #[inline(always)]
    fn rotate_each_word_right11(self) -> Self {
        dmap(self, |x| x.rotate_right(11))
    }
    #[inline(always)]
    fn rotate_each_word_right12(self) -> Self {
        dmap(self, |x| x.rotate_right(12))
    }
    #[inline(always)]
    fn rotate_each_word_right16(self) -> Self {
        dmap(self, |x| x.rotate_right(16))
    }
    #[inline(always)]
    fn rotate_each_word_right20(self) -> Self {
        dmap(self, |x| x.rotate_right(20))
    }
    #[inline(always)]
    fn rotate_each_word_right24(self) -> Self {
        dmap(self, |x| x.rotate_right(24))
    }
    #[inline(always)]
    fn rotate_each_word_right25(self) -> Self {
        dmap(self, |x| x.rotate_right(25))
    }
}

impl RotateEachWord32 for u64x2_generic {
    #[inline(always)]
    fn rotate_each_word_right7(self) -> Self {
        qmap(self, |x| x.rotate_right(7))
    }
    #[inline(always)]
    fn rotate_each_word_right8(self) -> Self {
        qmap(self, |x| x.rotate_right(8))
    }
    #[inline(always)]
    fn rotate_each_word_right11(self) -> Self {
        qmap(self, |x| x.rotate_right(11))
    }
    #[inline(always)]
    fn rotate_each_word_right12(self) -> Self {
        qmap(self, |x| x.rotate_right(12))
    }
    #[inline(always)]
    fn rotate_each_word_right16(self) -> Self {
        qmap(self, |x| x.rotate_right(16))
    }
    #[inline(always)]
    fn rotate_each_word_right20(self) -> Self {
        qmap(self, |x| x.rotate_right(20))
    }
    #[inline(always)]
    fn rotate_each_word_right24(self) -> Self {
        qmap(self, |x| x.rotate_right(24))
    }
    #[inline(always)]
    fn rotate_each_word_right25(self) -> Self {
        qmap(self, |x| x.rotate_right(25))
    }
}
impl RotateEachWord64 for u64x2_generic {
    #[inline(always)]
    fn rotate_each_word_right32(self) -> Self {
        qmap(self, |x| x.rotate_right(32))
    }
}

// workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web)
#[inline(always)]
fn rotate_u128_right(x: u128, i: u32) -> u128 {
    (x >> i) | (x << (128 - i))
}
#[test]
fn test_rotate_u128() {
    const X: u128 = 0x0001_0203_0405_0607_0809_0a0b_0c0d_0e0f;
    assert_eq!(rotate_u128_right(X, 17), X.rotate_right(17));
}

impl RotateEachWord32 for u128x1_generic {
    #[inline(always)]
    fn rotate_each_word_right7(self) -> Self {
        Self([rotate_u128_right(self.0[0], 7)])
    }
    #[inline(always)]
    fn rotate_each_word_right8(self) -> Self {
        Self([rotate_u128_right(self.0[0], 8)])
    }
    #[inline(always)]
    fn rotate_each_word_right11(self) -> Self {
        Self([rotate_u128_right(self.0[0], 11)])
    }
    #[inline(always)]
    fn rotate_each_word_right12(self) -> Self {
        Self([rotate_u128_right(self.0[0], 12)])
    }
    #[inline(always)]
    fn rotate_each_word_right16(self) -> Self {
        Self([rotate_u128_right(self.0[0], 16)])
    }
    #[inline(always)]
    fn rotate_each_word_right20(self) -> Self {
        Self([rotate_u128_right(self.0[0], 20)])
    }
    #[inline(always)]
    fn rotate_each_word_right24(self) -> Self {
        Self([rotate_u128_right(self.0[0], 24)])
    }
    #[inline(always)]
    fn rotate_each_word_right25(self) -> Self {
        Self([rotate_u128_right(self.0[0], 25)])
    }
}
impl RotateEachWord64 for u128x1_generic {
    #[inline(always)]
    fn rotate_each_word_right32(self) -> Self {
        Self([rotate_u128_right(self.0[0], 32)])
    }
}

#[derive(Copy, Clone)]
pub struct GenericMachine;
impl Machine for GenericMachine {
    type u32x4 = u32x4_generic;
    type u64x2 = u64x2_generic;
    type u128x1 = u128x1_generic;
    type u32x4x2 = u32x4x2_generic;
    type u64x2x2 = u64x2x2_generic;
    type u64x4 = u64x4_generic;
    type u128x2 = u128x2_generic;
    type u32x4x4 = u32x4x4_generic;
    type u64x2x4 = u64x2x4_generic;
    type u128x4 = u128x4_generic;
    #[inline(always)]
    unsafe fn instance() -> Self {
        Self
    }
}

#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)]
#[repr(transparent)]
pub struct u32x4_generic([u32; 4]);
#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)]
#[repr(transparent)]
pub struct u64x2_generic([u64; 2]);
#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)]
#[repr(transparent)]
pub struct u128x1_generic([u128; 1]);

impl From<u32x4_generic> for vec128_storage {
    #[inline(always)]
    fn from(d: u32x4_generic) -> Self {
        Self { d: d.0 }
    }
}
impl From<u64x2_generic> for vec128_storage {
    #[inline(always)]
    fn from(q: u64x2_generic) -> Self {
        Self { q: q.0 }
    }
}
impl From<u128x1_generic> for vec128_storage {
    #[inline(always)]
    fn from(o: u128x1_generic) -> Self {
        Self { q: q_of_o(o.0[0]) }
    }
}

impl Store<vec128_storage> for u32x4_generic {
    #[inline(always)]
    unsafe fn unpack(s: vec128_storage) -> Self {
        Self(s.d)
    }
}
impl Store<vec128_storage> for u64x2_generic {
    #[inline(always)]
    unsafe fn unpack(s: vec128_storage) -> Self {
        Self(s.q)
    }
}
impl Store<vec128_storage> for u128x1_generic {
    #[inline(always)]
    unsafe fn unpack(s: vec128_storage) -> Self {
        Self([o_of_q(s.q); 1])
    }
}

impl ArithOps for u32x4_generic {}
impl ArithOps for u64x2_generic {}
impl ArithOps for u128x1_generic {}

impl Add for u32x4_generic {
    type Output = Self;
    #[inline(always)]
    fn add(self, rhs: Self) -> Self::Output {
        dmap2(self, rhs, |x, y| x.wrapping_add(y))
    }
}
impl Add for u64x2_generic {
    type Output = Self;
    #[inline(always)]
    fn add(self, rhs: Self) -> Self::Output {
        qmap2(self, rhs, |x, y| x.wrapping_add(y))
    }
}
impl Add for u128x1_generic {
    type Output = Self;
    #[inline(always)]
    fn add(self, rhs: Self) -> Self::Output {
        omap2(self, rhs, |x, y| x.wrapping_add(y))
    }
}
impl AddAssign for u32x4_generic {
    #[inline(always)]
    fn add_assign(&mut self, rhs: Self) {
        *self = *self + rhs
    }
}
impl AddAssign for u64x2_generic {
    #[inline(always)]
    fn add_assign(&mut self, rhs: Self) {
        *self = *self + rhs
    }
}
impl AddAssign for u128x1_generic {
    #[inline(always)]
    fn add_assign(&mut self, rhs: Self) {
        *self = *self + rhs
    }
}
impl BSwap for u32x4_generic {
    #[inline(always)]
    fn bswap(self) -> Self {
        dmap(self, |x| x.swap_bytes())
    }
}
impl BSwap for u64x2_generic {
    #[inline(always)]
    fn bswap(self) -> Self {
        qmap(self, |x| x.swap_bytes())
    }
}
impl BSwap for u128x1_generic {
    #[inline(always)]
    fn bswap(self) -> Self {
        omap(self, |x| x.swap_bytes())
    }
}
impl StoreBytes for u32x4_generic {
    #[inline(always)]
    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
        let x = u32x4_generic::read_from(input).unwrap();
        dmap(x, |x| x.to_le())
    }
    #[inline(always)]
    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
        let x = u32x4_generic::read_from(input).unwrap();
        dmap(x, |x| x.to_be())
    }
    #[inline(always)]
    fn write_le(self, out: &mut [u8]) {
        let x = dmap(self, |x| x.to_le());
        x.write_to(out).unwrap();
    }
    #[inline(always)]
    fn write_be(self, out: &mut [u8]) {
        let x = dmap(self, |x| x.to_be());
        x.write_to(out).unwrap();
    }
}
impl StoreBytes for u64x2_generic {
    #[inline(always)]
    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
        let x = u64x2_generic::read_from(input).unwrap();
        qmap(x, |x| x.to_le())
    }
    #[inline(always)]
    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
        let x = u64x2_generic::read_from(input).unwrap();
        qmap(x, |x| x.to_be())
    }
    #[inline(always)]
    fn write_le(self, out: &mut [u8]) {
        let x = qmap(self, |x| x.to_le());
        x.write_to(out).unwrap();
    }
    #[inline(always)]
    fn write_be(self, out: &mut [u8]) {
        let x = qmap(self, |x| x.to_be());
        x.write_to(out).unwrap();
    }
}

#[derive(Copy, Clone)]
pub struct G0;
#[derive(Copy, Clone)]
pub struct G1;
pub type u32x4x2_generic = x2<u32x4_generic, G0>;
pub type u64x2x2_generic = x2<u64x2_generic, G0>;
pub type u64x4_generic = x2<u64x2_generic, G1>;
pub type u128x2_generic = x2<u128x1_generic, G0>;
pub type u32x4x4_generic = x4<u32x4_generic>;
pub type u64x2x4_generic = x4<u64x2_generic>;
pub type u128x4_generic = x4<u128x1_generic>;

impl Vector<[u32; 16]> for u32x4x4_generic {
    fn to_scalars(self) -> [u32; 16] {
        let [a, b, c, d] = self.0;
        let a = a.0;
        let b = b.0;
        let c = c.0;
        let d = d.0;
        [
            a[0], a[1], a[2], a[3], //
            b[0], b[1], b[2], b[3], //
            c[0], c[1], c[2], c[3], //
            d[0], d[1], d[2], d[3], //
        ]
    }
}

impl MultiLane<[u32; 4]> for u32x4_generic {
    #[inline(always)]
    fn to_lanes(self) -> [u32; 4] {
        self.0
    }
    #[inline(always)]
    fn from_lanes(xs: [u32; 4]) -> Self {
        Self(xs)
    }
}
impl MultiLane<[u64; 2]> for u64x2_generic {
    #[inline(always)]
    fn to_lanes(self) -> [u64; 2] {
        self.0
    }
    #[inline(always)]
    fn from_lanes(xs: [u64; 2]) -> Self {
        Self(xs)
    }
}
impl MultiLane<[u64; 4]> for u64x4_generic {
    #[inline(always)]
    fn to_lanes(self) -> [u64; 4] {
        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
        [a[0], a[1], b[0], b[1]]
    }
    #[inline(always)]
    fn from_lanes(xs: [u64; 4]) -> Self {
        let (a, b) = (
            u64x2_generic::from_lanes([xs[0], xs[1]]),
            u64x2_generic::from_lanes([xs[2], xs[3]]),
        );
        x2::new([a, b])
    }
}
impl MultiLane<[u128; 1]> for u128x1_generic {
    #[inline(always)]
    fn to_lanes(self) -> [u128; 1] {
        self.0
    }
    #[inline(always)]
    fn from_lanes(xs: [u128; 1]) -> Self {
        Self(xs)
    }
}
impl Vec4<u32> for u32x4_generic {
    #[inline(always)]
    fn extract(self, i: u32) -> u32 {
        self.0[i as usize]
    }
    #[inline(always)]
    fn insert(mut self, v: u32, i: u32) -> Self {
        self.0[i as usize] = v;
        self
    }
}
impl Vec4<u64> for u64x4_generic {
    #[inline(always)]
    fn extract(self, i: u32) -> u64 {
        let d: [u64; 4] = self.to_lanes();
        d[i as usize]
    }
    #[inline(always)]
    fn insert(self, v: u64, i: u32) -> Self {
        self.0[(i / 2) as usize].insert(v, i % 2);
        self
    }
}
impl Vec2<u64> for u64x2_generic {
    #[inline(always)]
    fn extract(self, i: u32) -> u64 {
        self.0[i as usize]
    }
    #[inline(always)]
    fn insert(mut self, v: u64, i: u32) -> Self {
        self.0[i as usize] = v;
        self
    }
}

impl Words4 for u32x4_generic {
    #[inline(always)]
    fn shuffle2301(self) -> Self {
        self.swap64()
    }
    #[inline(always)]
    fn shuffle1230(self) -> Self {
        let x = self.0;
        Self([x[3], x[0], x[1], x[2]])
    }
    #[inline(always)]
    fn shuffle3012(self) -> Self {
        let x = self.0;
        Self([x[1], x[2], x[3], x[0]])
    }
}
impl LaneWords4 for u32x4_generic {
    #[inline(always)]
    fn shuffle_lane_words2301(self) -> Self {
        self.shuffle2301()
    }
    #[inline(always)]
    fn shuffle_lane_words1230(self) -> Self {
        self.shuffle1230()
    }
    #[inline(always)]
    fn shuffle_lane_words3012(self) -> Self {
        self.shuffle3012()
    }
}

impl Words4 for u64x4_generic {
    #[inline(always)]
    fn shuffle2301(self) -> Self {
        x2::new([self.0[1], self.0[0]])
    }
    #[inline(always)]
    fn shuffle1230(self) -> Self {
        unimplemented!()
    }
    #[inline(always)]
    fn shuffle3012(self) -> Self {
        unimplemented!()
    }
}

impl u32x4<GenericMachine> for u32x4_generic {}
impl u64x2<GenericMachine> for u64x2_generic {}
impl u128x1<GenericMachine> for u128x1_generic {}
impl u32x4x2<GenericMachine> for u32x4x2_generic {}
impl u64x2x2<GenericMachine> for u64x2x2_generic {}
impl u64x4<GenericMachine> for u64x4_generic {}
impl u128x2<GenericMachine> for u128x2_generic {}
impl u32x4x4<GenericMachine> for u32x4x4_generic {}
impl u64x2x4<GenericMachine> for u64x2x4_generic {}
impl u128x4<GenericMachine> for u128x4_generic {}

#[macro_export]
macro_rules! dispatch {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            fn_impl($mach, $($arg),*)
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}
#[macro_export]
macro_rules! dispatch_light128 {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            fn_impl($mach, $($arg),*)
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}
#[macro_export]
macro_rules! dispatch_light256 {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            fn_impl($mach, $($arg),*)
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}
#[macro_export]
macro_rules! dispatch_light512 {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            fn_impl($mach, $($arg),*)
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_bswap32() {
        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];

        let m = unsafe { GenericMachine::instance() };

        let x: <GenericMachine as Machine>::u32x4 = m.vec(xs);
        let x = x.bswap();

        let y = m.vec(ys);
        assert_eq!(x, y);
    }
}
