| // ARM Neon intrinsic specification. |
| // |
| // This file contains the specification for a number of |
| // intrinsics that allows us to generate them along with |
| // their test cases. |
| // |
| // To the syntax of the file - it's not very intelligently parsed! |
| // |
| // # Comments |
| // start with AT LEAST two, or four or more slashes so // is a |
| // comment /////// is too. |
| // |
| // # Sections |
| // Sections start with EXACTLY three slashes followed |
| // by AT LEAST one space. Sections are used for two things: |
| // |
| // 1) they serve as the doc comment for the given intrinics. |
| // 2) they reset all variables (name, fn, etc.) |
| // |
| // # Variables |
| // |
| // name - The prefix of the function, suffixes are auto |
| // generated by the type they get passed. |
| // |
| // fn - The function to call in rust-land. |
| // |
| // aarch64 - The intrinsic to check on aarch64 architecture. |
| // If this is given but no arm intrinsic is provided, |
| // the function will exclusively be generated for |
| // aarch64. |
| // This is used to generate both aarch64 specific and |
| // shared intrinics by first only specifying th aarch64 |
| // variant then the arm variant. |
| // |
| // arm - The arm v7 intrinics used to checked for arm code |
| // generation. All neon functions available in arm are |
| // also available in aarch64. If no aarch64 intrinic was |
| // set they are assumed to be the same. |
| // Intrinics ending with a `.` will have a size suffixes |
| // added (such as `i8` or `i64`) that is not sign specific |
| // Intrinics ending with a `.s` will have a size suffixes |
| // added (such as `s8` or `u64`) that is sign specific |
| // |
| // a - First input for tests, it gets scaled to the size of |
| // the type. |
| // |
| // b - Second input for tests, it gets scaled to the size of |
| // the type. |
| // |
| // # special values |
| // |
| // TRUE - 'true' all bits are set to 1 |
| // FALSE - 'false' all bits are set to 0 |
| // FF - same as 'true' |
| // MIN - minimal value (either 0 or the lowest negative number) |
| // MAX - maximal value proper to overflow |
| // |
| // # validate <values> |
| // Validates a and b aginst the expected result of the test. |
| // The special values 'TRUE' and 'FALSE' can be used to |
| // represent the correct NEON representation of true or |
| // false values. It too gets scaled to the type. |
| // |
| // Validate needs to be called before generate as it sets |
| // up the rules for validation that get generated for each |
| // type. |
| // # generate <types> |
| // The generate command generates the intrinsics, it uses the |
| // Variables set and can be called multiple times while overwriting |
| // some of the variables. |
| |
| /// Vector bitwise and |
| name = vand |
| fn = simd_and |
| arm = vand |
| aarch64 = and |
| a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 |
| b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F |
| validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 |
| b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| /// Vector bitwise or (immediate, inclusive) |
| name = vorr |
| fn = simd_or |
| arm = vorr |
| aarch64 = orr |
| a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| |
| /// Vector bitwise exclusive or (vector) |
| name = veor |
| fn = simd_xor |
| arm = veor |
| aarch64 = eor |
| a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| //////////////////// |
| // Absolute difference between the arguments |
| //////////////////// |
| |
| /// Absolute difference between the arguments |
| name = vabd |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
| validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15 |
| |
| arm = vabd.s |
| aarch64 = sabd |
| link-arm = vabds._EXT_ |
| link-aarch64 = sabd._EXT_ |
| generate int*_t |
| |
| arm = vabd.s |
| aarch64 = uabd |
| link-arm = vabdu._EXT_ |
| link-aarch64 = uabd._EXT_ |
| generate uint*_t |
| |
| /// Absolute difference between the arguments of Floating |
| name = vabd |
| a = 1.0, 2.0, 5.0, -4.0 |
| b = 9.0, 3.0, 2.0, 8.0 |
| validate 8.0, 1.0, 3.0, 12.0 |
| |
| aarch64 = fabd |
| link-aarch64 = fabd._EXT_ |
| generate float64x*_t |
| |
| arm = vabd.s |
| aarch64 = fabd |
| link-arm = vabds._EXT_ |
| link-aarch64 = fabd._EXT_ |
| generate float*_t |
| |
| //////////////////// |
| // Absolute difference Long |
| //////////////////// |
| |
| /// Unsigned Absolute difference Long |
| name = vabdl |
| multi_fn = simd_cast, {vabd-unsigned-noext, a, b} |
| a = 1, 2, 3, 4, 4, 3, 2, 1 |
| b = 10, 10, 10, 10, 10, 10, 10, 10 |
| validate 9, 8, 7, 6, 6, 7, 8, 9 |
| |
| arm = vabdl.s |
| aarch64 = uabdl |
| generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t |
| |
| /// Signed Absolute difference Long |
| name = vabdl |
| multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b} |
| multi_fn = simd_cast, c |
| a = 1, 2, 3, 4, 4, 3, 2, 1 |
| b = 10, 10, 10, 10, 10, 10, 10, 10 |
| validate 9, 8, 7, 6, 6, 7, 8, 9 |
| |
| arm = vabdl.s |
| aarch64 = sabdl |
| generate int8x8_t:int8x8_t:int16x8_t |
| |
| /// Signed Absolute difference Long |
| name = vabdl |
| multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b} |
| multi_fn = simd_cast, c |
| a = 1, 2, 11, 12 |
| b = 10, 10, 10, 10 |
| validate 9, 8, 1, 2 |
| |
| arm = vabdl.s |
| aarch64 = sabdl |
| generate int16x4_t:int16x4_t:int32x4_t |
| |
| /// Signed Absolute difference Long |
| name = vabdl |
| multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b} |
| multi_fn = simd_cast, c |
| a = 1, 11 |
| b = 10, 10 |
| validate 9, 1 |
| |
| arm = vabdl.s |
| aarch64 = sabdl |
| generate int32x2_t:int32x2_t:int64x2_t |
| |
| /// Unsigned Absolute difference Long |
| name = vabdl_high |
| no-q |
| multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_cast, {vabd_u8, c, d} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 |
| validate 1, 0, 1, 2, 3, 4, 5, 6 |
| |
| aarch64 = uabdl |
| generate uint8x16_t:uint8x16_t:uint16x8_t |
| |
| /// Unsigned Absolute difference Long |
| name = vabdl_high |
| no-q |
| multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] |
| multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_cast, {vabd_u16, c, d} |
| a = 1, 2, 3, 4, 8, 9, 11, 12 |
| b = 10, 10, 10, 10, 10, 10, 10, 10 |
| validate 2, 1, 1, 2 |
| |
| aarch64 = uabdl |
| generate uint16x8_t:uint16x8_t:uint32x4_t |
| |
| /// Unsigned Absolute difference Long |
| name = vabdl_high |
| no-q |
| multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] |
| multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] |
| multi_fn = simd_cast, {vabd_u32, c, d} |
| a = 1, 2, 3, 4 |
| b = 10, 10, 10, 10 |
| validate 7, 6 |
| |
| aarch64 = uabdl |
| generate uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Signed Absolute difference Long |
| name = vabdl_high |
| no-q |
| multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d} |
| multi_fn = simd_cast, e |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 |
| validate 1, 0, 1, 2, 3, 4, 5, 6 |
| |
| aarch64 = sabdl |
| generate int8x16_t:int8x16_t:int16x8_t |
| |
| /// Signed Absolute difference Long |
| name = vabdl_high |
| no-q |
| multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] |
| multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d} |
| multi_fn = simd_cast, e |
| a = 1, 2, 3, 4, 9, 10, 11, 12 |
| b = 10, 10, 10, 10, 10, 10, 10, 10 |
| validate 1, 0, 1, 2 |
| |
| aarch64 = sabdl |
| generate int16x8_t:int16x8_t:int32x4_t |
| |
| /// Signed Absolute difference Long |
| name = vabdl_high |
| no-q |
| multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] |
| multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] |
| multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d} |
| multi_fn = simd_cast, e |
| a = 1, 2, 3, 4 |
| b = 10, 10, 10, 10 |
| validate 7, 6 |
| |
| aarch64 = sabdl |
| generate int32x4_t:int32x4_t:int64x2_t |
| |
| //////////////////// |
| // equality |
| //////////////////// |
| |
| /// Compare bitwise Equal (vector) |
| name = vceq |
| fn = simd_eq |
| a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX |
| b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX |
| b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN |
| validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE |
| |
| aarch64 = cmeq |
| generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t |
| |
| arm = vceq. |
| generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t |
| |
| /// Floating-point compare equal |
| name = vceq |
| fn = simd_eq |
| a = 1.2, 3.4, 5.6, 7.8 |
| b = 1.2, 3.4, 5.6, 7.8 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmeq |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vceq. |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Signed compare bitwise equal to zero |
| name = vceqz |
| fn = simd_eq |
| a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| |
| aarch64 = cmeq |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t |
| |
| /// Unsigned compare bitwise equal to zero |
| name = vceqz |
| fn = simd_eq |
| a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| |
| aarch64 = cmeq |
| generate uint*_t, uint64x*_t |
| |
| /// Floating-point compare bitwise equal to zero |
| name = vceqz |
| fn = simd_eq |
| a = 0.0, 1.2, 3.4, 5.6 |
| fixed = 0.0, 0.0, 0.0, 0.0 |
| validate TRUE, FALSE, FALSE, FALSE |
| |
| aarch64 = fcmeq |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| /// Signed compare bitwise Test bits nonzero |
| name = vtst |
| multi_fn = simd_and, c:in_t, a, b |
| multi_fn = fixed, d:in_t |
| multi_fn = simd_ne, c, transmute(d) |
| a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX |
| b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmtst |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t |
| |
| arm = vtst |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t |
| |
| /// Unsigned compare bitwise Test bits nonzero |
| name = vtst |
| multi_fn = simd_and, c:in_t, a, b |
| multi_fn = fixed, d:in_t |
| multi_fn = simd_ne, c, transmute(d) |
| a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX |
| b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmtst |
| generate uint64x*_t |
| |
| arm = vtst |
| generate uint*_t |
| |
| //////////////////// |
| // Floating-point absolute value |
| //////////////////// |
| |
| /// Floating-point absolute value |
| name = vabs |
| fn = simd_fabs |
| a = -0.1, -2.2, -3.3, -6.6 |
| validate 0.1, 2.2, 3.3, 6.6 |
| aarch64 = fabs |
| generate float64x1_t:float64x1_t, float64x2_t:float64x2_t |
| |
| arm = vabs |
| generate float32x2_t:float32x2_t, float32x4_t:float32x4_t |
| |
| //////////////////// |
| // greater then |
| //////////////////// |
| |
| /// Compare signed greater than |
| name = vcgt |
| fn = simd_gt |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| aarch64 = cmgt |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned highe |
| name = vcgt |
| fn = simd_gt |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhi |
| generate uint64x*_t |
| |
| arm = vcgt.s |
| generate uint*_t |
| |
| /// Floating-point compare greater than |
| name = vcgt |
| fn = simd_gt |
| a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmgt |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // lesser then |
| //////////////////// |
| |
| /// Compare signed less than |
| name = vclt |
| fn = simd_lt |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| aarch64 = cmgt |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned less than |
| name = vclt |
| fn = simd_lt |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhi |
| generate uint64x*_t |
| |
| arm = vcgt.s |
| generate uint*_t |
| |
| /// Floating-point compare less than |
| name = vclt |
| fn = simd_lt |
| a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmgt |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcgt.s |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // lesser then equals |
| //////////////////// |
| |
| /// Compare signed less than or equal |
| name = vcle |
| fn = simd_le |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmge |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcge.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned less than or equal |
| name = vcle |
| fn = simd_le |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhs |
| generate uint64x*_t |
| |
| arm = vcge.s |
| generate uint*_t |
| |
| /// Floating-point compare less than or equal |
| name = vcle |
| fn = simd_le |
| a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| aarch64 = fcmge |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| arm = vcge.s |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| //////////////////// |
| // greater then equals |
| //////////////////// |
| |
| /// Compare signed greater than or equal |
| name = vcge |
| fn = simd_ge |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmge |
| generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| arm = vcge.s |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t |
| |
| /// Compare unsigned greater than or equal |
| name = vcge |
| fn = simd_ge |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmhs |
| generate uint64x*_t |
| |
| arm = vcge.s |
| generate uint*_t |
| |
| /// Floating-point compare greater than or equal |
| name = vcge |
| fn = simd_ge |
| a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
| b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
| validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmge |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcge.s |
| // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Compare signed greater than or equal to zero |
| name = vcgez |
| fn = simd_ge |
| a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmge |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| /// Floating-point compare greater than or equal to zero |
| name = vcgez |
| fn = simd_ge |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 |
| validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmge |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| /// Compare signed greater than zero |
| name = vcgtz |
| fn = simd_gt |
| a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = cmgt |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| /// Floating-point compare greater than zero |
| name = vcgtz |
| fn = simd_gt |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 |
| validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
| |
| aarch64 = fcmgt |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| /// Compare signed less than or equal to zero |
| name = vclez |
| fn = simd_le |
| a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| |
| aarch64 = cmgt |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| /// Floating-point compare less than or equal to zero |
| name = vclez |
| fn = simd_le |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 |
| validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| |
| aarch64 = fcmle |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| /// Compare signed less than zero |
| name = vcltz |
| fn = simd_lt |
| a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX |
| fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| |
| aarch64 = sshr |
| generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
| |
| /// Floating-point compare less than zero |
| name = vcltz |
| fn = simd_lt |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 |
| validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE |
| |
| aarch64 = fcmlt |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| /// Count leading sign bits |
| name = vcls |
| a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX |
| validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 |
| |
| arm = vcls.s |
| aarch64 = cls |
| link-arm = vcls._EXT_ |
| link-aarch64 = cls._EXT_ |
| generate int*_t |
| |
| /// Signed count leading sign bits |
| name = vclz |
| multi_fn = self-signed-ext, a |
| a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX |
| validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1 |
| |
| arm = vclz. |
| aarch64 = clz |
| generate int*_t |
| |
| /// Unsigned count leading sign bits |
| name = vclz |
| multi_fn = transmute, {self-signed-ext, transmute(a)} |
| a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX |
| validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 |
| |
| arm = vclz. |
| aarch64 = clz |
| generate uint*_t |
| |
| /// Floating-point absolute compare greater than |
| name = vcagt |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 |
| validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE |
| |
| aarch64 = facgt |
| link-aarch64 = facgt._EXT2_._EXT_ |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vacgt.s |
| link-arm = vacgt._EXT2_._EXT_ |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Floating-point absolute compare greater than or equal |
| name = vcage |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 |
| validate TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE |
| |
| aarch64 = facge |
| link-aarch64 = facge._EXT2_._EXT_ |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vacge.s |
| link-arm = vacge._EXT2_._EXT_ |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Floating-point absolute compare less than |
| name = vcalt |
| multi_fn = vcagt-self-noext, b, a |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 |
| validate FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE |
| |
| aarch64 = facgt |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vacgt.s |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Floating-point absolute compare less than or equal |
| name = vcale |
| multi_fn = vcage-self-noext , b, a |
| a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 |
| b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 |
| validate FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE |
| |
| aarch64 = facge |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vacge.s |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 0:1 |
| validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = mov |
| generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t |
| generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t |
| generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} |
| a = 1., 2., 3., 4. |
| b = 0., 0.5, 0., 0. |
| n = 0:1 |
| validate 0.5, 2., 3., 4. |
| |
| aarch64 = mov |
| generate float32x2_t, float32x4_t, float64x2_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 0:1 |
| validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = mov |
| generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t |
| generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t |
| generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = simd_shuffle-in_len-!, a:in_t, a, a, {asc-0-in_len} |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in_len-LANE2} |
| a = 1., 2., 3., 4. |
| b = 0., 0.5, 0., 0. |
| n = 0:1 |
| validate 0.5, 2., 3., 4. |
| |
| aarch64 = mov |
| generate float32x2_t:float32x4_t:float32x2_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 0:1 |
| validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = mov |
| generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t |
| generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t |
| generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1:0 |
| validate 1, MAX |
| |
| aarch64 = zip1 |
| generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t |
| |
| /// Insert vector element from another vector element |
| name = vcopy |
| lane-suffixes |
| constn = LANE1:LANE2 |
| multi_fn = static_assert_imm-in0_exp_len-LANE1 |
| multi_fn = static_assert_imm-in_exp_len-LANE2 |
| multi_fn = simd_shuffle-in0_len-!, b:in_t0, b, b, {asc-0-in0_len} |
| multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle-out_len-!, a, b, {ins-in0_len-in0_len-LANE2} |
| a = 1., 2., 3., 4. |
| b = 0.5, 0., 0., 0. |
| n = 1:0 |
| validate 1., 0.5, 3., 4. |
| |
| aarch64 = mov |
| generate float32x4_t:float32x2_t:float32x4_t |
| aarch64 = zip1 |
| generate float64x2_t:float64x1_t:float64x2_t |
| |
| /// Insert vector element from another vector element |
| name = vcreate |
| out-suffix |
| multi_fn = transmute, a |
| a = 1 |
| validate 1, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = nop |
| arm = nop |
| generate u64:int8x8_t, u64:int16x4_t: u64:int32x2_t, u64:int64x1_t |
| generate u64:uint8x8_t, u64:uint16x4_t: u64:uint32x2_t, u64:uint64x1_t |
| generate u64:poly8x8_t, u64:poly16x4_t |
| target = crypto |
| generate u64:poly64x1_t |
| |
| /// Insert vector element from another vector element |
| name = vcreate |
| out-suffix |
| multi_fn = transmute, a |
| a = 0 |
| validate 0., 0. |
| |
| aarch64 = nop |
| generate u64:float64x1_t |
| arm = nop |
| generate u64:float32x2_t |
| |
| /// Fixed-point convert to floating-point |
| name = vcvt |
| double-suffixes |
| fn = simd_cast |
| a = 1, 2, 3, 4 |
| validate 1., 2., 3., 4. |
| |
| aarch64 = scvtf |
| generate int64x1_t:float64x1_t, int64x2_t:float64x2_t |
| aarch64 = ucvtf |
| generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t |
| |
| arm = vcvt |
| aarch64 = scvtf |
| generate int32x2_t:float32x2_t, int32x4_t:float32x4_t |
| aarch64 = ucvtf |
| generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t |
| |
| /// Floating-point convert to higher precision long |
| name = vcvt |
| double-suffixes |
| fn = simd_cast |
| a = -1.2, 1.2 |
| validate -1.2f32 as f64, 1.2f32 as f64 |
| |
| aarch64 = fcvtl |
| generate float32x2_t:float64x2_t |
| |
| /// Floating-point convert to higher precision long |
| name = vcvt_high |
| noq-double-suffixes |
| multi_fn = simd_shuffle2!, b:float32x2_t, a, a, [2, 3] |
| multi_fn = simd_cast, b |
| a = -1.2, 1.2, 2.3, 3.4 |
| validate 2.3f32 as f64, 3.4f32 as f64 |
| |
| aarch64 = fcvtl |
| generate float32x4_t:float64x2_t |
| |
| /// Floating-point convert to lower precision narrow |
| name = vcvt |
| double-suffixes |
| fn = simd_cast |
| a = -1.2, 1.2 |
| validate -1.2f64 as f32, 1.2f64 as f32 |
| |
| aarch64 = fcvtn |
| generate float64x2_t:float32x2_t |
| |
| /// Floating-point convert to lower precision narrow |
| name = vcvt_high |
| noq-double-suffixes |
| multi_fn = simd_shuffle4!, a, {simd_cast, b}, [0, 1, 2, 3] |
| a = -1.2, 1.2 |
| b = -2.3, 3.4 |
| validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32 |
| |
| aarch64 = fcvtn |
| generate float32x2_t:float64x2_t:float32x4_t |
| |
| /// Floating-point convert to lower precision narrow, rounding to odd |
| name = vcvtx |
| double-suffixes |
| a = -1.0, 2.0 |
| validate -1.0, 2.0 |
| |
| aarch64 = fcvtxn |
| link-aarch64 = fcvtxn._EXT2_._EXT_ |
| generate float64x2_t:float32x2_t |
| |
| /// Floating-point convert to lower precision narrow, rounding to odd |
| name = vcvtx_high |
| noq-double-suffixes |
| multi_fn = simd_shuffle4!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3] |
| a = -1.0, 2.0 |
| b = -3.0, 4.0 |
| validate -1.0, 2.0, -3.0, 4.0 |
| |
| aarch64 = fcvtxn |
| generate float32x2_t:float64x2_t:float32x4_t |
| |
| /// Fixed-point convert to floating-point |
| name = vcvt |
| double-n-suffixes |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| a = 1, 2, 3, 4 |
| n = 2 |
| validate 0.25, 0.5, 0.75, 1. |
| |
| aarch64 = scvtf |
| link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ |
| const-aarch64 = N |
| generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64 |
| |
| aarch64 = ucvtf |
| link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ |
| const-aarch64 = N |
| generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64 |
| |
| aarch64 = scvtf |
| link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ |
| arm = vcvt |
| link-arm = vcvtfxs2fp._EXT2_._EXT_ |
| const-arm = N:i32 |
| generate int32x2_t:float32x2_t, int32x4_t:float32x4_t |
| |
| aarch64 = ucvtf |
| link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ |
| arm = vcvt |
| link-arm = vcvtfxu2fp._EXT2_._EXT_ |
| const-arm = N:i32 |
| generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t |
| |
| /// Floating-point convert to fixed-point, rounding toward zero |
| name = vcvt |
| double-n-suffixes |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| a = 0.25, 0.5, 0.75, 1. |
| n = 2 |
| validate 1, 2, 3, 4 |
| |
| aarch64 = fcvtzs |
| link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ |
| const-aarch64 = N |
| generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 |
| |
| aarch64 = fcvtzu |
| link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ |
| const-aarch64 = N |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
| |
| aarch64 = fcvtzs |
| link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ |
| arm = vcvt |
| link-arm = vcvtfp2fxs._EXT2_._EXT_ |
| const-arm = N:i32 |
| generate float32x2_t:int32x2_t, float32x4_t:int32x4_t |
| |
| aarch64 = fcvtzu |
| link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ |
| arm = vcvt |
| link-arm = vcvtfp2fxu._EXT2_._EXT_ |
| const-arm = N:i32 |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Fixed-point convert to floating-point |
| name = vcvt |
| double-suffixes |
| multi_fn = a as out_t |
| a = 1 |
| validate 1. |
| |
| aarch64 = scvtf |
| generate i32:f32, i64:f64 |
| aarch64 = ucvtf |
| generate u32:f32, u64:f64 |
| |
| /// Fixed-point convert to floating-point |
| name = vcvt |
| double-suffixes |
| multi_fn = a as out_t |
| a = 1. |
| validate 1 |
| |
| aarch64 = fcvtzs |
| generate f32:i32, f64:i64 |
| aarch64 = fcvtzu |
| generate f32:u32, f64:u64 |
| |
| /// Floating-point convert to signed fixed-point, rounding toward zero |
| name = vcvt |
| double-suffixes |
| fn = simd_cast |
| a = -1.1, 2.1, -2.9, 3.9 |
| validate -1, 2, -2, 3 |
| |
| aarch64 = fcvtzs |
| generate float64x1_t:int64x1_t, float64x2_t:int64x2_t |
| |
| arm = vcvt |
| generate float32x2_t:int32x2_t, float32x4_t:int32x4_t |
| |
| /// Floating-point convert to unsigned fixed-point, rounding toward zero |
| name = vcvt |
| double-suffixes |
| fn = simd_cast |
| a = 1.1, 2.1, 2.9, 3.9 |
| validate 1, 2, 2, 3 |
| |
| aarch64 = fcvtzu |
| generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| arm = vcvt |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
| |
| /// Floating-point convert to signed integer, rounding to nearest with ties to away |
| name = vcvta |
| double-suffixes |
| a = -1.1, 2.1, -2.9, 3.9 |
| validate -1, 2, -3, 4 |
| |
| aarch64 = fcvtas |
| link-aarch64 = fcvtas._EXT2_._EXT_ |
| generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t |
| |
| /// Floating-point convert to integer, rounding to nearest with ties to away |
| name = vcvta |
| double-suffixes |
| a = 2.9 |
| validate 3 |
| |
| aarch64 = fcvtas |
| link-aarch64 = fcvtas._EXT2_._EXT_ |
| generate f32:i32, f64:i64 |
| |
| aarch64 = fcvtau |
| link-aarch64 = fcvtau._EXT2_._EXT_ |
| generate f32:u32, f64:u64 |
| |
| /// Floating-point convert to signed integer, rounding to nearest with ties to even |
| name = vcvtn |
| double-suffixes |
| a = -1.5, 2.1, -2.9, 3.9 |
| validate -2, 2, -3, 4 |
| |
| aarch64 = fcvtns |
| link-aarch64 = fcvtns._EXT2_._EXT_ |
| generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 |
| |
| /// Floating-point convert to signed integer, rounding toward minus infinity |
| name = vcvtm |
| double-suffixes |
| a = -1.1, 2.1, -2.9, 3.9 |
| validate -2, 2, -3, 3 |
| |
| aarch64 = fcvtms |
| link-aarch64 = fcvtms._EXT2_._EXT_ |
| generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 |
| |
| /// Floating-point convert to signed integer, rounding toward plus infinity |
| name = vcvtp |
| double-suffixes |
| a = -1.1, 2.1, -2.9, 3.9 |
| validate -1, 3, -2, 4 |
| |
| aarch64 = fcvtps |
| link-aarch64 = fcvtps._EXT2_._EXT_ |
| generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 |
| |
| /// Floating-point convert to unsigned integer, rounding to nearest with ties to away |
| name = vcvta |
| double-suffixes |
| a = 1.1, 2.1, 2.9, 3.9 |
| validate 1, 2, 3, 4 |
| |
| aarch64 = fcvtau |
| link-aarch64 = fcvtau._EXT2_._EXT_ |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t |
| |
| /// Floating-point convert to unsigned integer, rounding to nearest with ties to even |
| name = vcvtn |
| double-suffixes |
| a = 1.5, 2.1, 2.9, 3.9 |
| validate 2, 2, 3, 4 |
| |
| aarch64 = fcvtnu |
| link-aarch64 = fcvtnu._EXT2_._EXT_ |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
| |
| /// Floating-point convert to unsigned integer, rounding toward minus infinity |
| name = vcvtm |
| double-suffixes |
| a = 1.1, 2.1, 2.9, 3.9 |
| validate 1, 2, 2, 3 |
| |
| aarch64 = fcvtmu |
| link-aarch64 = fcvtmu._EXT2_._EXT_ |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
| |
| /// Floating-point convert to unsigned integer, rounding toward plus infinity |
| name = vcvtp |
| double-suffixes |
| a = 1.1, 2.1, 2.9, 3.9 |
| validate 2, 3, 3, 4 |
| |
| aarch64 = fcvtpu |
| link-aarch64 = fcvtpu._EXT2_._EXT_ |
| generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} |
| a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 |
| n = HFLEN |
| validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| |
| aarch64 = dup |
| generate poly64x2_t, poly64x1_t:poly64x2_t |
| |
| arm = vdup.l |
| generate int*_t |
| generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t |
| generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t |
| |
| generate uint*_t |
| generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t |
| generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t |
| |
| generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t |
| generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t |
| generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} |
| a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 |
| n = HFLEN |
| validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| |
| aarch64 = dup |
| arm = vmov |
| generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_shuffle-out_len-!, a, a, {dup-out_len-N as u32} |
| a = 1., 1., 1., 4. |
| n = HFLEN |
| validate 1., 1., 1., 1. |
| |
| aarch64 = dup |
| generate float64x2_t, float64x1_t:float64x2_t |
| |
| arm = vdup.l |
| generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = a |
| a = 0 |
| n = HFLEN |
| validate 0 |
| |
| aarch64 = nop |
| generate poly64x1_t |
| |
| arm = nop |
| generate int64x1_t, uint64x1_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = a |
| a = 0. |
| n = HFLEN |
| validate 0. |
| |
| aarch64 = nop |
| generate float64x1_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32} |
| a = 0, 1 |
| n = HFLEN |
| validate 1 |
| |
| aarch64 = nop |
| generate poly64x2_t:poly64x1_t |
| |
| arm = vmov |
| generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32} |
| a = 0., 1. |
| n = HFLEN |
| validate 1. |
| |
| aarch64 = nop |
| generate float64x2_t:float64x1_t |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_extract, a, N as u32 |
| a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 |
| n = HFLEN |
| validate 1 |
| |
| aarch64 = nop |
| generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64 |
| generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64 |
| generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16 |
| |
| /// Set all vector lanes to the same value |
| name = vdup |
| lane-suffixes |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_extract, a, N as u32 |
| a = 1., 1., 1., 4. |
| n = HFLEN |
| validate 1. |
| |
| aarch64 = nop |
| generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64 |
| |
| /// Extract vector from pair of vectors |
| name = vext |
| constn = N |
| multi_fn = static_assert_imm-out_exp_len-N |
| multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} |
| a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 |
| b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 |
| n = HFLEN |
| validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 |
| |
| arm = "vext.8" |
| aarch64 = ext |
| generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t |
| |
| /// Extract vector from pair of vectors |
| name = vext |
| constn = N |
| multi_fn = static_assert_imm-out_exp_len-N |
| multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} |
| a = 0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15 |
| b = 9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11 |
| n = HFLEN |
| validate 8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19 |
| |
| aarch64 = ext |
| generate poly64x2_t |
| |
| arm = vmov |
| generate int64x2_t, uint64x2_t |
| |
| /// Extract vector from pair of vectors |
| name = vext |
| constn = N |
| multi_fn = static_assert_imm-out_exp_len-N |
| multi_fn = matchn-out_exp_len-N, simd_shuffle-out_len-!, a, b, {asc-n-out_len} |
| a = 0., 2., 2., 3. |
| b = 3., 4., 5., 6., |
| n = HFLEN |
| validate 2., 3., 3., 4. |
| |
| aarch64 = ext |
| generate float64x2_t |
| |
| arm = "vext.8" |
| generate float*_t |
| |
| /// Multiply-add to accumulator |
| name = vmla |
| multi_fn = simd_add, a, {simd_mul, b, c} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| arm = vmla. |
| aarch64 = mla |
| generate int*_t, uint*_t |
| |
| /// Floating-point multiply-add to accumulator |
| name = vmla |
| multi_fn = simd_add, a, {simd_mul, b, c} |
| a = 0., 1., 2., 3. |
| b = 2., 2., 2., 2. |
| c = 3., 3., 3., 3. |
| validate 6., 7., 8., 9. |
| |
| aarch64 = fmul |
| generate float64x*_t |
| |
| arm = vmla. |
| generate float*_t |
| |
| /// Vector multiply accumulate with scalar |
| name = vmla |
| n-suffix |
| multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| aarch64 = mla |
| arm = vmla. |
| generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t |
| generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t |
| |
| /// Vector multiply accumulate with scalar |
| name = vmla |
| n-suffix |
| multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} |
| a = 0., 1., 2., 3. |
| b = 2., 2., 2., 2. |
| c = 3. |
| validate 6., 7., 8., 9. |
| |
| aarch64 = fmul |
| arm = vmla. |
| generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t |
| |
| /// Vector multiply accumulate with scalar |
| name = vmla |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| aarch64 = mla |
| arm = vmla. |
| generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t |
| generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t |
| generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t |
| generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t |
| |
| /// Vector multiply accumulate with scalar |
| name = vmla |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 0., 1., 2., 3. |
| b = 2., 2., 2., 2. |
| c = 0., 3., 0., 0. |
| n = 1 |
| validate 6., 7., 8., 9. |
| |
| aarch64 = fmul |
| arm = vmla. |
| generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t |
| |
| /// Signed multiply-add long |
| name = vmlal |
| multi_fn = simd_add, a, {vmull-self-noext, b, c} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| arm = vmlal.s |
| aarch64 = smlal |
| generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Unsigned multiply-add long |
| name = vmlal |
| multi_fn = simd_add, a, {vmull-self-noext, b, c} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| arm = vmlal.s |
| aarch64 = umlal |
| generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t |
| |
| /// Vector widening multiply accumulate with scalar |
| name = vmlal |
| n-suffix |
| multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| arm = vmlal.s |
| aarch64 = smlal |
| generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t |
| aarch64 = umlal |
| generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t |
| |
| /// Vector widening multiply accumulate with scalar |
| name = vmlal_lane |
| in2-suffix |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| |
| arm = vmlal.s |
| aarch64 = smlal |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t |
| generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t |
| aarch64 = umlal |
| generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t |
| generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t |
| |
| /// Signed multiply-add long |
| name = vmlal_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} |
| multi_fn = vmlal-noqself-noext, a, b, c |
| a = 8, 7, 6, 5, 4, 3, 2, 1 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = smlal2 |
| generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| /// Unsigned multiply-add long |
| name = vmlal_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} |
| multi_fn = vmlal-noqself-noext, a, b, c |
| a = 8, 7, 6, 5, 4, 3, 2, 1 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = umlal2 |
| generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Multiply-add long |
| name = vmlal_high_n |
| no-q |
| multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} |
| a = 8, 7, 6, 5, 4, 3, 2, 1 |
| b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| c = 2 |
| validate 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = smlal2 |
| generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t |
| aarch64 = umlal2 |
| generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t |
| |
| /// Multiply-add long |
| name = vmlal_high_lane |
| in2-suffix |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 8, 7, 6, 5, 4, 3, 2, 1 |
| b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = smlal2 |
| generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t |
| generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| aarch64 = umlal2 |
| generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t |
| generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Multiply-subtract from accumulator |
| name = vmls |
| multi_fn = simd_sub, a, {simd_mul, b, c} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| arm = vmls. |
| aarch64 = mls |
| generate int*_t, uint*_t |
| |
| /// Floating-point multiply-subtract from accumulator |
| name = vmls |
| multi_fn = simd_sub, a, {simd_mul, b, c} |
| a = 6., 7., 8., 9. |
| b = 2., 2., 2., 2. |
| c = 3., 3., 3., 3. |
| validate 0., 1., 2., 3. |
| |
| aarch64 = fmul |
| generate float64x*_t |
| |
| arm = vmls. |
| generate float*_t |
| |
| /// Vector multiply subtract with scalar |
| name = vmls |
| n-suffix |
| multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = mls |
| arm = vmls. |
| generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t |
| generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t |
| |
| /// Vector multiply subtract with scalar |
| name = vmls |
| n-suffix |
| multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} |
| a = 6., 7., 8., 9. |
| b = 2., 2., 2., 2. |
| c = 3. |
| validate 0., 1., 2., 3. |
| |
| aarch64 = fmul |
| arm = vmls. |
| generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t |
| |
| /// Vector multiply subtract with scalar |
| name = vmls |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = mls |
| arm = vmls. |
| generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t |
| generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t |
| generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t |
| generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t |
| |
| /// Vector multiply subtract with scalar |
| name = vmls |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 6., 7., 8., 9. |
| b = 2., 2., 2., 2. |
| c = 0., 3., 0., 0. |
| n = 1 |
| validate 0., 1., 2., 3. |
| |
| aarch64 = fmul |
| arm = vmls. |
| generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t |
| |
| /// Signed multiply-subtract long |
| name = vmlsl |
| multi_fn = simd_sub, a, {vmull-self-noext, b, c} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| arm = vmlsl.s |
| aarch64 = smlsl |
| generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Unsigned multiply-subtract long |
| name = vmlsl |
| multi_fn = simd_sub, a, {vmull-self-noext, b, c} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| arm = vmlsl.s |
| aarch64 = umlsl |
| generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t |
| |
| /// Vector widening multiply subtract with scalar |
| name = vmlsl |
| n-suffix |
| multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| arm = vmlsl.s |
| aarch64 = smlsl |
| generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t |
| aarch64 = umlsl |
| generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t |
| |
| /// Vector widening multiply subtract with scalar |
| name = vmlsl_lane |
| in2-suffix |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| arm = vmlsl.s |
| aarch64 = smlsl |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t |
| generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t |
| aarch64 = umlsl |
| generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t |
| generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t |
| |
| /// Signed multiply-subtract long |
| name = vmlsl_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} |
| multi_fn = vmlsl-noqself-noext, a, b, c |
| a = 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 14, 13, 12, 11, 10, 9, 8, 7 |
| |
| aarch64 = smlsl2 |
| generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| /// Unsigned multiply-subtract long |
| name = vmlsl_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, c:half, c, c, {fixed-half-right} |
| multi_fn = vmlsl-noqself-noext, a, b, c |
| a = 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 14, 13, 12, 11, 10, 9, 8, 7 |
| |
| aarch64 = umlsl2 |
| generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Multiply-subtract long |
| name = vmlsl_high_n |
| no-q |
| multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} |
| a = 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| c = 2 |
| validate 14, 13, 12, 11, 10, 9, 8, 7 |
| |
| aarch64 = smlsl2 |
| generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t |
| aarch64 = umlsl2 |
| generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t |
| |
| /// Multiply-subtract long |
| name = vmlsl_high_lane |
| in2-suffix |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-!, c, c, {dup-in_len-LANE as u32}} |
| a = 14, 15, 16, 17, 18, 19, 20, 21 |
| b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 |
| c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 14, 13, 12, 11, 10, 9, 8, 7 |
| |
| aarch64 = smlsl2 |
| generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t |
| generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| aarch64 = umlsl2 |
| generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t |
| generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Extract narrow |
| name = vmovn_high |
| no-q |
| multi_fn = simd_cast, c:in_t0, b |
| multi_fn = simd_shuffle-out_len-!, a, c, {asc-0-out_len} |
| a = 0, 1, 2, 3, 2, 3, 4, 5 |
| b = 2, 3, 4, 5, 12, 13, 14, 15 |
| validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15 |
| |
| aarch64 = xtn2 |
| generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t |
| generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t |
| |
| /// Negate |
| name = vneg |
| fn = simd_neg |
| a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8 |
| validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8 |
| |
| aarch64 = neg |
| generate int64x*_t |
| |
| arm = vneg.s |
| generate int*_t |
| |
| /// Negate |
| name = vneg |
| fn = simd_neg |
| a = 0., 1., -1., 2., -2., 3., -3., 4. |
| validate 0., -1., 1., -2., 2., -3., 3., -4. |
| |
| aarch64 = fneg |
| generate float64x*_t |
| |
| arm = vneg.s |
| generate float*_t |
| |
| /// Signed saturating negate |
| name = vqneg |
| a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7 |
| validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7 |
| link-arm = vqneg._EXT_ |
| link-aarch64 = sqneg._EXT_ |
| |
| aarch64 = sqneg |
| generate int64x*_t |
| |
| arm = vqneg.s |
| generate int*_t |
| |
| /// Saturating subtract |
| name = vqsub |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 |
| |
| arm = vqsub.s |
| aarch64 = uqsub |
| link-arm = llvm.usub.sat._EXT_ |
| link-aarch64 = uqsub._EXT_ |
| generate uint*_t, uint64x*_t |
| |
| arm = vqsub.s |
| aarch64 = sqsub |
| link-arm = llvm.ssub.sat._EXT_ |
| link-aarch64 = sqsub._EXT_ |
| generate int*_t, int64x*_t |
| |
| /// Saturating subtract |
| name = vqsub |
| multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0 |
| a = 42 |
| b = 1 |
| validate 41 |
| |
| aarch64 = sqsub |
| generate i8, i16 |
| aarch64 = uqsub |
| generate u8, u16 |
| |
| /// Saturating subtract |
| name = vqsub |
| a = 42 |
| b = 1 |
| validate 41 |
| |
| aarch64 = uqsub |
| link-aarch64 = uqsub._EXT_ |
| generate u32, u64 |
| |
| aarch64 = sqsub |
| link-aarch64 = sqsub._EXT_ |
| generate i32, i64 |
| |
| /// Halving add |
| name = vhadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29 |
| |
| arm = vhadd.s |
| aarch64 = uhadd |
| link-aarch64 = uhadd._EXT_ |
| link-arm = vhaddu._EXT_ |
| generate uint*_t |
| |
| arm = vhadd.s |
| aarch64 = shadd |
| link-aarch64 = shadd._EXT_ |
| link-arm = vhadds._EXT_ |
| generate int*_t |
| |
| /// Reverse bit order |
| name = vrbit |
| a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 |
| |
| aarch64 = rbit |
| link-aarch64 = rbit._EXT_ |
| |
| generate int8x8_t, int8x16_t |
| |
| /// Reverse bit order |
| name = vrbit |
| multi_fn = transmute, {vrbit-signed-noext, transmute(a)} |
| a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 |
| |
| aarch64 = rbit |
| |
| generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t |
| |
| /// Rounding halving add |
| name = vrhadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29 |
| |
| arm = vrhadd.s |
| aarch64 = urhadd |
| link-arm = vrhaddu._EXT_ |
| link-aarch64 = urhadd._EXT_ |
| generate uint*_t |
| |
| arm = vrhadd.s |
| aarch64 = srhadd |
| link-arm = vrhadds._EXT_ |
| link-aarch64 = srhadd._EXT_ |
| generate int*_t |
| |
| /// Floating-point round to integral exact, using current rounding mode |
| name = vrndx |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -2.0, 0.0, 2.0, 2.0 |
| |
| aarch64 = frintx |
| link-aarch64 = llvm.rint._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Floating-point round to integral, to nearest with ties to away |
| name = vrnda |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -2.0, 1.0, 2.0, 3.0 |
| |
| aarch64 = frinta |
| link-aarch64 = llvm.round._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Floating-point round to integral, to nearest with ties to even |
| name = vrndn |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -2.0, 0.0, 2.0, 2.0 |
| |
| link-aarch64 = frintn._EXT_ |
| aarch64 = frintn |
| generate float64x*_t |
| |
| target = fp-armv8 |
| arm = vrintn |
| link-arm = vrintn._EXT_ |
| generate float*_t |
| |
| /// Floating-point round to integral, toward minus infinity |
| name = vrndm |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -2.0, 0.0, 1.0, 2.0 |
| |
| aarch64 = frintm |
| link-aarch64 = llvm.floor._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Floating-point round to integral, toward plus infinity |
| name = vrndp |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -1.0, 1.0, 2.0, 3.0 |
| |
| aarch64 = frintp |
| link-aarch64 = llvm.ceil._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Floating-point round to integral, toward zero |
| name = vrnd |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -1.0, 0.0, 1.0, 2.0 |
| |
| aarch64 = frintz |
| link-aarch64 = llvm.trunc._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Floating-point round to integral, using current rounding mode |
| name = vrndi |
| a = -1.5, 0.5, 1.5, 2.5 |
| validate -2.0, 0.0, 2.0, 2.0 |
| |
| aarch64 = frinti |
| link-aarch64 = llvm.nearbyint._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Saturating add |
| name = vqadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 |
| |
| arm = vqadd.s |
| aarch64 = uqadd |
| link-arm = llvm.uadd.sat._EXT_ |
| link-aarch64 = uqadd._EXT_ |
| generate uint*_t, uint64x*_t |
| |
| arm = vqadd.s |
| aarch64 = sqadd |
| link-arm = llvm.sadd.sat._EXT_ |
| link-aarch64 = sqadd._EXT_ |
| generate int*_t, int64x*_t |
| |
| /// Saturating add |
| name = vqadd |
| multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0 |
| a = 42 |
| b = 1 |
| validate 43 |
| |
| aarch64 = sqadd |
| generate i8, i16 |
| aarch64 = uqadd |
| generate u8, u16 |
| |
| /// Saturating add |
| name = vqadd |
| a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 |
| |
| aarch64 = uqadd |
| link-aarch64 = uqadd._EXT_ |
| generate u32, u64 |
| |
| aarch64 = sqadd |
| link-aarch64 = sqadd._EXT_ |
| generate i32, i64 |
| |
| /// Multiply |
| name = vmul |
| a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 |
| arm = vmul. |
| aarch64 = mul |
| fn = simd_mul |
| generate int*_t, uint*_t |
| |
| /// Polynomial multiply |
| name = vmul |
| a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48 |
| |
| aarch64 = pmul |
| link-aarch64 = pmul._EXT_ |
| arm = vmul |
| link-arm = vmulp._EXT_ |
| generate poly8x8_t, poly8x16_t |
| |
| /// Multiply |
| name = vmul |
| fn = simd_mul |
| a = 1.0, 2.0, 1.0, 2.0 |
| b = 2.0, 3.0, 4.0, 5.0 |
| validate 2.0, 6.0, 4.0, 10.0 |
| |
| aarch64 = fmul |
| generate float64x*_t |
| |
| arm = vmul. |
| generate float*_t |
| |
| /// Vector multiply by scalar |
| name = vmul |
| out-n-suffix |
| multi_fn = simd_mul, a, {vdup-nout-noext, b} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 2 |
| validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 |
| |
| arm = vmul |
| aarch64 = mul |
| generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t |
| generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t |
| |
| /// Vector multiply by scalar |
| name = vmul |
| out-n-suffix |
| multi_fn = simd_mul, a, {vdup-nout-noext, b} |
| a = 1., 2., 3., 4. |
| b = 2. |
| validate 2., 4., 6., 8. |
| |
| aarch64 = fmul |
| generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t |
| |
| arm = vmul |
| generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t |
| |
| /// Multiply |
| name = vmul |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 |
| |
| aarch64 = mul |
| arm = vmul |
| generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t |
| generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t |
| generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t |
| generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t |
| |
| /// Floating-point multiply |
| name = vmul |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}} |
| a = 1., 2., 3., 4. |
| b = 2., 0., 0., 0. |
| n = 0 |
| validate 2., 4., 6., 8. |
| |
| aarch64 = fmul |
| generate float64x1_t, float64x1_t:float64x2_t:float64x1_t |
| |
| /// Floating-point multiply |
| name = vmul |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_mul, a, {simd_shuffle-out_len-!, b, b, {dup-out_len-LANE as u32}} |
| a = 1., 2., 3., 4. |
| b = 2., 0., 0., 0. |
| n = 0 |
| validate 2., 4., 6., 8. |
| |
| aarch64 = fmul |
| generate float64x2_t:float64x1_t:float64x2_t, float64x2_t |
| |
| arm = vmul |
| generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t |
| |
| /// Floating-point multiply |
| name = vmuls_lane |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_extract, b:f32, b, LANE as u32 |
| multi_fn = a * b |
| a = 1. |
| b = 2., 0., 0., 0. |
| n = 0 |
| validate 2. |
| aarch64 = fmul |
| generate f32:float32x2_t:f32, f32:float32x4_t:f32 |
| |
| /// Floating-point multiply |
| name = vmuld_lane |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_extract, b:f64, b, LANE as u32 |
| multi_fn = a * b |
| a = 1. |
| b = 2., 0. |
| n = 0 |
| validate 2. |
| aarch64 = fmul |
| generate f64:float64x1_t:f64, f64:float64x2_t:f64 |
| |
| /// Signed multiply long |
| name = vmull |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 |
| |
| arm = vmull.s |
| aarch64 = smull |
| link-arm = vmulls._EXT_ |
| link-aarch64 = smull._EXT_ |
| generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t |
| |
| /// Signed multiply long |
| name = vmull_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = vmull-noqself-noext, a, b |
| a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 9, 20, 11, 24, 13, 28, 15, 32 |
| |
| aarch64 = smull2 |
| generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t |
| |
| /// Unsigned multiply long |
| name = vmull |
| a = 1, 2, 3, 4, 5, 6, 7, 8 |
| b = 1, 2, 1, 2, 1, 2, 1, 2 |
| validate 1, 4, 3, 8, 5, 12, 7, 16 |
| |
| arm = vmull.s |
| aarch64 = umull |
| link-arm = vmullu._EXT_ |
| link-aarch64 = umull._EXT_ |
| generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t |
| |
| /// Unsigned multiply long |
| name = vmull_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = vmull-noqself-noext, a, b |
| a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 9, 20, 11, 24, 13, 28, 15, 32 |
| |
| aarch64 = umull2 |
| generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Polynomial multiply long |
| name = vmull |
| a = 1, 2, 3, 4, 5, 6, 7, 8 |
| b = 1, 3, 1, 3, 1, 3, 1, 3 |
| validate 1, 6, 3, 12, 5, 10, 7, 24 |
| |
| arm = vmull.s |
| aarch64 = pmull |
| link-arm = vmullp._EXT_ |
| link-aarch64 = pmull._EXT_ |
| generate poly8x8_t:poly8x8_t:poly16x8_t |
| |
| /// Polynomial multiply long |
| name = vmull |
| no-q |
| a = 15 |
| b = 3 |
| validate 17 |
| target = crypto |
| |
| aarch64 = pmull |
| link-aarch64 = pmull64:p64:p64:p64:int8x16_t |
| // Because of the support status of llvm, vmull_p64 is currently only available on aarch64 |
| // arm = vmull |
| // link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t |
| generate p64:p64:p128 |
| |
| |
| /// Polynomial multiply long |
| name = vmull_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a:half, a, a, {fixed-half-right} |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {fixed-half-right} |
| multi_fn = vmull-noqself-noext, a, b |
| a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 |
| fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 9, 30, 11, 20, 13, 18, 15, 48 |
| |
| aarch64 = pmull |
| generate poly8x16_t:poly8x16_t:poly16x8_t |
| |
| /// Polynomial multiply long |
| name = vmull_high |
| no-q |
| multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1} |
| a = 1, 15 |
| b = 1, 3 |
| validate 17 |
| target = crypto |
| |
| aarch64 = pmull |
| generate poly64x2_t:poly64x2_t:p128 |
| |
| /// Vector long multiply with scalar |
| name = vmull |
| n-suffix |
| multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} |
| a = 1, 2, 3, 4, 5, 6, 7, 8 |
| b = 2 |
| validate 2, 4, 6, 8, 10, 12, 14, 16 |
| |
| arm = vmull |
| aarch64 = smull |
| generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t |
| aarch64 = umull |
| generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t |
| |
| /// Vector long multiply by scalar |
| name = vmull_lane |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = vmull-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 |
| |
| arm = vmull |
| aarch64 = smull |
| generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t |
| generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t |
| aarch64 = umull |
| generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t |
| generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t |
| |
| /// Multiply long |
| name = vmull_high_n |
| no-q |
| multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b} |
| a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 2 |
| validate 18, 20, 22, 24, 26, 28, 30, 32 |
| |
| aarch64 = smull2 |
| generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t |
| aarch64 = umull2 |
| generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t |
| |
| /// Multiply long |
| name = vmull_high_lane |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = vmull_high-noqself-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} |
| a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 18, 20, 22, 24, 26, 28, 30, 32 |
| |
| aarch64 = smull2 |
| generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t |
| generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t |
| aarch64 = umull2 |
| generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t |
| generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Floating-point multiply extended |
| name = vmulx |
| a = 1., 2., 3., 4. |
| b = 2., 2., 2., 2. |
| validate 2., 4., 6., 8. |
| |
| aarch64 = fmulx |
| link-aarch64 = fmulx._EXT_ |
| generate float*_t, float64x*_t |
| |
| /// Floating-point multiply extended |
| name = vmulx |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}} |
| a = 1. |
| b = 2., 0. |
| n = 0 |
| validate 2. |
| |
| aarch64 = fmulx |
| generate float64x1_t, float64x1_t:float64x2_t:float64x1_t |
| |
| /// Floating-point multiply extended |
| name = vmulx |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = vmulx-in0-noext, a, {simd_shuffle-in0_len-!, b, b, {dup-in0_len-LANE as u32}} |
| a = 1., 2., 3., 4. |
| b = 2., 0., 0., 0. |
| n = 0 |
| validate 2., 4., 6., 8. |
| |
| aarch64 = fmulx |
| generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t |
| generate float64x2_t:float64x1_t:float64x2_t, float64x2_t |
| |
| /// Floating-point multiply extended |
| name = vmulx |
| a = 2. |
| b = 3. |
| validate 6. |
| |
| aarch64 = fmulx |
| link-aarch64 = fmulx._EXT_ |
| generate f32, f64 |
| |
| /// Floating-point multiply extended |
| name = vmulx |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32} |
| |
| a = 2. |
| b = 3., 0., 0., 0. |
| n = 0 |
| validate 6. |
| |
| aarch64 = fmulx |
| generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64 |
| |
| /// Floating-point fused Multiply-Add to accumulator(vector) |
| name = vfma |
| multi_fn = vfma-self-_, b, c, a |
| a = 8.0, 18.0, 12.0, 10.0 |
| b = 6.0, 4.0, 7.0, 8.0 |
| c = 2.0, 3.0, 4.0, 5.0 |
| validate 20.0, 30.0, 40.0, 50.0 |
| |
| link-aarch64 = llvm.fma._EXT_ |
| aarch64 = fmadd |
| generate float64x1_t |
| aarch64 = fmla |
| generate float64x2_t |
| |
| target = fp-armv8 |
| arm = vfma |
| link-arm = llvm.fma._EXT_ |
| generate float*_t |
| |
| /// Floating-point fused Multiply-Add to accumulator(vector) |
| name = vfma |
| n-suffix |
| multi_fn = vfma-self-noext, a, b, {vdup-nself-noext, c} |
| a = 2.0, 3.0, 4.0, 5.0 |
| b = 6.0, 4.0, 7.0, 8.0 |
| c = 8.0 |
| validate 50.0, 35.0, 60.0, 69.0 |
| |
| aarch64 = fmadd |
| generate float64x1_t:float64x1_t:f64:float64x1_t |
| aarch64 = fmla |
| generate float64x2_t:float64x2_t:f64:float64x2_t |
| |
| target = fp-armv8 |
| arm = vfma |
| generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t |
| |
| /// Floating-point fused multiply-add to accumulator |
| name = vfma |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} |
| a = 2., 3., 4., 5. |
| b = 6., 4., 7., 8. |
| c = 2., 0., 0., 0. |
| n = 0 |
| validate 14., 11., 18., 21. |
| |
| aarch64 = fmla |
| generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t |
| aarch64 = fmadd |
| generate float64x1_t |
| aarch64 = fmla |
| generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t |
| |
| /// Floating-point fused multiply-add to accumulator |
| name = vfma |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = simd_extract, c:out_t, c, LANE as u32 |
| multi_fn = vfma-in2lane-_, b, c, a |
| a = 2. |
| b = 6. |
| c = 3., 0., 0., 0. |
| n = 0 |
| validate 20. |
| |
| aarch64 = fmla |
| link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32 |
| generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 |
| link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64 |
| aarch64 = fmadd |
| generate f64:f64:float64x1_t:f64 |
| aarch64 = fmla |
| generate f64:f64:float64x2_t:f64 |
| |
| /// Floating-point fused multiply-subtract from accumulator |
| name = vfms |
| multi_fn = simd_neg, b:in_t, b |
| multi_fn = vfma-self-noext, a, b, c |
| a = 20.0, 30.0, 40.0, 50.0 |
| b = 6.0, 4.0, 7.0, 8.0 |
| c = 2.0, 3.0, 4.0, 5.0 |
| validate 8.0, 18.0, 12.0, 10.0 |
| |
| aarch64 = fmsub |
| generate float64x1_t |
| aarch64 = fmls |
| generate float64x2_t |
| |
| target = fp-armv8 |
| arm = vfms |
| generate float*_t |
| |
| /// Floating-point fused Multiply-subtract to accumulator(vector) |
| name = vfms |
| n-suffix |
| multi_fn = vfms-self-noext, a, b, {vdup-nself-noext, c} |
| a = 50.0, 35.0, 60.0, 69.0 |
| b = 6.0, 4.0, 7.0, 8.0 |
| c = 8.0 |
| validate 2.0, 3.0, 4.0, 5.0 |
| |
| aarch64 = fmsub |
| generate float64x1_t:float64x1_t:f64:float64x1_t |
| aarch64 = fmls |
| generate float64x2_t:float64x2_t:f64:float64x2_t |
| |
| target = fp-armv8 |
| arm = vfms |
| generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t |
| |
| /// Floating-point fused multiply-subtract to accumulator |
| name = vfms |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}} |
| a = 14., 11., 18., 21. |
| b = 6., 4., 7., 8. |
| c = 2., 0., 0., 0. |
| n = 0 |
| validate 2., 3., 4., 5. |
| |
| aarch64 = fmls |
| generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t |
| aarch64 = fmsub |
| generate float64x1_t |
| aarch64 = fmls |
| generate float64x1_t:float64x1_t:float64x2_t:float64x1_t, float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t |
| |
| /// Floating-point fused multiply-subtract to accumulator |
| name = vfms |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = vfma-in2lane-::<LANE>, a, -b, c |
| a = 14. |
| b = 6. |
| c = 2., 0., 0., 0. |
| n = 0 |
| validate 2. |
| |
| aarch64 = fmls |
| generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 |
| aarch64 = fmsub |
| generate f64:f64:float64x1_t:f64 |
| aarch64 = fmls |
| generate f64:f64:float64x2_t:f64 |
| |
| /// Divide |
| name = vdiv |
| fn = simd_div |
| a = 2.0, 6.0, 4.0, 10.0 |
| b = 1.0, 2.0, 1.0, 2.0 |
| validate 2.0, 3.0, 4.0, 5.0 |
| |
| aarch64 = fdiv |
| generate float*_t, float64x*_t |
| |
| /// Subtract |
| name = vsub |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 |
| arm = vsub. |
| aarch64 = sub |
| fn = simd_sub |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| /// Subtract |
| name = vsub |
| fn = simd_sub |
| a = 1.0, 4.0, 3.0, 8.0 |
| b = 1.0, 2.0, 3.0, 4.0 |
| validate 0.0, 2.0, 0.0, 4.0 |
| |
| aarch64 = fsub |
| generate float64x*_t |
| |
| arm = vsub. |
| generate float*_t |
| |
| /// Signed Add Long across Vector |
| name = vaddlv |
| a = 1, 2, 3, 4 |
| validate 10 |
| |
| aarch64 = saddlv |
| link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ |
| generate int16x4_t:i32 |
| |
| /// Signed Add Long across Vector |
| name = vaddlv |
| a = 1, 2, 3, 4, 5, 6, 7, 8 |
| validate 36 |
| |
| aarch64 = saddlv |
| link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ |
| generate int16x8_t:i32 |
| |
| /// Signed Add Long across Vector |
| name = vaddlv |
| a = 1, 2 |
| validate 3 |
| |
| aarch64 = saddlp |
| link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ |
| generate int32x2_t:i64 |
| |
| /// Signed Add Long across Vector |
| name = vaddlv |
| a = 1, 2, 3, 4 |
| validate 10 |
| |
| aarch64 = saddlv |
| link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ |
| generate int32x4_t:i64 |
| |
| /// Unsigned Add Long across Vector |
| name = vaddlv |
| a = 1, 2, 3, 4 |
| validate 10 |
| |
| aarch64 = uaddlv |
| link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ |
| generate uint16x4_t:u32 |
| |
| /// Unsigned Add Long across Vector |
| name = vaddlv |
| a = 1, 2, 3, 4, 5, 6, 7, 8 |
| validate 36 |
| |
| aarch64 = uaddlv |
| link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ |
| generate uint16x8_t:u32 |
| |
| /// Unsigned Add Long across Vector |
| name = vaddlv |
| a = 1, 2 |
| validate 3 |
| |
| aarch64 = uaddlp |
| link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ |
| generate uint32x2_t:u64 |
| |
| /// Unsigned Add Long across Vector |
| name = vaddlv |
| a = 1, 2, 3, 4 |
| validate 10 |
| |
| aarch64 = uaddlv |
| link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ |
| generate uint32x4_t:u64 |
| |
| /// Subtract returning high narrow |
| name = vsubhn |
| no-q |
| multi_fn = fixed, c:in_t |
| multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)} |
| a = MAX, MIN, 1, 1, MAX, MIN, 1, 1 |
| b = 1, 0, 0, 0, 1, 0, 0, 0 |
| fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS |
| validate MAX, MIN, 0, 0, MAX, MIN, 0, 0 |
| |
| arm = vsubhn |
| aarch64 = subhn |
| generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
| generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
| |
| /// Subtract returning high narrow |
| name = vsubhn_high |
| no-q |
| multi_fn = vsubhn-noqself-noext, d:in_t0, b, c |
| multi_fn = simd_shuffle-out_len-!, a, d, {asc-0-out_len} |
| a = MAX, 0, MAX, 0, MAX, 0, MAX, 0 |
| b = MAX, 1, MAX, 1, MAX, 1, MAX, 1 |
| c = 1, 0, 1, 0, 1, 0, 1, 0 |
| validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0 |
| |
| arm = vsubhn |
| aarch64 = subhn2 |
| generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t |
| generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t |
| |
| /// Signed halving subtract |
| name = vhsub |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 |
| validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 |
| |
| arm = vhsub.s |
| aarch64 = uhsub |
| link-arm = vhsubu._EXT_ |
| link-aarch64 = uhsub._EXT_ |
| generate uint*_t |
| |
| arm = vhsub.s |
| aarch64 = shsub |
| link-arm = vhsubs._EXT_ |
| link-aarch64 = shsub._EXT_ |
| generate int*_t |
| |
| /// Signed Subtract Wide |
| name = vsubw |
| no-q |
| multi_fn = simd_sub, a, {simd_cast, b} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| arm = vsubw |
| aarch64 = ssubw |
| generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t |
| |
| /// Unsigned Subtract Wide |
| name = vsubw |
| no-q |
| multi_fn = simd_sub, a, {simd_cast, b} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| arm = vsubw |
| aarch64 = usubw |
| generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t |
| |
| /// Signed Subtract Wide |
| name = vsubw_high |
| no-q |
| multi_fn = simd_shuffle8!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_sub, a, {simd_cast, c} |
| a = 8, 9, 10, 12, 13, 14, 15, 16 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 |
| validate 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = ssubw |
| generate int16x8_t:int8x16_t:int16x8_t |
| |
| /// Signed Subtract Wide |
| name = vsubw_high |
| no-q |
| multi_fn = simd_shuffle4!, c:int16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_sub, a, {simd_cast, c} |
| a = 8, 9, 10, 11 |
| b = 0, 1, 2, 3, 8, 9, 10, 11 |
| validate 0, 0, 0, 0 |
| |
| aarch64 = ssubw |
| generate int32x4_t:int16x8_t:int32x4_t |
| |
| /// Signed Subtract Wide |
| name = vsubw_high |
| no-q |
| multi_fn = simd_shuffle2!, c:int32x2_t, b, b, [2, 3] |
| multi_fn = simd_sub, a, {simd_cast, c} |
| a = 8, 9 |
| b = 6, 7, 8, 9 |
| validate 0, 0 |
| |
| aarch64 = ssubw |
| generate int64x2_t:int32x4_t:int64x2_t |
| |
| /// Unsigned Subtract Wide |
| name = vsubw_high |
| no-q |
| multi_fn = simd_shuffle8!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_sub, a, {simd_cast, c} |
| a = 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = usubw |
| generate uint16x8_t:uint8x16_t:uint16x8_t |
| |
| /// Unsigned Subtract Wide |
| name = vsubw_high |
| no-q |
| multi_fn = simd_shuffle4!, c:uint16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_sub, a, {simd_cast, c} |
| a = 8, 9, 10, 11 |
| b = 0, 1, 2, 3, 8, 9, 10, 11 |
| validate 0, 0, 0, 0 |
| |
| aarch64 = usubw |
| generate uint32x4_t:uint16x8_t:uint32x4_t |
| |
| /// Unsigned Subtract Wide |
| name = vsubw_high |
| no-q |
| multi_fn = simd_shuffle2!, c:uint32x2_t, b, b, [2, 3] |
| multi_fn = simd_sub, a, {simd_cast, c} |
| a = 8, 9 |
| b = 6, 7, 8, 9 |
| validate 0, 0 |
| |
| aarch64 = usubw |
| generate uint64x2_t:uint32x4_t:uint64x2_t |
| |
| /// Signed Subtract Long |
| name = vsubl |
| no-q |
| multi_fn = simd_cast, c:out_t, a |
| multi_fn = simd_cast, d:out_t, b |
| multi_fn = simd_sub, c, d |
| |
| a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| arm = vsubl |
| aarch64 = ssubl |
| generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t |
| |
| /// Unsigned Subtract Long |
| name = vsubl |
| no-q |
| multi_fn = simd_cast, c:out_t, a |
| multi_fn = simd_cast, d:out_t, b |
| multi_fn = simd_sub, c, d |
| |
| a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| arm = vsubl |
| aarch64 = usubl |
| generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t |
| |
| /// Signed Subtract Long |
| name = vsubl_high |
| no-q |
| multi_fn = simd_shuffle8!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_cast, d:out_t, c |
| multi_fn = simd_shuffle8!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_cast, f:out_t, e |
| multi_fn = simd_sub, d, f |
| |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 6, 7, 8, 9, 10, 11, 12, 13 |
| |
| aarch64 = ssubl |
| generate int8x16_t:int8x16_t:int16x8_t |
| |
| /// Signed Subtract Long |
| name = vsubl_high |
| no-q |
| multi_fn = simd_shuffle4!, c:int16x4_t, a, a, [4, 5, 6, 7] |
| multi_fn = simd_cast, d:out_t, c |
| multi_fn = simd_shuffle4!, e:int16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_cast, f:out_t, e |
| multi_fn = simd_sub, d, f |
| |
| a = 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 6, 6, 6, 6, 8, 8, 8, 8 |
| validate 4, 5, 6, 7 |
| |
| aarch64 = ssubl |
| generate int16x8_t:int16x8_t:int32x4_t |
| |
| /// Signed Subtract Long |
| name = vsubl_high |
| no-q |
| multi_fn = simd_shuffle2!, c:int32x2_t, a, a, [2, 3] |
| multi_fn = simd_cast, d:out_t, c |
| multi_fn = simd_shuffle2!, e:int32x2_t, b, b, [2, 3] |
| multi_fn = simd_cast, f:out_t, e |
| multi_fn = simd_sub, d, f |
| |
| a = 12, 13, 14, 15 |
| b = 6, 6, 8, 8 |
| validate 6, 7 |
| |
| aarch64 = ssubl |
| generate int32x4_t:int32x4_t:int64x2_t |
| |
| /// Unsigned Subtract Long |
| name = vsubl_high |
| no-q |
| multi_fn = simd_shuffle8!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_cast, d:out_t, c |
| multi_fn = simd_shuffle8!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_cast, f:out_t, e |
| multi_fn = simd_sub, d, f |
| |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 6, 7, 8, 9, 10, 11, 12, 13 |
| |
| aarch64 = usubl |
| generate uint8x16_t:uint8x16_t:uint16x8_t |
| |
| /// Unsigned Subtract Long |
| name = vsubl_high |
| no-q |
| multi_fn = simd_shuffle4!, c:uint16x4_t, a, a, [4, 5, 6, 7] |
| multi_fn = simd_cast, d:out_t, c |
| multi_fn = simd_shuffle4!, e:uint16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_cast, f:out_t, e |
| multi_fn = simd_sub, d, f |
| |
| a = 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 6, 6, 6, 6, 8, 8, 8, 8 |
| validate 4, 5, 6, 7 |
| |
| aarch64 = usubl |
| generate uint16x8_t:uint16x8_t:uint32x4_t |
| |
| /// Unsigned Subtract Long |
| name = vsubl_high |
| no-q |
| multi_fn = simd_shuffle2!, c:uint32x2_t, a, a, [2, 3] |
| multi_fn = simd_cast, d:out_t, c |
| multi_fn = simd_shuffle2!, e:uint32x2_t, b, b, [2, 3] |
| multi_fn = simd_cast, f:out_t, e |
| multi_fn = simd_sub, d, f |
| |
| a = 12, 13, 14, 15 |
| b = 6, 6, 8, 8 |
| validate 6, 7 |
| |
| aarch64 = usubl |
| generate uint32x4_t:uint32x4_t:uint64x2_t |
| |
| /// Maximum (vector) |
| name = vmax |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
| validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| arm = vmax |
| aarch64 = smax |
| link-arm = vmaxs._EXT_ |
| link-aarch64 = smax._EXT_ |
| generate int*_t |
| |
| arm = vmax |
| aarch64 = umax |
| link-arm = vmaxu._EXT_ |
| link-aarch64 = umax._EXT_ |
| generate uint*_t |
| |
| /// Maximum (vector) |
| name = vmax |
| a = 1.0, -2.0, 3.0, -4.0 |
| b = 0.0, 3.0, 2.0, 8.0 |
| validate 1.0, 3.0, 3.0, 8.0 |
| |
| aarch64 = fmax |
| link-aarch64 = fmax._EXT_ |
| generate float64x*_t |
| |
| arm = vmax |
| aarch64 = fmax |
| link-arm = vmaxs._EXT_ |
| link-aarch64 = fmax._EXT_ |
| generate float*_t |
| |
| /// Floating-point Maximun Number (vector) |
| name = vmaxnm |
| a = 1.0, 2.0, 3.0, -4.0 |
| b = 8.0, 16.0, -1.0, 6.0 |
| validate 8.0, 16.0, 3.0, 6.0 |
| |
| aarch64 = fmaxnm |
| link-aarch64 = fmaxnm._EXT_ |
| generate float64x*_t |
| |
| target = fp-armv8 |
| arm = vmaxnm |
| aarch64 = fmaxnm |
| link-arm = vmaxnm._EXT_ |
| link-aarch64 = fmaxnm._EXT_ |
| generate float*_t |
| |
| /// Floating-point Maximum Number Pairwise (vector). |
| name = vpmaxnm |
| a = 1.0, 2.0 |
| b = 6.0, -3.0 |
| validate 2.0, 6.0 |
| aarch64 = fmaxnmp |
| link-aarch64 = fmaxnmp._EXT_ |
| generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t |
| |
| /// Floating-point Maximum Number Pairwise (vector). |
| name = vpmaxnm |
| a = 1.0, 2.0, 3.0, -4.0 |
| b = 8.0, 16.0, -1.0, 6.0 |
| validate 2.0, 3.0, 16.0, 6.0 |
| aarch64 = fmaxnmp |
| link-aarch64 = fmaxnmp._EXT_ |
| generate float32x4_t:float32x4_t:float32x4_t |
| |
| /// Minimum (vector) |
| name = vmin |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 |
| |
| arm = vmin |
| aarch64 = smin |
| link-arm = vmins._EXT_ |
| link-aarch64 = smin._EXT_ |
| generate int*_t |
| |
| arm = vmin |
| aarch64 = umin |
| link-arm = vminu._EXT_ |
| link-aarch64 = umin._EXT_ |
| generate uint*_t |
| |
| /// Minimum (vector) |
| name = vmin |
| a = 1.0, -2.0, 3.0, -4.0 |
| b = 0.0, 3.0, 2.0, 8.0 |
| validate 0.0, -2.0, 2.0, -4.0 |
| |
| aarch64 = fmin |
| link-aarch64 = fmin._EXT_ |
| generate float64x*_t |
| |
| arm = vmin |
| aarch64 = fmin |
| link-arm = vmins._EXT_ |
| link-aarch64 = fmin._EXT_ |
| generate float*_t |
| |
| /// Floating-point Minimun Number (vector) |
| name = vminnm |
| a = 1.0, 2.0, 3.0, -4.0 |
| b = 8.0, 16.0, -1.0, 6.0 |
| validate 1.0, 2.0, -1.0, -4.0 |
| |
| aarch64 = fminnm |
| link-aarch64 = fminnm._EXT_ |
| generate float64x*_t |
| |
| target = fp-armv8 |
| arm = vminnm |
| aarch64 = fminnm |
| link-arm = vminnm._EXT_ |
| link-aarch64 = fminnm._EXT_ |
| generate float*_t |
| |
| /// Floating-point Minimum Number Pairwise (vector). |
| name = vpminnm |
| a = 1.0, 2.0 |
| b = 6.0, -3.0 |
| validate 1.0, -3.0 |
| aarch64 = fminnmp |
| link-aarch64 = fminnmp._EXT_ |
| generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t |
| |
| /// Floating-point Minimum Number Pairwise (vector). |
| name = vpminnm |
| a = 1.0, 2.0, 3.0, -4.0 |
| b = 8.0, 16.0, -1.0, 6.0 |
| validate 1.0, -4.0, 8.0, -1.0 |
| aarch64 = fminnmp |
| link-aarch64 = fminnmp._EXT_ |
| generate float32x4_t:float32x4_t:float32x4_t |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull |
| a = 0, 1, 2, 3, 4, 5, 6, 7 |
| b = 1, 2, 3, 4, 5, 6, 7, 8 |
| validate 0, 4, 12, 24, 40, 60, 84, 108 |
| |
| aarch64 = sqdmull |
| link-aarch64 = sqdmull._EXT2_ |
| arm = vqdmull |
| link-arm = vqdmull._EXT2_ |
| generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull |
| multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0 |
| a = 2 |
| b = 3 |
| validate 12 |
| |
| aarch64 = sqdmull |
| generate i16:i16:i32 |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull |
| a = 2 |
| b = 3 |
| validate 12 |
| |
| aarch64 = sqdmull |
| link-aarch64 = sqdmulls.scalar |
| generate i32:i32:i64 |
| |
| /// Vector saturating doubling long multiply with scalar |
| name = vqdmull_n |
| no-q |
| multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b} |
| a = 2, 4, 6, 8 |
| b = 2 |
| validate 8, 16, 24, 32 |
| |
| aarch64 = sqdmull |
| arm = vqdmull |
| generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-halflen-halflen} |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {asc-halflen-halflen} |
| multi_fn = vqdmull-noqself-noext, a, b |
| a = 0, 1, 4, 5, 4, 5, 6, 7 |
| b = 1, 2, 5, 6, 5, 6, 7, 8 |
| validate 40, 60, 84, 112 |
| |
| aarch64 = sqdmull2 |
| generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull_high_n |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a:in_ntt, a, a, {asc-out_len-out_len} |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = vqdmull-in_ntt-noext, a, b |
| a = 0, 2, 8, 10, 8, 10, 12, 14 |
| b = 2 |
| validate 32, 40, 48, 56 |
| |
| aarch64 = sqdmull2 |
| generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t |
| |
| /// Vector saturating doubling long multiply by scalar |
| name = vqdmull_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_shuffle-out_len-!, b:in_t0, b, b, {dup-out_len-N as u32} |
| multi_fn = vqdmull-noqself-noext, a, b |
| a = 1, 2, 3, 4 |
| b = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate 4, 8, 12, 16 |
| |
| aarch64 = sqdmull |
| generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t |
| |
| arm = vqdmull |
| generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmullh_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_extract, b:in_t0, b, N as u32 |
| multi_fn = vqdmullh-noqself-noext, a, b |
| a = 2 |
| b = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate 8 |
| |
| aarch64 = sqdmull |
| generate i16:int16x4_t:i32, i16:int16x8_t:i32 |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmulls_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_extract, b:in_t0, b, N as u32 |
| multi_fn = vqdmulls-noqself-noext, a, b |
| a = 2 |
| b = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate 8 |
| |
| aarch64 = sqdmull |
| generate i32:int32x2_t:i64, i32:int32x4_t:i64 |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull_high_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_shuffle-out_len-!, a:in_t, a, a, {asc-out_len-out_len} |
| multi_fn = simd_shuffle-out_len-!, b:in_t, b, b, {dup-out_len-N as u32} |
| multi_fn = vqdmull-self-noext, a, b |
| a = 0, 1, 4, 5, 4, 5, 6, 7 |
| b = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate 16, 20, 24, 28 |
| |
| aarch64 = sqdmull2 |
| generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t |
| |
| /// Signed saturating doubling multiply long |
| name = vqdmull_high_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_shuffle-out_len-!, a:half, a, a, {asc-out_len-out_len} |
| multi_fn = simd_shuffle-out_len-!, b:half, b, b, {dup-out_len-N as u32} |
| multi_fn = vqdmull-noqself-noext, a, b |
| a = 0, 1, 4, 5, 4, 5, 6, 7 |
| b = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate 16, 20, 24, 28 |
| |
| aarch64 = sqdmull2 |
| generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t |
| |
| /// Signed saturating doubling multiply-add long |
| name = vqdmlal |
| multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c} |
| a = 1, 1, 1, 1 |
| b = 1, 2, 3, 4 |
| c = 2, 2, 2, 2 |
| validate 5, 9, 13, 17 |
| |
| aarch64 = sqdmlal |
| arm = vqdmlal |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Vector widening saturating doubling multiply accumulate with scalar |
| name = vqdmlal |
| n-suffix |
| multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c} |
| a = 1, 1, 1, 1 |
| b = 1, 2, 3, 4 |
| c = 2 |
| validate 5, 9, 13, 17 |
| |
| aarch64 = sqdmlal |
| arm = vqdmlal |
| generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t |
| |
| /// Signed saturating doubling multiply-add long |
| name = vqdmlal_high |
| no-q |
| multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c} |
| a = 1, 2, 3, 4 |
| b = 0, 1, 4, 5, 4, 5, 6, 7 |
| c = 1, 2, 5, 6, 5, 6, 7, 8 |
| validate 41, 62, 87, 116 |
| |
| aarch64 = sqdmlal2 |
| generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| /// Signed saturating doubling multiply-add long |
| name = vqdmlal_high_n |
| no-q |
| multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} |
| a = 1, 2, 3, 4 |
| b = 0, 2, 8, 10, 8, 10, 12, 14 |
| c = 2 |
| validate 33, 42, 51, 60 |
| |
| aarch64 = sqdmlal2 |
| generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t |
| |
| /// Vector widening saturating doubling multiply accumulate with scalar |
| name = vqdmlal_lane |
| in2-suffix |
| constn = N |
| multi_fn = static_assert_imm-in2_exp_len-N |
| multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::<N>, b, c} |
| a = 1, 2, 3, 4 |
| b = 1, 2, 3, 4 |
| c = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate 5, 10, 15, 20 |
| |
| aarch64 = sqdmlal |
| generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t |
| |
| arm = vqdmlal |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Signed saturating doubling multiply-add long |
| name = vqdmlal_high_lane |
| in2-suffix |
| constn = N |
| multi_fn = static_assert_imm-in2_exp_len-N |
| multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c} |
| a = 1, 2, 3, 4 |
| b = 0, 1, 4, 5, 4, 5, 6, 7 |
| c = 0, 2, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 17, 22, 27, 32 |
| |
| aarch64 = sqdmlal2 |
| generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| /// Signed saturating doubling multiply-subtract long |
| name = vqdmlsl |
| multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c} |
| a = 3, 7, 11, 15 |
| b = 1, 2, 3, 4 |
| c = 2, 2, 2, 2 |
| validate -1, -1, -1, -1 |
| |
| aarch64 = sqdmlsl |
| arm = vqdmlsl |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Vector widening saturating doubling multiply subtract with scalar |
| name = vqdmlsl |
| n-suffix |
| multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c} |
| a = 3, 7, 11, 15 |
| b = 1, 2, 3, 4 |
| c = 2 |
| validate -1, -1, -1, -1 |
| |
| aarch64 = sqdmlsl |
| arm = vqdmlsl |
| generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t |
| |
| /// Signed saturating doubling multiply-subtract long |
| name = vqdmlsl_high |
| no-q |
| multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c} |
| a = 39, 58, 81, 108 |
| b = 0, 1, 4, 5, 4, 5, 6, 7 |
| c = 1, 2, 5, 6, 5, 6, 7, 8 |
| validate -1, -2, -3, -4 |
| |
| aarch64 = sqdmlsl2 |
| generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| /// Signed saturating doubling multiply-subtract long |
| name = vqdmlsl_high_n |
| no-q |
| multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} |
| a = 31, 38, 45, 52 |
| b = 0, 2, 8, 10, 8, 10, 12, 14 |
| c = 2 |
| validate -1, -2, -3, -4 |
| |
| aarch64 = sqdmlsl2 |
| generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t |
| |
| /// Vector widening saturating doubling multiply subtract with scalar |
| name = vqdmlsl_lane |
| in2-suffix |
| constn = N |
| multi_fn = static_assert_imm-in2_exp_len-N |
| multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::<N>, b, c} |
| a = 3, 6, 9, 12 |
| b = 1, 2, 3, 4 |
| c = 0, 2, 2, 0, 2, 0, 0, 0 |
| n = HFLEN |
| validate -1, -2, -3, -4 |
| |
| aarch64 = sqdmlsl |
| generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t |
| |
| arm = vqdmlsl |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Signed saturating doubling multiply-subtract long |
| name = vqdmlsl_high_lane |
| in2-suffix |
| constn = N |
| multi_fn = static_assert_imm-in2_exp_len-N |
| multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c} |
| a = 15, 18, 21, 24 |
| b = 0, 1, 4, 5, 4, 5, 6, 7 |
| c = 0, 2, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate -1, -2, -3, -4 |
| |
| aarch64 = sqdmlsl2 |
| generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| /// Signed saturating doubling multiply returning high half |
| name = vqdmulh |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| b = 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 1, 1, 1, 1, 1, 1, 1, 1 |
| |
| aarch64 = sqdmulh |
| link-aarch64 = sqdmulh._EXT_ |
| arm = vqdmulh |
| link-arm = vqdmulh._EXT_ |
| generate int16x4_t, int16x8_t, int32x2_t, int32x4_t |
| |
| /// Signed saturating doubling multiply returning high half |
| name = vqdmulh |
| multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0 |
| a = 1 |
| b = 2 |
| validate 0 |
| |
| aarch64 = sqdmulh |
| generate i16, i32 |
| |
| /// Vector saturating doubling multiply high with scalar |
| name = vqdmulh_n |
| out-suffix |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = vqdmulh-out-noext, a, b |
| a = MAX, MAX, MAX, MAX |
| b = 2 |
| validate 1, 1, 1, 1 |
| |
| aarch64 = sqdmulh |
| arm = vqdmulh |
| generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t |
| |
| /// Vector saturating doubling multiply high with scalar |
| name = vqdmulhq_n |
| out-suffix |
| multi_fn = vdupq_n-in_ntt-noext, b:out_t, b |
| multi_fn = vqdmulh-out-noext, a, b |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| b = 2 |
| validate 1, 1, 1, 1, 1, 1, 1, 1 |
| |
| aarch64 = sqdmulh |
| arm = vqdmulh |
| generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t |
| |
| /// Signed saturating doubling multiply returning high half |
| name = vqdmulhh_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_extract, b:in_t0, b, N as u32 |
| multi_fn = vqdmulhh-out_ntt-noext, a, b |
| a = 2 |
| b = 0, 0, MAX, 0, 0, 0, 0, 0 |
| n = 2 |
| validate 1 |
| |
| aarch64 = sqdmulh |
| generate i16:int16x4_t:i16, i16:int16x8_t:i16 |
| |
| /// Signed saturating doubling multiply returning high half |
| name = vqdmulhs_lane |
| constn = N |
| multi_fn = static_assert_imm-in_exp_len-N |
| multi_fn = simd_extract, b:in_t0, b, N as u32 |
| multi_fn = vqdmulhs-out_ntt-noext, a, b |
| a = 2 |
| b = 0, MAX, 0, 0 |
| n = 1 |
| validate 1 |
| |
| aarch64 = sqdmulh |
| generate i32:int32x2_t:i32, i32:int32x4_t:i32 |
| |
| /// Signed saturating extract narrow |
| name = vqmovn |
| no-q |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| |
| aarch64 = sqxtn |
| link-aarch64 = sqxtn._EXT2_ |
| arm = vqmovn |
| link-arm = vqmovns._EXT2_ |
| generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
| |
| /// Unsigned saturating extract narrow |
| name = vqmovn |
| no-q |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| |
| aarch64 = uqxtn |
| link-aarch64 = uqxtn._EXT2_ |
| arm = vqmovn |
| link-arm = vqmovnu._EXT2_ |
| generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
| |
| /// Saturating extract narrow |
| name = vqmovn |
| multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 |
| a = 1 |
| validate 1 |
| |
| aarch64 = sqxtn |
| generate i16:i8, i32:i16 |
| aarch64 = uqxtn |
| generate u16:u8, u32:u16 |
| |
| /// Saturating extract narrow |
| name = vqmovn |
| a = 1 |
| validate 1 |
| |
| aarch64 = sqxtn |
| link-aarch64 = scalar.sqxtn._EXT2_._EXT_ |
| generate i64:i32 |
| |
| aarch64 = uqxtn |
| link-aarch64 = scalar.uqxtn._EXT2_._EXT_ |
| generate u64:u32 |
| |
| /// Signed saturating extract narrow |
| name = vqmovn_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len} |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| |
| aarch64 = sqxtn2 |
| generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t |
| aarch64 = uqxtn2 |
| generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t |
| |
| /// Signed saturating extract unsigned narrow |
| name = vqmovun |
| no-q |
| a = -1, -1, -1, -1, -1, -1, -1, -1 |
| validate 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = sqxtun |
| link-aarch64 = sqxtun._EXT2_ |
| arm = vqmovun |
| link-arm = vqmovnsu._EXT2_ |
| generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t |
| |
| /// Signed saturating extract unsigned narrow |
| name = vqmovun |
| multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 |
| a = 1 |
| validate 1 |
| |
| aarch64 = sqxtun |
| generate i16:u8, i32:u16, i64:u32 |
| |
| /// Signed saturating extract unsigned narrow |
| name = vqmovun_high |
| no-q |
| multi_fn = simd_shuffle-out_len-!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len} |
| a = 0, 0, 0, 0, 0, 0, 0, 0 |
| b = -1, -1, -1, -1, -1, -1, -1, -1 |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = sqxtun2 |
| generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t |
| |
| /// Signed saturating rounding doubling multiply returning high half |
| name = vqrdmulh |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| b = 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 2, 2, 2, 2, 2, 2, 2, 2 |
| |
| aarch64 = sqrdmulh |
| link-aarch64 = sqrdmulh._EXT_ |
| arm = vqrdmulh |
| link-arm = vqrdmulh._EXT_ |
| generate int16x4_t, int16x8_t, int32x2_t, int32x4_t |
| |
| /// Signed saturating rounding doubling multiply returning high half |
| name = vqrdmulh |
| multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
| a = 1 |
| b = 2 |
| validate 0 |
| |
| aarch64 = sqrdmulh |
| generate i16, i32 |
| |
| /// Vector saturating rounding doubling multiply high with scalar |
| name = vqrdmulh |
| out-n-suffix |
| multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b} |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| b = 2 |
| validate 2, 2, 2, 2, 2, 2, 2, 2 |
| |
| aarch64 = sqrdmulh |
| arm = vqrdmulh |
| generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t |
| |
| /// Vector rounding saturating doubling multiply high by scalar |
| name = vqrdmulh |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_shuffle-out_len-!, b:out_t, b, b, {dup-out_len-LANE as u32} |
| multi_fn = vqrdmulh-out-noext, a, b |
| a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| b = 0, 2, 0, 0, 0, 0, 0, 0, |
| n = 1 |
| validate 2, 2, 2, 2, 2, 2, 2, 2 |
| |
| aarch64 = sqrdmulh |
| arm = vqrdmulh |
| generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t |
| generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t |
| |
| /// Signed saturating rounding doubling multiply returning high half |
| name = vqrdmulh |
| lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32} |
| a = 1 |
| b = 0, 2, 0, 0, 0, 0, 0, 0, |
| n = 1 |
| validate 0 |
| |
| aarch64 = sqrdmulh |
| generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32 |
| |
| /// Signed saturating rounding doubling multiply accumulate returning high half |
| name = vqrdmlah |
| multi_fn = vqadd-out-noext, a, {vqrdmulh-out-noext, b, c} |
| a = 1, 1, 1, 1, 1, 1, 1, 1 |
| b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| c = 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 3, 3, 3, 3, 3, 3, 3, 3 |
| |
| aarch64 = sqrdmulh |
| arm = vqrdmulh |
| generate int16x4_t, int16x8_t, int32x2_t, int32x4_t |
| |
| /// Signed saturating rounding doubling multiply accumulate returning high half |
| name = vqrdmlah |
| multi_fn = vqadd-self-noext, a, {vqrdmulh-self-noext, b, c} |
| a = 1 |
| b = 1 |
| c = 2 |
| validate 1 |
| |
| aarch64 = sqrdmulh |
| generate i16, i32 |
| |
| /// Signed saturating rounding doubling multiply accumulate returning high half |
| name = vqrdmlah |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c} |
| a = 1, 1, 1, 1, 1, 1, 1, 1 |
| b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| c = 0, 2, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 3, 3, 3, 3, 3, 3, 3, 3 |
| |
| aarch64 = sqrdmulh |
| arm = vqrdmulh |
| generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t |
| generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t |
| |
| /// Signed saturating rounding doubling multiply accumulate returning high half |
| name = vqrdmlah |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c} |
| a = 1 |
| b = 1 |
| c = 0, 2, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 1 |
| |
| aarch64 = sqrdmulh |
| generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 |
| |
| /// Signed saturating rounding doubling multiply subtract returning high half |
| name = vqrdmlsh |
| multi_fn = vqsub-out-noext, a, {vqrdmulh-out-noext, b, c} |
| a = 1, 1, 1, 1, 1, 1, 1, 1 |
| b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| c = 2, 2, 2, 2, 2, 2, 2, 2 |
| validate -1, -1, -1, -1, -1, -1, -1, -1 |
| |
| aarch64 = sqrdmulh |
| arm = vqrdmulh |
| generate int16x4_t, int16x8_t, int32x2_t, int32x4_t |
| |
| /// Signed saturating rounding doubling multiply subtract returning high half |
| name = vqrdmlsh |
| multi_fn = vqsub-self-noext, a, {vqrdmulh-self-noext, b, c} |
| a = 1 |
| b = 1 |
| c = 2 |
| validate 1 |
| |
| aarch64 = sqrdmulh |
| generate i16, i32 |
| |
| /// Signed saturating rounding doubling multiply subtract returning high half |
| name = vqrdmlsh |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vqsub-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c} |
| a = 1, 1, 1, 1, 1, 1, 1, 1 |
| b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
| c = 0, 2, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate -1, -1, -1, -1, -1, -1, -1, -1 |
| |
| aarch64 = sqrdmulh |
| arm = vqrdmulh |
| generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t |
| generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t |
| |
| /// Signed saturating rounding doubling multiply subtract returning high half |
| name = vqrdmlsh |
| in2-lane-suffixes |
| constn = LANE |
| multi_fn = static_assert_imm-in2_exp_len-LANE |
| multi_fn = vqsub-self-noext, a, {vqrdmulh-in2lane-::<LANE>, b, c} |
| a = 1 |
| b = 1 |
| c = 0, 2, 0, 0, 0, 0, 0, 0 |
| n = 1 |
| validate 1 |
| |
| aarch64 = sqrdmulh |
| generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 |
| |
| /// Signed saturating rounding shift left |
| name = vqrshl |
| a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 |
| |
| aarch64 = sqrshl |
| link-aarch64 = sqrshl._EXT_ |
| generate i32, i64 |
| |
| arm = vqrshl |
| link-arm = vqrshifts._EXT_ |
| generate int*_t, int64x*_t |
| |
| /// Signed saturating rounding shift left |
| name = vqrshl |
| multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0 |
| a = 1 |
| b = 2 |
| validate 4 |
| |
| aarch64 = sqrshl |
| generate i8, i16 |
| |
| /// Unsigned signed saturating rounding shift left |
| name = vqrshl |
| out-suffix |
| a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 |
| |
| aarch64 = uqrshl |
| link-aarch64 = uqrshl._EXT_ |
| generate u32:i32:u32, u64:i64:u64 |
| |
| arm = vqrshl |
| link-arm = vqrshiftu._EXT_ |
| generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t |
| generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t |
| |
| /// Unsigned signed saturating rounding shift left |
| name = vqrshl |
| out-suffix |
| multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a |
| multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
| multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0 |
| a = 1 |
| b = 2 |
| validate 4 |
| |
| aarch64 = uqrshl |
| generate u8:i8:u8, u16:i16:u16 |
| |
| /// Signed saturating rounded shift right narrow |
| name = vqrshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = MIN, 4, 8, 12, 16, 20, 24, 28 |
| n = 2 |
| validate MIN, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = sqrshrn |
| link-aarch64 = sqrshrn._EXT2_ |
| const-aarch64 = N |
| |
| arm = vqrshrn |
| link-arm = vqrshiftns._EXT2_ |
| const-arm = -N as ttn |
| generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
| |
| /// Signed saturating rounded shift right narrow |
| name = vqrshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a |
| multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0 |
| a = 4 |
| n = 2 |
| validate 1 |
| |
| aarch64 = sqrshrn |
| generate i16:i8, i32:i16, i64:i32 |
| |
| /// Signed saturating rounded shift right narrow |
| name = vqrshrn_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 2, 3, 2, 3, 6, 7 |
| b = 8, 12, 24, 28, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 |
| |
| aarch64 = sqrshrn2 |
| generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t |
| |
| /// Unsigned signed saturating rounded shift right narrow |
| name = vqrshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = MIN, 4, 8, 12, 16, 20, 24, 28 |
| n = 2 |
| validate 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = uqrshrn |
| link-aarch64 = uqrshrn._EXT2_ |
| const-aarch64 = N |
| |
| arm = vqrshrn |
| link-arm = vqrshiftnu._EXT2_ |
| const-arm = -N as ttn |
| generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
| |
| /// Unsigned saturating rounded shift right narrow |
| name = vqrshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a |
| multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0 |
| a = 4 |
| n = 2 |
| validate 1 |
| |
| aarch64 = uqrshrn |
| generate u16:u8, u32:u16, u64:u32 |
| |
| /// Unsigned saturating rounded shift right narrow |
| name = vqrshrn_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 2, 3, 2, 3, 6, 7 |
| b = 8, 12, 24, 28, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 |
| |
| aarch64 = uqrshrn2 |
| generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t |
| |
| /// Signed saturating rounded shift right unsigned narrow |
| name = vqrshrun |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = 0, 4, 8, 12, 16, 20, 24, 28 |
| n = 2 |
| validate 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = sqrshrun |
| link-aarch64 = sqrshrun._EXT2_ |
| const-aarch64 = N |
| |
| arm = vqrshrun |
| link-arm = vqrshiftnsu._EXT2_ |
| const-arm = -N as ttn |
| generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t |
| |
| /// Signed saturating rounded shift right unsigned narrow |
| name = vqrshrun |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a |
| multi_fn = simd_extract, {vqrshrun_n-in_ntt-::<N>, a}, 0 |
| a = 4 |
| n = 2 |
| validate 1 |
| |
| aarch64 = sqrshrun |
| generate i16:u8, i32:u16, i64:u32 |
| |
| /// Signed saturating rounded shift right unsigned narrow |
| name = vqrshrun_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 2, 3, 2, 3, 6, 7 |
| b = 8, 12, 24, 28, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 |
| |
| aarch64 = sqrshrun2 |
| generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t |
| |
| /// Signed saturating shift left |
| name = vqshl |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 |
| |
| aarch64 = sqshl |
| link-aarch64 = sqshl._EXT_ |
| generate i64 |
| |
| arm = vqshl |
| link-arm = vqshifts._EXT_ |
| generate int*_t, int64x*_t |
| |
| /// Signed saturating shift left |
| name = vqshl |
| multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b} |
| multi_fn = simd_extract, c, 0 |
| a = 1 |
| b = 2 |
| validate 4 |
| |
| aarch64 = sqshl |
| generate i8, i16, i32 |
| |
| /// Unsigned saturating shift left |
| name = vqshl |
| out-suffix |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 |
| |
| aarch64 = uqshl |
| link-aarch64 = uqshl._EXT_ |
| generate u64:i64:u64 |
| |
| arm = vqshl |
| link-arm = vqshiftu._EXT_ |
| generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t |
| generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t |
| |
| /// Unsigned saturating shift left |
| name = vqshl |
| out-suffix |
| multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b} |
| multi_fn = simd_extract, c, 0 |
| a = 1 |
| b = 2 |
| validate 4 |
| |
| aarch64 = uqshl |
| generate u8:i8:u8, u16:i16:u16, u32:i32:u32 |
| |
| /// Signed saturating shift left |
| name = vqshl |
| n-suffix |
| constn = N |
| multi_fn = static_assert_imm-out_bits_exp_len-N |
| multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N.try_into().unwrap()} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| n = 2 |
| validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 |
| |
| aarch64 = sqshl |
| arm = vqshl |
| generate int*_t, int64x*_t |
| |
| /// Signed saturating shift left |
| name = vqshl |
| n-suffix |
| constn = N |
| multi_fn = static_assert_imm-out_bits_exp_len-N |
| multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 |
| a = 1 |
| n = 2 |
| validate 4 |
| |
| aarch64 = sqshl |
| generate i8, i16, i32, i64 |
| |
| /// Unsigned saturating shift left |
| name = vqshl |
| n-suffix |
| constn = N |
| multi_fn = static_assert_imm-out_bits_exp_len-N |
| multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N.try_into().unwrap()} |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| n = 2 |
| validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 |
| |
| aarch64 = uqshl |
| arm = vqshl |
| generate uint*_t, uint64x*_t |
| |
| /// Unsigned saturating shift left |
| name = vqshl |
| n-suffix |
| constn = N |
| multi_fn = static_assert_imm-out_bits_exp_len-N |
| multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 |
| a = 1 |
| n = 2 |
| validate 4 |
| |
| aarch64 = uqshl |
| generate u8, u16, u32, u64 |
| |
| /// Signed saturating shift right narrow |
| name = vqshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = 0, 4, 8, 12, 16, 20, 24, 28 |
| n = 2 |
| validate 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = sqshrn |
| link-aarch64 = sqshrn._EXT2_ |
| const-aarch64 = N |
| generate i64:i32 |
| |
| arm = vqshrn |
| link-arm = vqshiftns._EXT2_ |
| const-arm = -N as ttn |
| generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
| |
| /// Signed saturating shift right narrow |
| name = vqshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 |
| a = 4 |
| n = 2 |
| validate 1 |
| |
| aarch64 = sqshrn |
| generate i16:i8, i32:i16 |
| |
| /// Signed saturating shift right narrow |
| name = vqshrn_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 8, 9, 8, 9, 10, 11 |
| b = 32, 36, 40, 44, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = sqshrn2 |
| generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t |
| |
| /// Unsigned saturating shift right narrow |
| name = vqshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = 0, 4, 8, 12, 16, 20, 24, 28 |
| n = 2 |
| validate 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = uqshrn |
| link-aarch64 = uqshrn._EXT2_ |
| const-aarch64 = N |
| generate u64:u32 |
| |
| arm = vqshrn |
| link-arm = vqshiftnu._EXT2_ |
| const-arm = -N as ttn |
| generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
| |
| /// Unsigned saturating shift right narrow |
| name = vqshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 |
| a = 4 |
| n = 2 |
| validate 1 |
| |
| aarch64 = uqshrn |
| generate u16:u8, u32:u16 |
| |
| /// Unsigned saturating shift right narrow |
| name = vqshrn_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 8, 9, 8, 9, 10, 11 |
| b = 32, 36, 40, 44, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = uqshrn2 |
| generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t |
| |
| /// Signed saturating shift right unsigned narrow |
| name = vqshrun |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = 0, 4, 8, 12, 16, 20, 24, 28 |
| n = 2 |
| validate 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = sqshrun |
| link-aarch64 = sqshrun._EXT2_ |
| const-aarch64 = N |
| |
| arm = vqshrun |
| link-arm = vqshiftnsu._EXT2_ |
| const-arm = -N as ttn |
| generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t |
| |
| /// Signed saturating shift right unsigned narrow |
| name = vqshrun |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_extract, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 |
| a = 4 |
| n = 2 |
| validate 1 |
| |
| aarch64 = sqshrun |
| generate i16:u8, i32:u16, i64:u32 |
| |
| /// Signed saturating shift right unsigned narrow |
| name = vqshrun_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 8, 9, 8, 9, 10, 11 |
| b = 32, 36, 40, 44, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = sqshrun2 |
| generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t |
| |
| /// Calculates the square root of each lane. |
| name = vsqrt |
| fn = simd_fsqrt |
| a = 4.0, 9.0, 16.0, 25.0 |
| validate 2.0, 3.0, 4.0, 5.0 |
| |
| aarch64 = fsqrt |
| generate float*_t, float64x*_t |
| |
| /// Reciprocal square-root estimate. |
| name = vrsqrte |
| a = 1.0, 2.0, 3.0, 4.0 |
| validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375 |
| |
| aarch64 = frsqrte |
| link-aarch64 = frsqrte._EXT_ |
| generate float64x*_t |
| |
| arm = vrsqrte |
| link-arm = vrsqrte._EXT_ |
| generate float*_t |
| |
| /// Reciprocal estimate. |
| name = vrecpe |
| a = 4.0, 3.0, 2.0, 1.0 |
| validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875 |
| |
| aarch64 = frecpe |
| link-aarch64 = frecpe._EXT_ |
| generate float64x*_t |
| |
| arm = vrecpe |
| link-arm = vrecpe._EXT_ |
| generate float*_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = str |
| generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t |
| generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t |
| |
| arm = str |
| generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t |
| generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t |
| generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t |
| generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t |
| generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t |
| generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 1, 2, 3, 4, 5, 6, 7 |
| validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 |
| |
| aarch64 = str |
| generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t |
| generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t |
| |
| arm = str |
| generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t |
| generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t |
| generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t |
| generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t |
| generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t |
| generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 |
| validate 0, 1, 2, 3, 4, 5, 6, 7 |
| |
| aarch64 = str |
| generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t |
| generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t |
| |
| arm = str |
| generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t |
| generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t |
| generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t |
| generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t |
| generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t |
| generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 1, 2, 3 |
| validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 |
| |
| aarch64 = str |
| generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t |
| generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t |
| |
| arm = str |
| generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t |
| generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t |
| generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t |
| generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t |
| generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t |
| generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 |
| validate 0, 1, 2, 3 |
| |
| aarch64 = str |
| generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t |
| generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t |
| |
| arm = str |
| generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t |
| generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t |
| generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t |
| generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 1 |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = str |
| generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t |
| generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t |
| |
| arm = str |
| generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t |
| generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 |
| validate 0, 1 |
| |
| aarch64 = str |
| generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t |
| generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t |
| |
| arm = str |
| generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t |
| generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0., 0., 0., 0., 0., 0., 0., 0. |
| validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| aarch64 = str |
| generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t |
| generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t |
| generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t |
| generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t |
| generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t |
| generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t |
| |
| arm = str |
| generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t |
| generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t |
| generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t |
| generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t |
| generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t |
| generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| validate 0., 0., 0., 0., 0., 0., 0., 0. |
| |
| aarch64 = str |
| generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t |
| generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t |
| generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t |
| generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t |
| generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t |
| generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t |
| |
| arm = str |
| generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t |
| generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t |
| generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t |
| generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t |
| generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t |
| generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t |
| |
| /// Vector reinterpret cast operation |
| name = vreinterpret |
| double-suffixes |
| fn = transmute |
| a = 0., 0., 0., 0., 0., 0., 0., 0. |
| validate 0., 0., 0., 0., 0., 0., 0., 0. |
| |
| aarch64 = str |
| generate float32x2_t:float64x1_t, float64x1_t:float32x2_t |
| generate float32x4_t:float64x2_t, float64x2_t:float32x4_t |
| |
| /// Signed rounding shift left |
| name = vrshl |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| |
| aarch64 = srshl |
| link-aarch64 = srshl._EXT_ |
| generate i64 |
| |
| arm = vrshl |
| link-arm = vrshifts._EXT_ |
| generate int*_t, int64x*_t |
| |
| /// Unsigned rounding shift left |
| name = vrshl |
| out-suffix |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| |
| aarch64 = urshl |
| link-aarch64 = urshl._EXT_ |
| generate u64:i64:u64 |
| |
| arm = vrshl |
| link-arm = vrshiftu._EXT_ |
| generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t |
| generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t |
| |
| /// Signed rounding shift right |
| name = vrshr |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N).try_into().unwrap()} |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = srshr |
| arm = vrshr |
| generate int*_t, int64x*_t |
| |
| /// Signed rounding shift right |
| name = vrshr |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = vrshl-self-noext, a, -N as i64 |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = srshr |
| generate i64 |
| |
| /// Unsigned rounding shift right |
| name = vrshr |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N).try_into().unwrap()} |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = urshr |
| arm = vrshr |
| generate uint*_t, uint64x*_t |
| |
| /// Unsigned rounding shift right |
| name = vrshr |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = vrshl-self-noext, a, -N as i64 |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = urshr |
| generate u64 |
| |
| /// Rounding shift right narrow |
| name = vrshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = rshrn |
| link-aarch64 = rshrn._EXT2_ |
| const-aarch64 = N |
| |
| arm = vrshrn |
| link-arm = vrshiftn._EXT2_ |
| const-arm = -N as ttn |
| generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
| |
| /// Rounding shift right narrow |
| name = vrshrn |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = transmute, {vrshrn_n-noqsigned-::<N>, transmute(a)} |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = rshrn |
| arm = vrshrn |
| generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
| |
| /// Rounding shift right narrow |
| name = vrshrn_high |
| noq-n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 0, 1, 8, 9, 8, 9, 10, 11 |
| b = 32, 36, 40, 44, 48, 52, 56, 60 |
| n = 2 |
| validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = rshrn2 |
| generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t |
| generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t |
| |
| /// Signed rounding shift right and accumulate |
| name = vrsra |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = simd_add, a, {vrshr-nself-::<N>, b} |
| a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 |
| |
| aarch64 = srsra |
| arm = vrsra |
| generate int*_t, int64x*_t |
| |
| /// Unsigned rounding shift right and accumulate |
| name = vrsra |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = simd_add, a, {vrshr-nself-::<N>, b} |
| a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 |
| |
| aarch64 = ursra |
| arm = vrsra |
| generate uint*_t, uint64x*_t |
| |
| /// Signed rounding shift right and accumulate. |
| name = vrsra |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = vrshr-nself-::<N>, b:in_t, b |
| multi_fn = a + b |
| a = 1 |
| b = 4 |
| n = 2 |
| validate 2 |
| |
| aarch64 = srsra |
| generate i64 |
| |
| /// Ungisned rounding shift right and accumulate. |
| name = vrsra |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = vrshr-nself-::<N>, b:in_t, b |
| multi_fn = a + b |
| a = 1 |
| b = 4 |
| n = 2 |
| validate 2 |
| |
| aarch64 = ursra |
| generate u64 |
| |
| /// Insert vector element from another vector element |
| name = vset_lane |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_insert, b, LANE as u32, a |
| a = 1 |
| b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| n = 0 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = nop |
| arm = nop |
| generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t |
| generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t |
| generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t |
| generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t |
| generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t |
| |
| target = crypto |
| generate p64:poly64x1_t:poly64x1_t |
| |
| /// Insert vector element from another vector element |
| name = vsetq_lane |
| no-q |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_insert, b, LANE as u32, a |
| a = 1 |
| b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| n = 0 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| aarch64 = nop |
| arm = nop |
| generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t |
| generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t |
| generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t |
| generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t |
| generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t |
| |
| target = crypto |
| generate p64:poly64x2_t:poly64x2_t |
| |
| /// Insert vector element from another vector element |
| name = vset_lane |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_insert, b, LANE as u32, a |
| a = 1. |
| b = 0., 2., 3., 4. |
| n = 0 |
| validate 1., 2., 3., 4. |
| |
| aarch64 = nop |
| generate f64:float64x1_t:float64x1_t |
| |
| arm = nop |
| generate f32:float32x2_t:float32x2_t |
| |
| /// Insert vector element from another vector element |
| name = vsetq_lane |
| no-q |
| constn = LANE |
| multi_fn = static_assert_imm-in_exp_len-LANE |
| multi_fn = simd_insert, b, LANE as u32, a |
| a = 1. |
| b = 0., 2., 3., 4. |
| n = 0 |
| validate 1., 2., 3., 4. |
| |
| aarch64 = nop |
| generate f64:float64x2_t:float64x2_t |
| |
| arm = nop |
| generate f32:float32x4_t:float32x4_t |
| |
| /// Signed Shift left |
| name = vshl |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| |
| aarch64 = sshl |
| link-aarch64 = sshl._EXT_ |
| arm = vshl |
| link-arm = vshifts._EXT_ |
| generate int*_t, int64x*_t |
| |
| /// Signed Shift left |
| name = vshl |
| multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)} |
| a = 1 |
| b = 2 |
| validate 4 |
| |
| aarch64 = sshl |
| generate i64 |
| |
| /// Unsigned Shift left |
| name = vshl |
| out-suffix |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| |
| aarch64 = ushl |
| link-aarch64 = ushl._EXT_ |
| arm = vshl |
| link-arm = vshiftu._EXT_ |
| generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t |
| generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t |
| |
| /// Unsigned Shift left |
| out-suffix |
| name = vshl |
| multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)} |
| a = 1 |
| b = 2 |
| validate 4 |
| |
| aarch64 = ushl |
| generate u64:i64:u64 |
| |
| /// Shift left |
| name = vshl |
| n-suffix |
| constn = N |
| multi_fn = static_assert_imm-out_bits_exp_len-N |
| multi_fn = simd_shl, a, {vdup-nself-noext, N.try_into().unwrap()} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| n = 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| |
| arm = vshl |
| aarch64 = shl |
| generate int*_t, uint*_t, int64x*_t, uint64x*_t |
| |
| /// Signed shift left long |
| name = vshll |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-0-bits |
| multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N.try_into().unwrap()} |
| a = 1, 2, 3, 4, 5, 6, 7, 8 |
| n = 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32 |
| |
| arm = vshll.s |
| aarch64 = sshll |
| generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t |
| aarch64 = ushll |
| generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t |
| |
| /// Signed shift left long |
| name = vshll_high_n |
| no-q |
| constn = N |
| multi_fn = static_assert-N-0-bits |
| multi_fn = simd_shuffle-out_len-!, b:half, a, a, {asc-halflen-halflen} |
| multi_fn = vshll_n-noqself-::<N>, b |
| a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8 |
| n = 2 |
| validate 4, 8, 12, 16, 20, 24, 28, 32 |
| |
| aarch64 = sshll2 |
| generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t |
| aarch64 = ushll2 |
| generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t |
| |
| /// Shift right |
| name = vshr |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()} |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| arm = vshr.s |
| aarch64 = sshr |
| generate int*_t, int64x*_t |
| aarch64 = ushr |
| generate uint*_t, uint64x*_t |
| |
| /// Shift right narrow |
| name = vshrn_n |
| no-q |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N.try_into().unwrap()}} |
| a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| |
| arm = vshrn. |
| aarch64 = shrn |
| generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
| generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
| |
| /// Shift right narrow |
| name = vshrn_high_n |
| no-q |
| constn = N |
| multi_fn = static_assert-N-1-halfbits |
| multi_fn = simd_shuffle-out_len-!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
| a = 1, 2, 5, 6, 5, 6, 7, 8 |
| b = 20, 24, 28, 32, 52, 56, 60, 64 |
| n = 2 |
| validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16 |
| |
| aarch64 = shrn2 |
| generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t |
| generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t |
| |
| /// Signed shift right and accumulate |
| name = vsra |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = simd_add, a, {vshr-nself-::<N>, b} |
| a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 |
| |
| aarch64 = ssra |
| arm = vsra |
| generate int*_t, int64x*_t |
| |
| /// Unsigned shift right and accumulate |
| name = vsra |
| n-suffix |
| constn = N |
| multi_fn = static_assert-N-1-bits |
| multi_fn = simd_add, a, {vshr-nself-::<N>, b} |
| a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
| b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
| n = 2 |
| validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 |
| |
| aarch64 = usra |
| arm = vsra |
| generate uint*_t, uint64x*_t |
| |
| /// Transpose vectors |
| name = vtrn1 |
| multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} |
| a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 |
| |
| aarch64 = trn1 |
| generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t |
| |
| aarch64 = zip1 |
| generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t |
| |
| /// Transpose vectors |
| name = vtrn1 |
| multi_fn = simd_shuffle-in_len-!, a, b, {transpose-1-in_len} |
| a = 0., 2., 4., 6., 8., 10., 12., 14. |
| b = 1., 3., 5., 7., 9., 11., 13., 15. |
| validate 0., 1., 4., 5., 8., 9., 12., 13. |
| |
| aarch64 = trn1 |
| generate float32x4_t |
| |
| aarch64 = zip1 |
| generate float32x2_t, float64x2_t |
| |
| /// Transpose vectors |
| name = vtrn2 |
| multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} |
| a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 |
| |
| aarch64 = trn2 |
| generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t |
| |
| aarch64 = zip2 |
| generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t |
| |
| /// Transpose vectors |
| name = vtrn2 |
| multi_fn = simd_shuffle-in_len-!, a, b, {transpose-2-in_len} |
| a = 0., 2., 4., 6., 8., 10., 12., 14. |
| b = 1., 3., 5., 7., 9., 11., 13., 15. |
| validate 2., 3., 6., 7., 10., 11., 14., 15. |
| |
| aarch64 = trn2 |
| generate float32x4_t |
| |
| aarch64 = zip2 |
| generate float32x2_t, float64x2_t |
| |
| /// Zip vectors |
| name = vzip1 |
| multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} |
| a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
| b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 |
| validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| aarch64 = zip1 |
| generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t |
| |
| /// Zip vectors |
| name = vzip1 |
| multi_fn = simd_shuffle-in_len-!, a, b, {zip-1-in_len} |
| a = 0., 2., 4., 6., 8., 10., 12., 14. |
| b = 1., 3., 5., 7., 9., 11., 13., 15. |
| validate 0., 1., 2., 3., 4., 5., 6., 7. |
| |
| aarch64 = zip1 |
| generate float32x2_t, float32x4_t, float64x2_t |
| |
| /// Zip vectors |
| name = vzip2 |
| multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} |
| a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30 |
| b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31 |
| validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 |
| |
| aarch64 = zip2 |
| generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t |
| |
| /// Zip vectors |
| name = vzip2 |
| multi_fn = simd_shuffle-in_len-!, a, b, {zip-2-in_len} |
| a = 0., 8., 8., 10., 8., 10., 12., 14. |
| b = 1., 9., 9., 11., 9., 11., 13., 15. |
| validate 8., 9., 10., 11., 12., 13., 14., 15. |
| |
| aarch64 = zip2 |
| generate float32x2_t, float32x4_t, float64x2_t |
| |
| /// Unzip vectors |
| name = vuzp1 |
| multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} |
| a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0 |
| b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0 |
| validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16 |
| |
| aarch64 = uzp1 |
| generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t |
| |
| aarch64 = zip1 |
| generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t |
| |
| /// Unzip vectors |
| name = vuzp1 |
| multi_fn = simd_shuffle-in_len-!, a, b, {unzip-1-in_len} |
| a = 0., 8., 1., 9., 4., 12., 5., 13. |
| b = 1., 10., 3., 11., 6., 14., 7., 15. |
| validate 0., 1., 1., 3., 4., 5., 6., 7. |
| |
| aarch64 = uzp1 |
| generate float32x4_t |
| |
| aarch64 = zip1 |
| generate float32x2_t, float64x2_t |
| |
| /// Unzip vectors |
| name = vuzp2 |
| multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} |
| a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24 |
| b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32 |
| validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32 |
| |
| aarch64 = uzp2 |
| generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t |
| |
| aarch64 = zip2 |
| generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t |
| |
| /// Unzip vectors |
| name = vuzp2 |
| multi_fn = simd_shuffle-in_len-!, a, b, {unzip-2-in_len} |
| a = 0., 8., 1., 9., 4., 12., 5., 13. |
| b = 2., 9., 3., 11., 6., 14., 7., 15. |
| validate 8., 9., 9., 11., 12., 13., 14., 15. |
| |
| aarch64 = uzp2 |
| generate float32x4_t |
| |
| aarch64 = zip2 |
| generate float32x2_t, float64x2_t |
| |
| //////////////////// |
| // Unsigned Absolute difference and Accumulate Long |
| //////////////////// |
| |
| /// Unsigned Absolute difference and Accumulate Long |
| name = vabal |
| multi_fn = vabd-unsigned-noext, b, c, d:in_t |
| multi_fn = simd_add, a, {simd_cast, d} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 |
| validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 |
| |
| arm = vabal.s |
| aarch64 = uabal |
| generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t |
| |
| /// Unsigned Absolute difference and Accumulate Long |
| name = vabal_high |
| no-q |
| multi_fn = simd_shuffle8!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_shuffle8!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = vabd_u8, d, e, f:uint8x8_t |
| multi_fn = simd_add, a, {simd_cast, f} |
| a = 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 |
| validate 20, 20, 20, 20, 20, 20, 20, 20 |
| |
| aarch64 = uabal |
| generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t |
| |
| /// Unsigned Absolute difference and Accumulate Long |
| name = vabal_high |
| no-q |
| multi_fn = simd_shuffle4!, d:uint16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_shuffle4!, e:uint16x4_t, c, c, [4, 5, 6, 7] |
| multi_fn = vabd_u16, d, e, f:uint16x4_t |
| multi_fn = simd_add, a, {simd_cast, f} |
| a = 9, 10, 11, 12 |
| b = 1, 2, 3, 4, 9, 10, 11, 12 |
| c = 10, 10, 10, 10, 20, 0, 2, 4 |
| validate 20, 20, 20, 20 |
| |
| aarch64 = uabal |
| generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t |
| |
| /// Unsigned Absolute difference and Accumulate Long |
| name = vabal_high |
| no-q |
| multi_fn = simd_shuffle2!, d:uint32x2_t, b, b, [2, 3] |
| multi_fn = simd_shuffle2!, e:uint32x2_t, c, c, [2, 3] |
| multi_fn = vabd_u32, d, e, f:uint32x2_t |
| multi_fn = simd_add, a, {simd_cast, f} |
| a = 15, 16 |
| b = 1, 2, 15, 16 |
| c = 10, 10, 10, 12 |
| validate 20, 20 |
| |
| aarch64 = uabal |
| generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t |
| |
| //////////////////// |
| // Signed Absolute difference and Accumulate Long |
| //////////////////// |
| |
| /// Signed Absolute difference and Accumulate Long |
| name = vabal |
| multi_fn = vabd-signed-noext, b, c, d:int8x8_t |
| multi_fn = simd_cast, e:uint8x8_t, d |
| multi_fn = simd_add, a, {simd_cast, e} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 |
| validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 |
| |
| arm = vabal.s |
| aarch64 = sabal |
| generate int16x8_t:int8x8_t:int8x8_t:int16x8_t |
| |
| /// Signed Absolute difference and Accumulate Long |
| name = vabal |
| multi_fn = vabd-signed-noext, b, c, d:int16x4_t |
| multi_fn = simd_cast, e:uint16x4_t, d |
| multi_fn = simd_add, a, {simd_cast, e} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 |
| validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 |
| |
| arm = vabal.s |
| aarch64 = sabal |
| generate int32x4_t:int16x4_t:int16x4_t:int32x4_t |
| |
| /// Signed Absolute difference and Accumulate Long |
| name = vabal |
| multi_fn = vabd-signed-noext, b, c, d:int32x2_t |
| multi_fn = simd_cast, e:uint32x2_t, d |
| multi_fn = simd_add, a, {simd_cast, e} |
| a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 |
| validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 |
| |
| arm = vabal.s |
| aarch64 = sabal |
| generate int64x2_t:int32x2_t:int32x2_t:int64x2_t |
| |
| /// Signed Absolute difference and Accumulate Long |
| name = vabal_high |
| no-q |
| multi_fn = simd_shuffle8!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = simd_shuffle8!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] |
| multi_fn = vabd_s8, d, e, f:int8x8_t |
| multi_fn = simd_cast, f:uint8x8_t, f |
| multi_fn = simd_add, a, {simd_cast, f} |
| a = 9, 10, 11, 12, 13, 14, 15, 16 |
| b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
| c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 |
| validate 20, 20, 20, 20, 20, 20, 20, 20 |
| |
| aarch64 = sabal |
| generate int16x8_t:int8x16_t:int8x16_t:int16x8_t |
| |
| /// Signed Absolute difference and Accumulate Long |
| name = vabal_high |
| no-q |
| multi_fn = simd_shuffle4!, d:int16x4_t, b, b, [4, 5, 6, 7] |
| multi_fn = simd_shuffle4!, e:int16x4_t, c, c, [4, 5, 6, 7] |
| multi_fn = vabd_s16, d, e, f:int16x4_t |
| multi_fn = simd_cast, f:uint16x4_t, f |
| multi_fn = simd_add, a, {simd_cast, f} |
| a = 9, 10, 11, 12 |
| b = 1, 2, 3, 4, 9, 10, 11, 12 |
| c = 10, 10, 10, 10, 20, 0, 2, 4 |
| validate 20, 20, 20, 20 |
| |
| aarch64 = sabal |
| generate int32x4_t:int16x8_t:int16x8_t:int32x4_t |
| |
| /// Signed Absolute difference and Accumulate Long |
| name = vabal_high |
| no-q |
| multi_fn = simd_shuffle2!, d:int32x2_t, b, b, [2, 3] |
| multi_fn = simd_shuffle2!, e:int32x2_t, c, c, [2, 3] |
| multi_fn = vabd_s32, d, e, f:int32x2_t |
| multi_fn = simd_cast, f:uint32x2_t, f |
| multi_fn = simd_add, a, {simd_cast, f} |
| a = 15, 16 |
| b = 1, 2, 15, 16 |
| c = 10, 10, 10, 12 |
| validate 20, 20 |
| |
| aarch64 = sabal |
| generate int64x2_t:int32x4_t:int32x4_t:int64x2_t |
| |
| //////////////////// |
| // Singned saturating Absolute value |
| //////////////////// |
| |
| /// Singned saturating Absolute value |
| name = vqabs |
| a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5 |
| validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5 |
| |
| arm = vqabs.s |
| aarch64 = sqabs |
| link-arm = vqabs._EXT_ |
| link-aarch64 = sqabs._EXT_ |
| generate int*_t |
| |
| /// Singned saturating Absolute value |
| name = vqabs |
| a = MIN, -7 |
| validate MAX, 7 |
| |
| aarch64 = sqabs |
| link-aarch64 = sqabs._EXT_ |
| generate int64x*_t |