| // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_ |
| #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_ |
| |
| #ifdef GEMMLOWP_NEON_64 |
| |
| #include <cassert> |
| #include <cstdint> |
| |
| namespace gemmlowp { |
| namespace meta { |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v1.2s}, [%x[lhs]], #8\n" |
| "ld1 {v2.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v3.8h, v2.8b, v1.8b\n" |
| "uadalp v0.4s, v3.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.b}[0], [%x[result]], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v2.2s}, [%x[lhs]], #8\n" |
| "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v5.8h, v3.8b, v2.8b\n" |
| "umull v6.8h, v4.8b, v2.8b\n" |
| "uadalp v0.4s, v5.8h\n" |
| "uadalp v1.4s, v6.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.h}[0], [%x[result]], #2\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v3.2s}, [%x[lhs]], #8\n" |
| "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v7.8h, v4.8b, v3.8b\n" |
| "umull v8.8h, v5.8b, v3.8b\n" |
| "umull v9.8h, v6.8b, v3.8b\n" |
| "uadalp v0.4s, v7.8h\n" |
| "uadalp v1.4s, v8.8h\n" |
| "uadalp v2.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.h}[0], [%x[result]], #2\n" |
| "st1 {v0.b}[2], [%x[result]], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", |
| "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v4.2s}, [%x[lhs]], #8\n" |
| "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v9.8h, v5.8b, v4.8b\n" |
| "umull v10.8h, v6.8b, v4.8b\n" |
| "umull v11.8h, v7.8b, v4.8b\n" |
| "umull v12.8h, v8.8b, v4.8b\n" |
| "uadalp v0.4s, v9.8h\n" |
| "uadalp v1.4s, v10.8h\n" |
| "uadalp v2.4s, v11.8h\n" |
| "uadalp v3.4s, v12.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" |
| "ld1 {v9.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v10.8h, v5.8b, v9.8b\n" |
| "umull v11.8h, v6.8b, v9.8b\n" |
| "umull v12.8h, v7.8b, v9.8b\n" |
| "umull v13.8h, v8.8b, v9.8b\n" |
| "ld1 {v5.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v10.8h\n" |
| "uadalp v1.4s, v11.8h\n" |
| "uadalp v2.4s, v12.8h\n" |
| "uadalp v3.4s, v13.8h\n" |
| "umull v10.8h, v5.8b, v9.8b\n" |
| "uadalp v4.4s, v10.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v5.4s}, [%x[lhs]], #16\n" |
| "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n" |
| "dup v8.4s, %w[multiplicative_offset]\n" |
| "dup v9.4s, %w[rounding_offset]\n" |
| "dup v10.4s, %w[shift]\n" |
| "dup v5.4s, v5.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v4.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "mul v0.4s, v0.4s, v8.4s\n" |
| "mul v1.4s, v1.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v1.4s, v1.4s, v9.4s\n" |
| "sshl v0.4s, v0.4s, v10.4s\n" |
| "sshl v1.4s, v1.4s, v10.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn2 v0.8h, v1.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v0.b}[4], [%x[result]], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n" |
| "ld1 {v10.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v11.8h, v6.8b, v10.8b\n" |
| "umull v12.8h, v7.8b, v10.8b\n" |
| "umull v13.8h, v8.8b, v10.8b\n" |
| "umull v14.8h, v9.8b, v10.8b\n" |
| "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "umull v11.8h, v6.8b, v10.8b\n" |
| "umull v12.8h, v7.8b, v10.8b\n" |
| "uadalp v4.4s, v11.8h\n" |
| "uadalp v5.4s, v12.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n" |
| "dup v9.4s, %w[multiplicative_offset]\n" |
| "dup v10.4s, %w[rounding_offset]\n" |
| "dup v11.4s, %w[shift]\n" |
| "dup v6.4s, v6.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v1.4s, v1.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v8.4s\n" |
| "mul v0.4s, v0.4s, v9.4s\n" |
| "mul v1.4s, v1.4s, v9.4s\n" |
| "add v0.4s, v0.4s, v10.4s\n" |
| "add v1.4s, v1.4s, v10.4s\n" |
| "sshl v0.4s, v0.4s, v11.4s\n" |
| "sshl v1.4s, v1.4s, v11.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn2 v0.8h, v1.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v0.h}[2], [%x[result]], #2\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n" |
| "ld1 {v11.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v12.8h, v7.8b, v11.8b\n" |
| "umull v13.8h, v8.8b, v11.8b\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "umull v15.8h, v10.8b, v11.8b\n" |
| "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v12.8h\n" |
| "uadalp v1.4s, v13.8h\n" |
| "uadalp v2.4s, v14.8h\n" |
| "uadalp v3.4s, v15.8h\n" |
| "umull v12.8h, v7.8b, v11.8b\n" |
| "umull v13.8h, v8.8b, v11.8b\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "uadalp v4.4s, v12.8h\n" |
| "uadalp v5.4s, v13.8h\n" |
| "uadalp v6.4s, v14.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v7.4s}, [%x[lhs]], #16\n" |
| "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n" |
| "dup v10.4s, %w[multiplicative_offset]\n" |
| "dup v11.4s, %w[rounding_offset]\n" |
| "dup v12.4s, %w[shift]\n" |
| "dup v7.4s, v7.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v6.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v1.4s, v1.4s, v9.4s\n" |
| "mul v0.4s, v0.4s, v10.4s\n" |
| "mul v1.4s, v1.4s, v10.4s\n" |
| "add v0.4s, v0.4s, v11.4s\n" |
| "add v1.4s, v1.4s, v11.4s\n" |
| "sshl v0.4s, v0.4s, v12.4s\n" |
| "sshl v1.4s, v1.4s, v12.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn2 v0.8h, v1.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v0.h}[2], [%x[result]], #2\n" |
| "st1 {v0.b}[6], [%x[result]], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| |
| // 1x8 lanes loop. |
| "1:" |
| |
| "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" |
| "ld1 {v8.2s}, [%x[lhs]], #8\n" |
| "umull v13.8h, v8.8b, v9.8b\n" |
| "umull v14.8h, v8.8b, v10.8b\n" |
| "umull v15.8h, v8.8b, v11.8b\n" |
| "umull v16.8h, v8.8b, v12.8b\n" |
| "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" |
| "uadalp v0.4s, v13.8h\n" |
| "uadalp v1.4s, v14.8h\n" |
| "uadalp v2.4s, v15.8h\n" |
| "uadalp v3.4s, v16.8h\n" |
| "prfm pldl1keep, [%x[rhs], #256]\n" |
| "umull v17.8h, v8.8b, v9.8b\n" |
| "umull v13.8h, v8.8b, v10.8b\n" |
| "umull v14.8h, v8.8b, v11.8b\n" |
| "umull v15.8h, v8.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[lhs], #32]\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "uadalp v4.4s, v17.8h\n" |
| "uadalp v5.4s, v13.8h\n" |
| "uadalp v6.4s, v14.8h\n" |
| "uadalp v7.4s, v15.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v8.4s}, [%x[lhs]], #16\n" |
| "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n" |
| "dup v11.4s, %w[multiplicative_offset]\n" |
| "dup v12.4s, %w[rounding_offset]\n" |
| "dup v13.4s, %w[shift]\n" |
| "dup v8.4s, v8.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v1.4s, v1.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v1.4s, v1.4s, v10.4s\n" |
| "mul v0.4s, v0.4s, v11.4s\n" |
| "mul v1.4s, v1.4s, v11.4s\n" |
| "add v0.4s, v0.4s, v12.4s\n" |
| "add v1.4s, v1.4s, v12.4s\n" |
| "sshl v0.4s, v0.4s, v13.4s\n" |
| "sshl v1.4s, v1.4s, v13.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn2 v0.8h, v1.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n" |
| "ld1 {v4.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v5.8h, v4.8b, v2.8b\n" |
| "umull v6.8h, v4.8b, v3.8b\n" |
| "uadalp v0.4s, v5.8h\n" |
| "uadalp v1.4s, v6.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v2.4s, v4.s[0]\n" |
| "dup v4.4s, v4.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v2.4s\n" |
| "add v1.4s, v1.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "mul v1.4s, v1.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sshl v1.4s, v1.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v1.4h, v1.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v1.8b, v1.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.b}[0], [%x[result]], #1\n" |
| "st1 {v1.b}[0], [x0], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", |
| "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n" |
| "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v8.8h, v6.8b, v4.8b\n" |
| "umull v9.8h, v7.8b, v4.8b\n" |
| "umull v10.8h, v6.8b, v5.8b\n" |
| "umull v11.8h, v7.8b, v5.8b\n" |
| "uadalp v0.4s, v8.8h\n" |
| "uadalp v1.4s, v9.8h\n" |
| "uadalp v2.4s, v10.8h\n" |
| "uadalp v3.4s, v11.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v9.4s, v4.s[0]\n" |
| "dup v4.4s, v4.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v2.4s, v2.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v2.4s, v2.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "mul v2.4s, v2.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sshl v2.4s, v2.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v2.4h, v2.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v2.8b, v2.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.h}[0], [%x[result]], #2\n" |
| "st1 {v2.h}[0], [x0], #2\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n" |
| "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v11.8h, v8.8b, v6.8b\n" |
| "umull v12.8h, v9.8b, v6.8b\n" |
| "umull v13.8h, v10.8b, v6.8b\n" |
| "umull v14.8h, v8.8b, v7.8b\n" |
| "umull v15.8h, v9.8b, v7.8b\n" |
| "umull v16.8h, v10.8b, v7.8b\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s}, [%x[rhs]], #16\n" |
| "dup v8.4s, %w[multiplicative_offset]\n" |
| "dup v9.4s, %w[rounding_offset]\n" |
| "dup v10.4s, %w[shift]\n" |
| "dup v11.4s, v6.s[0]\n" |
| "dup v6.4s, v6.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v3.4s, v3.4s, v4.4s\n" |
| "addp v5.4s, v5.4s, v5.4s\n" |
| "addp v3.4s, v3.4s, v5.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v11.4s\n" |
| "add v3.4s, v3.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v3.4s, v3.4s, v7.4s\n" |
| "mul v0.4s, v0.4s, v8.4s\n" |
| "mul v3.4s, v3.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v3.4s, v3.4s, v9.4s\n" |
| "sshl v0.4s, v0.4s, v10.4s\n" |
| "sshl v3.4s, v3.4s, v10.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v3.4h, v3.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v3.8b, v3.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.h}[0], [%x[result]], #2\n" |
| "st1 {v0.b}[2], [%x[result]], #1\n" |
| "st1 {v3.h}[0], [x0], #2\n" |
| "st1 {v3.b}[2], [x0], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| |
| // 2x4 lanes loop. |
| "1:" |
| |
| "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n" |
| "ld1 {v8.8b}, [%x[lhs]], #8\n" |
| "umull v14.8h, v8.8b, v10.8b\n" |
| "ld1 {v9.8b}, [%x[lhs]], #8\n" |
| "umull v15.8h, v8.8b, v11.8b\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v16.8h, v8.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v17.8h, v8.8b, v13.8b\n" |
| "umull v18.8h, v9.8b, v10.8b\n" |
| "uadalp v0.4s, v14.8h\n" |
| "uadalp v1.4s, v15.8h\n" |
| "uadalp v2.4s, v16.8h\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "umull v15.8h, v9.8b, v12.8b\n" |
| "umull v16.8h, v9.8b, v13.8b\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "uadalp v3.4s, v17.8h\n" |
| "uadalp v4.4s, v18.8h\n" |
| "uadalp v5.4s, v14.8h\n" |
| "uadalp v6.4s, v15.8h\n" |
| "uadalp v7.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v8.4s}, [%x[lhs]], #16\n" |
| "ld1 {v9.4s}, [%x[rhs]], #16\n" |
| "dup v10.4s, %w[multiplicative_offset]\n" |
| "dup v11.4s, %w[rounding_offset]\n" |
| "dup v12.4s, %w[shift]\n" |
| "dup v13.4s, v8.s[0]\n" |
| "dup v8.4s, v8.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v4.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v13.4s\n" |
| "add v4.4s, v4.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v4.4s, v4.4s, v9.4s\n" |
| "mul v0.4s, v0.4s, v10.4s\n" |
| "mul v4.4s, v4.4s, v10.4s\n" |
| "add v0.4s, v0.4s, v11.4s\n" |
| "add v4.4s, v4.4s, v11.4s\n" |
| "sshl v0.4s, v0.4s, v12.4s\n" |
| "sshl v4.4s, v4.4s, v12.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v4.4h, v4.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v4.8b, v4.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v4.s}[0], [x0], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n" |
| "ld1 {v6.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v7.8h, v6.8b, v3.8b\n" |
| "umull v8.8h, v6.8b, v4.8b\n" |
| "umull v9.8h, v6.8b, v5.8b\n" |
| "uadalp v0.4s, v7.8h\n" |
| "uadalp v1.4s, v8.8h\n" |
| "uadalp v2.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[multiplicative_offset]\n" |
| "dup v7.4s, %w[rounding_offset]\n" |
| "dup v8.4s, %w[shift]\n" |
| "dup v3.4s, v4.s[0]\n" |
| "dup v9.4s, v4.s[1]\n" |
| "dup v4.4s, v4.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v3.4s\n" |
| "add v1.4s, v1.4s, v9.4s\n" |
| "add v2.4s, v2.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "add v2.4s, v2.4s, v5.4s\n" |
| "mul v0.4s, v0.4s, v6.4s\n" |
| "mul v1.4s, v1.4s, v6.4s\n" |
| "mul v2.4s, v2.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v7.4s\n" |
| "sshl v0.4s, v0.4s, v8.4s\n" |
| "sshl v1.4s, v1.4s, v8.4s\n" |
| "sshl v2.4s, v2.4s, v8.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v1.4h, v1.4s\n" |
| "sqxtn v2.4h, v2.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v1.8b, v1.8h\n" |
| "sqxtun v2.8b, v2.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.b}[0], [%x[result]], #1\n" |
| "st1 {v1.b}[0], [x0], #1\n" |
| "st1 {v2.b}[0], [x1], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n" |
| "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v11.8h, v9.8b, v6.8b\n" |
| "umull v12.8h, v10.8b, v6.8b\n" |
| "umull v13.8h, v9.8b, v7.8b\n" |
| "umull v14.8h, v10.8b, v7.8b\n" |
| "umull v15.8h, v9.8b, v8.8b\n" |
| "umull v16.8h, v10.8b, v8.8b\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s}, [%x[rhs]], #16\n" |
| "dup v8.4s, %w[multiplicative_offset]\n" |
| "dup v9.4s, %w[rounding_offset]\n" |
| "dup v10.4s, %w[shift]\n" |
| "dup v11.4s, v6.s[0]\n" |
| "dup v12.4s, v6.s[1]\n" |
| "dup v6.4s, v6.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v4.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v11.4s\n" |
| "add v2.4s, v2.4s, v12.4s\n" |
| "add v4.4s, v4.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v7.4s\n" |
| "add v4.4s, v4.4s, v7.4s\n" |
| "mul v0.4s, v0.4s, v8.4s\n" |
| "mul v2.4s, v2.4s, v8.4s\n" |
| "mul v4.4s, v4.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v2.4s, v2.4s, v9.4s\n" |
| "add v4.4s, v4.4s, v9.4s\n" |
| "sshl v0.4s, v0.4s, v10.4s\n" |
| "sshl v2.4s, v2.4s, v10.4s\n" |
| "sshl v4.4s, v4.4s, v10.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v2.4h, v2.4s\n" |
| "sqxtn v4.4h, v4.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v2.8b, v2.8h\n" |
| "sqxtun v4.8b, v4.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.h}[0], [%x[result]], #2\n" |
| "st1 {v2.h}[0], [x0], #2\n" |
| "st1 {v4.h}[0], [x1], #2\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); |
| } |
| |
| template <> |
| inline void |
| MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessed, |
| RowMajor>& params, |
| uint8_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, " |
| "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| "mov v8.16b, v5.16b\n" |
| |
| // 3x3 lanes loop. |
| "1:" |
| |
| "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n" |
| "ld1 {v9.8b}, [%x[lhs]], #8\n" |
| "umull v15.8h, v9.8b, v12.8b\n" |
| "ld1 {v10.8b}, [%x[lhs]], #8\n" |
| "umull v16.8h, v9.8b, v13.8b\n" |
| "ld1 {v11.8b}, [%x[lhs]], #8\n" |
| "umull v17.8h, v9.8b, v14.8b\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v18.8h, v10.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "uadalp v0.4s, v15.8h\n" |
| "uadalp v1.4s, v16.8h\n" |
| "uadalp v2.4s, v17.8h\n" |
| "uadalp v3.4s, v18.8h\n" |
| "umull v15.8h, v10.8b, v13.8b\n" |
| "umull v16.8h, v10.8b, v14.8b\n" |
| "umull v17.8h, v11.8b, v12.8b\n" |
| "umull v18.8h, v11.8b, v13.8b\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "umull v9.8h, v11.8b, v14.8b\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| "uadalp v6.4s, v17.8h\n" |
| "uadalp v7.4s, v18.8h\n" |
| "uadalp v8.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantization::Prepare |
| "ld1 {v9.4s}, [%x[lhs]], #16\n" |
| "ld1 {v10.4s}, [%x[rhs]], #16\n" |
| "dup v11.4s, %w[multiplicative_offset]\n" |
| "dup v12.4s, %w[rounding_offset]\n" |
| "dup v13.4s, %w[shift]\n" |
| "dup v14.4s, v9.s[0]\n" |
| "dup v15.4s, v9.s[1]\n" |
| "dup v9.4s, v9.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v3.4s, v3.4s, v4.4s\n" |
| "addp v5.4s, v5.4s, v5.4s\n" |
| "addp v3.4s, v3.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v8.4s, v8.4s, v8.4s\n" |
| "addp v6.4s, v6.4s, v8.4s\n" |
| |
| // StaticQuantization::Transform |
| "add v0.4s, v0.4s, v14.4s\n" |
| "add v3.4s, v3.4s, v15.4s\n" |
| "add v6.4s, v6.4s, v9.4s\n" |
| "add v0.4s, v0.4s, v10.4s\n" |
| "add v3.4s, v3.4s, v10.4s\n" |
| "add v6.4s, v6.4s, v10.4s\n" |
| "mul v0.4s, v0.4s, v11.4s\n" |
| "mul v3.4s, v3.4s, v11.4s\n" |
| "mul v6.4s, v6.4s, v11.4s\n" |
| "add v0.4s, v0.4s, v12.4s\n" |
| "add v3.4s, v3.4s, v12.4s\n" |
| "add v6.4s, v6.4s, v12.4s\n" |
| "sshl v0.4s, v0.4s, v13.4s\n" |
| "sshl v3.4s, v3.4s, v13.4s\n" |
| "sshl v6.4s, v6.4s, v13.4s\n" |
| "sqxtn v0.4h, v0.4s\n" |
| "sqxtn v3.4h, v3.4s\n" |
| "sqxtn v6.4h, v6.4s\n" |
| "sqxtun v0.8b, v0.8h\n" |
| "sqxtun v3.8b, v3.8h\n" |
| "sqxtun v6.8b, v6.8h\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.h}[0], [%x[result]], #2\n" |
| "st1 {v0.b}[2], [%x[result]], #1\n" |
| "st1 {v3.h}[0], [x0], #2\n" |
| "st1 {v3.b}[2], [x0], #1\n" |
| "st1 {v6.h}[0], [x1], #2\n" |
| "st1 {v6.b}[2], [x1], #1\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [multiplicative_offset] "r"(params.kernel.multiplicative_offset), |
| [shift] "r"(params.kernel.shift), |
| [stride] "r"(params.output_stream.stride), |
| [rounding_offset] "r"(params.kernel.rounding_offset) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", |
| "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v1.2s}, [%x[lhs]], #8\n" |
| "ld1 {v2.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v3.8h, v2.8b, v1.8b\n" |
| "uadalp v0.4s, v3.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v2.2s}, [%x[lhs]], #8\n" |
| "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v5.8h, v3.8b, v2.8b\n" |
| "umull v6.8h, v4.8b, v2.8b\n" |
| "uadalp v0.4s, v5.8h\n" |
| "uadalp v1.4s, v6.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v3.2s}, [%x[lhs]], #8\n" |
| "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v7.8h, v4.8b, v3.8b\n" |
| "umull v8.8h, v5.8b, v3.8b\n" |
| "umull v9.8h, v6.8b, v3.8b\n" |
| "uadalp v0.4s, v7.8h\n" |
| "uadalp v1.4s, v8.8h\n" |
| "uadalp v2.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v0.s}[2], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", |
| "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v4.2s}, [%x[lhs]], #8\n" |
| "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v9.8h, v5.8b, v4.8b\n" |
| "umull v10.8h, v6.8b, v4.8b\n" |
| "umull v11.8h, v7.8b, v4.8b\n" |
| "umull v12.8h, v8.8b, v4.8b\n" |
| "uadalp v0.4s, v9.8h\n" |
| "uadalp v1.4s, v10.8h\n" |
| "uadalp v2.4s, v11.8h\n" |
| "uadalp v3.4s, v12.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" |
| "ld1 {v9.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v10.8h, v5.8b, v9.8b\n" |
| "umull v11.8h, v6.8b, v9.8b\n" |
| "umull v12.8h, v7.8b, v9.8b\n" |
| "umull v13.8h, v8.8b, v9.8b\n" |
| "ld1 {v5.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v10.8h\n" |
| "uadalp v1.4s, v11.8h\n" |
| "uadalp v2.4s, v12.8h\n" |
| "uadalp v3.4s, v13.8h\n" |
| "umull v10.8h, v5.8b, v9.8b\n" |
| "uadalp v4.4s, v10.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v5.4s}, [%x[lhs]], #16\n" |
| "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n" |
| "dup v5.4s, v5.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v4.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v1.s}[0], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n" |
| "ld1 {v10.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v11.8h, v6.8b, v10.8b\n" |
| "umull v12.8h, v7.8b, v10.8b\n" |
| "umull v13.8h, v8.8b, v10.8b\n" |
| "umull v14.8h, v9.8b, v10.8b\n" |
| "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "umull v11.8h, v6.8b, v10.8b\n" |
| "umull v12.8h, v7.8b, v10.8b\n" |
| "uadalp v4.4s, v11.8h\n" |
| "uadalp v5.4s, v12.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n" |
| "dup v6.4s, v6.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v1.4s, v1.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v8.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v1.2s}, [%x[result]], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n" |
| "ld1 {v11.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v12.8h, v7.8b, v11.8b\n" |
| "umull v13.8h, v8.8b, v11.8b\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "umull v15.8h, v10.8b, v11.8b\n" |
| "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v12.8h\n" |
| "uadalp v1.4s, v13.8h\n" |
| "uadalp v2.4s, v14.8h\n" |
| "uadalp v3.4s, v15.8h\n" |
| "umull v12.8h, v7.8b, v11.8b\n" |
| "umull v13.8h, v8.8b, v11.8b\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "uadalp v4.4s, v12.8h\n" |
| "uadalp v5.4s, v13.8h\n" |
| "uadalp v6.4s, v14.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v7.4s}, [%x[lhs]], #16\n" |
| "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n" |
| "dup v7.4s, v7.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v6.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v1.4s, v1.4s, v9.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v1.2s}, [%x[result]], #8\n" |
| "st1 {v1.s}[2], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| |
| // 1x8 lanes loop. |
| "1:" |
| |
| "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" |
| "ld1 {v8.2s}, [%x[lhs]], #8\n" |
| "umull v13.8h, v8.8b, v9.8b\n" |
| "umull v14.8h, v8.8b, v10.8b\n" |
| "umull v15.8h, v8.8b, v11.8b\n" |
| "umull v16.8h, v8.8b, v12.8b\n" |
| "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" |
| "uadalp v0.4s, v13.8h\n" |
| "uadalp v1.4s, v14.8h\n" |
| "uadalp v2.4s, v15.8h\n" |
| "uadalp v3.4s, v16.8h\n" |
| "prfm pldl1keep, [%x[rhs], #256]\n" |
| "umull v17.8h, v8.8b, v9.8b\n" |
| "umull v13.8h, v8.8b, v10.8b\n" |
| "umull v14.8h, v8.8b, v11.8b\n" |
| "umull v15.8h, v8.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[lhs], #32]\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "uadalp v4.4s, v17.8h\n" |
| "uadalp v5.4s, v13.8h\n" |
| "uadalp v6.4s, v14.8h\n" |
| "uadalp v7.4s, v15.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v8.4s}, [%x[lhs]], #16\n" |
| "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n" |
| "dup v8.4s, v8.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v1.4s, v1.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v1.4s, v1.4s, v10.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s, v1.4s}, [%x[result]], #32\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n" |
| "ld1 {v4.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v5.8h, v4.8b, v2.8b\n" |
| "umull v6.8h, v4.8b, v3.8b\n" |
| "uadalp v0.4s, v5.8h\n" |
| "uadalp v1.4s, v6.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v2.4s, v4.s[0]\n" |
| "dup v4.4s, v4.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v2.4s\n" |
| "add v1.4s, v1.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v1.s}[0], [x0], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n" |
| "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v8.8h, v6.8b, v4.8b\n" |
| "umull v9.8h, v7.8b, v4.8b\n" |
| "umull v10.8h, v6.8b, v5.8b\n" |
| "umull v11.8h, v7.8b, v5.8b\n" |
| "uadalp v0.4s, v8.8h\n" |
| "uadalp v1.4s, v9.8h\n" |
| "uadalp v2.4s, v10.8h\n" |
| "uadalp v3.4s, v11.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, v4.s[0]\n" |
| "dup v4.4s, v4.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v2.4s, v2.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v2.4s, v2.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v2.2s}, [x0], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n" |
| "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v11.8h, v8.8b, v6.8b\n" |
| "umull v12.8h, v9.8b, v6.8b\n" |
| "umull v13.8h, v10.8b, v6.8b\n" |
| "umull v14.8h, v8.8b, v7.8b\n" |
| "umull v15.8h, v9.8b, v7.8b\n" |
| "umull v16.8h, v10.8b, v7.8b\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s}, [%x[rhs]], #16\n" |
| "dup v8.4s, v6.s[0]\n" |
| "dup v6.4s, v6.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v3.4s, v3.4s, v4.4s\n" |
| "addp v5.4s, v5.4s, v5.4s\n" |
| "addp v3.4s, v3.4s, v5.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v3.4s, v3.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v3.4s, v3.4s, v7.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v0.s}[2], [%x[result]], #4\n" |
| "st1 {v3.2s}, [x0], #8\n" |
| "st1 {v3.s}[2], [x0], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| |
| // 2x4 lanes loop. |
| "1:" |
| |
| "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n" |
| "ld1 {v8.8b}, [%x[lhs]], #8\n" |
| "umull v14.8h, v8.8b, v10.8b\n" |
| "ld1 {v9.8b}, [%x[lhs]], #8\n" |
| "umull v15.8h, v8.8b, v11.8b\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v16.8h, v8.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v17.8h, v8.8b, v13.8b\n" |
| "umull v18.8h, v9.8b, v10.8b\n" |
| "uadalp v0.4s, v14.8h\n" |
| "uadalp v1.4s, v15.8h\n" |
| "uadalp v2.4s, v16.8h\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "umull v15.8h, v9.8b, v12.8b\n" |
| "umull v16.8h, v9.8b, v13.8b\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "uadalp v3.4s, v17.8h\n" |
| "uadalp v4.4s, v18.8h\n" |
| "uadalp v5.4s, v14.8h\n" |
| "uadalp v6.4s, v15.8h\n" |
| "uadalp v7.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v8.4s}, [%x[lhs]], #16\n" |
| "ld1 {v9.4s}, [%x[rhs]], #16\n" |
| "dup v10.4s, v8.s[0]\n" |
| "dup v8.4s, v8.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v4.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v10.4s\n" |
| "add v4.4s, v4.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v4.4s, v4.4s, v9.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v4.4s}, [x0], #16\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n" |
| "ld1 {v6.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v7.8h, v6.8b, v3.8b\n" |
| "umull v8.8h, v6.8b, v4.8b\n" |
| "umull v9.8h, v6.8b, v5.8b\n" |
| "uadalp v0.4s, v7.8h\n" |
| "uadalp v1.4s, v8.8h\n" |
| "uadalp v2.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v3.4s, v4.s[0]\n" |
| "dup v6.4s, v4.s[1]\n" |
| "dup v4.4s, v4.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v3.4s\n" |
| "add v1.4s, v1.4s, v6.4s\n" |
| "add v2.4s, v2.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "add v2.4s, v2.4s, v5.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v1.s}[0], [x0], #4\n" |
| "st1 {v2.s}[0], [x1], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n" |
| "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v11.8h, v9.8b, v6.8b\n" |
| "umull v12.8h, v10.8b, v6.8b\n" |
| "umull v13.8h, v9.8b, v7.8b\n" |
| "umull v14.8h, v10.8b, v7.8b\n" |
| "umull v15.8h, v9.8b, v8.8b\n" |
| "umull v16.8h, v10.8b, v8.8b\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s}, [%x[rhs]], #16\n" |
| "dup v8.4s, v6.s[0]\n" |
| "dup v9.4s, v6.s[1]\n" |
| "dup v6.4s, v6.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v4.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v2.4s, v2.4s, v9.4s\n" |
| "add v4.4s, v4.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v7.4s\n" |
| "add v4.4s, v4.4s, v7.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v2.2s}, [x0], #8\n" |
| "st1 {v4.2s}, [x1], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsInt32, |
| RowMajor>& params, |
| int32_t* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, " |
| "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| "mov v8.16b, v5.16b\n" |
| |
| // 3x3 lanes loop. |
| "1:" |
| |
| "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n" |
| "ld1 {v9.8b}, [%x[lhs]], #8\n" |
| "umull v15.8h, v9.8b, v12.8b\n" |
| "ld1 {v10.8b}, [%x[lhs]], #8\n" |
| "umull v16.8h, v9.8b, v13.8b\n" |
| "ld1 {v11.8b}, [%x[lhs]], #8\n" |
| "umull v17.8h, v9.8b, v14.8b\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v18.8h, v10.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "uadalp v0.4s, v15.8h\n" |
| "uadalp v1.4s, v16.8h\n" |
| "uadalp v2.4s, v17.8h\n" |
| "uadalp v3.4s, v18.8h\n" |
| "umull v15.8h, v10.8b, v13.8b\n" |
| "umull v16.8h, v10.8b, v14.8b\n" |
| "umull v17.8h, v11.8b, v12.8b\n" |
| "umull v18.8h, v11.8b, v13.8b\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "umull v9.8h, v11.8b, v14.8b\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| "uadalp v6.4s, v17.8h\n" |
| "uadalp v7.4s, v18.8h\n" |
| "uadalp v8.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationInt32::Prepare |
| "ld1 {v9.4s}, [%x[lhs]], #16\n" |
| "ld1 {v10.4s}, [%x[rhs]], #16\n" |
| "dup v11.4s, v9.s[0]\n" |
| "dup v12.4s, v9.s[1]\n" |
| "dup v9.4s, v9.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v3.4s, v3.4s, v4.4s\n" |
| "addp v5.4s, v5.4s, v5.4s\n" |
| "addp v3.4s, v3.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v8.4s, v8.4s, v8.4s\n" |
| "addp v6.4s, v6.4s, v8.4s\n" |
| |
| // StaticQuantizationInt32::Transform |
| "add v0.4s, v0.4s, v11.4s\n" |
| "add v3.4s, v3.4s, v12.4s\n" |
| "add v6.4s, v6.4s, v9.4s\n" |
| "add v0.4s, v0.4s, v10.4s\n" |
| "add v3.4s, v3.4s, v10.4s\n" |
| "add v6.4s, v6.4s, v10.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v0.s}[2], [%x[result]], #4\n" |
| "st1 {v3.2s}, [x0], #8\n" |
| "st1 {v3.s}[2], [x0], #4\n" |
| "st1 {v6.2s}, [x1], #8\n" |
| "st1 {v6.s}[2], [x1], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", |
| "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v1.2s}, [%x[lhs]], #8\n" |
| "ld1 {v2.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v3.8h, v2.8b, v1.8b\n" |
| "uadalp v0.4s, v3.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v2.2s}, [%x[lhs]], #8\n" |
| "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v5.8h, v3.8b, v2.8b\n" |
| "umull v6.8h, v4.8b, v2.8b\n" |
| "uadalp v0.4s, v5.8h\n" |
| "uadalp v1.4s, v6.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v3.2s}, [%x[lhs]], #8\n" |
| "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v7.8h, v4.8b, v3.8b\n" |
| "umull v8.8h, v5.8b, v3.8b\n" |
| "umull v9.8h, v6.8b, v3.8b\n" |
| "uadalp v0.4s, v7.8h\n" |
| "uadalp v1.4s, v8.8h\n" |
| "uadalp v2.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v0.s}[2], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", |
| "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v4.2s}, [%x[lhs]], #8\n" |
| "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v9.8h, v5.8b, v4.8b\n" |
| "umull v10.8h, v6.8b, v4.8b\n" |
| "umull v11.8h, v7.8b, v4.8b\n" |
| "umull v12.8h, v8.8b, v4.8b\n" |
| "uadalp v0.4s, v9.8h\n" |
| "uadalp v1.4s, v10.8h\n" |
| "uadalp v2.4s, v11.8h\n" |
| "uadalp v3.4s, v12.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v4.4s, v4.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n" |
| "ld1 {v9.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v10.8h, v5.8b, v9.8b\n" |
| "umull v11.8h, v6.8b, v9.8b\n" |
| "umull v12.8h, v7.8b, v9.8b\n" |
| "umull v13.8h, v8.8b, v9.8b\n" |
| "ld1 {v5.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v10.8h\n" |
| "uadalp v1.4s, v11.8h\n" |
| "uadalp v2.4s, v12.8h\n" |
| "uadalp v3.4s, v13.8h\n" |
| "umull v10.8h, v5.8b, v9.8b\n" |
| "uadalp v4.4s, v10.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v5.4s}, [%x[lhs]], #16\n" |
| "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n" |
| "dup v8.4s, %w[scale]\n" |
| "dup v5.4s, v5.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v4.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v1.4s, v1.4s\n" |
| "fmul v0.4s, v0.4s, v8.4s\n" |
| "fmul v1.4s, v1.4s, v8.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v1.s}[0], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n" |
| "ld1 {v10.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v11.8h, v6.8b, v10.8b\n" |
| "umull v12.8h, v7.8b, v10.8b\n" |
| "umull v13.8h, v8.8b, v10.8b\n" |
| "umull v14.8h, v9.8b, v10.8b\n" |
| "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "umull v11.8h, v6.8b, v10.8b\n" |
| "umull v12.8h, v7.8b, v10.8b\n" |
| "uadalp v4.4s, v11.8h\n" |
| "uadalp v5.4s, v12.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n" |
| "dup v9.4s, %w[scale]\n" |
| "dup v6.4s, v6.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v6.4s\n" |
| "add v1.4s, v1.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v8.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v1.4s, v1.4s\n" |
| "fmul v0.4s, v0.4s, v9.4s\n" |
| "fmul v1.4s, v1.4s, v9.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v1.2s}, [%x[result]], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| |
| // General 1xM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n" |
| "ld1 {v11.2s}, [%x[lhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v12.8h, v7.8b, v11.8b\n" |
| "umull v13.8h, v8.8b, v11.8b\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "umull v15.8h, v10.8b, v11.8b\n" |
| "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[rhs], #128]\n" |
| "uadalp v0.4s, v12.8h\n" |
| "uadalp v1.4s, v13.8h\n" |
| "uadalp v2.4s, v14.8h\n" |
| "uadalp v3.4s, v15.8h\n" |
| "umull v12.8h, v7.8b, v11.8b\n" |
| "umull v13.8h, v8.8b, v11.8b\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "uadalp v4.4s, v12.8h\n" |
| "uadalp v5.4s, v13.8h\n" |
| "uadalp v6.4s, v14.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v7.4s}, [%x[lhs]], #16\n" |
| "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n" |
| "dup v10.4s, %w[scale]\n" |
| "dup v7.4s, v7.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v6.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v1.4s, v1.4s, v9.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v1.4s, v1.4s\n" |
| "fmul v0.4s, v0.4s, v10.4s\n" |
| "fmul v1.4s, v1.4s, v10.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v1.2s}, [%x[result]], #8\n" |
| "st1 {v1.s}[2], [%x[result]], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| |
| // 1x8 lanes loop. |
| "1:" |
| |
| "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" |
| "ld1 {v8.2s}, [%x[lhs]], #8\n" |
| "umull v13.8h, v8.8b, v9.8b\n" |
| "umull v14.8h, v8.8b, v10.8b\n" |
| "umull v15.8h, v8.8b, v11.8b\n" |
| "umull v16.8h, v8.8b, v12.8b\n" |
| "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n" |
| "uadalp v0.4s, v13.8h\n" |
| "uadalp v1.4s, v14.8h\n" |
| "uadalp v2.4s, v15.8h\n" |
| "uadalp v3.4s, v16.8h\n" |
| "prfm pldl1keep, [%x[rhs], #256]\n" |
| "umull v17.8h, v8.8b, v9.8b\n" |
| "umull v13.8h, v8.8b, v10.8b\n" |
| "umull v14.8h, v8.8b, v11.8b\n" |
| "umull v15.8h, v8.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[lhs], #32]\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "uadalp v4.4s, v17.8h\n" |
| "uadalp v5.4s, v13.8h\n" |
| "uadalp v6.4s, v14.8h\n" |
| "uadalp v7.4s, v15.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v8.4s}, [%x[lhs]], #16\n" |
| "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n" |
| "dup v11.4s, %w[scale]\n" |
| "dup v8.4s, v8.s[0]\n" |
| |
| // RowMajorOutput::Prepare |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v1.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v8.4s\n" |
| "add v1.4s, v1.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v1.4s, v1.4s, v10.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v1.4s, v1.4s\n" |
| "fmul v0.4s, v0.4s, v11.4s\n" |
| "fmul v1.4s, v1.4s, v11.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s, v1.4s}, [%x[result]], #32\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n" |
| "ld1 {v4.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v5.8h, v4.8b, v2.8b\n" |
| "umull v6.8h, v4.8b, v3.8b\n" |
| "uadalp v0.4s, v5.8h\n" |
| "uadalp v1.4s, v6.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v2.4s, v4.s[0]\n" |
| "dup v4.4s, v4.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v2.4s\n" |
| "add v1.4s, v1.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v1.4s, v1.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| "fmul v1.4s, v1.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v1.s}[0], [x0], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n" |
| "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v8.8h, v6.8b, v4.8b\n" |
| "umull v9.8h, v7.8b, v4.8b\n" |
| "umull v10.8h, v6.8b, v5.8b\n" |
| "umull v11.8h, v7.8b, v5.8b\n" |
| "uadalp v0.4s, v8.8h\n" |
| "uadalp v1.4s, v9.8h\n" |
| "uadalp v2.4s, v10.8h\n" |
| "uadalp v3.4s, v11.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v7.4s, v4.s[0]\n" |
| "dup v4.4s, v4.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v2.4s, v2.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v2.4s, v2.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| "fmul v2.4s, v2.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v2.2s}, [x0], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n" |
| "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v11.8h, v8.8b, v6.8b\n" |
| "umull v12.8h, v9.8b, v6.8b\n" |
| "umull v13.8h, v10.8b, v6.8b\n" |
| "umull v14.8h, v8.8b, v7.8b\n" |
| "umull v15.8h, v9.8b, v7.8b\n" |
| "umull v16.8h, v10.8b, v7.8b\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s}, [%x[rhs]], #16\n" |
| "dup v8.4s, %w[scale]\n" |
| "dup v9.4s, v6.s[0]\n" |
| "dup v6.4s, v6.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v3.4s, v3.4s, v4.4s\n" |
| "addp v5.4s, v5.4s, v5.4s\n" |
| "addp v3.4s, v3.4s, v5.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v3.4s, v3.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v3.4s, v3.4s, v7.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v3.4s, v3.4s\n" |
| "fmul v0.4s, v0.4s, v8.4s\n" |
| "fmul v3.4s, v3.4s, v8.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v0.s}[2], [%x[result]], #4\n" |
| "st1 {v3.2s}, [x0], #8\n" |
| "st1 {v3.s}[2], [x0], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| |
| // 2x4 lanes loop. |
| "1:" |
| |
| "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n" |
| "ld1 {v8.8b}, [%x[lhs]], #8\n" |
| "umull v14.8h, v8.8b, v10.8b\n" |
| "ld1 {v9.8b}, [%x[lhs]], #8\n" |
| "umull v15.8h, v8.8b, v11.8b\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v16.8h, v8.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v17.8h, v8.8b, v13.8b\n" |
| "umull v18.8h, v9.8b, v10.8b\n" |
| "uadalp v0.4s, v14.8h\n" |
| "uadalp v1.4s, v15.8h\n" |
| "uadalp v2.4s, v16.8h\n" |
| "umull v14.8h, v9.8b, v11.8b\n" |
| "umull v15.8h, v9.8b, v12.8b\n" |
| "umull v16.8h, v9.8b, v13.8b\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "uadalp v3.4s, v17.8h\n" |
| "uadalp v4.4s, v18.8h\n" |
| "uadalp v5.4s, v14.8h\n" |
| "uadalp v6.4s, v15.8h\n" |
| "uadalp v7.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v8.4s}, [%x[lhs]], #16\n" |
| "ld1 {v9.4s}, [%x[rhs]], #16\n" |
| "dup v10.4s, %w[scale]\n" |
| "dup v11.4s, v8.s[0]\n" |
| "dup v8.4s, v8.s[1]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v4.4s, v4.4s, v6.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v11.4s\n" |
| "add v4.4s, v4.4s, v8.4s\n" |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v4.4s, v4.4s, v9.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v4.4s, v4.4s\n" |
| "fmul v0.4s, v0.4s, v10.4s\n" |
| "fmul v4.4s, v4.4s, v10.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.4s}, [%x[result]], #16\n" |
| "st1 {v4.4s}, [x0], #16\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", |
| "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n" |
| "ld1 {v6.2s}, [%x[rhs]], #8\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v7.8h, v6.8b, v3.8b\n" |
| "umull v8.8h, v6.8b, v4.8b\n" |
| "umull v9.8h, v6.8b, v5.8b\n" |
| "uadalp v0.4s, v7.8h\n" |
| "uadalp v1.4s, v8.8h\n" |
| "uadalp v2.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v4.4s}, [%x[lhs]], #16\n" |
| "ld1 {v5.4s}, [%x[rhs]], #16\n" |
| "dup v6.4s, %w[scale]\n" |
| "dup v3.4s, v4.s[0]\n" |
| "dup v7.4s, v4.s[1]\n" |
| "dup v4.4s, v4.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v1.4s, v1.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v3.4s\n" |
| "add v1.4s, v1.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v4.4s\n" |
| "add v0.4s, v0.4s, v5.4s\n" |
| "add v1.4s, v1.4s, v5.4s\n" |
| "add v2.4s, v2.4s, v5.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v1.4s, v1.4s\n" |
| "scvtf v2.4s, v2.4s\n" |
| "fmul v0.4s, v0.4s, v6.4s\n" |
| "fmul v1.4s, v1.4s, v6.4s\n" |
| "fmul v2.4s, v2.4s, v6.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.s}[0], [%x[result]], #4\n" |
| "st1 {v1.s}[0], [x0], #4\n" |
| "st1 {v2.s}[0], [x1], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| |
| // General NxM lanes loop. |
| "1:" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n" |
| "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "umull v11.8h, v9.8b, v6.8b\n" |
| "umull v12.8h, v10.8b, v6.8b\n" |
| "umull v13.8h, v9.8b, v7.8b\n" |
| "umull v14.8h, v10.8b, v7.8b\n" |
| "umull v15.8h, v9.8b, v8.8b\n" |
| "umull v16.8h, v10.8b, v8.8b\n" |
| "uadalp v0.4s, v11.8h\n" |
| "uadalp v1.4s, v12.8h\n" |
| "uadalp v2.4s, v13.8h\n" |
| "uadalp v3.4s, v14.8h\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v6.4s}, [%x[lhs]], #16\n" |
| "ld1 {v7.4s}, [%x[rhs]], #16\n" |
| "dup v8.4s, %w[scale]\n" |
| "dup v9.4s, v6.s[0]\n" |
| "dup v10.4s, v6.s[1]\n" |
| "dup v6.4s, v6.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v0.4s, v0.4s, v0.4s\n" |
| "addp v2.4s, v2.4s, v3.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v4.4s, v4.4s, v5.4s\n" |
| "addp v4.4s, v4.4s, v4.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v9.4s\n" |
| "add v2.4s, v2.4s, v10.4s\n" |
| "add v4.4s, v4.4s, v6.4s\n" |
| "add v0.4s, v0.4s, v7.4s\n" |
| "add v2.4s, v2.4s, v7.4s\n" |
| "add v4.4s, v4.4s, v7.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v2.4s, v2.4s\n" |
| "scvtf v4.4s, v4.4s\n" |
| "fmul v0.4s, v0.4s, v8.4s\n" |
| "fmul v2.4s, v2.4s, v8.4s\n" |
| "fmul v4.4s, v4.4s, v8.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v2.2s}, [x0], #8\n" |
| "st1 {v4.2s}, [x1], #8\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"); |
| } |
| |
| template <> |
| inline void MulKernel< |
| uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, |
| 8>::Multiply(const uint8_t* lhs, const uint8_t* rhs, |
| const FusedKernelParams<QuantizedStaticPreprocessedAsFloat, |
| RowMajor>& params, |
| float* result) { |
| #ifdef DEBUG |
| #ifdef DEBUG_METAGEMM_VERBOSE |
| std::cout << __FILE__ << "(" << __LINE__ |
| << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, " |
| "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, " |
| "8>::Multiply()" |
| << std::endl |
| << std::flush; |
| #endif |
| #endif |
| asm volatile( |
| "prfm pldl1keep, [%x[lhs]]\n" |
| "prfm pldl1keep, [%x[rhs]]\n" |
| |
| // Clear aggregators. |
| "movi v0.4s, #0\n" |
| "movi v1.4s, #0\n" |
| "movi v2.4s, #0\n" |
| "mov v3.16b, v0.16b\n" |
| "mov v4.16b, v1.16b\n" |
| "mov v5.16b, v2.16b\n" |
| "mov v6.16b, v3.16b\n" |
| "mov v7.16b, v4.16b\n" |
| "mov v8.16b, v5.16b\n" |
| |
| // 3x3 lanes loop. |
| "1:" |
| |
| "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n" |
| "ld1 {v9.8b}, [%x[lhs]], #8\n" |
| "umull v15.8h, v9.8b, v12.8b\n" |
| "ld1 {v10.8b}, [%x[lhs]], #8\n" |
| "umull v16.8h, v9.8b, v13.8b\n" |
| "ld1 {v11.8b}, [%x[lhs]], #8\n" |
| "umull v17.8h, v9.8b, v14.8b\n" |
| "prfm pldl1keep, [%x[lhs], #64]\n" |
| "umull v18.8h, v10.8b, v12.8b\n" |
| "prfm pldl1keep, [%x[rhs], #64]\n" |
| "uadalp v0.4s, v15.8h\n" |
| "uadalp v1.4s, v16.8h\n" |
| "uadalp v2.4s, v17.8h\n" |
| "uadalp v3.4s, v18.8h\n" |
| "umull v15.8h, v10.8b, v13.8b\n" |
| "umull v16.8h, v10.8b, v14.8b\n" |
| "umull v17.8h, v11.8b, v12.8b\n" |
| "umull v18.8h, v11.8b, v13.8b\n" |
| |
| // Subtract counter. |
| "subs %x[count], %x[count], #8\n" |
| |
| "umull v9.8h, v11.8b, v14.8b\n" |
| "uadalp v4.4s, v15.8h\n" |
| "uadalp v5.4s, v16.8h\n" |
| "uadalp v6.4s, v17.8h\n" |
| "uadalp v7.4s, v18.8h\n" |
| "uadalp v8.4s, v9.8h\n" |
| |
| // Loop break. |
| "bgt 1b\n" |
| |
| // StaticQuantizationFloat::Prepare |
| "ld1 {v9.4s}, [%x[lhs]], #16\n" |
| "ld1 {v10.4s}, [%x[rhs]], #16\n" |
| "dup v11.4s, %w[scale]\n" |
| "dup v12.4s, v9.s[0]\n" |
| "dup v13.4s, v9.s[1]\n" |
| "dup v9.4s, v9.s[2]\n" |
| |
| // RowMajorOutput::Prepare |
| "add x0, %x[result], %x[stride]\n" |
| "add x1, x0, %x[stride]\n" |
| |
| // Reduce aggregators. |
| "addp v0.4s, v0.4s, v1.4s\n" |
| "addp v2.4s, v2.4s, v2.4s\n" |
| "addp v0.4s, v0.4s, v2.4s\n" |
| "addp v3.4s, v3.4s, v4.4s\n" |
| "addp v5.4s, v5.4s, v5.4s\n" |
| "addp v3.4s, v3.4s, v5.4s\n" |
| "addp v6.4s, v6.4s, v7.4s\n" |
| "addp v8.4s, v8.4s, v8.4s\n" |
| "addp v6.4s, v6.4s, v8.4s\n" |
| |
| // StaticQuantizationFloat::Transform |
| "add v0.4s, v0.4s, v12.4s\n" |
| "add v3.4s, v3.4s, v13.4s\n" |
| "add v6.4s, v6.4s, v9.4s\n" |
| "add v0.4s, v0.4s, v10.4s\n" |
| "add v3.4s, v3.4s, v10.4s\n" |
| "add v6.4s, v6.4s, v10.4s\n" |
| "scvtf v0.4s, v0.4s\n" |
| "scvtf v3.4s, v3.4s\n" |
| "scvtf v6.4s, v6.4s\n" |
| "fmul v0.4s, v0.4s, v11.4s\n" |
| "fmul v3.4s, v3.4s, v11.4s\n" |
| "fmul v6.4s, v6.4s, v11.4s\n" |
| |
| // RowMajorOutput::Output |
| "st1 {v0.2s}, [%x[result]], #8\n" |
| "st1 {v0.s}[2], [%x[result]], #4\n" |
| "st1 {v3.2s}, [x0], #8\n" |
| "st1 {v3.s}[2], [x0], #4\n" |
| "st1 {v6.2s}, [x1], #8\n" |
| "st1 {v6.s}[2], [x1], #4\n" |
| : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result) |
| : [count] "r"(params.kernel.count), |
| [stride] "r"(params.output_stream.stride), |
| [scale] "r"(params.kernel.scale) |
| : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", |
| "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", |
| "memory"); |
| } |
| |
| } // namespace meta |
| } // namespace gemmlowp |
| |
| #else |
| #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!" |
| #endif |
| |
| #endif // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_ |