internal/unpack_neon.h - platform/external/gemmlowp - Git at Google

 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // unpack_neon.h: optimized NEON specializations of the templates in unpack.h.

 #ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_
 #define GEMMLOWP_INTERNAL_UNPACK_NEON_H_

 #include "internal/unpack.h"

 #include <arm_neon.h>

 namespace gemmlowp {

 template <typename ResultBlockType, typename PackedResult,
           typename KernelLhsFormat, typename KernelRhsFormat>
 void UnpackResultImplNEON(ResultBlockType* dst, const PackedResult& src,
                           const PackedSideBlock<KernelLhsFormat>& packed_lhs,
                           const PackedSideBlock<KernelRhsFormat>& packed_rhs,
                           int depth, std::int32_t result_offset,
                           std::int32_t result_mult_int,
                           std::int32_t result_shift) {
   ScopedProfilingLabel label("optimized kernel");

   std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() *
                              packed_rhs.rank_one_update_multiplier() * depth;
   std::int32_t preshift_offset = 1 << (result_shift - 1);
   int32x4_t shift_reg = vdupq_n_s32(-result_shift);
   for (int c = 0; c < dst->cols(); c++) {
     std::uint8_t* dst_ptr = dst->data(0, c);
     const std::int32_t* src_ptr = src.data(0, c);
     const std::int32_t* rank_one_update_ptr = packed_lhs.rank_one_update();
     std::int32_t rank1update = packed_rhs.rank_one_update()[c];
     std::int32_t constant_offset = rank1update + rank0update + result_offset;

     int dst_rows_aligned4 = RoundDown<4>(dst->rows());
     int dst_rows_aligned16 = RoundDown<16>(dst->rows());

     if (dst_rows_aligned16) {
       std::uint8_t* dst_end_ptr = dst_ptr + dst_rows_aligned16;

       asm volatile(
           "vdup.32 q12, %[constant_offset]\n"
           "vdup.32 q13, %[preshift_offset]\n"
           "neg r3, %[result_shift]\n"
           "vdup.32 q14, r3\n"
           "vdup.32 q15, %[result_mult_int]\n"

           "loop_UnpackResultImplNEON_%=:\n"

           // Load a 16x1 block of the packed result matrix
           // (so 16 contiguous entries in one column).
           "vld1.32 {d0, d1, d2, d3}, [%[src_ptr]]!\n"
           "vld1.32 {d4, d5, d6, d7}, [%[src_ptr]]!\n"

           // Load entries the LHS rank one update vector.
           "vld1.32 {d8, d9, d10, d11}, "
           "[%[rank_one_update_ptr]:256]!\n"
           "vld1.32 {d12, d13, d14, d15}, "
           "[%[rank_one_update_ptr]:256]!\n"

           // Apply the LHS rank one update.
           "vadd.s32 q0, q0, q4\n"
           "vadd.s32 q1, q1, q5\n"
           "vadd.s32 q2, q2, q6\n"
           "vadd.s32 q3, q3, q7\n"

           // Add the constant offset
           // (which includes the RHS rank one update, see above).
           "vadd.s32 q0, q0, q12\n"
           "vadd.s32 q1, q1, q12\n"
           "vadd.s32 q2, q2, q12\n"
           "vadd.s32 q3, q3, q12\n"

           // Multiply by the result multiplier
           "vmul.s32 q0, q0, q15\n"
           "vmul.s32 q1, q1, q15\n"
           "vmul.s32 q2, q2, q15\n"
           "vmul.s32 q3, q3, q15\n"

           // Add the pre-shift offset (so that the shift is rounding)
           "vadd.s32 q0, q0, q13\n"
           "vadd.s32 q1, q1, q13\n"
           "vadd.s32 q2, q2, q13\n"
           "vadd.s32 q3, q3, q13\n"

           // Shift right (shift left by negative offset).
           "vshl.s32 q0, q0, q14\n"
           "vshl.s32 q1, q1, q14\n"
           "vshl.s32 q2, q2, q14\n"
           "vshl.s32 q3, q3, q14\n"

           // So far we had signed 32bit values; now we cast them down
           // to unsigned 8bit, saturating.
           "vqmovn.s32 d8, q0\n"
           "vqmovn.s32 d9, q1\n"
           "vqmovn.s32 d10, q2\n"
           "vqmovn.s32 d11, q3\n"
           "vqmovun.s16 d0, q4\n"
           "vqmovun.s16 d1, q5\n"

           // Store result into the destination matrix.
           "vst1.8 {d0, d1}, [%[dst_ptr]]!\n"

           // End of the loop.
           "cmp %[dst_ptr], %[dst_end_ptr]\n"
           "bne loop_UnpackResultImplNEON_%=\n"

           :  // outputs
           [dst_ptr] "+r"(dst_ptr), [src_ptr] "+r"(src_ptr),
           [rank_one_update_ptr] "+r"(rank_one_update_ptr)
           :  // inputs
           [dst_end_ptr] "r"(dst_end_ptr),
           [constant_offset] "r"(constant_offset),
           [result_mult_int] "r"(result_mult_int),
           [preshift_offset] "r"(preshift_offset),
           [result_shift] "r"(result_shift)
           :  // clobbers
           "cc", "memory", "r3",
           // note: someone on internet says that quad registers are
           // unsupported in the clobber list!
           "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
           "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
           "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
           "d31");
     }

     // We have finished handling groups of 16 entries at once; now
     // try to handle 4 entries at once.
     for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) {
       int32x4_t q = vld1q_s32(src_ptr);
       q = vaddq_s32(q, vld1q_s32(rank_one_update_ptr));
       q = vaddq_s32(q, vdupq_n_s32(constant_offset));
       q = vmulq_n_s32(q, result_mult_int);
       q = vaddq_s32(q, vdupq_n_s32(preshift_offset));
       q = vshlq_s32(q, shift_reg);
       int16x4_t q_16 = vqmovn_s32(q);
       uint8x8_t q_8 = vqmovun_s16(vcombine_s16(q_16, q_16));
       vst1_lane_u32(reinterpret_cast<std::uint32_t*>(dst_ptr),
                     vreinterpret_u32_u8(q_8), 0);
       dst_ptr += 4;
       src_ptr += 4;
       rank_one_update_ptr += 4;
     }
     // We have finished handling 4 entries at once; now handle
     // remaining entries one by one.
     for (int r = dst_rows_aligned4; r < dst->rows(); r++) {
       std::int32_t q = src(r, c);
       q += packed_lhs.rank_one_update()[r] + rank1update + rank0update;
       q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >>
           result_shift;
       (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q;
     }
   }
 }

 }  // namespace gemmlowp

 #endif  // GEMMLOWP_INTERNAL_UNPACK_NEON_H_
	// Copyright 2015 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// unpack_neon.h: optimized NEON specializations of the templates in unpack.h.

	#ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_
	#define GEMMLOWP_INTERNAL_UNPACK_NEON_H_

	#include "internal/unpack.h"

	#include <arm_neon.h>

	namespace gemmlowp {

	template <typename ResultBlockType, typename PackedResult,
	typename KernelLhsFormat, typename KernelRhsFormat>
	void UnpackResultImplNEON(ResultBlockType* dst, const PackedResult& src,
	const PackedSideBlock<KernelLhsFormat>& packed_lhs,
	const PackedSideBlock<KernelRhsFormat>& packed_rhs,
	int depth, std::int32_t result_offset,
	std::int32_t result_mult_int,
	std::int32_t result_shift) {
	ScopedProfilingLabel label("optimized kernel");

	std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() *
	packed_rhs.rank_one_update_multiplier() * depth;
	std::int32_t preshift_offset = 1 << (result_shift - 1);
	int32x4_t shift_reg = vdupq_n_s32(-result_shift);
	for (int c = 0; c < dst->cols(); c++) {
	std::uint8_t* dst_ptr = dst->data(0, c);
	const std::int32_t* src_ptr = src.data(0, c);
	const std::int32_t* rank_one_update_ptr = packed_lhs.rank_one_update();
	std::int32_t rank1update = packed_rhs.rank_one_update()[c];
	std::int32_t constant_offset = rank1update + rank0update + result_offset;

	int dst_rows_aligned4 = RoundDown<4>(dst->rows());
	int dst_rows_aligned16 = RoundDown<16>(dst->rows());

	if (dst_rows_aligned16) {
	std::uint8_t* dst_end_ptr = dst_ptr + dst_rows_aligned16;

	asm volatile(
	"vdup.32 q12, %[constant_offset]\n"
	"vdup.32 q13, %[preshift_offset]\n"
	"neg r3, %[result_shift]\n"
	"vdup.32 q14, r3\n"
	"vdup.32 q15, %[result_mult_int]\n"

	"loop_UnpackResultImplNEON_%=:\n"

	// Load a 16x1 block of the packed result matrix
	// (so 16 contiguous entries in one column).
	"vld1.32 {d0, d1, d2, d3}, [%[src_ptr]]!\n"
	"vld1.32 {d4, d5, d6, d7}, [%[src_ptr]]!\n"

	// Load entries the LHS rank one update vector.
	"vld1.32 {d8, d9, d10, d11}, "
	"[%[rank_one_update_ptr]:256]!\n"
	"vld1.32 {d12, d13, d14, d15}, "
	"[%[rank_one_update_ptr]:256]!\n"

	// Apply the LHS rank one update.
	"vadd.s32 q0, q0, q4\n"
	"vadd.s32 q1, q1, q5\n"
	"vadd.s32 q2, q2, q6\n"
	"vadd.s32 q3, q3, q7\n"

	// Add the constant offset
	// (which includes the RHS rank one update, see above).
	"vadd.s32 q0, q0, q12\n"
	"vadd.s32 q1, q1, q12\n"
	"vadd.s32 q2, q2, q12\n"
	"vadd.s32 q3, q3, q12\n"

	// Multiply by the result multiplier
	"vmul.s32 q0, q0, q15\n"
	"vmul.s32 q1, q1, q15\n"
	"vmul.s32 q2, q2, q15\n"
	"vmul.s32 q3, q3, q15\n"

	// Add the pre-shift offset (so that the shift is rounding)
	"vadd.s32 q0, q0, q13\n"
	"vadd.s32 q1, q1, q13\n"
	"vadd.s32 q2, q2, q13\n"
	"vadd.s32 q3, q3, q13\n"

	// Shift right (shift left by negative offset).
	"vshl.s32 q0, q0, q14\n"
	"vshl.s32 q1, q1, q14\n"
	"vshl.s32 q2, q2, q14\n"
	"vshl.s32 q3, q3, q14\n"

	// So far we had signed 32bit values; now we cast them down
	// to unsigned 8bit, saturating.
	"vqmovn.s32 d8, q0\n"
	"vqmovn.s32 d9, q1\n"
	"vqmovn.s32 d10, q2\n"
	"vqmovn.s32 d11, q3\n"
	"vqmovun.s16 d0, q4\n"
	"vqmovun.s16 d1, q5\n"

	// Store result into the destination matrix.
	"vst1.8 {d0, d1}, [%[dst_ptr]]!\n"

	// End of the loop.
	"cmp %[dst_ptr], %[dst_end_ptr]\n"
	"bne loop_UnpackResultImplNEON_%=\n"

	: // outputs
	[dst_ptr] "+r"(dst_ptr), [src_ptr] "+r"(src_ptr),
	[rank_one_update_ptr] "+r"(rank_one_update_ptr)
	: // inputs
	[dst_end_ptr] "r"(dst_end_ptr),
	[constant_offset] "r"(constant_offset),
	[result_mult_int] "r"(result_mult_int),
	[preshift_offset] "r"(preshift_offset),
	[result_shift] "r"(result_shift)
	: // clobbers
	"cc", "memory", "r3",
	// note: someone on internet says that quad registers are
	// unsupported in the clobber list!
	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
	"d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
	"d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
	"d31");
	}

	// We have finished handling groups of 16 entries at once; now
	// try to handle 4 entries at once.
	for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) {
	int32x4_t q = vld1q_s32(src_ptr);
	q = vaddq_s32(q, vld1q_s32(rank_one_update_ptr));
	q = vaddq_s32(q, vdupq_n_s32(constant_offset));
	q = vmulq_n_s32(q, result_mult_int);
	q = vaddq_s32(q, vdupq_n_s32(preshift_offset));
	q = vshlq_s32(q, shift_reg);
	int16x4_t q_16 = vqmovn_s32(q);
	uint8x8_t q_8 = vqmovun_s16(vcombine_s16(q_16, q_16));
	vst1_lane_u32(reinterpret_cast<std::uint32_t*>(dst_ptr),
	vreinterpret_u32_u8(q_8), 0);
	dst_ptr += 4;
	src_ptr += 4;
	rank_one_update_ptr += 4;
	}
	// We have finished handling 4 entries at once; now handle
	// remaining entries one by one.
	for (int r = dst_rows_aligned4; r < dst->rows(); r++) {
	std::int32_t q = src(r, c);
	q += packed_lhs.rank_one_update()[r] + rank1update + rank0update;
	q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >>
	result_shift;
	(*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q;
	}
	}
	}

	} // namespace gemmlowp

	#endif // GEMMLOWP_INTERNAL_UNPACK_NEON_H_