internal/kernel.h - platform/external/gemmlowp - Git at Google

 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // kernel.h: general definitions for kernels.

 #ifndef GEMMLOWP_INTERNAL_KERNEL_H_
 #define GEMMLOWP_INTERNAL_KERNEL_H_

 #include "../public/bit_depth.h"
 #include "common.h"

 namespace gemmlowp {

 // Explanation of general gemmlowp terminology
 // ===========================================
 //
 // We use the following abbreviations:
 // LHS = "left-hand side"
 // RHS = "right-hand side"
 // Sometimes when referring to either LHS or RHS, we just say a "Side".
 //
 // In a matrix product of a MxK matrix times a KxN matrix,
 // we call K the 'depth'. Note that M is the number of rows
 // of the result (and of the LHS), and N is the number of columns
 // of the result (and of the RHS).
 //
 // In each of the LHS and RHS matrices, we call 'width' the
 // other dimension, besides the depth. So in the LHS, 'width'
 // is the number of rows, while in the RHS, 'width' is the number
 // of columns.
 //
 //  So in the LHS MxK matrix, the depth is K and the width in M.
 // And in the RHS KxN matrix, the depth is K and the width in N.
 //
 // This is illustrated in this picture:
 //
 //                             RHS width
 //                        <----------------->
 //                        +-----------------+ ^
 //                        |       RHS       | | Depth
 //                        +-----------------+ v
 //                 ^ +--+ +-----------------+
 //                 | |L | |                 |
 //       LHS width | |H | |      Result     |
 //                 | |S | |                 |
 //                 v +--+ +-----------------+
 //                   <-->
 //                   Depth

 // Explanation of gemmlowp kernel formats and "cells"
 // ==================================================
 //
 // Kernels operate on small LHS and RHS blocks that fit in registers.
 // These blocks are stored contiguously in memory, but not always
 // in a traditional column-major or row-major order; instead,
 // they consist of a number of sub-blocks, which we call "cells",
 // that are stored in column-major or row-major order. However,
 // what really matters to us is not so much rows vs columns, but
 // rather width vs depth. So we refer to "width-major" and "depth-major"
 // storage orders. In the LHS, width-major means row-major,
 // while in the RHS, width-major means column-major.
 // There is also a third possibility, "diagonal order",
 // which is unused at the moment.
 //
 // We aim to treat both sides, LHS and RHS, on an equal footing,
 // so we call them both 'sides'. A KernelFormat thus is just a pair
 // of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
 // contains a CellFormat and a number of cells; cells are only ever
 // stacked in the width dimension, which means stacked vertically in the
 // LHS and stacked horizondally in the RHS.
 //
 // Example
 // =======
 //
 // Let's work out the data layout expected by a kernel having the
 // following format (the struct names here are defined below in this file):
 //
 // KernelFormat<
 //   KernelSideFormat<CellFormat<3, 4>, 3>,
 //   KernelSideFormat<CellFormat<5, 4>, 2>
 // >
 //
 // The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
 // 3 cells, each cell having dimensions (width=3, depth=4), laid out in
 // DepthMajor order (the default value, see CellFormat). In the LHS,
 // DepthMajor means column-major, so the LHS cells are of size 3x4 in
 // column-major order, so the LHS layout is:
 //
 // 0  3  6  9
 // 1  4  7  10
 // 2  5  8  11
 // 12 15 18 21
 // 13 16 19 22
 // 14 17 20 23
 // 24 27 30 33
 // 25 28 31 34
 // 26 29 32 35
 //
 // The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
 // 2 cells each having dimensions (width=5, depth=4), laid out in
 // DepthMajor order (the default value, see CellFormat). In the RHS,
 // DepthMajor means row-major, so the RHS cells are of size 4x5 in
 // row-major order, so the RHS layout is:
 //
 // 0  1  2  3  4  20 21 22 23 24
 // 5  6  7  8  9  25 26 27 28 29
 // 10 11 12 13 14 30 31 32 33 34
 // 15 16 17 18 19 35 36 37 38 39

 // CellOrder enumerates the possible storage orders (=layouts) for
 // a cell (see explanation above).
 enum class CellOrder { DepthMajor, WidthMajor, Diagonal };

 // CellFormat describes how data is laid
 // out in a cell. That is, a CellOrder together with actual dimensions.
 template <int tWidth, int tDepth, CellOrder tOrder = CellOrder::DepthMajor>
 struct CellFormat {
   static const int kWidth = tWidth;
   static const int kDepth = tDepth;
   static const CellOrder kOrder = tOrder;

   static const int kSize = kWidth * kDepth;
 };

 // KernelSideFormat describes how data is laid out in a kernel side
 // (i.e. LHS or RHS). That is, a CellFormat together with a number of
 // cells. These cells are always stacked in the Width dimension.
 // For example, in the LHS case, the Width dimension is the rows dimension,
 // se we're saying that in the LHS, cells are stacked vertically.
 // We never stack cells in the Depth dimension.
 template <typename tCellFormat, int tCells>
 struct KernelSideFormat {
   typedef tCellFormat Cell;
   static const int kCells = tCells;
   static const int kWidth = kCells * Cell::kWidth;
   static const int kDepth = Cell::kDepth;
   typedef std::uint8_t Scalar;       // The scalar type of the Format.
   typedef std::uint8_t InputScalar;  // The scalar type of the original input.
 };

 // KernelSideFormat for int8 fast kernel trick. The original input is uint8, but
 // packs converts it to int8.
 template <typename tCellFormat, int tCells>
 struct KernelSideFormatInt8 : KernelSideFormat<tCellFormat, tCells> {
   typedef std::int8_t Scalar;
   typedef std::uint8_t InputScalar;
 };

 // KernelSideFormat for int8 inputs, enabling int8 fast kernel trick without
 // pack conversion.
 template <typename tCellFormat, int tCells>
 struct KernelSideFormatInt8Inputs : KernelSideFormat<tCellFormat, tCells> {
   typedef std::int8_t Scalar;
   typedef std::int8_t InputScalar;
 };

 // KernelFormat describes fully the input data layout that a kernel expects.
 // It consists of two KernelSideFormat's, one for LHS and one for RHS.
 template <typename tLhs, typename tRhs>
 struct KernelFormat {
   typedef tLhs Lhs;
   typedef tRhs Rhs;

   static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
   static const int kDepth = Lhs::Cell::kDepth;
   static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
   static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
 };

 inline const char* CellOrderName(CellOrder o) {
   switch (o) {
     case CellOrder::DepthMajor:
       return "DepthMajor";
     case CellOrder::WidthMajor:
       return "WidthMajor";
     case CellOrder::Diagonal:
       return "Diagonal";
     default:
       assert(false);
       return nullptr;
   }
 }

 // Returns the offset into a cell, at which a given coefficient is stored.
 template <typename CellFormat>
 inline int OffsetIntoCell(int w, int d) {
   const int size = CellFormat::kWidth;
   switch (CellFormat::kOrder) {
     case CellOrder::DepthMajor:
       return w + d * CellFormat::kWidth;
     case CellOrder::WidthMajor:
       return d + w * CellFormat::kDepth;
     case CellOrder::Diagonal:
       assert(CellFormat::kWidth == CellFormat::kDepth);
       return ((size + w - d) * size + d) % (size * size);
     default:
       assert(false);
       return 0;
   }
 }

 // KernelBase is the virtual base class below all kernels.
 // The idea is that we don't need to templatize all our code on the exact
 // kernel type; we only need to templatize on kernel format. Kernels
 // sharing the same format can thus share the same packing/unpacking code.
 struct KernelBase {
   virtual const char* Name() const = 0;

   // This is the kernel implementation. We use the word 'run' consistently
   // throughout gemmlowp to mean an inner loop, the implementation of which
   // is to be provided by a separate optimized function.
   virtual void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
                    std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
                    const std::uint8_t* rhs_ptr, std::size_t start_depth,
                    std::size_t run_depth) const = 0;

   virtual ~KernelBase() {}
 };

 template <typename InputKernelScalarType, typename KernelScalarType>
 struct ZeroPointInputValue {};

 template <>
 struct ZeroPointInputValue<std::uint8_t, std::uint8_t> {
   static constexpr std::uint8_t kValue = 0;
 };

 template <>
 struct ZeroPointInputValue<std::uint8_t, std::int8_t> {
   static constexpr std::uint8_t kValue = 128;
 };

 template <>
 struct ZeroPointInputValue<std::int8_t, std::int8_t> {
   static constexpr std::uint8_t kValue = 0;
 };

 }  // namespace gemmlowp

 #endif  // GEMMLOWP_INTERNAL_KERNEL_H_
	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// kernel.h: general definitions for kernels.

	#ifndef GEMMLOWP_INTERNAL_KERNEL_H_
	#define GEMMLOWP_INTERNAL_KERNEL_H_

	#include "../public/bit_depth.h"
	#include "common.h"

	namespace gemmlowp {

	// Explanation of general gemmlowp terminology
	// ===========================================
	//
	// We use the following abbreviations:
	// LHS = "left-hand side"
	// RHS = "right-hand side"
	// Sometimes when referring to either LHS or RHS, we just say a "Side".
	//
	// In a matrix product of a MxK matrix times a KxN matrix,
	// we call K the 'depth'. Note that M is the number of rows
	// of the result (and of the LHS), and N is the number of columns
	// of the result (and of the RHS).
	//
	// In each of the LHS and RHS matrices, we call 'width' the
	// other dimension, besides the depth. So in the LHS, 'width'
	// is the number of rows, while in the RHS, 'width' is the number
	// of columns.
	//
	// So in the LHS MxK matrix, the depth is K and the width in M.
	// And in the RHS KxN matrix, the depth is K and the width in N.
	//
	// This is illustrated in this picture:
	//
	// RHS width
	// <----------------->
	// +-----------------+ ^
	// \| RHS \| \| Depth
	// +-----------------+ v
	// ^ +--+ +-----------------+
	// \| \|L \| \| \|
	// LHS width \| \|H \| \| Result \|
	// \| \|S \| \| \|
	// v +--+ +-----------------+
	// <-->
	// Depth

	// Explanation of gemmlowp kernel formats and "cells"
	// ==================================================
	//
	// Kernels operate on small LHS and RHS blocks that fit in registers.
	// These blocks are stored contiguously in memory, but not always
	// in a traditional column-major or row-major order; instead,
	// they consist of a number of sub-blocks, which we call "cells",
	// that are stored in column-major or row-major order. However,
	// what really matters to us is not so much rows vs columns, but
	// rather width vs depth. So we refer to "width-major" and "depth-major"
	// storage orders. In the LHS, width-major means row-major,
	// while in the RHS, width-major means column-major.
	// There is also a third possibility, "diagonal order",
	// which is unused at the moment.
	//
	// We aim to treat both sides, LHS and RHS, on an equal footing,
	// so we call them both 'sides'. A KernelFormat thus is just a pair
	// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
	// contains a CellFormat and a number of cells; cells are only ever
	// stacked in the width dimension, which means stacked vertically in the
	// LHS and stacked horizondally in the RHS.
	//
	// Example
	// =======
	//
	// Let's work out the data layout expected by a kernel having the
	// following format (the struct names here are defined below in this file):
	//
	// KernelFormat<
	// KernelSideFormat<CellFormat<3, 4>, 3>,
	// KernelSideFormat<CellFormat<5, 4>, 2>
	// >
	//
	// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
	// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
	// DepthMajor order (the default value, see CellFormat). In the LHS,
	// DepthMajor means column-major, so the LHS cells are of size 3x4 in
	// column-major order, so the LHS layout is:
	//
	// 0 3 6 9
	// 1 4 7 10
	// 2 5 8 11
	// 12 15 18 21
	// 13 16 19 22
	// 14 17 20 23
	// 24 27 30 33
	// 25 28 31 34
	// 26 29 32 35
	//
	// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
	// 2 cells each having dimensions (width=5, depth=4), laid out in
	// DepthMajor order (the default value, see CellFormat). In the RHS,
	// DepthMajor means row-major, so the RHS cells are of size 4x5 in
	// row-major order, so the RHS layout is:
	//
	// 0 1 2 3 4 20 21 22 23 24
	// 5 6 7 8 9 25 26 27 28 29
	// 10 11 12 13 14 30 31 32 33 34
	// 15 16 17 18 19 35 36 37 38 39

	// CellOrder enumerates the possible storage orders (=layouts) for
	// a cell (see explanation above).
	enum class CellOrder { DepthMajor, WidthMajor, Diagonal };

	// CellFormat describes how data is laid
	// out in a cell. That is, a CellOrder together with actual dimensions.
	template <int tWidth, int tDepth, CellOrder tOrder = CellOrder::DepthMajor>
	struct CellFormat {
	static const int kWidth = tWidth;
	static const int kDepth = tDepth;
	static const CellOrder kOrder = tOrder;

	static const int kSize = kWidth * kDepth;
	};

	// KernelSideFormat describes how data is laid out in a kernel side
	// (i.e. LHS or RHS). That is, a CellFormat together with a number of
	// cells. These cells are always stacked in the Width dimension.
	// For example, in the LHS case, the Width dimension is the rows dimension,
	// se we're saying that in the LHS, cells are stacked vertically.
	// We never stack cells in the Depth dimension.
	template <typename tCellFormat, int tCells>
	struct KernelSideFormat {
	typedef tCellFormat Cell;
	static const int kCells = tCells;
	static const int kWidth = kCells * Cell::kWidth;
	static const int kDepth = Cell::kDepth;
	typedef std::uint8_t Scalar; // The scalar type of the Format.
	typedef std::uint8_t InputScalar; // The scalar type of the original input.
	};

	// KernelSideFormat for int8 fast kernel trick. The original input is uint8, but
	// packs converts it to int8.
	template <typename tCellFormat, int tCells>
	struct KernelSideFormatInt8 : KernelSideFormat<tCellFormat, tCells> {
	typedef std::int8_t Scalar;
	typedef std::uint8_t InputScalar;
	};

	// KernelSideFormat for int8 inputs, enabling int8 fast kernel trick without
	// pack conversion.
	template <typename tCellFormat, int tCells>
	struct KernelSideFormatInt8Inputs : KernelSideFormat<tCellFormat, tCells> {
	typedef std::int8_t Scalar;
	typedef std::int8_t InputScalar;
	};

	// KernelFormat describes fully the input data layout that a kernel expects.
	// It consists of two KernelSideFormat's, one for LHS and one for RHS.
	template <typename tLhs, typename tRhs>
	struct KernelFormat {
	typedef tLhs Lhs;
	typedef tRhs Rhs;

	static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
	static const int kDepth = Lhs::Cell::kDepth;
	static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
	static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
	};

	inline const char* CellOrderName(CellOrder o) {
	switch (o) {
	case CellOrder::DepthMajor:
	return "DepthMajor";
	case CellOrder::WidthMajor:
	return "WidthMajor";
	case CellOrder::Diagonal:
	return "Diagonal";
	default:
	assert(false);
	return nullptr;
	}
	}

	// Returns the offset into a cell, at which a given coefficient is stored.
	template <typename CellFormat>
	inline int OffsetIntoCell(int w, int d) {
	const int size = CellFormat::kWidth;
	switch (CellFormat::kOrder) {
	case CellOrder::DepthMajor:
	return w + d * CellFormat::kWidth;
	case CellOrder::WidthMajor:
	return d + w * CellFormat::kDepth;
	case CellOrder::Diagonal:
	assert(CellFormat::kWidth == CellFormat::kDepth);
	return ((size + w - d) * size + d) % (size * size);
	default:
	assert(false);
	return 0;
	}
	}

	// KernelBase is the virtual base class below all kernels.
	// The idea is that we don't need to templatize all our code on the exact
	// kernel type; we only need to templatize on kernel format. Kernels
	// sharing the same format can thus share the same packing/unpacking code.
	struct KernelBase {
	virtual const char* Name() const = 0;

	// This is the kernel implementation. We use the word 'run' consistently
	// throughout gemmlowp to mean an inner loop, the implementation of which
	// is to be provided by a separate optimized function.
	virtual void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
	std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
	const std::uint8_t* rhs_ptr, std::size_t start_depth,
	std::size_t run_depth) const = 0;

	virtual ~KernelBase() {}
	};

	template <typename InputKernelScalarType, typename KernelScalarType>
	struct ZeroPointInputValue {};

	template <>
	struct ZeroPointInputValue<std::uint8_t, std::uint8_t> {
	static constexpr std::uint8_t kValue = 0;
	};

	template <>
	struct ZeroPointInputValue<std::uint8_t, std::int8_t> {
	static constexpr std::uint8_t kValue = 128;
	};

	template <>
	struct ZeroPointInputValue<std::int8_t, std::int8_t> {
	static constexpr std::uint8_t kValue = 0;
	};

	} // namespace gemmlowp

	#endif // GEMMLOWP_INTERNAL_KERNEL_H_