blob: e04f0737ad9c8210a56bdab2868759828d07ecbd [file]
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "LSTM.h"
#include "CpuExecutor.h"
#include "HalInterfaces.h"
namespace android {
namespace nn {
// TODO: move the kernels to a separate file as soon as we have the
// optimized version ready.
namespace {
template <typename T>
T getScalarData(RunTimeOperandInfo& info) {
T * data = reinterpret_cast<T*>(info.buffer);
return data[0];
}
// Limit a float input f between +abs_limit and -abs_limit.
inline float Clip(float f, float abs_limit) {
float result = (abs_limit < f) ? abs_limit : f;
result = (-abs_limit > result) ? -abs_limit : result;
return result;
}
// Multiply a matrix by a batch vector, and store results in a batch-size
// vector.
void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
int m_cols, const float* vector,
int n_batch, float* result) {
for (int b = 0; b < n_batch; b++) {
float* result_in_batch = result + b * m_rows;
const float* matrix_ptr = matrix;
for (int r = 0; r < m_rows; r++) {
const float* vector_in_batch = vector + b * m_cols;
for (int c = 0; c < m_cols; c++) {
*result_in_batch += *matrix_ptr++ * *vector_in_batch++;
}
result_in_batch++;
}
}
}
// Cwise product of two vectors.
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result) {
for (int v = 0; v < v_size; v++) {
*result++ = *vector1++ * *vector2++;
}
}
// Cwise product and accumulation of two vectors. Since it's a MAC operation, the
// assumption here is that result array is initialized to valid values.
void VectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result) {
for (int v = 0; v < v_size; v++) {
*result++ += *vector1++ * *vector2++;
}
}
// Cwise product and accumulation of a vector and a batch-vector. Since it's a MAC
// operation, the assumption here is that result array is initialized to valid
// values.
void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
const float* batch_vector,
int n_batch, float* result) {
for (int b = 0; b < n_batch; b++) {
for (int v = 0; v < v_size; v++) {
*result++ += vector[v] * *batch_vector++;
}
}
}
// Batch vector initialization with another vector.
void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
float* batch_vector) {
for (int b = 0; b < n_batch; b++) {
memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
}
}
// Apply sigmoid to elements of a vector.
void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
auto sigmoid_func = ActivationFunctor(kActivationSigmoid);
for (int v = 0; v < v_size; v++) {
*result++ = (sigmoid_func)(*vector++);
}
}
// Apply activation function to elements of a vector.
void ApplyActivationToVector(const float* vector, int v_size,
ActivationFn activation, float* result) {
auto activation_func = ActivationFunctor(activation);
for (int v = 0; v < v_size; v++) {
*result++ = (activation_func)(*vector++);
}
}
// Copy vector to another vector.
inline void CopyVector(const float* vector, int v_size, float* result) {
memcpy(result, vector, v_size * sizeof(float));
}
// Compute "1.0f - elements of vector" (used in CIFG).
void Sub1Vector(const float* vector, int v_size, float* result) {
for (int v = 0; v < v_size; v++) {
*result++ = 1.0f - *vector++;
}
}
// Fill vector with 0.f.
void ZeroVector(float* vector, int v_size) {
memset(vector, 0, v_size * sizeof(float));
}
// Clip elements of a vector using a abs_limit value.
void ClipVector(const float* vector, int v_size, float abs_limit,
float* result) {
for (int v = 0; v < v_size; v++) {
*result++ = Clip(*vector++, abs_limit);
}
}
template <typename T>
inline T *GetBuffer(RunTimeOperandInfo* operand) {
return reinterpret_cast<T*>(operand->buffer);
}
template <typename T>
inline const T *GetBuffer(const RunTimeOperandInfo* operand) {
return reinterpret_cast<const T*>(operand->buffer);
}
} // anonymous namespace
LSTMCell::LSTMCell(const Operation& operation,
std::vector<RunTimeOperandInfo>& operands) {
auto GetInput = [&operation,
&operands](uint32_t index) -> const RunTimeOperandInfo* {
const std::vector<uint32_t>& inputs = operation.inputs;
const int index_of_operand = inputs[index];
if (index_of_operand < 0) {
return nullptr;
}
return &operands[index_of_operand];
};
auto GetOutput = [&operation,
&operands](uint32_t index) -> RunTimeOperandInfo* {
const std::vector<uint32_t>& outputs = operation.outputs;
const int index_of_operand = outputs[index];
// Expects index of operand in range.
return &operands[index_of_operand];
};
input_ = GetInput(kInputTensor);
input_to_input_weights_ = GetInput(kInputToInputWeightsTensor); // optional
input_to_forget_weights_ = GetInput(kInputToForgetWeightsTensor);
input_to_cell_weights_ = GetInput(kInputToCellWeightsTensor);
input_to_output_weights_ = GetInput(kInputToOutputWeightsTensor);
recurrent_to_input_weights_ =
GetInput(kRecurrentToInputWeightsTensor); // optional
recurrent_to_forget_weights_ = GetInput(kRecurrentToForgetWeightsTensor);
recurrent_to_cell_weights_ = GetInput(kRecurrentToCellWeightsTensor);
recurrent_to_output_weights_ = GetInput(kRecurrentToOutputWeightsTensor);
cell_to_input_weights_ = GetInput(kCellToInputWeightsTensor); // optional
cell_to_forget_weights_ = GetInput(kCellToForgetWeightsTensor); // optional
cell_to_output_weights_ = GetInput(kCellToOutputWeightsTensor); // optional
input_gate_bias_ = GetInput(kInputGateBiasTensor);
forget_gate_bias_ = GetInput(kForgetGateBiasTensor);
cell_bias_ = GetInput(kCellGateBiasTensor);
output_gate_bias_ = GetInput(kOutputGateBiasTensor);
projection_weights_ = GetInput(kProjectionWeightsTensor); // optional
projection_bias_ = GetInput(kProjectionBiasTensor); // optional
params_.activation_ = static_cast<ActivationFn>(getScalarData<int32_t>(operands[operation.inputs[kActivationParam]]));
params_.cell_clip_ = getScalarData<float>(operands[operation.inputs[kCellClipParam]]);
params_.proj_clip_ = getScalarData<float>(operands[operation.inputs[kProjClipParam]]);
output_state_ = GetOutput(kOutputStateTensor);
cell_state_ = GetOutput(kCellStateTensor);
output_ = GetOutput(kOutputTensor);
scratch_buffer_ = GetOutput(kScratchBufferTensor);
}
bool LSTMCell::Eval() {
const uint32_t n_batch = input_->shape().dimensions[0];
const uint32_t n_input = input_->shape().dimensions[1];
// n_cell and n_output will be the same size when there is no projection.
const uint32_t n_cell = input_to_output_weights_->shape().dimensions[0];
const uint32_t n_output = recurrent_to_output_weights_->shape().dimensions[1];
// Since we have already checked that weights are all there or none, we can
// check the existence of only one to the get the condition.
const bool use_cifg = (input_to_input_weights_->buffer == nullptr);
const bool use_peephole = (cell_to_output_weights_->buffer != nullptr);
// Index the scratch buffers pointers to the global scratch buffer.
float* input_gate_scratch = nullptr;
float* cell_scratch = nullptr;
float* forget_gate_scratch = nullptr;
float* output_gate_scratch = nullptr;
if (use_cifg) {
cell_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
forget_gate_scratch = cell_scratch + n_cell * n_batch;
output_gate_scratch = cell_scratch + 2 * n_cell * n_batch;
} else {
input_gate_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
cell_scratch = input_gate_scratch + n_cell * n_batch;
forget_gate_scratch = input_gate_scratch + 2 * n_cell * n_batch;
output_gate_scratch = input_gate_scratch + 3 * n_cell * n_batch;
}
// Initialize scratch buffers with bias.
if (!use_cifg) {
VectorBatchVectorAssign(GetBuffer<float>(input_gate_bias_), n_cell, n_batch,
input_gate_scratch);
}
VectorBatchVectorAssign(GetBuffer<float>(forget_gate_bias_), n_cell, n_batch,
forget_gate_scratch);
VectorBatchVectorAssign(GetBuffer<float>(cell_bias_), n_cell, n_batch,
cell_scratch);
VectorBatchVectorAssign(GetBuffer<float>(output_gate_bias_), n_cell, n_batch,
output_gate_scratch);
// For each batch and cell: compute input_weight * input.
if (!use_cifg) {
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(input_to_input_weights_), n_cell, n_input,
GetBuffer<float>(input_), n_batch, input_gate_scratch);
}
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(input_to_forget_weights_), n_cell, n_input,
GetBuffer<float>(input_), n_batch, forget_gate_scratch);
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(input_to_cell_weights_), n_cell, n_input,
GetBuffer<float>(input_), n_batch, cell_scratch);
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(input_to_output_weights_), n_cell, n_input,
GetBuffer<float>(input_), n_batch, output_gate_scratch);
// For each batch and cell: compute recurrent_weight * output_state.
if (!use_cifg) {
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(recurrent_to_input_weights_), n_cell, n_output,
GetBuffer<float>(output_state_), n_batch, input_gate_scratch);
}
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(recurrent_to_forget_weights_), n_cell, n_output,
GetBuffer<float>(output_state_), n_batch, forget_gate_scratch);
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(recurrent_to_cell_weights_), n_cell, n_output,
GetBuffer<float>(output_state_), n_batch, cell_scratch);
MatrixBatchVectorMultiplyAccumulate(
GetBuffer<float>(recurrent_to_output_weights_), n_cell, n_output,
GetBuffer<float>(output_state_), n_batch, output_gate_scratch);
// For each batch and cell: update input gate.
if (!use_cifg) {
if (use_peephole) {
VectorBatchVectorCwiseProductAccumulate(
GetBuffer<float>(cell_to_input_weights_), n_cell,
GetBuffer<float>(cell_state_), n_batch, input_gate_scratch);
}
ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
input_gate_scratch);
}
// For each batch and cell: update forget gate.
if (use_peephole) {
VectorBatchVectorCwiseProductAccumulate(
GetBuffer<float>(cell_to_forget_weights_), n_cell,
GetBuffer<float>(cell_state_), n_batch, forget_gate_scratch);
}
ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
forget_gate_scratch);
// For each batch and cell: update the cell.
VectorVectorCwiseProduct(forget_gate_scratch, GetBuffer<float>(cell_state_),
n_batch * n_cell, GetBuffer<float>(cell_state_));
ApplyActivationToVector(cell_scratch, n_batch * n_cell, params_.activation_,
cell_scratch);
if (use_cifg) {
Sub1Vector(forget_gate_scratch, n_batch * n_cell, forget_gate_scratch);
VectorVectorCwiseProductAccumulate(cell_scratch, forget_gate_scratch,
n_batch * n_cell,
GetBuffer<float>(cell_state_));
} else {
VectorVectorCwiseProductAccumulate(cell_scratch, input_gate_scratch,
n_batch * n_cell,
GetBuffer<float>(cell_state_));
}
if (params_.cell_clip_ > 0.0) {
ClipVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
params_.cell_clip_, GetBuffer<float>(cell_state_));
}
// For each batch and cell: update the output gate.
if (use_peephole) {
VectorBatchVectorCwiseProductAccumulate(
GetBuffer<float>(cell_to_output_weights_), n_cell,
GetBuffer<float>(cell_state_), n_batch, output_gate_scratch);
}
ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
output_gate_scratch);
ApplyActivationToVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
params_.activation_, cell_scratch);
VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, n_batch * n_cell,
output_gate_scratch);
// For each batch: update the projection and output_state.
const bool use_projection_weight = (projection_weights_->buffer != nullptr);
const bool use_projection_bias = (projection_bias_->buffer != nullptr);
if (use_projection_weight) {
if (use_projection_bias) {
VectorBatchVectorAssign(GetBuffer<float>(projection_bias_), n_output,
n_batch, GetBuffer<float>(output_));
} else {
ZeroVector(GetBuffer<float>(output_), n_batch * n_output);
}
MatrixBatchVectorMultiplyAccumulate(GetBuffer<float>(projection_weights_),
n_output, n_cell, output_gate_scratch,
n_batch, GetBuffer<float>(output_));
if (params_.proj_clip_ > 0.0) {
ClipVector(GetBuffer<float>(output_), n_batch * n_output,
params_.proj_clip_, GetBuffer<float>(output_));
}
} else {
CopyVector(output_gate_scratch, n_batch * n_output,
GetBuffer<float>(output_));
}
CopyVector(GetBuffer<float>(output_), n_batch * n_output,
GetBuffer<float>(output_state_));
return true;
}
} // namespace nn
} // namespace android