Implement LSTM cell on CPU

Bug: 63905942

Adapted previous TF Lite implementation and unit tests.

Modified the API implmentation to allow input tensor to have a null
buffer pointer to indicate an optional input.

Test: adb shell /data/nativetest64/lstm_test/lstm_test

Change-Id: Iab89fe6c7731dc68c8460f30630174c55e3b24df
diff --git a/common/LSTM.cpp b/common/LSTM.cpp
new file mode 100644
index 0000000..e04f073
--- /dev/null
+++ b/common/LSTM.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "LSTM.h"
+
+#include "CpuExecutor.h"
+#include "HalInterfaces.h"
+
+namespace android {
+namespace nn {
+
+// TODO: move the kernels to a separate file as soon as we have the
+// optimized version ready.
+namespace {
+
+template <typename T>
+T getScalarData(RunTimeOperandInfo& info) {
+    T * data = reinterpret_cast<T*>(info.buffer);
+    return data[0];
+}
+
+// Limit a float input f between +abs_limit and -abs_limit.
+inline float Clip(float f, float abs_limit) {
+  float result = (abs_limit < f) ? abs_limit : f;
+  result = (-abs_limit > result) ? -abs_limit : result;
+  return result;
+}
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result) {
+  for (int b = 0; b < n_batch; b++) {
+    float* result_in_batch = result + b * m_rows;
+    const float* matrix_ptr = matrix;
+    for (int r = 0; r < m_rows; r++) {
+      const float* vector_in_batch = vector + b * m_cols;
+      for (int c = 0; c < m_cols; c++) {
+        *result_in_batch += *matrix_ptr++ * *vector_in_batch++;
+      }
+      result_in_batch++;
+    }
+  }
+}
+
+// Cwise product of two vectors.
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                              int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product and accumulation of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2, int v_size,
+                                        float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ += *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product and accumulation of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+                                             const float* batch_vector,
+                                             int n_batch, float* result) {
+  for (int b = 0; b < n_batch; b++) {
+    for (int v = 0; v < v_size; v++) {
+      *result++ += vector[v] * *batch_vector++;
+    }
+  }
+}
+
+// Batch vector initialization with another vector.
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+                             float* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
+  }
+}
+
+// Apply sigmoid to elements of a vector.
+void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
+  auto sigmoid_func = ActivationFunctor(kActivationSigmoid);
+  for (int v = 0; v < v_size; v++) {
+    *result++ = (sigmoid_func)(*vector++);
+  }
+}
+
+// Apply activation function to elements of a vector.
+void ApplyActivationToVector(const float* vector, int v_size,
+                             ActivationFn activation, float* result) {
+  auto activation_func = ActivationFunctor(activation);
+  for (int v = 0; v < v_size; v++) {
+    *result++ = (activation_func)(*vector++);
+  }
+}
+
+// Copy vector to another vector.
+inline void CopyVector(const float* vector, int v_size, float* result) {
+  memcpy(result, vector, v_size * sizeof(float));
+}
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = 1.0f - *vector++;
+  }
+}
+
+// Fill vector with 0.f.
+void ZeroVector(float* vector, int v_size) {
+  memset(vector, 0, v_size * sizeof(float));
+}
+
+// Clip elements of a vector using a abs_limit value.
+void ClipVector(const float* vector, int v_size, float abs_limit,
+                float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = Clip(*vector++, abs_limit);
+  }
+}
+
+template <typename T>
+inline T *GetBuffer(RunTimeOperandInfo* operand) {
+  return reinterpret_cast<T*>(operand->buffer);
+}
+
+template <typename T>
+inline const T *GetBuffer(const RunTimeOperandInfo* operand) {
+  return reinterpret_cast<const T*>(operand->buffer);
+}
+
+}  // anonymous namespace
+
+LSTMCell::LSTMCell(const Operation& operation,
+                   std::vector<RunTimeOperandInfo>& operands) {
+  auto GetInput = [&operation,
+                   &operands](uint32_t index) -> const RunTimeOperandInfo* {
+    const std::vector<uint32_t>& inputs = operation.inputs;
+    const int index_of_operand = inputs[index];
+    if (index_of_operand < 0) {
+      return nullptr;
+    }
+    return &operands[index_of_operand];
+  };
+
+  auto GetOutput = [&operation,
+                    &operands](uint32_t index) -> RunTimeOperandInfo* {
+    const std::vector<uint32_t>& outputs = operation.outputs;
+    const int index_of_operand = outputs[index];
+    // Expects index of operand in range.
+    return &operands[index_of_operand];
+  };
+
+  input_ = GetInput(kInputTensor);
+
+  input_to_input_weights_ = GetInput(kInputToInputWeightsTensor);  // optional
+  input_to_forget_weights_ = GetInput(kInputToForgetWeightsTensor);
+  input_to_cell_weights_ = GetInput(kInputToCellWeightsTensor);
+  input_to_output_weights_ = GetInput(kInputToOutputWeightsTensor);
+
+  recurrent_to_input_weights_ =
+      GetInput(kRecurrentToInputWeightsTensor);  // optional
+  recurrent_to_forget_weights_ = GetInput(kRecurrentToForgetWeightsTensor);
+  recurrent_to_cell_weights_ = GetInput(kRecurrentToCellWeightsTensor);
+  recurrent_to_output_weights_ = GetInput(kRecurrentToOutputWeightsTensor);
+
+  cell_to_input_weights_ = GetInput(kCellToInputWeightsTensor);    // optional
+  cell_to_forget_weights_ = GetInput(kCellToForgetWeightsTensor);  // optional
+  cell_to_output_weights_ = GetInput(kCellToOutputWeightsTensor);  // optional
+
+  input_gate_bias_ = GetInput(kInputGateBiasTensor);
+  forget_gate_bias_ = GetInput(kForgetGateBiasTensor);
+  cell_bias_ = GetInput(kCellGateBiasTensor);
+  output_gate_bias_ = GetInput(kOutputGateBiasTensor);
+
+  projection_weights_ = GetInput(kProjectionWeightsTensor);  // optional
+  projection_bias_ = GetInput(kProjectionBiasTensor);        // optional
+
+  params_.activation_ = static_cast<ActivationFn>(getScalarData<int32_t>(operands[operation.inputs[kActivationParam]]));
+  params_.cell_clip_ = getScalarData<float>(operands[operation.inputs[kCellClipParam]]);
+  params_.proj_clip_ = getScalarData<float>(operands[operation.inputs[kProjClipParam]]);
+
+  output_state_ = GetOutput(kOutputStateTensor);
+  cell_state_ = GetOutput(kCellStateTensor);
+  output_ = GetOutput(kOutputTensor);
+
+  scratch_buffer_ = GetOutput(kScratchBufferTensor);
+}
+
+bool LSTMCell::Eval() {
+  const uint32_t n_batch = input_->shape().dimensions[0];
+  const uint32_t n_input = input_->shape().dimensions[1];
+  // n_cell and n_output will be the same size when there is no projection.
+  const uint32_t n_cell = input_to_output_weights_->shape().dimensions[0];
+  const uint32_t n_output = recurrent_to_output_weights_->shape().dimensions[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_->buffer == nullptr);
+  const bool use_peephole = (cell_to_output_weights_->buffer != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
+    forget_gate_scratch = cell_scratch + n_cell * n_batch;
+    output_gate_scratch = cell_scratch + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
+    cell_scratch = input_gate_scratch + n_cell * n_batch;
+    forget_gate_scratch = input_gate_scratch + 2 * n_cell * n_batch;
+    output_gate_scratch = input_gate_scratch + 3 * n_cell * n_batch;
+  }
+
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    VectorBatchVectorAssign(GetBuffer<float>(input_gate_bias_), n_cell, n_batch,
+                            input_gate_scratch);
+  }
+  VectorBatchVectorAssign(GetBuffer<float>(forget_gate_bias_), n_cell, n_batch,
+                          forget_gate_scratch);
+  VectorBatchVectorAssign(GetBuffer<float>(cell_bias_), n_cell, n_batch,
+                          cell_scratch);
+  VectorBatchVectorAssign(GetBuffer<float>(output_gate_bias_), n_cell, n_batch,
+                          output_gate_scratch);
+
+  // For each batch and cell: compute input_weight * input.
+  if (!use_cifg) {
+    MatrixBatchVectorMultiplyAccumulate(
+        GetBuffer<float>(input_to_input_weights_), n_cell, n_input,
+        GetBuffer<float>(input_), n_batch, input_gate_scratch);
+  }
+  MatrixBatchVectorMultiplyAccumulate(
+      GetBuffer<float>(input_to_forget_weights_), n_cell, n_input,
+      GetBuffer<float>(input_), n_batch, forget_gate_scratch);
+  MatrixBatchVectorMultiplyAccumulate(
+      GetBuffer<float>(input_to_cell_weights_), n_cell, n_input,
+      GetBuffer<float>(input_), n_batch, cell_scratch);
+  MatrixBatchVectorMultiplyAccumulate(
+      GetBuffer<float>(input_to_output_weights_), n_cell, n_input,
+      GetBuffer<float>(input_), n_batch, output_gate_scratch);
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  if (!use_cifg) {
+    MatrixBatchVectorMultiplyAccumulate(
+        GetBuffer<float>(recurrent_to_input_weights_), n_cell, n_output,
+        GetBuffer<float>(output_state_), n_batch, input_gate_scratch);
+  }
+  MatrixBatchVectorMultiplyAccumulate(
+      GetBuffer<float>(recurrent_to_forget_weights_), n_cell, n_output,
+      GetBuffer<float>(output_state_), n_batch, forget_gate_scratch);
+  MatrixBatchVectorMultiplyAccumulate(
+      GetBuffer<float>(recurrent_to_cell_weights_), n_cell, n_output,
+      GetBuffer<float>(output_state_), n_batch, cell_scratch);
+  MatrixBatchVectorMultiplyAccumulate(
+      GetBuffer<float>(recurrent_to_output_weights_), n_cell, n_output,
+      GetBuffer<float>(output_state_), n_batch, output_gate_scratch);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole) {
+      VectorBatchVectorCwiseProductAccumulate(
+          GetBuffer<float>(cell_to_input_weights_), n_cell,
+          GetBuffer<float>(cell_state_), n_batch, input_gate_scratch);
+    }
+    ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                         input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole) {
+    VectorBatchVectorCwiseProductAccumulate(
+        GetBuffer<float>(cell_to_forget_weights_), n_cell,
+        GetBuffer<float>(cell_state_), n_batch, forget_gate_scratch);
+  }
+  ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                       forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  VectorVectorCwiseProduct(forget_gate_scratch, GetBuffer<float>(cell_state_),
+                           n_batch * n_cell, GetBuffer<float>(cell_state_));
+  ApplyActivationToVector(cell_scratch, n_batch * n_cell, params_.activation_,
+                          cell_scratch);
+  if (use_cifg) {
+    Sub1Vector(forget_gate_scratch, n_batch * n_cell, forget_gate_scratch);
+    VectorVectorCwiseProductAccumulate(cell_scratch, forget_gate_scratch,
+                                       n_batch * n_cell,
+                                       GetBuffer<float>(cell_state_));
+  } else {
+    VectorVectorCwiseProductAccumulate(cell_scratch, input_gate_scratch,
+                                       n_batch * n_cell,
+                                       GetBuffer<float>(cell_state_));
+  }
+  if (params_.cell_clip_ > 0.0) {
+    ClipVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
+               params_.cell_clip_, GetBuffer<float>(cell_state_));
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole) {
+    VectorBatchVectorCwiseProductAccumulate(
+        GetBuffer<float>(cell_to_output_weights_), n_cell,
+        GetBuffer<float>(cell_state_), n_batch, output_gate_scratch);
+  }
+  ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                       output_gate_scratch);
+  ApplyActivationToVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
+                          params_.activation_, cell_scratch);
+  VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, n_batch * n_cell,
+                           output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights_->buffer != nullptr);
+  const bool use_projection_bias = (projection_bias_->buffer != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      VectorBatchVectorAssign(GetBuffer<float>(projection_bias_), n_output,
+                              n_batch, GetBuffer<float>(output_));
+    } else {
+      ZeroVector(GetBuffer<float>(output_), n_batch * n_output);
+    }
+    MatrixBatchVectorMultiplyAccumulate(GetBuffer<float>(projection_weights_),
+                                        n_output, n_cell, output_gate_scratch,
+                                        n_batch, GetBuffer<float>(output_));
+    if (params_.proj_clip_ > 0.0) {
+      ClipVector(GetBuffer<float>(output_), n_batch * n_output,
+                 params_.proj_clip_, GetBuffer<float>(output_));
+    }
+  } else {
+    CopyVector(output_gate_scratch, n_batch * n_output,
+               GetBuffer<float>(output_));
+  }
+  CopyVector(GetBuffer<float>(output_), n_batch * n_output,
+             GetBuffer<float>(output_state_));
+
+  return true;
+}
+
+}  // namespace nn
+}  // namespace android