common/LSTM.cpp - platform/packages/modules/NeuralNetworks - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "LSTM.h"

 #include "CpuExecutor.h"
 #include "HalInterfaces.h"

 namespace android {
 namespace nn {

 // TODO: move the kernels to a separate file as soon as we have the
 // optimized version ready.
 namespace {

 template <typename T>
 T getScalarData(RunTimeOperandInfo& info) {
     T * data = reinterpret_cast<T*>(info.buffer);
     return data[0];
 }

 // Limit a float input f between +abs_limit and -abs_limit.
 inline float Clip(float f, float abs_limit) {
   float result = (abs_limit < f) ? abs_limit : f;
   result = (-abs_limit > result) ? -abs_limit : result;
   return result;
 }

 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
                                          int n_batch, float* result) {
   for (int b = 0; b < n_batch; b++) {
     float* result_in_batch = result + b * m_rows;
     const float* matrix_ptr = matrix;
     for (int r = 0; r < m_rows; r++) {
       const float* vector_in_batch = vector + b * m_cols;
       for (int c = 0; c < m_cols; c++) {
         *result_in_batch += *matrix_ptr++ * *vector_in_batch++;
       }
       result_in_batch++;
     }
   }
 }

 // Cwise product of two vectors.
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   for (int v = 0; v < v_size; v++) {
     *result++ = *vector1++ * *vector2++;
   }
 }

 // Cwise product and accumulation of two vectors. Since it's a MAC operation, the
 // assumption here is that result array is initialized to valid values.
 void VectorVectorCwiseProductAccumulate(const float* vector1,
                                         const float* vector2, int v_size,
                                         float* result) {
   for (int v = 0; v < v_size; v++) {
     *result++ += *vector1++ * *vector2++;
   }
 }

 // Cwise product and accumulation of a vector and a batch-vector. Since it's a MAC
 // operation, the assumption here is that result array is initialized to valid
 // values.
 void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
                                              const float* batch_vector,
                                              int n_batch, float* result) {
   for (int b = 0; b < n_batch; b++) {
     for (int v = 0; v < v_size; v++) {
       *result++ += vector[v] * *batch_vector++;
     }
   }
 }

 // Batch vector initialization with another vector.
 void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
                              float* batch_vector) {
   for (int b = 0; b < n_batch; b++) {
     memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
   }
 }

 // Apply sigmoid to elements of a vector.
 void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
   auto sigmoid_func = ActivationFunctor(kActivationSigmoid);
   for (int v = 0; v < v_size; v++) {
     *result++ = (sigmoid_func)(*vector++);
   }
 }

 // Apply activation function to elements of a vector.
 void ApplyActivationToVector(const float* vector, int v_size,
                              ActivationFn activation, float* result) {
   auto activation_func = ActivationFunctor(activation);
   for (int v = 0; v < v_size; v++) {
     *result++ = (activation_func)(*vector++);
   }
 }

 // Copy vector to another vector.
 inline void CopyVector(const float* vector, int v_size, float* result) {
   memcpy(result, vector, v_size * sizeof(float));
 }

 // Compute "1.0f - elements of vector" (used in CIFG).
 void Sub1Vector(const float* vector, int v_size, float* result) {
   for (int v = 0; v < v_size; v++) {
     *result++ = 1.0f - *vector++;
   }
 }

 // Fill vector with 0.f.
 void ZeroVector(float* vector, int v_size) {
   memset(vector, 0, v_size * sizeof(float));
 }

 // Clip elements of a vector using a abs_limit value.
 void ClipVector(const float* vector, int v_size, float abs_limit,
                 float* result) {
   for (int v = 0; v < v_size; v++) {
     *result++ = Clip(*vector++, abs_limit);
   }
 }

 template <typename T>
 inline T *GetBuffer(RunTimeOperandInfo* operand) {
   return reinterpret_cast<T*>(operand->buffer);
 }

 template <typename T>
 inline const T *GetBuffer(const RunTimeOperandInfo* operand) {
   return reinterpret_cast<const T*>(operand->buffer);
 }

 }  // anonymous namespace

 LSTMCell::LSTMCell(const Operation& operation,
                    std::vector<RunTimeOperandInfo>& operands) {
   auto GetInput = [&operation,
                    &operands](uint32_t index) -> const RunTimeOperandInfo* {
     const std::vector<uint32_t>& inputs = operation.inputs;
     const int index_of_operand = inputs[index];
     if (index_of_operand < 0) {
       return nullptr;
     }
     return &operands[index_of_operand];
   };

   auto GetOutput = [&operation,
                     &operands](uint32_t index) -> RunTimeOperandInfo* {
     const std::vector<uint32_t>& outputs = operation.outputs;
     const int index_of_operand = outputs[index];
     // Expects index of operand in range.
     return &operands[index_of_operand];
   };

   input_ = GetInput(kInputTensor);

   input_to_input_weights_ = GetInput(kInputToInputWeightsTensor);  // optional
   input_to_forget_weights_ = GetInput(kInputToForgetWeightsTensor);
   input_to_cell_weights_ = GetInput(kInputToCellWeightsTensor);
   input_to_output_weights_ = GetInput(kInputToOutputWeightsTensor);

   recurrent_to_input_weights_ =
       GetInput(kRecurrentToInputWeightsTensor);  // optional
   recurrent_to_forget_weights_ = GetInput(kRecurrentToForgetWeightsTensor);
   recurrent_to_cell_weights_ = GetInput(kRecurrentToCellWeightsTensor);
   recurrent_to_output_weights_ = GetInput(kRecurrentToOutputWeightsTensor);

   cell_to_input_weights_ = GetInput(kCellToInputWeightsTensor);    // optional
   cell_to_forget_weights_ = GetInput(kCellToForgetWeightsTensor);  // optional
   cell_to_output_weights_ = GetInput(kCellToOutputWeightsTensor);  // optional

   input_gate_bias_ = GetInput(kInputGateBiasTensor);
   forget_gate_bias_ = GetInput(kForgetGateBiasTensor);
   cell_bias_ = GetInput(kCellGateBiasTensor);
   output_gate_bias_ = GetInput(kOutputGateBiasTensor);

   projection_weights_ = GetInput(kProjectionWeightsTensor);  // optional
   projection_bias_ = GetInput(kProjectionBiasTensor);        // optional

   params_.activation_ = static_cast<ActivationFn>(getScalarData<int32_t>(operands[operation.inputs[kActivationParam]]));
   params_.cell_clip_ = getScalarData<float>(operands[operation.inputs[kCellClipParam]]);
   params_.proj_clip_ = getScalarData<float>(operands[operation.inputs[kProjClipParam]]);

   output_state_ = GetOutput(kOutputStateTensor);
   cell_state_ = GetOutput(kCellStateTensor);
   output_ = GetOutput(kOutputTensor);

   scratch_buffer_ = GetOutput(kScratchBufferTensor);
 }

 bool LSTMCell::Eval() {
   const uint32_t n_batch = input_->shape().dimensions[0];
   const uint32_t n_input = input_->shape().dimensions[1];
   // n_cell and n_output will be the same size when there is no projection.
   const uint32_t n_cell = input_to_output_weights_->shape().dimensions[0];
   const uint32_t n_output = recurrent_to_output_weights_->shape().dimensions[1];

   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_->buffer == nullptr);
   const bool use_peephole = (cell_to_output_weights_->buffer != nullptr);

   // Index the scratch buffers pointers to the global scratch buffer.
   float* input_gate_scratch = nullptr;
   float* cell_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
   float* output_gate_scratch = nullptr;
   if (use_cifg) {
     cell_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
     forget_gate_scratch = cell_scratch + n_cell * n_batch;
     output_gate_scratch = cell_scratch + 2 * n_cell * n_batch;
   } else {
     input_gate_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
     cell_scratch = input_gate_scratch + n_cell * n_batch;
     forget_gate_scratch = input_gate_scratch + 2 * n_cell * n_batch;
     output_gate_scratch = input_gate_scratch + 3 * n_cell * n_batch;
   }

   // Initialize scratch buffers with bias.
   if (!use_cifg) {
     VectorBatchVectorAssign(GetBuffer<float>(input_gate_bias_), n_cell, n_batch,
                             input_gate_scratch);
   }
   VectorBatchVectorAssign(GetBuffer<float>(forget_gate_bias_), n_cell, n_batch,
                           forget_gate_scratch);
   VectorBatchVectorAssign(GetBuffer<float>(cell_bias_), n_cell, n_batch,
                           cell_scratch);
   VectorBatchVectorAssign(GetBuffer<float>(output_gate_bias_), n_cell, n_batch,
                           output_gate_scratch);

   // For each batch and cell: compute input_weight * input.
   if (!use_cifg) {
     MatrixBatchVectorMultiplyAccumulate(
         GetBuffer<float>(input_to_input_weights_), n_cell, n_input,
         GetBuffer<float>(input_), n_batch, input_gate_scratch);
   }
   MatrixBatchVectorMultiplyAccumulate(
       GetBuffer<float>(input_to_forget_weights_), n_cell, n_input,
       GetBuffer<float>(input_), n_batch, forget_gate_scratch);
   MatrixBatchVectorMultiplyAccumulate(
       GetBuffer<float>(input_to_cell_weights_), n_cell, n_input,
       GetBuffer<float>(input_), n_batch, cell_scratch);
   MatrixBatchVectorMultiplyAccumulate(
       GetBuffer<float>(input_to_output_weights_), n_cell, n_input,
       GetBuffer<float>(input_), n_batch, output_gate_scratch);

   // For each batch and cell: compute recurrent_weight * output_state.
   if (!use_cifg) {
     MatrixBatchVectorMultiplyAccumulate(
         GetBuffer<float>(recurrent_to_input_weights_), n_cell, n_output,
         GetBuffer<float>(output_state_), n_batch, input_gate_scratch);
   }
   MatrixBatchVectorMultiplyAccumulate(
       GetBuffer<float>(recurrent_to_forget_weights_), n_cell, n_output,
       GetBuffer<float>(output_state_), n_batch, forget_gate_scratch);
   MatrixBatchVectorMultiplyAccumulate(
       GetBuffer<float>(recurrent_to_cell_weights_), n_cell, n_output,
       GetBuffer<float>(output_state_), n_batch, cell_scratch);
   MatrixBatchVectorMultiplyAccumulate(
       GetBuffer<float>(recurrent_to_output_weights_), n_cell, n_output,
       GetBuffer<float>(output_state_), n_batch, output_gate_scratch);

   // For each batch and cell: update input gate.
   if (!use_cifg) {
     if (use_peephole) {
       VectorBatchVectorCwiseProductAccumulate(
           GetBuffer<float>(cell_to_input_weights_), n_cell,
           GetBuffer<float>(cell_state_), n_batch, input_gate_scratch);
     }
     ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
                          input_gate_scratch);
   }

   // For each batch and cell: update forget gate.
   if (use_peephole) {
     VectorBatchVectorCwiseProductAccumulate(
         GetBuffer<float>(cell_to_forget_weights_), n_cell,
         GetBuffer<float>(cell_state_), n_batch, forget_gate_scratch);
   }
   ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
                        forget_gate_scratch);

   // For each batch and cell: update the cell.
   VectorVectorCwiseProduct(forget_gate_scratch, GetBuffer<float>(cell_state_),
                            n_batch * n_cell, GetBuffer<float>(cell_state_));
   ApplyActivationToVector(cell_scratch, n_batch * n_cell, params_.activation_,
                           cell_scratch);
   if (use_cifg) {
     Sub1Vector(forget_gate_scratch, n_batch * n_cell, forget_gate_scratch);
     VectorVectorCwiseProductAccumulate(cell_scratch, forget_gate_scratch,
                                        n_batch * n_cell,
                                        GetBuffer<float>(cell_state_));
   } else {
     VectorVectorCwiseProductAccumulate(cell_scratch, input_gate_scratch,
                                        n_batch * n_cell,
                                        GetBuffer<float>(cell_state_));
   }
   if (params_.cell_clip_ > 0.0) {
     ClipVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
                params_.cell_clip_, GetBuffer<float>(cell_state_));
   }

   // For each batch and cell: update the output gate.
   if (use_peephole) {
     VectorBatchVectorCwiseProductAccumulate(
         GetBuffer<float>(cell_to_output_weights_), n_cell,
         GetBuffer<float>(cell_state_), n_batch, output_gate_scratch);
   }
   ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                        output_gate_scratch);
   ApplyActivationToVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
                           params_.activation_, cell_scratch);
   VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, n_batch * n_cell,
                            output_gate_scratch);

   // For each batch: update the projection and output_state.
   const bool use_projection_weight = (projection_weights_->buffer != nullptr);
   const bool use_projection_bias = (projection_bias_->buffer != nullptr);
   if (use_projection_weight) {
     if (use_projection_bias) {
       VectorBatchVectorAssign(GetBuffer<float>(projection_bias_), n_output,
                               n_batch, GetBuffer<float>(output_));
     } else {
       ZeroVector(GetBuffer<float>(output_), n_batch * n_output);
     }
     MatrixBatchVectorMultiplyAccumulate(GetBuffer<float>(projection_weights_),
                                         n_output, n_cell, output_gate_scratch,
                                         n_batch, GetBuffer<float>(output_));
     if (params_.proj_clip_ > 0.0) {
       ClipVector(GetBuffer<float>(output_), n_batch * n_output,
                  params_.proj_clip_, GetBuffer<float>(output_));
     }
   } else {
     CopyVector(output_gate_scratch, n_batch * n_output,
                GetBuffer<float>(output_));
   }
   CopyVector(GetBuffer<float>(output_), n_batch * n_output,
              GetBuffer<float>(output_state_));

   return true;
 }

 }  // namespace nn
 }  // namespace android
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "LSTM.h"

	#include "CpuExecutor.h"
	#include "HalInterfaces.h"

	namespace android {
	namespace nn {

	// TODO: move the kernels to a separate file as soon as we have the
	// optimized version ready.
	namespace {

	template <typename T>
	T getScalarData(RunTimeOperandInfo& info) {
	T * data = reinterpret_cast<T*>(info.buffer);
	return data[0];
	}

	// Limit a float input f between +abs_limit and -abs_limit.
	inline float Clip(float f, float abs_limit) {
	float result = (abs_limit < f) ? abs_limit : f;
	result = (-abs_limit > result) ? -abs_limit : result;
	return result;
	}

	// Multiply a matrix by a batch vector, and store results in a batch-size
	// vector.
	void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
	int m_cols, const float* vector,
	int n_batch, float* result) {
	for (int b = 0; b < n_batch; b++) {
	float* result_in_batch = result + b * m_rows;
	const float* matrix_ptr = matrix;
	for (int r = 0; r < m_rows; r++) {
	const float* vector_in_batch = vector + b * m_cols;
	for (int c = 0; c < m_cols; c++) {
	result_in_batch += matrix_ptr++ * *vector_in_batch++;
	}
	result_in_batch++;
	}
	}
	}

	// Cwise product of two vectors.
	void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
	int v_size, float* result) {
	for (int v = 0; v < v_size; v++) {
	result++ = vector1++ * *vector2++;
	}
	}

	// Cwise product and accumulation of two vectors. Since it's a MAC operation, the
	// assumption here is that result array is initialized to valid values.
	void VectorVectorCwiseProductAccumulate(const float* vector1,
	const float* vector2, int v_size,
	float* result) {
	for (int v = 0; v < v_size; v++) {
	result++ += vector1++ * *vector2++;
	}
	}

	// Cwise product and accumulation of a vector and a batch-vector. Since it's a MAC
	// operation, the assumption here is that result array is initialized to valid
	// values.
	void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
	const float* batch_vector,
	int n_batch, float* result) {
	for (int b = 0; b < n_batch; b++) {
	for (int v = 0; v < v_size; v++) {
	result++ += vector[v] *batch_vector++;
	}
	}
	}

	// Batch vector initialization with another vector.
	void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
	float* batch_vector) {
	for (int b = 0; b < n_batch; b++) {
	memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
	}
	}

	// Apply sigmoid to elements of a vector.
	void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
	auto sigmoid_func = ActivationFunctor(kActivationSigmoid);
	for (int v = 0; v < v_size; v++) {
	result++ = (sigmoid_func)(vector++);
	}
	}

	// Apply activation function to elements of a vector.
	void ApplyActivationToVector(const float* vector, int v_size,
	ActivationFn activation, float* result) {
	auto activation_func = ActivationFunctor(activation);
	for (int v = 0; v < v_size; v++) {
	result++ = (activation_func)(vector++);
	}
	}

	// Copy vector to another vector.
	inline void CopyVector(const float* vector, int v_size, float* result) {
	memcpy(result, vector, v_size * sizeof(float));
	}

	// Compute "1.0f - elements of vector" (used in CIFG).
	void Sub1Vector(const float* vector, int v_size, float* result) {
	for (int v = 0; v < v_size; v++) {
	result++ = 1.0f - vector++;
	}
	}

	// Fill vector with 0.f.
	void ZeroVector(float* vector, int v_size) {
	memset(vector, 0, v_size * sizeof(float));
	}

	// Clip elements of a vector using a abs_limit value.
	void ClipVector(const float* vector, int v_size, float abs_limit,
	float* result) {
	for (int v = 0; v < v_size; v++) {
	result++ = Clip(vector++, abs_limit);
	}
	}

	template <typename T>
	inline T GetBuffer(RunTimeOperandInfo operand) {
	return reinterpret_cast<T*>(operand->buffer);
	}

	template <typename T>
	inline const T GetBuffer(const RunTimeOperandInfo operand) {
	return reinterpret_cast<const T*>(operand->buffer);
	}

	} // anonymous namespace

	LSTMCell::LSTMCell(const Operation& operation,
	std::vector<RunTimeOperandInfo>& operands) {
	auto GetInput = [&operation,
	&operands](uint32_t index) -> const RunTimeOperandInfo* {
	const std::vector<uint32_t>& inputs = operation.inputs;
	const int index_of_operand = inputs[index];
	if (index_of_operand < 0) {
	return nullptr;
	}
	return &operands[index_of_operand];
	};

	auto GetOutput = [&operation,
	&operands](uint32_t index) -> RunTimeOperandInfo* {
	const std::vector<uint32_t>& outputs = operation.outputs;
	const int index_of_operand = outputs[index];
	// Expects index of operand in range.
	return &operands[index_of_operand];
	};

	input_ = GetInput(kInputTensor);

	input_to_input_weights_ = GetInput(kInputToInputWeightsTensor); // optional
	input_to_forget_weights_ = GetInput(kInputToForgetWeightsTensor);
	input_to_cell_weights_ = GetInput(kInputToCellWeightsTensor);
	input_to_output_weights_ = GetInput(kInputToOutputWeightsTensor);

	recurrent_to_input_weights_ =
	GetInput(kRecurrentToInputWeightsTensor); // optional
	recurrent_to_forget_weights_ = GetInput(kRecurrentToForgetWeightsTensor);
	recurrent_to_cell_weights_ = GetInput(kRecurrentToCellWeightsTensor);
	recurrent_to_output_weights_ = GetInput(kRecurrentToOutputWeightsTensor);

	cell_to_input_weights_ = GetInput(kCellToInputWeightsTensor); // optional
	cell_to_forget_weights_ = GetInput(kCellToForgetWeightsTensor); // optional
	cell_to_output_weights_ = GetInput(kCellToOutputWeightsTensor); // optional

	input_gate_bias_ = GetInput(kInputGateBiasTensor);
	forget_gate_bias_ = GetInput(kForgetGateBiasTensor);
	cell_bias_ = GetInput(kCellGateBiasTensor);
	output_gate_bias_ = GetInput(kOutputGateBiasTensor);

	projection_weights_ = GetInput(kProjectionWeightsTensor); // optional
	projection_bias_ = GetInput(kProjectionBiasTensor); // optional

	params_.activation_ = static_cast<ActivationFn>(getScalarData<int32_t>(operands[operation.inputs[kActivationParam]]));
	params_.cell_clip_ = getScalarData<float>(operands[operation.inputs[kCellClipParam]]);
	params_.proj_clip_ = getScalarData<float>(operands[operation.inputs[kProjClipParam]]);

	output_state_ = GetOutput(kOutputStateTensor);
	cell_state_ = GetOutput(kCellStateTensor);
	output_ = GetOutput(kOutputTensor);

	scratch_buffer_ = GetOutput(kScratchBufferTensor);
	}

	bool LSTMCell::Eval() {
	const uint32_t n_batch = input_->shape().dimensions[0];
	const uint32_t n_input = input_->shape().dimensions[1];
	// n_cell and n_output will be the same size when there is no projection.
	const uint32_t n_cell = input_to_output_weights_->shape().dimensions[0];
	const uint32_t n_output = recurrent_to_output_weights_->shape().dimensions[1];

	// Since we have already checked that weights are all there or none, we can
	// check the existence of only one to the get the condition.
	const bool use_cifg = (input_to_input_weights_->buffer == nullptr);
	const bool use_peephole = (cell_to_output_weights_->buffer != nullptr);

	// Index the scratch buffers pointers to the global scratch buffer.
	float* input_gate_scratch = nullptr;
	float* cell_scratch = nullptr;
	float* forget_gate_scratch = nullptr;
	float* output_gate_scratch = nullptr;
	if (use_cifg) {
	cell_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
	forget_gate_scratch = cell_scratch + n_cell * n_batch;
	output_gate_scratch = cell_scratch + 2 * n_cell * n_batch;
	} else {
	input_gate_scratch = reinterpret_cast<float*>(scratch_buffer_->buffer);
	cell_scratch = input_gate_scratch + n_cell * n_batch;
	forget_gate_scratch = input_gate_scratch + 2 * n_cell * n_batch;
	output_gate_scratch = input_gate_scratch + 3 * n_cell * n_batch;
	}

	// Initialize scratch buffers with bias.
	if (!use_cifg) {
	VectorBatchVectorAssign(GetBuffer<float>(input_gate_bias_), n_cell, n_batch,
	input_gate_scratch);
	}
	VectorBatchVectorAssign(GetBuffer<float>(forget_gate_bias_), n_cell, n_batch,
	forget_gate_scratch);
	VectorBatchVectorAssign(GetBuffer<float>(cell_bias_), n_cell, n_batch,
	cell_scratch);
	VectorBatchVectorAssign(GetBuffer<float>(output_gate_bias_), n_cell, n_batch,
	output_gate_scratch);

	// For each batch and cell: compute input_weight * input.
	if (!use_cifg) {
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(input_to_input_weights_), n_cell, n_input,
	GetBuffer<float>(input_), n_batch, input_gate_scratch);
	}
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(input_to_forget_weights_), n_cell, n_input,
	GetBuffer<float>(input_), n_batch, forget_gate_scratch);
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(input_to_cell_weights_), n_cell, n_input,
	GetBuffer<float>(input_), n_batch, cell_scratch);
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(input_to_output_weights_), n_cell, n_input,
	GetBuffer<float>(input_), n_batch, output_gate_scratch);

	// For each batch and cell: compute recurrent_weight * output_state.
	if (!use_cifg) {
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(recurrent_to_input_weights_), n_cell, n_output,
	GetBuffer<float>(output_state_), n_batch, input_gate_scratch);
	}
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(recurrent_to_forget_weights_), n_cell, n_output,
	GetBuffer<float>(output_state_), n_batch, forget_gate_scratch);
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(recurrent_to_cell_weights_), n_cell, n_output,
	GetBuffer<float>(output_state_), n_batch, cell_scratch);
	MatrixBatchVectorMultiplyAccumulate(
	GetBuffer<float>(recurrent_to_output_weights_), n_cell, n_output,
	GetBuffer<float>(output_state_), n_batch, output_gate_scratch);

	// For each batch and cell: update input gate.
	if (!use_cifg) {
	if (use_peephole) {
	VectorBatchVectorCwiseProductAccumulate(
	GetBuffer<float>(cell_to_input_weights_), n_cell,
	GetBuffer<float>(cell_state_), n_batch, input_gate_scratch);
	}
	ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
	input_gate_scratch);
	}

	// For each batch and cell: update forget gate.
	if (use_peephole) {
	VectorBatchVectorCwiseProductAccumulate(
	GetBuffer<float>(cell_to_forget_weights_), n_cell,
	GetBuffer<float>(cell_state_), n_batch, forget_gate_scratch);
	}
	ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
	forget_gate_scratch);

	// For each batch and cell: update the cell.
	VectorVectorCwiseProduct(forget_gate_scratch, GetBuffer<float>(cell_state_),
	n_batch * n_cell, GetBuffer<float>(cell_state_));
	ApplyActivationToVector(cell_scratch, n_batch * n_cell, params_.activation_,
	cell_scratch);
	if (use_cifg) {
	Sub1Vector(forget_gate_scratch, n_batch * n_cell, forget_gate_scratch);
	VectorVectorCwiseProductAccumulate(cell_scratch, forget_gate_scratch,
	n_batch * n_cell,
	GetBuffer<float>(cell_state_));
	} else {
	VectorVectorCwiseProductAccumulate(cell_scratch, input_gate_scratch,
	n_batch * n_cell,
	GetBuffer<float>(cell_state_));
	}
	if (params_.cell_clip_ > 0.0) {
	ClipVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
	params_.cell_clip_, GetBuffer<float>(cell_state_));
	}

	// For each batch and cell: update the output gate.
	if (use_peephole) {
	VectorBatchVectorCwiseProductAccumulate(
	GetBuffer<float>(cell_to_output_weights_), n_cell,
	GetBuffer<float>(cell_state_), n_batch, output_gate_scratch);
	}
	ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
	output_gate_scratch);
	ApplyActivationToVector(GetBuffer<float>(cell_state_), n_batch * n_cell,
	params_.activation_, cell_scratch);
	VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, n_batch * n_cell,
	output_gate_scratch);

	// For each batch: update the projection and output_state.
	const bool use_projection_weight = (projection_weights_->buffer != nullptr);
	const bool use_projection_bias = (projection_bias_->buffer != nullptr);
	if (use_projection_weight) {
	if (use_projection_bias) {
	VectorBatchVectorAssign(GetBuffer<float>(projection_bias_), n_output,
	n_batch, GetBuffer<float>(output_));
	} else {
	ZeroVector(GetBuffer<float>(output_), n_batch * n_output);
	}
	MatrixBatchVectorMultiplyAccumulate(GetBuffer<float>(projection_weights_),
	n_output, n_cell, output_gate_scratch,
	n_batch, GetBuffer<float>(output_));
	if (params_.proj_clip_ > 0.0) {
	ClipVector(GetBuffer<float>(output_), n_batch * n_output,
	params_.proj_clip_, GetBuffer<float>(output_));
	}
	} else {
	CopyVector(output_gate_scratch, n_batch * n_output,
	GetBuffer<float>(output_));
	}
	CopyVector(GetBuffer<float>(output_), n_batch * n_output,
	GetBuffer<float>(output_state_));

	return true;
	}

	} // namespace nn
	} // namespace android