Adding filter rank to SVDF

Bug: 67597181

Pulled in the latest implementation from TF Lite.
This allows a rank parameter that is greater than 1.

Split the state into input and output states.

Added a test spec for rank 2 SVDF.

Test: NeuralNetworksTest
Test: SVDFTest: adb shell /data/nativetest64/svdf_test/svdf_test

Change-Id: I207cc081019369bf5e643e16e91ff664d10975cc
(cherry picked from commit 062414c5c5e20d7432d17b00eb7daf0562396815)
diff --git a/common/Android.bp b/common/Android.bp
index 1e251cc..109d133 100644
--- a/common/Android.bp
+++ b/common/Android.bp
@@ -163,6 +163,9 @@
         "operations/SVDFTest.cpp",
     ],
     local_include_dirs: [ "include" ],
+    header_libs: [
+        "tensorflow_headers",
+    ],
     shared_libs: [
         "libneuralnetworks",
     ],
diff --git a/common/operations/SVDF.cpp b/common/operations/SVDF.cpp
index f600b26..38224a6 100644
--- a/common/operations/SVDF.cpp
+++ b/common/operations/SVDF.cpp
@@ -24,19 +24,14 @@
 
 namespace {
 
-// TODO: Implement this using circular buffer instead.
-// This is here temporarily only to show the logic.
-void svdf_right_shift_state(const float* state_in, int state_len, float shift_value,
-                            float* state_out) {
-  for (int i = 0; i < state_len - 1; i++) {
-    state_out[i] = state_in[i + 1];
-  }
-  state_out[state_len - 1] = shift_value;
+template <typename T>
+inline T *GetBuffer(RunTimeOperandInfo* operand) {
+  return reinterpret_cast<T*>(operand->buffer);
 }
 
-int32_t getInt32ScalarData(RunTimeOperandInfo& info) {
-    int32_t * data = reinterpret_cast<int32_t*>(info.buffer);
-    return data[0];
+template <typename T>
+inline const T *GetBuffer(const RunTimeOperandInfo* operand) {
+  return reinterpret_cast<const T*>(operand->buffer);
 }
 
 }
@@ -49,8 +44,8 @@
     bias_ = GetInput(operation, operands, kBiasTensor);
     state_in_ = GetInput(operation, operands, kStateInTensor);
 
-    params_.rank_ = getInt32ScalarData(*GetInput(operation, operands, kRankParam));
-    params_.activation_ = static_cast<ActivationFn>(getInt32ScalarData(
+    params_.rank_ = getScalarData<int>(*GetInput(operation, operands, kRankParam));
+    params_.activation_ = static_cast<TfLiteFusedActivation>(getScalarData<int>(
         *GetInput(operation, operands, kActivationParam)));
 
     state_out_ = GetOutput(operation, operands, kStateOutTensor);
@@ -63,6 +58,7 @@
                    Shape *outputShape) {
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
+
   NN_CHECK(num_inputs == 6 || num_inputs == 7);
   NN_CHECK_EQ(NumOutputs(operation), 2);
 
@@ -75,11 +71,14 @@
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
+  const int rank = getScalarData<int>(*GetInput(operation, operands, kRankParam));
   const uint32_t batch_size = SizeOfDimension(input, 0);
-  const uint32_t num_units = SizeOfDimension(weights_feature, 0);
+  const uint32_t num_filters = SizeOfDimension(weights_feature, 0);
+  NN_CHECK_EQ(num_filters % rank, 0);
+  const uint32_t num_units = num_filters / rank;
   const uint32_t memory_size = SizeOfDimension(weights_time, 1);
   NN_CHECK_EQ(SizeOfDimension(input, 1), SizeOfDimension(weights_feature, 1));
-  NN_CHECK_EQ(SizeOfDimension(weights_time, 0), num_units);
+  NN_CHECK_EQ(SizeOfDimension(weights_time, 0), num_filters);
 
   const RunTimeOperandInfo *bias =
       GetInput(operation, operands, kBiasTensor);
@@ -90,7 +89,7 @@
   // Resize state.
   const Shape &inputShape = input->shape();
   stateShape->type = inputShape.type;
-  stateShape->dimensions = { batch_size, memory_size * num_units };
+  stateShape->dimensions = { batch_size, memory_size * num_filters };
   stateShape->offset = inputShape.offset;
   stateShape->scale = inputShape.scale;
 
@@ -104,62 +103,80 @@
 }
 
 bool SVDF::Eval() {
-    const int batch_size = input_->shape().dimensions[0];
-    const int input_size = input_->shape().dimensions[1];
-    const int num_units = weights_feature_->shape().dimensions[0];
-    const int memory_size = weights_time_->shape().dimensions[1];
-    const int weights_feature_stride = weights_feature_->shape().dimensions[1];
-    const int weights_time_stride = weights_time_->shape().dimensions[1];
+    const int rank = params_.rank_;
+    const int batch_size = SizeOfDimension(input_, 0);
+    const int input_size = SizeOfDimension(input_, 1);
+    const int num_filters = SizeOfDimension(weights_feature_, 0);
+    const int num_units = num_filters / rank;
+    const int memory_size = SizeOfDimension(weights_time_, 1);
 
-    // Initialize weights_feature and weights_time pointers.
-    const float* weights_feature_ptr = reinterpret_cast<float *>(weights_feature_->buffer);
-    const float* weights_time_ptr = reinterpret_cast<float *>(weights_time_->buffer);
-
-    // For each batch
+    memcpy(GetBuffer<float>(state_out_), GetBuffer<float>(state_in_),
+           sizeof(float) * batch_size * memory_size * num_filters);
+    // Compute conv1d(inputs, weights_feature).
     for (int b = 0; b < batch_size; b++) {
-        // Initialize the pointer to input, output and bias.
-        const float* input_ptr_batch = reinterpret_cast<float *>(input_->buffer) + b * input_size;
-        float* output_ptr_batch = reinterpret_cast<float*>(output_->buffer) + b * num_units;
-        const float* state_in_ptr_batch = reinterpret_cast<const float*>(state_in_->buffer) + b * (memory_size - 1) * num_units;
-        float* state_out_ptr_batch = reinterpret_cast<float*>(state_out_->buffer) + b * (memory_size - 1) * num_units;
-
-        // For each unit
-        for (int c = 0; c < num_units; c++) {
-            float activation = 0.0;
-
-            // tf.nn.conv1d(inputs, weights_feature, feature_dim, "VALID")
-            for (int j = 0; j < input_size; j++) {
-                activation += input_ptr_batch[j] * weights_feature_ptr[j];
-            }
-
-            // Initialize state pointer for unit 'c'.
-            const float* state_in_ptr = state_in_ptr_batch + c * (memory_size - 1);
-            float* state_out_ptr = state_out_ptr_batch + c * (memory_size - 1);
-
-            // Apply bias if bias tensor exists.
-            output_ptr_batch[c] = bias_->buffer ? reinterpret_cast<float *>(bias_->buffer)[c] : 0.f;
-
-            // output = tf.matmul(state, weights_time)
-            output_ptr_batch[c] += weights_time_ptr[memory_size - 1] * activation;
-            for (int j = 0; j < memory_size - 1; j++) {
-                output_ptr_batch[c] += weights_time_ptr[j] * state_in_ptr[j];
-            }
-
-            // Apply activation.
-            output_ptr_batch[c] =
-                    (ActivationFunctor(params_.activation_))(output_ptr_batch[c]);
-
-            // Right shift the state and concatenate with activation.
-            svdf_right_shift_state(state_in_ptr, memory_size - 1, activation,
-                                   state_out_ptr);
-
-            // Update weight pointers.
-            weights_feature_ptr += weights_feature_stride;
-            weights_time_ptr += weights_time_stride;
+        float* state_ptr_batch = GetBuffer<float>(state_out_) + b * memory_size * num_filters;
+        for (int c = 0; c < num_filters; c++) {
+            float* state_ptr = state_ptr_batch + c * memory_size;
+            state_ptr[memory_size - 1] = 0.0;
         }
-        // Reset weight pointers for next batch.
-        weights_feature_ptr = reinterpret_cast<float*>(weights_feature_->buffer);
-        weights_time_ptr = reinterpret_cast<float*>(weights_time_->buffer);
+    }
+    // The state left most column is used to save current cycle activation. This
+    // is achieved by starting at state->data.f[memory_size - 1] and having the
+    // stride equal to memory_size.
+    tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        GetBuffer<float>(weights_feature_), num_filters, input_size,
+        GetBuffer<float>(input_),  batch_size,
+        &GetBuffer<float>(state_out_)[memory_size - 1], memory_size);
+
+    // Compute matmul(state, weights_time).
+    // The right most column is used to save temporary output (with the size of
+    // num_filters). This is achieved by starting at state->data.f and having the
+    // stride equal to memory_size.
+    float scratch[batch_size * num_filters];
+    for (int b = 0; b < batch_size; b++) {
+        float* state_out_ptr_batch =
+            GetBuffer<float>(state_out_) + b * memory_size * num_filters;
+        float* scratch_ptr_batch = scratch + b * num_filters;
+        tflite::tensor_utils::BatchVectorBatchVectorDotProduct(
+            GetBuffer<float>(weights_time_), state_out_ptr_batch, memory_size, num_filters,
+            scratch_ptr_batch, /*result_stride=*/1);
+    }
+
+    // Initialize output with bias if provided.
+    if (!IsNullInput(bias_)) {
+        tflite::tensor_utils::VectorBatchVectorAssign(
+            GetBuffer<float>(bias_), num_units, batch_size,
+            GetBuffer<float>(output_));
+    } else {
+        tflite::tensor_utils::ZeroVector(
+            GetBuffer<float>(output_), batch_size * num_units);
+    }
+
+    // Reduction sum
+    for (int b = 0; b < batch_size; b++) {
+        float* output_ptr_batch = GetBuffer<float>(output_) + b * num_units;
+        float* scratch_ptr_batch = scratch + b * num_filters;
+        tflite::tensor_utils::ReductionSumVector(
+            scratch_ptr_batch, output_ptr_batch, num_units, rank);
+    }
+
+    // Apply activation.
+    for (int b = 0; b < batch_size; b++) {
+        float* output_ptr_batch = GetBuffer<float>(output_) + b * num_units;
+        tflite::tensor_utils::ApplyActivationToVector(
+            output_ptr_batch, num_units,
+            params_.activation_, output_ptr_batch);
+    }
+
+    // Right shift the state.
+    for (int b = 0; b < batch_size; b++) {
+        float* state_out_ptr_batch =
+            GetBuffer<float>(state_out_) + b * memory_size * num_filters;
+        for (int f = 0; f < num_filters; f++) {
+            tflite::tensor_utils::VectorShiftLeft(state_out_ptr_batch, memory_size,
+                                          /*shift_value=*/0.0);
+            state_out_ptr_batch += memory_size;
+        }
     }
     return true;
 }
diff --git a/common/operations/SVDF.h b/common/operations/SVDF.h
index a219fe5..d5acfdb 100644
--- a/common/operations/SVDF.h
+++ b/common/operations/SVDF.h
@@ -17,7 +17,7 @@
 #ifndef FRAMEWORKS_ML_NN_SVDF_H
 #define FRAMEWORKS_ML_NN_SVDF_H
 
-#include "ActivationFunctor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 
 #include <algorithm>
 #include <cmath>
@@ -37,7 +37,7 @@
 
 struct SVDFParams {
     int rank_;
-    ActivationFn activation_;
+    TfLiteFusedActivation activation_;
 };
 
 struct RunTimeOperandInfo;
diff --git a/common/operations/SVDFTest.cpp b/common/operations/SVDFTest.cpp
index 0bd59ba..da96dd2 100644
--- a/common/operations/SVDFTest.cpp
+++ b/common/operations/SVDFTest.cpp
@@ -73,6 +73,38 @@
                              -0.89477462, 1.67204106,  -0.53235275,
                              -0.89477462, 1.67204106,  -0.53235275};
 
+static float svdf_input_rank2[] = {
+    0.12609188,  -0.46347019, -0.89598465,
+    0.35867718,  0.36897406,  0.73463392,
+
+    0.14278367,  -1.64410412, -0.75222826,
+    -0.57290924, 0.12729003,  0.7567004,
+
+    0.49837467,  0.19278903,  0.26584083,
+    0.17660543,  0.52949083,  -0.77931279,
+
+    -0.11186574, 0.13164264,  -0.05349274,
+    -0.72674477, -0.5683046,  0.55900657,
+
+    -0.68892461, 0.37783599,  0.18263303,
+    -0.63690937, 0.44483393,  -0.71817774,
+
+    -0.81299269, -0.86831826, 1.43940818,
+    -0.95760226, 1.82078898,  0.71135032,
+
+    -1.45006323, -0.82251364, -1.69082689,
+    -1.65087092, -1.89238167, 1.54172635,
+
+    0.03966608,  -0.24936394, -0.77526885,
+    2.06740379,  -1.51439476, 1.43768692,
+
+    0.11771342,  -0.23761693, -0.65898693,
+    0.31088525,  -1.55601168, -0.87661445,
+
+    -0.89477462, 1.67204106,  -0.53235275,
+    -0.6230064,  0.29819036,  1.06939757,
+};
+
 static float svdf_golden_output[] = {
     0.014899,    -0.0517661, -0.143725, -0.00271883,
     0.014899,    -0.0517661, -0.143725, -0.00271883,
@@ -104,6 +136,38 @@
     0.36726,     -0.522303,  -0.456502, -0.175475,
     0.36726,     -0.522303,  -0.456502, -0.175475};
 
+static float svdf_golden_output_rank_2[] = {
+    -0.09623547, -0.10193135, 0.11083051,  -0.0347917,
+    0.1141196,   0.12965347,  -0.12652366, 0.01007236,
+
+    -0.16396809, -0.21247184, 0.11259045,  -0.04156673,
+    0.10132131,  -0.06143532, -0.00924693, 0.10084561,
+
+    0.01257364,  0.0506071,   -0.19287863, -0.07162561,
+    -0.02033747, 0.22673416,  0.15487903,  0.02525555,
+
+    -0.1411963,  -0.37054959, 0.01774767,  0.05867489,
+    0.09607603,  -0.0141301,  -0.08995658, 0.12867066,
+
+    -0.27142537, -0.16955489, 0.18521598,  -0.12528358,
+    0.00331409,  0.11167502,  0.02218599,  -0.07309391,
+
+    0.09593632,  -0.28361851, -0.0773851,  0.17199151,
+    -0.00075242, 0.33691186,  -0.1536046,  0.16572715,
+
+    -0.27916506, -0.27626723, 0.42615682,  0.3225764,
+    -0.37472126, -0.55655634, -0.05013514, 0.289112,
+
+    -0.24418658, 0.07540751,  -0.1940318,  -0.08911639,
+    0.00732617,  0.46737891,  0.26449674,  0.24888524,
+
+    -0.17225097, -0.54660404, -0.38795233, 0.08389944,
+    0.07736043,  -0.28260678, 0.15666828,  1.14949894,
+
+    -0.57454878, -0.64704704, 0.73235172,  -0.34616736,
+    0.21120001,  -0.22927976, 0.02455296,  -0.35906726,
+};
+
 #define FOR_ALL_INPUT_AND_WEIGHT_TENSORS(ACTION) \
   ACTION(Input)                                  \
   ACTION(WeightsFeature)                         \
@@ -218,7 +282,7 @@
     ASSERT_EQ(execution.setInput(SVDF::kRankParam, &rank_, sizeof(rank_)),
               Result::NO_ERROR);
 
-    int activation = ActivationFn::kActivationNone;
+    int activation = TfLiteFusedActivation::kTfLiteActNone;
     ASSERT_EQ(execution.setInput(SVDF::kActivationParam, &activation,
                                  sizeof(activation)),
               Result::NO_ERROR);
@@ -318,6 +382,68 @@
   }
 }
 
+TEST(SVDFOpTest, BlackBoxTestRank2) {
+  SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                   /*memory_size=*/10, /*rank=*/2);
+  svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
+                          0.12416199,  0.15785322,  0.27901134,  0.3905206,
+                          0.21931258,  -0.36137494, -0.10640851, 0.31053296,
+                          -0.36118156, -0.0976817,  -0.36916667, 0.22197971,
+                          0.15294972,  0.38031587,  0.27557442,  0.39635518,
+                          -0.21580373, -0.06634006, -0.02702999, 0.27072677});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
+
+       -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+       0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
+
+       -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+       0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
+
+       -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+       -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
+
+       0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+       0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
+
+  svdf.SetBias({});
+
+  svdf.ResetState();
+  const int svdf_num_batches = svdf.num_batches();
+  const int svdf_input_size = svdf.input_size();
+  const int svdf_num_units = svdf.num_units();
+  const int input_sequence_size =
+      sizeof(svdf_input_rank2) / sizeof(float) / (svdf_input_size * svdf_num_batches);
+  // Going over each input batch, setting the input tensor, invoking the SVDF op
+  // and checking the output with the expected golden values.
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = svdf_input_rank2 + i * svdf_input_size * svdf_num_batches;
+    float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
+    svdf.SetInput(0, batch_start, batch_end);
+
+    svdf.Invoke();
+
+    float* golden_start =
+        svdf_golden_output_rank_2 + i * svdf_num_units * svdf_num_batches;
+    float* golden_end = golden_start + svdf_num_units * svdf_num_batches;
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
 }  // namespace wrapper
 }  // namespace nn
 }  // namespace android
diff --git a/runtime/test/generated/all_generated_tests.cpp b/runtime/test/generated/all_generated_tests.cpp
index 96e89f8..4235a17 100644
--- a/runtime/test/generated/all_generated_tests.cpp
+++ b/runtime/test/generated/all_generated_tests.cpp
@@ -1807,6 +1807,20 @@
             space_to_depth_quant8_2::examples);
 }
 
+namespace svdf2 {
+std::vector<MixedTypedExample> examples = {
+// Generated svdf2 test
+#include "generated/examples/svdf2.example.cpp"
+};
+// Generated model constructor
+#include "generated/models/svdf2.model.cpp"
+} // namespace svdf2
+TEST_F(GeneratedTests, svdf2) {
+    execute(svdf2::CreateModel,
+            svdf2::is_ignored,
+            svdf2::examples);
+}
+
 namespace svdf {
 std::vector<MixedTypedExample> examples = {
 // Generated svdf test
diff --git a/runtime/test/generated/examples/svdf.example.cpp b/runtime/test/generated/examples/svdf.example.cpp
index 949421d..16f4e7a 100644
--- a/runtime/test/generated/examples/svdf.example.cpp
+++ b/runtime/test/generated/examples/svdf.example.cpp
@@ -4,7 +4,7 @@
 //Input(s)
 { // See tools/test_generator/include/TestHarness.h:MixedTyped
   // int -> FLOAT32 map
-  {{0, {0.12609188f, -0.46347019f, -0.89598465f, 0.12609188f, -0.46347019f, -0.89598465f}}, {1, {-0.31930989f, -0.36118156f, 0.0079667f, 0.37613347f, 0.22197971f, 0.12416199f, 0.27901134f, 0.27557442f, 0.3905206f, -0.36137494f, -0.06634006f, -0.10640851f}}, {2, {-0.31930989f, 0.37613347f, 0.27901134f, -0.36137494f, -0.36118156f, 0.22197971f, 0.27557442f, -0.06634006f, 0.0079667f, 0.12416199f, 0.3905206f, -0.10640851f, -0.0976817f, 0.15294972f, 0.39635518f, -0.02702999f, 0.39296314f, 0.15785322f, 0.21931258f, 0.31053296f, -0.36916667f, 0.38031587f, -0.21580373f, 0.27072677f, 0.23622236f, 0.34936687f, 0.18174365f, 0.35907319f, -0.17493086f, 0.324846f, -0.10781813f, 0.27201805f, 0.14324132f, -0.23681851f, -0.27115166f, -0.01580888f, -0.14943552f, 0.15465137f, 0.09784451f, -0.0337657f}}, {3, {}}, {4, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}},
+  {{0, {0.12609188f, -0.46347019f, -0.89598465f, 0.12609188f, -0.46347019f, -0.89598465f}}, {1, {-0.31930989f, -0.36118156f, 0.0079667f, 0.37613347f, 0.22197971f, 0.12416199f, 0.27901134f, 0.27557442f, 0.3905206f, -0.36137494f, -0.06634006f, -0.10640851f}}, {2, {-0.31930989f, 0.37613347f, 0.27901134f, -0.36137494f, -0.36118156f, 0.22197971f, 0.27557442f, -0.06634006f, 0.0079667f, 0.12416199f, 0.3905206f, -0.10640851f, -0.0976817f, 0.15294972f, 0.39635518f, -0.02702999f, 0.39296314f, 0.15785322f, 0.21931258f, 0.31053296f, -0.36916667f, 0.38031587f, -0.21580373f, 0.27072677f, 0.23622236f, 0.34936687f, 0.18174365f, 0.35907319f, -0.17493086f, 0.324846f, -0.10781813f, 0.27201805f, 0.14324132f, -0.23681851f, -0.27115166f, -0.01580888f, -0.14943552f, 0.15465137f, 0.09784451f, -0.0337657f}}, {3, {}}, {4, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}},
   // int -> INT32 map
   {{5, {1}}, {6, {0}}},
   // int -> QUANT8_ASYMM map
@@ -13,7 +13,7 @@
 //Output(s)
 { // See tools/test_generator/include/TestHarness.h:MixedTyped
   // int -> FLOAT32 map
-  {{1, {0.014899f, -0.0517661f, -0.143725f, -0.00271883f, 0.014899f, -0.0517661f, -0.143725f, -0.00271883f}}, {0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}},
+  {{1, {0.014899f, -0.0517661f, -0.143725f, -0.00271883f, 0.014899f, -0.0517661f, -0.143725f, -0.00271883f}}, {0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}},
   // int -> INT32 map
   {},
   // int -> QUANT8_ASYMM map
diff --git a/runtime/test/generated/examples/svdf2.example.cpp b/runtime/test/generated/examples/svdf2.example.cpp
new file mode 100644
index 0000000..db2d32b
--- /dev/null
+++ b/runtime/test/generated/examples/svdf2.example.cpp
@@ -0,0 +1,22 @@
+// Generated file (from: svdf2.mod.py). Do not edit
+// Begin of an example
+{
+//Input(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{0, {0.12609188f, -0.46347019f, -0.89598465f, 0.35867718f, 0.36897406f, 0.73463392f}}, {1, {-0.31930989f, 0.0079667f, 0.39296314f, 0.37613347f, 0.12416199f, 0.15785322f, 0.27901134f, 0.3905206f, 0.21931258f, -0.36137494f, -0.10640851f, 0.31053296f, -0.36118156f, -0.0976817f, -0.36916667f, 0.22197971f, 0.15294972f, 0.38031587f, 0.27557442f, 0.39635518f, -0.21580373f, -0.06634006f, -0.02702999f, 0.27072677f}}, {2, {-0.31930989f, 0.37613347f, 0.27901134f, -0.36137494f, -0.36118156f, 0.22197971f, 0.27557442f, -0.06634006f, 0.0079667f, 0.12416199f, 0.3905206f, -0.10640851f, -0.0976817f, 0.15294972f, 0.39635518f, -0.02702999f, 0.39296314f, 0.15785322f, 0.21931258f, 0.31053296f, -0.36916667f, 0.38031587f, -0.21580373f, 0.27072677f, 0.23622236f, 0.34936687f, 0.18174365f, 0.35907319f, -0.17493086f, 0.324846f, -0.10781813f, 0.27201805f, 0.14324132f, -0.23681851f, -0.27115166f, -0.01580888f, -0.14943552f, 0.15465137f, 0.09784451f, -0.0337657f, -0.14884081f, 0.19931212f, -0.36002168f, 0.34663299f, -0.11405486f, 0.12672701f, 0.39463779f, -0.07886535f, -0.06384811f, 0.08249187f, -0.26816407f, -0.19905911f, 0.29211238f, 0.31264046f, -0.28664589f, 0.05698794f, 0.11613581f, 0.14078894f, 0.02187902f, -0.21781836f, -0.15567942f, 0.08693647f, -0.38256618f, 0.36580828f, -0.22922277f, -0.0226903f, 0.12878349f, -0.28122205f, -0.10850525f, -0.11955214f, 0.27179423f, -0.04710215f, 0.31069002f, 0.22672787f, 0.09580326f, 0.08682203f, 0.1258215f, 0.1851041f, 0.29228821f, 0.12366763f}}, {3, {}}, {4, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}},
+  // int -> INT32 map
+  {{5, {2}}, {6, {0}}},
+  // int -> QUANT8_ASYMM map
+  {}
+},
+//Output(s)
+{ // See tools/test_generator/include/TestHarness.h:MixedTyped
+  // int -> FLOAT32 map
+  {{1, {-0.09623547f, -0.10193135f, 0.11083051f, -0.0347917f, 0.1141196f, 0.12965347f, -0.12652366f, 0.01007236f}}, {0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}},
+  // int -> INT32 map
+  {},
+  // int -> QUANT8_ASYMM map
+  {}
+}
+}, // End of an example
diff --git a/runtime/test/generated/examples/svdf_state.example.cpp b/runtime/test/generated/examples/svdf_state.example.cpp
index ffe0212..a3f96a6 100644
--- a/runtime/test/generated/examples/svdf_state.example.cpp
+++ b/runtime/test/generated/examples/svdf_state.example.cpp
@@ -4,7 +4,7 @@
 //Input(s)
 { // See tools/test_generator/include/TestHarness.h:MixedTyped
   // int -> FLOAT32 map
-  {{0, {0.14278367f, -1.64410412f, -0.75222826f, 0.14278367f, -1.64410412f, -0.75222826f}}, {1, {-0.31930989f, -0.36118156f, 0.0079667f, 0.37613347f, 0.22197971f, 0.12416199f, 0.27901134f, 0.27557442f, 0.3905206f, -0.36137494f, -0.06634006f, -0.10640851f}}, {2, {-0.31930989f, 0.37613347f, 0.27901134f, -0.36137494f, -0.36118156f, 0.22197971f, 0.27557442f, -0.06634006f, 0.0079667f, 0.12416199f, 0.3905206f, -0.10640851f, -0.0976817f, 0.15294972f, 0.39635518f, -0.02702999f, 0.39296314f, 0.15785322f, 0.21931258f, 0.31053296f, -0.36916667f, 0.38031587f, -0.21580373f, 0.27072677f, 0.23622236f, 0.34936687f, 0.18174365f, 0.35907319f, -0.17493086f, 0.324846f, -0.10781813f, 0.27201805f, 0.14324132f, -0.23681851f, -0.27115166f, -0.01580888f, -0.14943552f, 0.15465137f, 0.09784451f, -0.0337657f}}, {3, {}}, {4, {0, 0, 0, 0, 0, 0, 0, 0, 0.119996f, 0, 0, 0, 0, 0, 0, 0, 0, -0.166701f, 0, 0, 0, 0, 0, 0, 0, 0, -0.44244f, 0, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0, 0, 0, 0, 0, 0, 0, 0, 0.119996f, 0, 0, 0, 0, 0, 0, 0, 0, -0.166701f, 0, 0, 0, 0, 0, 0, 0, 0, -0.44244f, 0, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0, 0, 0, 0, 0, 0, 0, 0}}},
+  {{0, {0.14278367f, -1.64410412f, -0.75222826f, 0.14278367f, -1.64410412f, -0.75222826f}}, {1, {-0.31930989f, -0.36118156f, 0.0079667f, 0.37613347f, 0.22197971f, 0.12416199f, 0.27901134f, 0.27557442f, 0.3905206f, -0.36137494f, -0.06634006f, -0.10640851f}}, {2, {-0.31930989f, 0.37613347f, 0.27901134f, -0.36137494f, -0.36118156f, 0.22197971f, 0.27557442f, -0.06634006f, 0.0079667f, 0.12416199f, 0.3905206f, -0.10640851f, -0.0976817f, 0.15294972f, 0.39635518f, -0.02702999f, 0.39296314f, 0.15785322f, 0.21931258f, 0.31053296f, -0.36916667f, 0.38031587f, -0.21580373f, 0.27072677f, 0.23622236f, 0.34936687f, 0.18174365f, 0.35907319f, -0.17493086f, 0.324846f, -0.10781813f, 0.27201805f, 0.14324132f, -0.23681851f, -0.27115166f, -0.01580888f, -0.14943552f, 0.15465137f, 0.09784451f, -0.0337657f}}, {3, {}}, {4, {0, 0, 0, 0, 0, 0, 0, 0, 0.119996f, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.166701f, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.44244f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.119996f, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.166701f, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.44244f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0}}},
   // int -> INT32 map
   {{5, {1}}, {6, {0}}},
   // int -> QUANT8_ASYMM map
@@ -13,7 +13,7 @@
 //Output(s)
 { // See tools/test_generator/include/TestHarness.h:MixedTyped
   // int -> FLOAT32 map
-  {{1, {0.068281f, -0.162217f, -0.152268f, 0.00323521f, 0.068281f, -0.162217f, -0.152268f, 0.00323521f}}, {0, {0, 0, 0, 0, 0, 0, 0, 0.119996f, 0.542235f, 0, 0, 0, 0, 0, 0, 0, -0.166701f, -0.40465f, 0, 0, 0, 0, 0, 0, 0, -0.44244f, -0.706995f, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0.137515f, 0, 0, 0, 0, 0, 0, 0, 0.119996f, 0.542235f, 0, 0, 0, 0, 0, 0, 0, -0.166701f, -0.40465f, 0, 0, 0, 0, 0, 0, 0, -0.44244f, -0.706995f, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0.137515f, 0, 0, 0, 0, 0, 0, 0, 0}}},
+  {{1, {0.068281f, -0.162217f, -0.152268f, 0.00323521f, 0.068281f, -0.162217f, -0.152268f, 0.00323521f}}, {0, {0, 0, 0, 0, 0, 0, 0, 0.119996f, 0.542235f, 0, 0, 0, 0, 0, 0, 0, 0, -0.166701f, -0.40465f, 0, 0, 0, 0, 0, 0, 0, 0, -0.44244f, -0.706995f, 0, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0.137515f, 0, 0, 0, 0, 0, 0, 0, 0, 0.119996f, 0.542235f, 0, 0, 0, 0, 0, 0, 0, 0, -0.166701f, -0.40465f, 0, 0, 0, 0, 0, 0, 0, 0, -0.44244f, -0.706995f, 0, 0, 0, 0, 0, 0, 0, 0, 0.0805206f, 0.137515f, 0}}},
   // int -> INT32 map
   {},
   // int -> QUANT8_ASYMM map
diff --git a/runtime/test/generated/models/svdf2.model.cpp b/runtime/test/generated/models/svdf2.model.cpp
new file mode 100644
index 0000000..2a08657
--- /dev/null
+++ b/runtime/test/generated/models/svdf2.model.cpp
@@ -0,0 +1,32 @@
+// Generated file (from: svdf2.mod.py). Do not edit
+void CreateModel(Model *model) {
+  OperandType type0(Type::TENSOR_FLOAT32, {2, 3});
+  OperandType type6(Type::TENSOR_FLOAT32, {2, 4});
+  OperandType type4(Type::TENSOR_FLOAT32, {2, 80});
+  OperandType type3(Type::TENSOR_FLOAT32, {4});
+  OperandType type2(Type::TENSOR_FLOAT32, {8, 10});
+  OperandType type1(Type::TENSOR_FLOAT32, {8, 3});
+  OperandType type5(Type::TENSOR_INT32, {1});
+  // Phase 1, operands
+  auto input = model->addOperand(&type0);
+  auto weights_feature = model->addOperand(&type1);
+  auto weights_time = model->addOperand(&type2);
+  auto bias = model->addOperand(&type3);
+  auto state_in = model->addOperand(&type4);
+  auto rank_param = model->addOperand(&type5);
+  auto activation_param = model->addOperand(&type5);
+  auto state_out = model->addOperand(&type4);
+  auto output = model->addOperand(&type6);
+  // Phase 2, operations
+  model->addOperation(ANEURALNETWORKS_SVDF, {input, weights_feature, weights_time, bias, state_in, rank_param, activation_param}, {state_out, output});
+  // Phase 3, inputs and outputs
+  model->identifyInputsAndOutputs(
+    {input, weights_feature, weights_time, bias, state_in, rank_param, activation_param},
+    {state_out, output});
+  assert(model->isValid());
+}
+
+bool is_ignored(int i) {
+  static std::set<int> ignore = {0};
+  return ignore.find(i) != ignore.end();
+}
diff --git a/runtime/test/specs/svdf.mod.py b/runtime/test/specs/svdf.mod.py
index 0843ff1..3c8a440 100644
--- a/runtime/test/specs/svdf.mod.py
+++ b/runtime/test/specs/svdf.mod.py
@@ -15,26 +15,29 @@
 #
 
 batches = 2
-units = 4
+features = 4
+rank = 1
+units = int(features / rank)
 input_size = 3
 memory_size = 10
 
 model = Model()
 
 input = Input("input", "TENSOR_FLOAT32", "{%d, %d}" % (batches, input_size))
-weights_feature = Input("weights_feature", "TENSOR_FLOAT32", "{%d, %d}" % (units, input_size))
-weights_time = Input("weights_time", "TENSOR_FLOAT32", "{%d, %d}" % (units, memory_size))
+weights_feature = Input("weights_feature", "TENSOR_FLOAT32", "{%d, %d}" % (features, input_size))
+weights_time = Input("weights_time", "TENSOR_FLOAT32", "{%d, %d}" % (features, memory_size))
 bias = Input("bias", "TENSOR_FLOAT32", "{%d}" % (units))
-state_in = Input("state_in", "TENSOR_FLOAT32", "{%d, %d}" % (batches, memory_size*units))
+state_in = Input("state_in", "TENSOR_FLOAT32", "{%d, %d}" % (batches, memory_size*features))
 rank_param = Input("rank_param", "TENSOR_INT32", "{1}")
 activation_param = Input("activation_param", "TENSOR_INT32", "{1}")
-state_out = IgnoredOutput("state_out", "TENSOR_FLOAT32", "{%d, %d}" % (batches, memory_size*units))
+state_out = IgnoredOutput("state_out", "TENSOR_FLOAT32", "{%d, %d}" % (batches, memory_size*features))
 output = Output("output", "TENSOR_FLOAT32", "{%d, %d}" % (batches, units))
 
 model = model.Operation("SVDF", input, weights_feature, weights_time, bias, state_in,
                         rank_param, activation_param).To([state_out, output])
 
 input0 = {
+    input: [],
     weights_feature: [
         -0.31930989, -0.36118156, 0.0079667, 0.37613347,
       0.22197971, 0.12416199, 0.27901134, 0.27557442,
@@ -54,11 +57,11 @@
       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657
     ],
     bias: [],
-    rank_param: [1],
+    state_in: [0 for _ in range(batches * memory_size * features)],
+    rank_param: [rank],
     activation_param: [0],
 }
 
-# TODO: State is an intermediate buffer, don't check this against test result.
 test_inputs = [
     0.12609188,  -0.46347019, -0.89598465,
     0.12609188,  -0.46347019, -0.89598465,
@@ -123,17 +126,14 @@
     0.36726,     -0.522303,  -0.456502, -0.175475
 ]
 
-input_sequence_size = int(len(test_inputs) / input_size / batches)
+output0 = {state_out: [0 for _ in range(batches * memory_size * features)],
+           output: []}
 
 # TODO: enable more data points after fixing the reference issue
-#for i in range(input_sequence_size):
 for i in range(1):
   batch_start = i * input_size * batches
   batch_end = batch_start + input_size * batches
   input0[input] = test_inputs[batch_start:batch_end]
-  input0[state_in]  = [0 for _ in range(batches * (memory_size - 1) * units)]
-  output0 = {state_out:[0 for x in range(batches * (memory_size - 1) * units)],
-             output: []}
   golden_start = i * units * batches
   golden_end = golden_start + units * batches
   output0[output] = golden_outputs[golden_start:golden_end]
diff --git a/runtime/test/specs/svdf2.mod.py b/runtime/test/specs/svdf2.mod.py
new file mode 100644
index 0000000..82f24a8
--- /dev/null
+++ b/runtime/test/specs/svdf2.mod.py
@@ -0,0 +1,155 @@
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+batches = 2
+features = 8
+rank = 2
+units = int(features / rank)
+input_size = 3
+memory_size = 10
+
+model = Model()
+
+input = Input("input", "TENSOR_FLOAT32", "{%d, %d}" % (batches, input_size))
+weights_feature = Input("weights_feature", "TENSOR_FLOAT32", "{%d, %d}" % (features, input_size))
+weights_time = Input("weights_time", "TENSOR_FLOAT32", "{%d, %d}" % (features, memory_size))
+bias = Input("bias", "TENSOR_FLOAT32", "{%d}" % (units))
+state_in = Input("state_in", "TENSOR_FLOAT32", "{%d, %d}" % (batches, memory_size*features))
+rank_param = Input("rank_param", "TENSOR_INT32", "{1}")
+activation_param = Input("activation_param", "TENSOR_INT32", "{1}")
+state_out = IgnoredOutput("state_out", "TENSOR_FLOAT32", "{%d, %d}" % (batches, memory_size*features))
+output = Output("output", "TENSOR_FLOAT32", "{%d, %d}" % (batches, units))
+
+model = model.Operation("SVDF", input, weights_feature, weights_time, bias, state_in,
+                        rank_param, activation_param).To([state_out, output])
+
+input0 = {
+    input: [],
+    weights_feature: [
+      -0.31930989, 0.0079667,   0.39296314,  0.37613347,
+      0.12416199,  0.15785322,  0.27901134,  0.3905206,
+      0.21931258,  -0.36137494, -0.10640851, 0.31053296,
+      -0.36118156, -0.0976817,  -0.36916667, 0.22197971,
+      0.15294972,  0.38031587,  0.27557442,  0.39635518,
+      -0.21580373, -0.06634006, -0.02702999, 0.27072677
+    ],
+    weights_time: [
+      -0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
+
+       -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+       0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
+
+       -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+       0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
+
+       -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+       -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
+
+       0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+       0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763
+    ],
+    bias: [],
+    state_in: [0 for _ in range(batches * memory_size * features)],
+    rank_param: [rank],
+    activation_param: [0],
+}
+
+test_inputs = [
+    0.12609188,  -0.46347019, -0.89598465,
+    0.35867718,  0.36897406,  0.73463392,
+
+    0.14278367,  -1.64410412, -0.75222826,
+    -0.57290924, 0.12729003,  0.7567004,
+
+    0.49837467,  0.19278903,  0.26584083,
+    0.17660543,  0.52949083,  -0.77931279,
+
+    -0.11186574, 0.13164264,  -0.05349274,
+    -0.72674477, -0.5683046,  0.55900657,
+
+    -0.68892461, 0.37783599,  0.18263303,
+    -0.63690937, 0.44483393,  -0.71817774,
+
+    -0.81299269, -0.86831826, 1.43940818,
+    -0.95760226, 1.82078898,  0.71135032,
+
+    -1.45006323, -0.82251364, -1.69082689,
+    -1.65087092, -1.89238167, 1.54172635,
+
+    0.03966608,  -0.24936394, -0.77526885,
+    2.06740379,  -1.51439476, 1.43768692,
+
+    0.11771342,  -0.23761693, -0.65898693,
+    0.31088525,  -1.55601168, -0.87661445,
+
+    -0.89477462, 1.67204106,  -0.53235275,
+    -0.6230064,  0.29819036,  1.06939757,
+]
+
+golden_outputs = [
+    -0.09623547, -0.10193135, 0.11083051,  -0.0347917,
+    0.1141196,   0.12965347,  -0.12652366, 0.01007236,
+
+    -0.16396809, -0.21247184, 0.11259045,  -0.04156673,
+    0.10132131,  -0.06143532, -0.00924693, 0.10084561,
+
+    0.01257364,  0.0506071,   -0.19287863, -0.07162561,
+    -0.02033747, 0.22673416,  0.15487903,  0.02525555,
+
+    -0.1411963,  -0.37054959, 0.01774767,  0.05867489,
+    0.09607603,  -0.0141301,  -0.08995658, 0.12867066,
+
+    -0.27142537, -0.16955489, 0.18521598,  -0.12528358,
+    0.00331409,  0.11167502,  0.02218599,  -0.07309391,
+
+    0.09593632,  -0.28361851, -0.0773851,  0.17199151,
+    -0.00075242, 0.33691186,  -0.1536046,  0.16572715,
+
+    -0.27916506, -0.27626723, 0.42615682,  0.3225764,
+    -0.37472126, -0.55655634, -0.05013514, 0.289112,
+
+    -0.24418658, 0.07540751,  -0.1940318,  -0.08911639,
+    0.00732617,  0.46737891,  0.26449674,  0.24888524,
+
+    -0.17225097, -0.54660404, -0.38795233, 0.08389944,
+    0.07736043,  -0.28260678, 0.15666828,  1.14949894,
+
+    -0.57454878, -0.64704704, 0.73235172,  -0.34616736,
+    0.21120001,  -0.22927976, 0.02455296,  -0.35906726,
+]
+
+output0 = {state_out: [0 for _ in range(batches * memory_size * features)],
+           output: []}
+
+# TODO: enable more data points after fixing the reference issue
+for i in range(1):
+  batch_start = i * input_size * batches
+  batch_end = batch_start + input_size * batches
+  input0[input] = test_inputs[batch_start:batch_end]
+  golden_start = i * units * batches
+  golden_end = golden_start + units * batches
+  output0[output] = golden_outputs[golden_start:golden_end]
+  Example((input0, output0))
diff --git a/runtime/test/specs/svdf_state.mod.py b/runtime/test/specs/svdf_state.mod.py
index aad2114..3d204cb 100644
--- a/runtime/test/specs/svdf_state.mod.py
+++ b/runtime/test/specs/svdf_state.mod.py
@@ -67,22 +67,22 @@
   0, 0, 0, 0,
   0.119996, 0, 0, 0,
   0, 0, 0, 0,
-  0, -0.166701, 0, 0,
+  0, 0, -0.166701, 0,
   0, 0, 0, 0,
-  0, 0, -0.44244, 0,
   0, 0, 0, 0,
-  0, 0, 0, 0.0805206,
+  -0.44244, 0, 0, 0,
+  0, 0, 0, 0,
+  0, 0, 0.0805206, 0,
   0, 0, 0, 0,
   0, 0, 0, 0,
   0.119996, 0, 0, 0,
   0, 0, 0, 0,
-  0, -0.166701, 0, 0,
-  0, 0, 0, 0,
-  0, 0, -0.44244, 0,
-  0, 0, 0, 0,
-  0, 0, 0, 0.0805206,
+  0, 0, -0.166701, 0,
   0, 0, 0, 0,
   0, 0, 0, 0,
+  -0.44244, 0, 0, 0,
+  0, 0, 0, 0,
+  0, 0, 0.0805206, 0,
 ]
 output0 = {
     state_out : [
@@ -90,22 +90,22 @@
   0, 0, 0, 0.119996,
   0.542235, 0, 0, 0,
   0, 0, 0, 0,
-  -0.166701, -0.40465, 0, 0,
+  0, -0.166701, -0.40465, 0,
   0, 0, 0, 0,
-  0, -0.44244, -0.706995, 0,
+  0, 0, 0, -0.44244,
+  -0.706995, 0, 0, 0,
   0, 0, 0, 0,
-  0, 0, 0.0805206, 0.137515,
+  0, 0.0805206, 0.137515, 0,
   0, 0, 0, 0,
   0, 0, 0, 0.119996,
   0.542235, 0, 0, 0,
   0, 0, 0, 0,
-  -0.166701, -0.40465, 0, 0,
+  0, -0.166701, -0.40465, 0,
   0, 0, 0, 0,
-  0, -0.44244, -0.706995, 0,
+  0, 0, 0, -0.44244,
+  -0.706995, 0, 0, 0,
   0, 0, 0, 0,
-  0, 0, 0.0805206, 0.137515,
-  0, 0, 0, 0,
-  0, 0, 0, 0,
+  0, 0.0805206, 0.137515, 0,
     ],
     output : [
   0.068281,    -0.162217,  -0.152268, 0.00323521,