Implement IntrinsicBLAS for RS C++ API

Change-Id: I2337340ce9ed43ab49b55b37d349b696bb0679a1
diff --git a/cpp/Android.mk b/cpp/Android.mk
index 0c59f44..fe45db7 100644
--- a/cpp/Android.mk
+++ b/cpp/Android.mk
@@ -28,6 +28,7 @@
 	Script.cpp \
 	ScriptC.cpp \
 	ScriptIntrinsics.cpp \
+	ScriptIntrinsicBLAS.cpp \
 	Sampler.cpp
 
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
diff --git a/cpp/ScriptIntrinsicBLAS.cpp b/cpp/ScriptIntrinsicBLAS.cpp
new file mode 100644
index 0000000..63a8f0c
--- /dev/null
+++ b/cpp/ScriptIntrinsicBLAS.cpp
@@ -0,0 +1,1848 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "RenderScript.h"
+#include "rsCppInternal.h"
+
+using namespace android;
+using namespace RSC;
+
+// ScriptIntrinsicBLAS APIS
+ScriptIntrinsicBLAS::ScriptIntrinsicBLAS(sp<RS> rs, sp<const Element> e)
+    : ScriptIntrinsic(rs, RS_SCRIPT_INTRINSIC_ID_BLAS, e) {
+
+}
+
+sp<ScriptIntrinsicBLAS> ScriptIntrinsicBLAS::create(sp<RS> rs) {
+    return new ScriptIntrinsicBLAS(rs, Element::U32(rs));
+}
+
+enum RsBlasDataType {
+    SINGLE,
+    DOUBLE,
+    SINGLE_COMPLEX,
+    DOUBLE_COMPLEX
+};
+
+static RsBlasCall
+setUpBLASCall(RsBlasDataType dataType, RsBlasFunction func,
+              int TransA, int TransB, int Side, int Uplo, int Diag,
+              int M, int N, int K, int incX, int incY, int KL, int KU,
+              float alphaF, float betaF, double alphaD, double betaD,
+              float alphaCX, float alphaCY, float betaCX, float betaCY,
+              double alphaZX, double alphaZY, double betaZX, double betaZY
+              ) {
+    RsBlasCall call;
+    memset(&call, 0, sizeof(call));
+    call.func = func;
+    call.transA = (RsBlasTranspose)TransA;
+    call.transB = (RsBlasTranspose)TransB;
+    call.side = (RsBlasSide)Side;
+    call.uplo = (RsBlasUplo)Uplo;
+    call.diag = (RsBlasDiag)Diag;
+    call.M = M;
+    call.N = N;
+    call.K = K;
+
+    switch (dataType) {
+        case SINGLE:
+            // For Single-precision BLAS.
+            call.alpha.f = alphaF;
+            call.beta.f = betaF;
+            break;
+        case DOUBLE:
+            // For Double-precision BLAS.
+            call.alpha.d = alphaD;
+            call.beta.d = betaD;
+            break;
+        case SINGLE_COMPLEX:
+            // For Single-precision complex BLAS.
+            call.alpha.c.r = alphaCX;
+            call.alpha.c.i = alphaCY;
+            call.beta.c.r = betaCX;
+            call.beta.c.i = betaCY;
+            break;
+        case DOUBLE_COMPLEX:
+            // For Double-precision complex BLAS.
+            call.alpha.z.r = alphaZX;
+            call.alpha.z.i = alphaZY;
+            call.beta.z.r = betaZX;
+            call.beta.z.i = betaZY;
+            break;
+        default:
+            break;
+    }
+
+    call.incX = incX;
+    call.incY = incY;
+    call.KL = KL;
+    call.KU = KU;
+
+    return call;
+}
+
+static void
+nScriptIntrinsicBLAS_Single(RS* mRS, RsContext con, RsScript id, RsBlasFunction func, int TransA,
+                            int TransB, int Side, int Uplo, int Diag, int M, int N, int K,
+                            float alpha, RsAllocation A, RsAllocation B,
+                            float beta, RsAllocation C, int incX, int incY, int KL, int KU) {
+    RsBlasCall call = setUpBLASCall(SINGLE, func, TransA, TransB, Side, Uplo, Diag,
+                                    M, N, K, incX, incY, KL, KU, alpha, beta, 0.0, 0.0,
+                                    0.0f, 0.0f, 0.0f, 0.0f, 0.0, 0.0, 0.0, 0.0);
+    RsAllocation in_allocs[3] = {A, B, C};
+    tryDispatch(mRS, RS::dispatch->ScriptForEachMulti(con, id, 0, in_allocs, sizeof(in_allocs), nullptr,
+                                                      &call, sizeof(call), nullptr, 0));
+}
+
+
+static void
+nScriptIntrinsicBLAS_Double(RS* mRS, RsContext con, RsScript id, RsBlasFunction func, int TransA,
+                            int TransB, int Side, int Uplo, int Diag, int M, int N, int K,
+                            double alpha, RsAllocation A, RsAllocation B,
+                            double beta, RsAllocation C, int incX, int incY, int KL, int KU) {
+    RsBlasCall call = setUpBLASCall(DOUBLE, func, TransA, TransB, Side, Uplo, Diag,
+                                    M, N, K, incX, incY, KL, KU, 0.0f, 0.0f, alpha, beta,
+                                    0.0f, 0.0f, 0.0f, 0.0f, 0.0, 0.0, 0.0, 0.0);
+    RsAllocation in_allocs[3] = {A, B, C};
+    tryDispatch(mRS, RS::dispatch->ScriptForEachMulti(con, id, 0, in_allocs, sizeof(in_allocs), nullptr,
+                                                      &call, sizeof(call), nullptr, 0));
+}
+
+static void
+nScriptIntrinsicBLAS_Complex(RS* mRS, RsContext con, RsScript id, RsBlasFunction func, int TransA,
+                             int TransB, int Side, int Uplo, int Diag, int M, int N, int K,
+                             float alphaX, float alphaY, RsAllocation A, RsAllocation B,
+                             float betaX, float betaY, RsAllocation C, int incX, int incY, int KL, int KU) {
+    RsBlasCall call = setUpBLASCall(SINGLE_COMPLEX, func, TransA, TransB, Side, Uplo, Diag,
+                                    M, N, K, incX, incY, KL, KU, 0.0f, 0.0f, 0.0, 0.0,
+                                    alphaX, alphaY, betaX, betaY, 0.0, 0.0, 0.0, 0.0);
+    RsAllocation in_allocs[3] = {A, B, C};
+    tryDispatch(mRS, RS::dispatch->ScriptForEachMulti(con, id, 0, in_allocs, sizeof(in_allocs), nullptr,
+                                                      &call, sizeof(call), nullptr, 0));
+}
+
+static void
+nScriptIntrinsicBLAS_Z(RS* mRS, RsContext con, RsScript id, RsBlasFunction func, int TransA,
+                       int TransB, int Side, int Uplo, int Diag, int M, int N, int K,
+                       double alphaX, double alphaY, RsAllocation A, RsAllocation B,
+                       double betaX, double betaY, RsAllocation C, int incX, int incY, int KL, int KU) {
+    RsBlasCall call = setUpBLASCall(DOUBLE_COMPLEX, func, TransA, TransB, Side, Uplo, Diag,
+                                    M, N, K, incX, incY, KL, KU, 0.0f, 0.0f, 0.0, 0.0,
+                                    0.0f, 0.0f, 0.0f, 0.0f, alphaX, alphaY, betaX, betaY);
+    RsAllocation in_allocs[3] = {A, B, C};
+    tryDispatch(mRS, RS::dispatch->ScriptForEachMulti(con, id, 0, in_allocs, sizeof(in_allocs), nullptr,
+                                                      &call, sizeof(call), nullptr, 0));
+}
+
+
+static void
+nScriptIntrinsicBLAS_BNNM(RS* mRS, RsContext con, RsScript id, int M, int N, int K,
+                          RsAllocation A, int a_offset, RsAllocation B, int b_offset,
+                          RsAllocation C, int c_offset, int c_mult_int) {
+    RsBlasCall call;
+    memset(&call, 0, sizeof(call));
+    call.func = RsBlas_bnnm;
+    call.M = M;
+    call.N = N;
+    call.K = K;
+    call.a_offset = a_offset & 0xFF;
+    call.b_offset = b_offset & 0xFF;
+    call.c_offset = c_offset;
+    call.c_mult_int = c_mult_int;
+
+    RsAllocation in_allocs[3] = {A, B, C};
+    tryDispatch(mRS, RS::dispatch->ScriptForEachMulti(con, id, 0, in_allocs, sizeof(in_allocs), nullptr,
+                                                      &call, sizeof(call), nullptr, 0));
+}
+
+/**
+ * Level 2 BLAS
+ */
+static void validateGEMV(RS* mRS, sp<const Element> e, RsBlasTranspose TransA, sp<Allocation> A,
+                         sp<Allocation> X, int incX, sp<Allocation> Y, int incY) {
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = -1, expectedYDim = -1;
+    if (TransA == RsBlasNoTrans) {
+        expectedXDim = 1 + (N - 1) * incX;
+        expectedYDim = 1 + (M - 1) * incY;
+    } else {
+        expectedXDim = 1 + (M - 1) * incX;
+        expectedYDim = 1 + (N - 1) * incY;
+    }
+    if ((int)X->getType()->getX() != expectedXDim ||
+        (int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for GEMV");
+    }
+}
+
+void ScriptIntrinsicBLAS::SGEMV(RsBlasTranspose TransA, float alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, float beta, sp<Allocation> Y, int incY) {
+    validateGEMV(mRS, Element::F32(mRS), TransA, A, X, incX, Y, incY);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sgemv,
+                                TransA, 0, 0, 0, 0, M, N, 0,
+                                alpha, A->getID(), X->getID(),
+                                beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DGEMV(RsBlasTranspose TransA, double alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, double beta, sp<Allocation> Y, int incY) {
+    validateGEMV(mRS, Element::F64(mRS), TransA, A, X, incX, Y, incY);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dgemv,
+                                TransA, 0, 0, 0, 0, M, N, 0,
+                                alpha, A->getID(), X->getID(),
+                                beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CGEMV(RsBlasTranspose TransA, Float2 alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, Float2 beta, sp<Allocation> Y, int incY) {
+    validateGEMV(mRS, Element::F32_2(mRS), TransA, A, X, incX, Y, incY);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cgemv,
+                                 TransA, 0, 0, 0, 0, M, N, 0,
+                                 alpha.x, alpha.y, A->getID(), X->getID(),
+                                 beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZGEMV(RsBlasTranspose TransA, Double2 alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, Double2 beta, sp<Allocation> Y, int incY) {
+    validateGEMV(mRS, Element::F64_2(mRS), TransA, A, X, incX, Y, incY);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zgemv,
+                           TransA, 0, 0, 0, 0, M, N, 0,
+                           alpha.x, alpha.y, A->getID(), X->getID(),
+                           beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SGBMV(RsBlasTranspose TransA, int KL, int KU, float alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, float beta, sp<Allocation> Y, int incY) {
+    // GBMV has the same validation requirements as GEMV + KL and KU >= 0
+    validateGEMV(mRS, Element::F32(mRS), TransA, A, X, incX, Y, incY);
+    if (KL < 0 || KU < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "KL and KU must be greater than or equal to 0");
+    }
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sgbmv,
+                                TransA, 0, 0, 0, 0, M, N, 0,
+                                alpha, A->getID(), X->getID(),
+                                beta, Y->getID(), incX, incY, KL, KU);
+}
+
+void ScriptIntrinsicBLAS::DGBMV(RsBlasTranspose TransA, int KL, int KU, double alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, double beta, sp<Allocation> Y, int incY) {
+    // GBMV has the same validation requirements as GEMV + KL and KU >= 0
+    validateGEMV(mRS, Element::F64(mRS), TransA, A, X, incX, Y, incY);
+    if (KL < 0 || KU < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "KL and KU must be greater than or equal to 0");
+    }
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dgbmv,
+                                TransA, 0, 0, 0, 0, M, N, 0,
+                                alpha, A->getID(), X->getID(),
+                                beta, Y->getID(), incX, incY, KL, KU);
+}
+
+void ScriptIntrinsicBLAS::CGBMV(RsBlasTranspose TransA, int KL, int KU, Float2 alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, Float2 beta, sp<Allocation> Y, int incY) {
+    // GBMV has the same validation requirements as GEMV + KL and KU >= 0
+    validateGEMV(mRS, Element::F32_2(mRS), TransA, A, X, incX, Y, incY);
+    if (KL < 0 || KU < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "KL and KU must be greater than or equal to 0");
+    }
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cgbmv,
+                                 TransA, 0, 0, 0, 0, M, N, 0,
+                                 alpha.x, alpha.y, A->getID(), X->getID(),
+                                 beta.x, beta.y, Y->getID(), incX, incY, KL, KU);
+}
+
+void ScriptIntrinsicBLAS::ZGBMV(RsBlasTranspose TransA, int KL, int KU, Double2 alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, Double2 beta, sp<Allocation> Y, int incY) {
+    // GBMV has the same validation requirements as GEMV + KL and KU >= 0
+    validateGEMV(mRS, Element::F64_2(mRS), TransA, A, X, incX, Y, incY);
+    if (KL < 0 || KU < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "KL and KU must be greater than or equal to 0");
+    }
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zgbmv,
+                           TransA, 0, 0, 0, 0, M, N, 0,
+                           alpha.x, alpha.y, A->getID(), X->getID(),
+                           beta.x, beta.y, Y->getID(), incX, incY, KL, KU);
+}
+
+static void validateTRMV(RS* mRS, sp<const Element> e, RsBlasUplo Uplo, RsBlasTranspose TransA,
+                         RsBlasDiag Diag, sp<Allocation> A, sp<Allocation> X, int incX) {
+    int N = A->getType()->getY();
+    if ((int)A->getType()->getX() != N) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "A must be a square matrix for TRMV");
+    }
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (incX <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for TRMV");
+    }
+}
+
+static int validateTPMV(RS* mRS, sp<const Element> e,  RsBlasUplo Uplo, RsBlasTranspose TransA,
+                        RsBlasDiag Diag, sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    if (!Ap->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (Ap->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Ap must have a Y dimension of 0 or 1");
+    }
+
+    int N = sqrt((double)Ap->getType()->getX() * 2);
+    if ((int)Ap->getType()->getX() != ((N * (N+1)) / 2)) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid dimension for Ap");
+    }
+    if (incX <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for TPMV");
+    }
+
+    return N;
+}
+
+
+void ScriptIntrinsicBLAS::STRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    validateTRMV(mRS, Element::F32(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_strmv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    validateTRMV(mRS, Element::F64(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtrmv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    validateTRMV(mRS, Element::F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctrmv,
+                                 TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                                 A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    validateTRMV(mRS, Element::F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztrmv,
+                           TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                           A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::STBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBMV has the same requirements as TRMV + K >= 0
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be greater than or equal to 0");
+    }
+    validateTRMV(mRS, Element::F32(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_stbmv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, K, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBMV has the same requirements as TRMV + K >= 0
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be greater than or equal to 0");
+    }
+    validateTRMV(mRS, Element::F64(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtbmv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, K, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBMV has the same requirements as TRMV + K >= 0
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be greater than or equal to 0");
+    }
+    validateTRMV(mRS, Element::F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctbmv,
+                                 TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0,
+                                 A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBMV has the same requirements as TRMV + K >= 0
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be greater than or equal to 0");
+    }
+    validateTRMV(mRS, Element::F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztbmv,
+                           TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0,
+                           A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::STPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    int N = validateTPMV(mRS, Element::F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_stpmv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                Ap->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    int N = validateTPMV(mRS, Element::F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtpmv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                Ap->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap,  sp<Allocation> X,  int incX) {
+    int N = validateTPMV(mRS, Element::F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctpmv,
+                                 TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                                 Ap->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    int N = validateTPMV(mRS, Element::F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztpmv,
+                           TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                           Ap->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::STRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TRSV is the same as TRMV
+    validateTRMV(mRS, Element::F32(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_strsv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A,  sp<Allocation> X,  int incX) {
+    // TRSV is the same as TRMV
+    validateTRMV(mRS, Element::F64(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtrsv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+
+}
+
+void ScriptIntrinsicBLAS::CTRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TRSV is the same as TRMV
+    validateTRMV(mRS, Element::F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctrsv,
+                                 TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                                 A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+
+}
+
+void ScriptIntrinsicBLAS::ZTRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TRSV is the same as TRMV
+    validateTRMV(mRS, Element::F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztrsv,
+                           TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                           A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+
+}
+
+void ScriptIntrinsicBLAS::STBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBSV is the same as TRMV + K >= 0
+    validateTRMV(mRS, Element::F32(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Number of diagonals must be positive");
+    }
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_stbsv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, K, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBSV is the same as TRMV + K >= 0
+    validateTRMV(mRS, Element::F64(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Number of diagonals must be positive");
+    }
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtbsv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, K, 0,
+                                A->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBSV is the same as TRMV + K >= 0
+    validateTRMV(mRS, Element::F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Number of diagonals must be positive");
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctbsv,
+                                 TransA, 0, 0, Uplo, Diag, 0, N, K,
+                                 0, 0, A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                int K, sp<Allocation> A, sp<Allocation> X, int incX) {
+    // TBSV is the same as TRMV + K >= 0
+    validateTRMV(mRS, Element::F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
+    int N = A->getType()->getY();
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Number of diagonals must be positive");
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztbsv,
+                           TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0,
+                           A->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::STPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    // TPSV is same as TPMV
+    int N = validateTPMV(mRS, Element::F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_stpsv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                Ap->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    // TPSV is same as TPMV
+    int N = validateTPMV(mRS, Element::F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtpsv,
+                                TransA, 0, 0, Uplo, Diag, 0, N, 0, 0,
+                                Ap->getID(), X->getID(), 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    // TPSV is same as TPMV
+    int N = validateTPMV(mRS, Element::F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctpsv,
+                                 TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                                 Ap->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                sp<Allocation> Ap, sp<Allocation> X, int incX) {
+    // TPSV is same as TPMV
+    int N = validateTPMV(mRS, Element::F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztpsv,
+                           TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0,
+                           Ap->getID(), X->getID(), 0, 0, 0, incX, 0, 0, 0);
+}
+
+/**
+ * Level 2, S and D only
+ */
+static int validateSYMV(RS* mRS, sp<const Element> e, RsBlasUplo Uplo, sp<Allocation> A,
+                        sp<Allocation> X, sp<Allocation> Y, int incX, int incY) {
+    int N = A->getType()->getY();
+    if ((int)A->getType()->getX() != N) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "A must be a square matrix for SYMV");
+    }
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e) ) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SYMV");
+    }
+    int expectedYDim = 1 + (N - 1) * incY;
+    if ((int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SYMV");
+    }
+    return N;
+}
+static int validateSPMV(RS* mRS, sp<const Element> e, RsBlasUplo Uplo, sp<Allocation> Ap,
+                        sp<Allocation> X, int incX, sp<Allocation> Y, int incY) {
+    if (!Ap->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (Ap->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Ap must have a Y dimension of 0 or 1");
+    }
+
+    int N = sqrt((double)Ap->getType()->getX() * 2);
+    if ((int)Ap->getType()->getX() != ((N * (N+1)) / 2)) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid dimension for Ap");
+    }
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SPMV");
+    }
+    int expectedYDim = 1 + (N - 1) * incY;
+    if ((int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SPMV");
+    }
+
+    return N;
+}
+static void validateGER(RS* mRS, sp<const Element> e, sp<Allocation> X, int incX,
+                        sp<Allocation> Y, int incY, sp<Allocation> A) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e) ) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+
+    if (N < 1 || M < 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "M and N must be 1 or greater for GER");
+    }
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (M - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for GER");
+    }
+    int expectedYDim = 1 + (N - 1) * incY;
+    if ((int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for GER");
+    }
+
+
+}
+static int validateSYR(RS* mRS, sp<const Element> e, RsBlasUplo Uplo,
+                       sp<Allocation> X, int incX, sp<Allocation> A) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+
+    int N = A->getType()->getX();
+
+    if (X->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+    if (N != (int)A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "A must be a symmetric matrix");
+    }
+    if (incX <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SYR");
+    }
+    return N;
+}
+static int validateSPR(RS* mRS, sp<const Element> e, RsBlasUplo Uplo,
+                       sp<Allocation> X, int incX, sp<Allocation> Ap) {
+    if (!Ap->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (Ap->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Ap must have a Y dimension of 0 or 1");
+    }
+
+    int N = sqrt((double)Ap->getType()->getX() * 2);
+    if ((int)Ap->getType()->getX() != ((N * (N+1)) / 2)) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid dimension for Ap");
+    }
+    if (incX <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SPR");
+    }
+
+    return N;
+}
+
+static int validateSYR2(RS* mRS, sp<const Element> e, RsBlasUplo Uplo, sp<Allocation> X,
+                        int incX, sp<Allocation> Y, int incY, sp<Allocation> A) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    int N = A->getType()->getX();
+
+    if (N != (int)A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "A must be a symmetric matrix");
+    }
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    int expectedYDim = 1 + (N - 1) * incY;
+    if ((int)X->getType()->getX() != expectedXDim || (int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SYR");
+    }
+    return N;
+
+}
+static int validateSPR2(RS* mRS, sp<const Element> e, RsBlasUplo Uplo, sp<Allocation> X,
+                        int incX, sp<Allocation> Y, int incY, sp<Allocation> Ap) {
+    if (!Ap->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    if (Ap->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Ap must have a Y dimension of 0 or 1");
+    }
+
+    int N = sqrt((double)Ap->getType()->getX() * 2);
+    if ((int)Ap->getType()->getX() != ((N * (N+1)) / 2)) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid dimension for Ap");
+    }
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (N - 1) * incX;
+    int expectedYDim = 1 + (N - 1) * incY;
+    if ((int)X->getType()->getX() != expectedXDim || (int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for SPR2");
+    }
+
+    return N;
+}
+
+void ScriptIntrinsicBLAS::SSYMV(RsBlasUplo Uplo, float alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, float beta, sp<Allocation> Y, int incY) {
+    int N = validateSYMV(mRS, Element::F32(mRS), Uplo, A, X, Y, incX, incY);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssymv,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                A->getID(), X->getID(), beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSBMV(RsBlasUplo Uplo, int K, float alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, float beta, sp<Allocation> Y, int incY) {
+    // SBMV is the same as SYMV + K >= 0
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be greater than or equal to 0");
+    }
+    int N = validateSYMV(mRS, Element::F32(mRS), Uplo, A, X, Y, incX, incY);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssbmv,
+                                0, 0, 0, Uplo, 0, 0, N, K, alpha,
+                                A->getID(), X->getID(), beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSPMV(RsBlasUplo Uplo, float alpha, sp<Allocation> Ap, sp<Allocation> X,
+                                int incX, float beta, sp<Allocation> Y, int incY) {
+    int N = validateSPMV(mRS, Element::F32(mRS), Uplo, Ap, X, incX, Y, incY);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sspmv,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                Ap->getID(), X->getID(), beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SGER(float alpha, sp<Allocation> X, int incX,
+                               sp<Allocation> Y, int incY, sp<Allocation> A) {
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    validateGER(mRS, Element::F32(mRS), X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sger,
+                                0, 0, 0, 0, 0, M, N, 0, alpha,
+                                X->getID(), Y->getID(), 0.f, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSYR(RsBlasUplo Uplo, float alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> A) {
+    int N = validateSYR(mRS, Element::F32(mRS), Uplo, X, incX, A);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssyr,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), A->getID(), 0.f, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSPR(RsBlasUplo Uplo, float alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> Ap) {
+    int N = validateSPR(mRS, Element::F32(mRS), Uplo, X, incX, Ap);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sspr,
+                                0, 0, 0, Uplo, 0, 0, N, 0,
+                                alpha, X->getID(), Ap->getID(), 0.f, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSYR2(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    int N = validateSYR2(mRS, Element::F32(mRS), Uplo, X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssyr2,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), Y->getID(), 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSPR2(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> Ap) {
+    int N = validateSPR2(mRS, Element::F32(mRS), Uplo, X, incX, Y, incY, Ap);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sspr2,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), Y->getID(), 0, Ap->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSYMV(RsBlasUplo Uplo, double alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, double beta, sp<Allocation> Y, int incY) {
+    int N = validateSYMV(mRS, Element::F64(mRS), Uplo, A, X, Y, incX, incY);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsymv,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                A->getID(), X->getID(), beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSBMV(RsBlasUplo Uplo, int K, double alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, double beta, sp<Allocation> Y, int incY) {
+    // SBMV is the same as SYMV + K >= 0
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be greater than or equal to 0");
+    }
+    int N = validateSYMV(mRS, Element::F64(mRS), Uplo, A, X, Y, incX, incY);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsbmv,
+                                0, 0, 0, Uplo, 0, 0, N, K, alpha,
+                                A->getID(), X->getID(), beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSPMV(RsBlasUplo Uplo, double alpha, sp<Allocation> Ap, sp<Allocation> X,
+                                int incX, double beta, sp<Allocation> Y, int incY) {
+    int N = validateSPMV(mRS, Element::F64(mRS), Uplo, Ap, X, incX, Y, incY);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dspmv,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                Ap->getID(), X->getID(), beta, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DGER(double alpha, sp<Allocation> X, int incX, sp<Allocation> Y,
+                               int incY, sp<Allocation> A) {
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    validateGER(mRS, Element::F64(mRS), X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dger,
+                                0, 0, 0, 0, 0, M, N, 0, alpha,
+                                X->getID(), Y->getID(), 0.f, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSYR(RsBlasUplo Uplo, double alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> A) {
+    int N = validateSYR(mRS, Element::F64(mRS), Uplo, X, incX, A);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsyr,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), A->getID(), 0.f, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSPR(RsBlasUplo Uplo, double alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> Ap) {
+    int N = validateSPR(mRS, Element::F64(mRS), Uplo, X, incX, Ap);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dspr,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), Ap->getID(), 0.f, 0, incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSYR2(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    int N = validateSYR2(mRS, Element::F64(mRS), Uplo, X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsyr2,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), Y->getID(), 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSPR2(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> Ap) {
+    int N = validateSPR2(mRS, Element::F64(mRS), Uplo, X, incX, Y, incY, Ap);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dspr2,
+                                0, 0, 0, Uplo, 0, 0, N, 0, alpha,
+                                X->getID(), Y->getID(), 0, Ap->getID(), incX, incY, 0, 0);
+}
+
+
+/**
+ * Level 2, C and Z only
+ */
+
+static void validateGERU(RS* mRS, sp<const Element> e, sp<Allocation> X, int incX,
+                         sp<Allocation> Y, int incY, sp<Allocation> A) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !X->getType()->getElement()->isCompatible(e) ||
+        !Y->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (X->getType()->getY() > 1 || Y->getType()->getY() > 1) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "BLAS vectors must have Y dimension of 0 or 1");
+    }
+
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    if (incX <= 0 || incY <= 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Vector increments must be greater than 0");
+    }
+    int expectedXDim = 1 + (M - 1) * incX;
+    if ((int)X->getType()->getX() != expectedXDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for GERU");
+    }
+    int expectedYDim = 1 + (N - 1) * incY;
+    if ((int)Y->getType()->getX() != expectedYDim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Incorrect vector dimensions for GERU");
+    }
+
+}
+
+void ScriptIntrinsicBLAS::CHEMV(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, Float2 beta, sp<Allocation> Y, int incY) {
+    // HEMV is the same as SYR2 validation-wise
+    int N = validateSYR2(mRS, Element::F32_2(mRS), Uplo, X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_chemv,
+                                 0, 0, 0, Uplo, 0, 0, N, 0,
+                                 alpha.x, alpha.y, A->getID(), X->getID(),
+                                 beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CHBMV(RsBlasUplo Uplo, int K, Float2 alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, Float2 beta, sp<Allocation> Y, int incY) {
+    // HBMV is the same as SYR2 validation-wise
+    int N = validateSYR2(mRS, Element::F32_2(mRS), Uplo, X, incX, Y, incY, A);
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be 0 or greater for HBMV");
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_chbmv,
+                                 0, 0, 0, Uplo, 0, 0, N, K,
+                                 alpha.x, alpha.y, A->getID(), X->getID(),
+                                 beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CHPMV(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> Ap,
+                                sp<Allocation> X, int incX, Float2 beta, sp<Allocation> Y, int incY) {
+    // HPMV is the same as SPR2
+    int N = validateSPR2(mRS, Element::F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_chpmv,
+                                 0, 0, 0, Uplo, 0, 0, N, 0,
+                                 alpha.x, alpha.y, Ap->getID(), X->getID(),
+                                 beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CGERU(Float2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    validateGERU(mRS, Element::F32_2(mRS), X, incX, Y, incY, A);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cgeru,
+                                 0, 0, 0, 0, 0, M, N, 0,
+                                 alpha.x, alpha.y, X->getID(), Y->getID(),
+                                 0, 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CGERC(Float2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    // Same as GERU
+    validateGERU(mRS, Element::F32_2(mRS), X, incX, Y, incY, A);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cgerc,
+                                 0, 0, 0, 0, 0, M, N, 0,
+                                 alpha.x, alpha.y, X->getID(), Y->getID(),
+                                 0, 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CHER(RsBlasUplo Uplo, float alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> A) {
+    // Same as SYR
+    int N = validateSYR(mRS, Element::F32_2(mRS), Uplo, X, incX, A);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cher,
+                                 0, 0, 0, Uplo, 0, 0, N, 0,
+                                 alpha, 0, X->getID(), 0,
+                                 0, 0, A->getID(), incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CHPR(RsBlasUplo Uplo, float alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> Ap) {
+    // Equivalent to SPR for validation
+    int N = validateSPR(mRS, Element::F32_2(mRS), Uplo, X, incX, Ap);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_chpr,
+                                 0, 0, 0, Uplo, 0, 0, N, 0,
+                                 alpha, 0, X->getID(), 0,
+                                 0, 0, Ap->getID(), incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CHER2(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    // Same as SYR2
+    int N = validateSYR2(mRS, Element::F32_2(mRS), Uplo, X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cher2,
+                                 0, 0, 0, Uplo, 0, 0, N, 0,
+                                 alpha.x, alpha.y, X->getID(), Y->getID(),
+                                 0, 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CHPR2(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> Ap) {
+    // Same as SPR2
+    int N = validateSPR2(mRS, Element::F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_chpr2,
+                                 0, 0, 0, Uplo, 0, 0, N, 0,
+                                 alpha.x, alpha.y, X->getID(), Y->getID(),
+                                 0, 0, Ap->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHEMV(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> A,
+                                sp<Allocation> X, int incX, Double2 beta, sp<Allocation> Y, int incY) {
+    // HEMV is the same as SYR2 validation-wise
+    int N = validateSYR2(mRS, Element::F64_2(mRS), Uplo, X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zhemv,
+                           0, 0, 0, Uplo, 0, 0, N, 0,
+                           alpha.x, alpha.y, A->getID(), X->getID(),
+                           beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHBMV(RsBlasUplo Uplo, int K, Double2 alpha, sp<Allocation> A, sp<Allocation> X,
+                                int incX, Double2 beta, sp<Allocation> Y, int incY) {
+    // HBMV is the same as SYR2 validation-wise
+    int N = validateSYR2(mRS, Element::F64_2(mRS), Uplo, X, incX, Y, incY, A);
+    if (K < 0) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "K must be 0 or greater for HBMV");
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zhbmv,
+                           0, 0, 0, Uplo, 0, 0, N, K,
+                           alpha.x, alpha.y, A->getID(), X->getID(),
+                           beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHPMV(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> Ap, sp<Allocation> X,
+                                int incX, Double2 beta, sp<Allocation> Y, int incY) {
+    // HPMV is the same as SPR2
+    int N = validateSPR2(mRS, Element::F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zhpmv,
+                           0, 0, 0, Uplo, 0, 0, N, 0,
+                           alpha.x, alpha.y, Ap->getID(), X->getID(),
+                           beta.x, beta.y, Y->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZGERU(Double2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    validateGERU(mRS, Element::F64_2(mRS), X, incX, Y, incY, A);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zgeru,
+                           0, 0, 0, 0, 0, M, N, 0,
+                           alpha.x, alpha.y, X->getID(), Y->getID(),
+                           0, 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZGERC(Double2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    // Same as GERU
+    validateGERU(mRS, Element::F64_2(mRS), X, incX, Y, incY, A);
+    int M = A->getType()->getY();
+    int N = A->getType()->getX();
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zgerc,
+                           0, 0, 0, 0, 0, M, N, 0,
+                           alpha.x, alpha.y, X->getID(), Y->getID(),
+                           0, 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHER(RsBlasUplo Uplo, double alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> A) {
+    // Same as SYR
+    int N = validateSYR(mRS, Element::F64_2(mRS), Uplo, X, incX, A);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zher,
+                           0, 0, 0, Uplo, 0, 0, N, 0,
+                           alpha, 0, X->getID(), 0,
+                           0, 0, A->getID(), incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHPR(RsBlasUplo Uplo, double alpha, sp<Allocation> X,
+                               int incX, sp<Allocation> Ap) {
+    // Equivalent to SPR for validation
+    int N = validateSPR(mRS, Element::F64_2(mRS), Uplo, X, incX, Ap);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zhpr,
+                           0, 0, 0, Uplo, 0, 0, N, 0,
+                           alpha, 0, X->getID(), 0,
+                           0, 0, Ap->getID(), incX, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHER2(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> A) {
+    // Same as SYR2
+    int N = validateSYR2(mRS, Element::F64_2(mRS), Uplo, X, incX, Y, incY, A);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zher2,
+                           0, 0, 0, Uplo, 0, 0, N, 0,
+                           alpha.x, alpha.y, X->getID(), Y->getID(),
+                           0, 0, A->getID(), incX, incY, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHPR2(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> X, int incX,
+                                sp<Allocation> Y, int incY, sp<Allocation> Ap) {
+    // Same as SPR2
+    int N = validateSPR2(mRS, Element::F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zhpr2,
+                           0, 0, 0, Uplo, 0, 0, N, 0,
+                           alpha.x, alpha.y, X->getID(), Y->getID(),
+                           0, 0, Ap->getID(), incX, incY, 0, 0);
+}
+
+
+/**
+ * Level 3 BLAS
+ */
+
+static void validateL3(RS* mRS, sp<const Element> e, int TransA, int TransB, int Side,
+                       sp<Allocation> A, sp<Allocation> B, sp<Allocation> C) {
+    int aM = -1, aN = -1, bM = -1, bN = -1, cM = -1, cN = -1;
+    if ((A != nullptr && !A->getType()->getElement()->isCompatible(e)) ||
+        (B != nullptr && !B->getType()->getElement()->isCompatible(e)) ||
+        (C != nullptr && !C->getType()->getElement()->isCompatible(e))) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (C == nullptr) {
+        // Since matrix C is used to store the result, it cannot be null.
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Allocation C cannot be null");
+    }
+    cM = C->getType()->getY();
+    cN = C->getType()->getX();
+
+    if (Side == RsBlasRight) {
+        if ((A == nullptr && B != nullptr) || (A != nullptr && B == nullptr)) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Provided Matrix A without Matrix B, or vice versa");
+        }
+        if (B != nullptr) {
+            bM = A->getType()->getY();
+            bN = A->getType()->getX();
+        }
+        if (A != nullptr) {
+            aM = B->getType()->getY();
+            aN = B->getType()->getX();
+        }
+    } else {
+        if (A != nullptr) {
+            if (TransA == RsBlasTrans || TransA == RsBlasConjTrans) {
+                aN = A->getType()->getY();
+                aM = A->getType()->getX();
+            } else {
+                aM = A->getType()->getY();
+                aN = A->getType()->getX();
+            }
+        }
+        if (B != nullptr) {
+            if (TransB == RsBlasTrans || TransB == RsBlasConjTrans) {
+                bN = B->getType()->getY();
+                bM = B->getType()->getX();
+            } else {
+                bM = B->getType()->getY();
+                bN = B->getType()->getX();
+            }
+        }
+    }
+    if (A != nullptr && B != nullptr && C != nullptr) {
+        if (aN != bM || aM != cM || bN != cN) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called BLAS with invalid dimensions");
+        }
+    } else if (A != nullptr && C != nullptr) {
+        // A and C only, for SYRK
+        if (cM != cN) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Matrix C is not symmetric");
+        }
+        if (aM != cM) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called BLAS with invalid dimensions");
+        }
+    } else if (A != nullptr && B != nullptr) {
+        // A and B only
+        if (aN != bM) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called BLAS with invalid dimensions");
+        }
+    }
+
+}
+
+void ScriptIntrinsicBLAS::SGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, float alpha,
+                                sp<Allocation> A, sp<Allocation> B, float beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F32(mRS), TransA, TransB, 0, A, B, C);
+
+    int M = -1, N = -1, K = -1;
+    if (TransA != RsBlasNoTrans) {
+        M = A->getType()->getX();
+        K = A->getType()->getY();
+    } else {
+        M = A->getType()->getY();
+        K = A->getType()->getX();
+    }
+    if (TransB != RsBlasNoTrans) {
+        N = B->getType()->getY();
+    } else {
+        N = B->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_sgemm,
+                                TransA, TransB, 0, 0, 0, M, N, K,
+                                alpha, A->getID(), B->getID(),
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, double alpha,
+                                sp<Allocation> A, sp<Allocation> B, double beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F64(mRS), TransA, TransB, 0, A, B, C);
+    int M = -1, N = -1, K = -1;
+    if (TransA != RsBlasNoTrans) {
+        M = A->getType()->getX();
+        K = A->getType()->getY();
+    } else {
+        M = A->getType()->getY();
+        K = A->getType()->getX();
+    }
+    if (TransB != RsBlasNoTrans) {
+        N = B->getType()->getY();
+    } else {
+        N = B->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dgemm,
+                                TransA, TransB, 0, 0, 0, M, N, K,
+                                alpha, A->getID(), B->getID(),
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, Float2 alpha,
+                                sp<Allocation> A, sp<Allocation> B, Float2 beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F32_2(mRS), TransA, TransB, 0, A, B, C);
+    int M = -1, N = -1, K = -1;
+    if (TransA != RsBlasNoTrans) {
+        M = A->getType()->getX();
+        K = A->getType()->getY();
+    } else {
+        M = A->getType()->getY();
+        K = A->getType()->getX();
+    }
+    if (TransB != RsBlasNoTrans) {
+        N = B->getType()->getY();
+    } else {
+        N = B->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cgemm,
+                                 TransA, TransB, 0, 0, 0, M, N, K,
+                                 alpha.x, alpha.y, A->getID(), B->getID(),
+                                 beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, Double2 alpha,
+                                sp<Allocation> A, sp<Allocation> B, Double2 beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F64_2(mRS), TransA, TransB, 0, A, B, C);
+    int M = -1, N = -1, K = -1;
+    if (TransA != RsBlasNoTrans) {
+        M = A->getType()->getX();
+        K = A->getType()->getY();
+    } else {
+        M = A->getType()->getY();
+        K = A->getType()->getX();
+    }
+    if (TransB != RsBlasNoTrans) {
+        N = B->getType()->getY();
+    } else {
+        N = B->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zgemm,
+                           TransA, TransB, 0, 0, 0, M, N, K,
+                           alpha.x, alpha.y, A->getID(), B->getID(),
+                           beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSYMM(RsBlasSide Side, RsBlasUplo Uplo, float alpha,
+                                sp<Allocation> A, sp<Allocation> B, float beta, sp<Allocation> C) {
+    //For SYMM, Matrix A should be symmetric
+    if (A->getType()->getX() != A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Matrix A is not symmetric");
+    }
+    validateL3(mRS, Element::F32(mRS), 0, 0, Side, A, B, C);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssymm,
+                                0, 0, Side, Uplo, 0, C->getType()->getY(), C->getType()->getX(), 0,
+                                alpha, A->getID(), B->getID(),
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSYMM(RsBlasSide Side, RsBlasUplo Uplo, double alpha,
+                                sp<Allocation> A, sp<Allocation> B, double beta, sp<Allocation> C) {
+    if (A->getType()->getX() != A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Matrix A is not symmetric");
+    }
+    validateL3(mRS, Element::F64(mRS), 0, 0, Side, A, B, C);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsymm,
+                                0, 0, Side, Uplo, 0, C->getType()->getY(), C->getType()->getX(), 0,
+                                alpha, A->getID(), B->getID(),
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CSYMM(RsBlasSide Side, RsBlasUplo Uplo, Float2 alpha,
+                                sp<Allocation> A, sp<Allocation> B, Float2 beta, sp<Allocation> C) {
+    if (A->getType()->getX() != A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Matrix A is not symmetric");
+    }
+    validateL3(mRS, Element::F32_2(mRS), 0, 0, Side, A, B, C);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_csymm,
+                                 0, 0, Side, Uplo, 0, C->getType()->getY(), C->getType()->getX(), 0,
+                                 alpha.x, alpha.y, A->getID(), B->getID(),
+                                 beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZSYMM(RsBlasSide Side, RsBlasUplo Uplo, Double2 alpha,
+                                sp<Allocation> A, sp<Allocation> B, Double2 beta, sp<Allocation> C) {
+    if (A->getType()->getX() != A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Matrix A is not symmetric");
+    }
+    validateL3(mRS, Element::F64_2(mRS), 0, 0, Side, A, B, C);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zsymm,
+                           0, 0, Side, Uplo, 0, C->getType()->getY(), C->getType()->getX(), 0,
+                           alpha.x, alpha.y, A->getID(), B->getID(),
+                           beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::SSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, float alpha,
+                                sp<Allocation> A, float beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F32(mRS), Trans, 0, 0, A, nullptr, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssyrk,
+                                Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                                alpha, A->getID(), 0,
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, double alpha,
+                                sp<Allocation> A, double beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F64(mRS), Trans, 0, 0, A, nullptr, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsyrk,
+                                Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                                alpha, A->getID(), 0,
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, Float2 alpha,
+                                sp<Allocation> A, Float2 beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F32_2(mRS), Trans, 0, 0, A, nullptr, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_csyrk,
+                                 Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                                 alpha.x, alpha.y, A->getID(), 0,
+                                 beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, Double2 alpha,
+                                sp<Allocation> A, Double2 beta, sp<Allocation> C) {
+    validateL3(mRS, Element::F64_2(mRS), Trans, 0, 0, A, nullptr, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zsyrk,
+                           Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                           alpha.x, alpha.y, A->getID(), 0,
+                           beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+static void validateSYR2K(RS* mRS, sp<const Element> e, RsBlasTranspose Trans,
+                          sp<Allocation> A, sp<Allocation> B, sp<Allocation> C) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !B->getType()->getElement()->isCompatible(e) ||
+        !C->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    int Cdim = -1;
+    // A is n x k if no transpose, k x n if transpose
+    // C is n x n
+    if (Trans == RsBlasTrans) {
+        // check columns versus C
+        Cdim = A->getType()->getX();
+    } else {
+        // check rows versus C
+        Cdim = A->getType()->getY();
+    }
+    if ((int)C->getType()->getX() != Cdim || (int)C->getType()->getY() != Cdim) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid symmetric matrix in SYR2K");
+    }
+    // A dims == B dims
+    if (A->getType()->getX() != B->getType()->getX() || A->getType()->getY() != B->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid A and B in SYR2K");
+    }
+}
+
+void ScriptIntrinsicBLAS::SSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, float alpha,
+                                 sp<Allocation> A, sp<Allocation> B, float beta, sp<Allocation> C) {
+    validateSYR2K(mRS, Element::F32(mRS), Trans, A, B, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_ssyr2k,
+                                Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                                alpha, A->getID(), B->getID(),
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, double alpha,
+                                 sp<Allocation> A, sp<Allocation> B, double beta, sp<Allocation> C) {
+    validateSYR2K(mRS, Element::F64(mRS), Trans, A, B, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dsyr2k,
+                                Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                                alpha, A->getID(), B->getID(),
+                                beta, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Float2 alpha,
+                                 sp<Allocation> A, sp<Allocation> B, Float2 beta, sp<Allocation> C) {
+    validateSYR2K(mRS, Element::F32_2(mRS), Trans, A, B, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_csyr2k,
+                                 Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                                 alpha.x, alpha.y, A->getID(), B->getID(),
+                                 beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Double2 alpha,
+                                 sp<Allocation> A, sp<Allocation> B, Double2 beta, sp<Allocation> C) {
+    validateSYR2K(mRS, Element::F64_2(mRS), Trans, A, B, C);
+    int K = -1;
+    if (Trans != RsBlasNoTrans) {
+        K = A->getType()->getY();
+    } else {
+        K = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zsyr2k,
+                           Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), K,
+                           alpha.x, alpha.y, A->getID(), B->getID(),
+                           beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+static void validateTRMM(RS* mRS, sp<const Element> e, RsBlasSide Side, RsBlasTranspose TransA,
+                         sp<Allocation> A, sp<Allocation> B) {
+    int aM = -1, aN = -1, bM = -1, bN = -1;
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !B->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+
+    aM = A->getType()->getY();
+    aN = A->getType()->getX();
+    if (aM != aN) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called TRMM with a non-symmetric matrix A");
+    }
+
+    bM = B->getType()->getY();
+    bN = B->getType()->getX();
+    if (Side == RsBlasLeft) {
+        if (aN != bM) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called TRMM with invalid matrices");
+        }
+    } else {
+        if (bN != aM) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called TRMM with invalid matrices");
+        }
+    }
+}
+
+void ScriptIntrinsicBLAS::STRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                float alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRMM(mRS, Element::F32(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_strmm,
+                                TransA, 0, Side, Uplo, Diag,\
+                                B->getType()->getY(), B->getType()->getX(), 0,
+                                alpha, A->getID(), B->getID(), 0.f, 0, 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                double alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRMM(mRS, Element::F64(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtrmm,
+                                TransA, 0, Side, Uplo, Diag,
+                                B->getType()->getY(), B->getType()->getX(), 0,
+                                alpha, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                Float2 alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRMM(mRS, Element::F32_2(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctrmm,
+                                 TransA, 0, Side, Uplo, Diag,
+                                 B->getType()->getY(), B->getType()->getX(), 0,
+                                 alpha.x, alpha.y, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                Double2 alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRMM(mRS, Element::F64_2(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztrmm,
+                           TransA, 0, Side, Uplo, Diag,
+                           B->getType()->getY(), B->getType()->getX(), 0,
+                           alpha.x, alpha.y, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0, 0);
+}
+
+static void validateTRSM(RS* mRS, sp<const Element> e, RsBlasSide Side, RsBlasTranspose TransA,
+                         sp<Allocation> A, sp<Allocation> B) {
+    int adim = -1, bM = -1, bN = -1;
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !B->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    adim = A->getType()->getX();
+    if (adim != (int)A->getType()->getY()) {
+        // This may be unnecessary, the restriction could potentially be relaxed.
+        // Allocation A needs to contain at least that symmetric matrix but could theoretically
+        // be larger for now we assume adapters are sufficient, will reevaluate in the future.
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called TRSM with a non-symmetric matrix A");
+    }
+    bM = B->getType()->getY();
+    bN = B->getType()->getX();
+    if (Side == RsBlasLeft) {
+        // A is M*M
+        if (adim != bM) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called TRSM with invalid matrix dimensions");
+        }
+    } else {
+        // A is N*N
+        if (adim != bN) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called TRSM with invalid matrix dimensions");
+        }
+    }
+}
+
+void ScriptIntrinsicBLAS::STRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                float alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRSM(mRS, Element::F32(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Single(mRS, mRS->getContext(), getID(), RsBlas_strsm,
+                                TransA, 0, Side, Uplo, Diag,
+                                B->getType()->getY(), B->getType()->getX(), 0,
+                                alpha, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::DTRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                double alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRSM(mRS, Element::F64(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Double(mRS, mRS->getContext(), getID(), RsBlas_dtrsm,
+                                TransA, 0, Side, Uplo, Diag,
+                                B->getType()->getY(), B->getType()->getX(), 0,
+                                alpha, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::CTRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                Float2 alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRSM(mRS, Element::F32_2(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_ctrsm,
+                                 TransA, 0, Side, Uplo, Diag,
+                                 B->getType()->getY(), B->getType()->getX(), 0,
+                                 alpha.x, alpha.y, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZTRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+                                Double2 alpha, sp<Allocation> A, sp<Allocation> B) {
+    validateTRSM(mRS, Element::F64_2(mRS), Side, TransA, A, B);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_ztrsm,
+                           TransA, 0, Side, Uplo, Diag,
+                           B->getType()->getY(), B->getType()->getX(), 0,
+                           alpha.x, alpha.y, A->getID(), B->getID(), 0, 0, 0, 0, 0, 0, 0);
+}
+
+static void validateHEMM(RS* mRS, sp<const Element> e, RsBlasSide Side,
+                         sp<Allocation> A, sp<Allocation> B, sp<Allocation> C) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !B->getType()->getElement()->isCompatible(e) ||
+        !C->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+
+    // A must be square; can potentially be relaxed similar to TRSM
+    int adim = A->getType()->getX();
+    if (adim != (int)A->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HEMM with non-square A");
+    }
+    if ((Side == RsBlasLeft && adim != (int)B->getType()->getY()) ||
+        (Side == RsBlasRight && adim != (int)B->getType()->getX())) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HEMM with invalid B");
+    }
+    if (B->getType()->getX() != C->getType()->getX() ||
+        B->getType()->getY() != C->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HEMM with mismatched B and C");
+    }
+}
+
+void ScriptIntrinsicBLAS::CHEMM(RsBlasSide Side, RsBlasUplo Uplo, Float2 alpha,
+                                sp<Allocation> A, sp<Allocation> B, Float2 beta, sp<Allocation> C) {
+    validateHEMM(mRS, Element::F32_2(mRS), Side, A, B, C);
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_chemm,
+                                 0, 0, Side, Uplo, 0,
+                                 C->getType()->getY(), C->getType()->getX(), 0,
+                                 alpha.x, alpha.y, A->getID(), B->getID(),
+                                 beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHEMM(RsBlasSide Side, RsBlasUplo Uplo, Double2 alpha,
+                                sp<Allocation> A, sp<Allocation> B, Double2 beta, sp<Allocation> C) {
+    validateHEMM(mRS, Element::F64_2(mRS), Side, A, B, C);
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zhemm,
+                           0, 0, Side, Uplo, 0,
+                           C->getType()->getY(), C->getType()->getX(), 0,
+                           alpha.x, alpha.y, A->getID(), B->getID(),
+                           beta.x, beta.y, C->getID(), 0, 0, 0, 0);
+}
+
+static void validateHERK(RS* mRS, sp<const Element> e, RsBlasTranspose Trans,
+                         sp<Allocation> A, sp<Allocation> C) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !C->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (Trans != RsBlasNoTrans && Trans != RsBlasConjTrans) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Call HERK with invalid Transpose");
+    }
+    int cdim = C->getType()->getX();
+    if (cdim != (int)C->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HERK with non-square C");
+    }
+    if (Trans == RsBlasNoTrans) {
+        if (cdim != (int)A->getType()->getY()) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HERK with invalid A");
+        }
+    } else {
+        if (cdim != (int)A->getType()->getX()) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HERK with invalid A");
+        }
+    }
+}
+
+void ScriptIntrinsicBLAS::CHERK(RsBlasUplo Uplo, RsBlasTranspose Trans, float alpha,
+                                sp<Allocation> A, float beta, sp<Allocation> C) {
+    validateHERK(mRS, Element::F32_2(mRS), Trans, A, C);
+    int k = 0;
+    if (Trans == RsBlasConjTrans) {
+        k = A->getType()->getY();
+    } else {
+        k = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cherk,
+                                 Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), k,
+                                 alpha, 0, A->getID(), 0,
+                                 beta, 0, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHERK(RsBlasUplo Uplo, RsBlasTranspose Trans, double alpha,
+                                sp<Allocation> A, double beta, sp<Allocation> C) {
+    validateHERK(mRS, Element::F64_2(mRS), Trans, A, C);
+    int k = 0;
+    if (Trans == RsBlasConjTrans) {
+        k = A->getType()->getY();
+    } else {
+        k = A->getType()->getX();
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zherk,
+                           Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), k,
+                           alpha, 0, A->getID(), 0,
+                           beta, 0, C->getID(), 0, 0, 0, 0);
+}
+
+static void validateHER2K(RS* mRS, sp<const Element> e, RsBlasTranspose Trans,
+                          sp<Allocation> A, sp<Allocation> B, sp<Allocation> C) {
+    if (!A->getType()->getElement()->isCompatible(e) ||
+        !B->getType()->getElement()->isCompatible(e) ||
+        !C->getType()->getElement()->isCompatible(e)) {
+        mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Called BLAS with wrong Element type");
+    }
+    if (Trans != RsBlasNoTrans && Trans != RsBlasConjTrans) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Call HERK with invalid Transpose");
+    }
+    int cdim = C->getType()->getX();
+    if (cdim != (int)C->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HER2K with non-square C");
+    }
+    if (Trans == RsBlasNoTrans) {
+        if ((int)A->getType()->getY() != cdim) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HER2K with invalid matrices");
+        }
+    } else {
+        if ((int)A->getType()->getX() != cdim) {
+            mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HER2K with invalid matrices");
+        }
+    }
+    if (A->getType()->getX() != B->getType()->getX() || A->getType()->getY() != B->getType()->getY()) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Called HER2K with invalid A and B matrices");
+    }
+}
+
+void ScriptIntrinsicBLAS::CHER2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Float2 alpha,
+                                 sp<Allocation> A, sp<Allocation> B, float beta, sp<Allocation> C) {
+    validateHER2K(mRS, Element::F32_2(mRS), Trans, A, B, C);
+    int k = 0;
+    if (Trans == RsBlasNoTrans) {
+        k = A->getType()->getX();
+    } else {
+        k = A->getType()->getY();
+    }
+    nScriptIntrinsicBLAS_Complex(mRS, mRS->getContext(), getID(), RsBlas_cher2k,
+                                 Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), k,
+                                 alpha.x, alpha.y, A->getID(), B->getID(),
+                                 beta, 0, C->getID(), 0, 0, 0, 0);
+}
+
+void ScriptIntrinsicBLAS::ZHER2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Double2 alpha,
+                                 sp<Allocation> A, sp<Allocation> B, double beta, sp<Allocation> C) {
+    validateHER2K(mRS, Element::F64_2(mRS), Trans, A, B, C);
+    int k = 0;
+    if (Trans == RsBlasNoTrans) {
+        k = A->getType()->getX();
+    } else {
+        k = A->getType()->getY();
+    }
+    nScriptIntrinsicBLAS_Z(mRS, mRS->getContext(), getID(), RsBlas_zher2k,
+                           Trans, 0, 0, Uplo, 0, 0, C->getType()->getX(), k,
+                           alpha.x, alpha.y, A->getID(), B->getID(),
+                           beta, 0, C->getID(), 0, 0, 0, 0);
+}
+
+
+
+void ScriptIntrinsicBLAS::BNNM(sp<Allocation> A, int a_offset, sp<Allocation> B, int b_offset,
+                               sp<Allocation> C, int c_offset, int c_mult) {
+    validateL3(mRS, Element::U8(mRS), RsBlasNoTrans, RsBlasTrans, 0, A, B, C);
+
+    if (a_offset < 0 || a_offset > 255) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid a_offset passed to BNNM");
+    }
+    if (b_offset < 0 || b_offset > 255) {
+        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Invalid b_offset passed to BNNM");
+    }
+    int M = -1, N = -1, K = -1;
+    M = A->getType()->getY();
+    N = B->getType()->getY();
+    K = A->getType()->getX();
+
+    nScriptIntrinsicBLAS_BNNM(mRS, mRS->getContext(), getID(), M, N, K, A->getID(), a_offset,
+                              B->getID(), b_offset, C->getID(), c_offset, c_mult);
+}
diff --git a/cpp/ScriptIntrinsics.cpp b/cpp/ScriptIntrinsics.cpp
index 54ce465..f159d11 100644
--- a/cpp/ScriptIntrinsics.cpp
+++ b/cpp/ScriptIntrinsics.cpp
@@ -635,4 +635,4 @@
     }
 
     Script::forEach(0, nullptr, out, nullptr, 0);
-}
+}
\ No newline at end of file
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index fd531f1..03ef3d5 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -86,6 +86,277 @@
      RS_INIT_MAX = 32
  };
 
+
+class Byte2 {
+ public:
+  int8_t x, y;
+
+  Byte2(int8_t initX, int8_t initY)
+    : x(initX), y(initY) {}
+  Byte2() : x(0), y(0) {}
+};
+
+class Byte3 {
+ public:
+  int8_t x, y, z;
+
+  Byte3(int8_t initX, int8_t initY, int8_t initZ)
+    : x(initX), y(initY), z(initZ) {}
+  Byte3() : x(0), y(0), z(0) {}
+};
+
+class Byte4 {
+ public:
+  int8_t x, y, z, w;
+
+  Byte4(int8_t initX, int8_t initY, int8_t initZ, int8_t initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  Byte4() : x(0), y(0), z(0), w(0) {}
+};
+
+class UByte2 {
+ public:
+  uint8_t x, y;
+
+  UByte2(uint8_t initX, uint8_t initY)
+    : x(initX), y(initY) {}
+  UByte2() : x(0), y(0) {}
+};
+
+class UByte3 {
+ public:
+  uint8_t x, y, z;
+
+  UByte3(uint8_t initX, uint8_t initY, uint8_t initZ)
+    : x(initX), y(initY), z(initZ) {}
+  UByte3() : x(0), y(0), z(0) {}
+};
+
+class UByte4 {
+ public:
+  uint8_t x, y, z, w;
+
+  UByte4(uint8_t initX, uint8_t initY, uint8_t initZ, uint8_t initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  UByte4() : x(0), y(0), z(0), w(0) {}
+};
+
+class Short2 {
+ public:
+  short x, y;
+
+  Short2(short initX, short initY)
+    : x(initX), y(initY) {}
+  Short2() : x(0), y(0) {}
+};
+
+class Short3 {
+ public:
+  short x, y, z;
+
+  Short3(short initX, short initY, short initZ)
+    : x(initX), y(initY), z(initZ) {}
+  Short3() : x(0), y(0), z(0) {}
+};
+
+class Short4 {
+ public:
+  short x, y, z, w;
+
+  Short4(short initX, short initY, short initZ, short initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  Short4() : x(0), y(0), z(0), w(0) {}
+};
+
+class UShort2 {
+ public:
+  uint16_t x, y;
+
+  UShort2(uint16_t initX, uint16_t initY)
+    : x(initX), y(initY) {}
+  UShort2() : x(0), y(0) {}
+};
+
+class UShort3 {
+ public:
+  uint16_t x, y, z;
+
+  UShort3(uint16_t initX, uint16_t initY, uint16_t initZ)
+    : x(initX), y(initY), z(initZ) {}
+  UShort3() : x(0), y(0), z(0) {}
+};
+
+class UShort4 {
+ public:
+  uint16_t x, y, z, w;
+
+  UShort4(uint16_t initX, uint16_t initY, uint16_t initZ, uint16_t initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  UShort4() : x(0), y(0), z(0), w(0) {}
+};
+
+class Int2 {
+ public:
+  int x, y;
+
+  Int2(int initX, int initY)
+    : x(initX), y(initY) {}
+  Int2() : x(0), y(0) {}
+};
+
+class Int3 {
+ public:
+  int x, y, z;
+
+  Int3(int initX, int initY, int initZ)
+    : x(initX), y(initY), z(initZ) {}
+  Int3() : x(0), y(0), z(0) {}
+};
+
+class Int4 {
+ public:
+  int x, y, z, w;
+
+  Int4(int initX, int initY, int initZ, int initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  Int4() : x(0), y(0), z(0), w(0) {}
+};
+
+class UInt2 {
+ public:
+  uint32_t x, y;
+
+  UInt2(uint32_t initX, uint32_t initY)
+    : x(initX), y(initY) {}
+  UInt2() : x(0), y(0) {}
+};
+
+class UInt3 {
+ public:
+  uint32_t x, y, z;
+
+  UInt3(uint32_t initX, uint32_t initY, uint32_t initZ)
+    : x(initX), y(initY), z(initZ) {}
+  UInt3() : x(0), y(0), z(0) {}
+};
+
+class UInt4 {
+ public:
+  uint32_t x, y, z, w;
+
+  UInt4(uint32_t initX, uint32_t initY, uint32_t initZ, uint32_t initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  UInt4() : x(0), y(0), z(0), w(0) {}
+};
+
+class Long2 {
+ public:
+  int64_t x, y;
+
+  Long2(int64_t initX, int64_t initY)
+    : x(initX), y(initY) {}
+  Long2() : x(0), y(0) {}
+};
+
+class Long3 {
+ public:
+  int64_t x, y, z;
+
+  Long3(int64_t initX, int64_t initY, int64_t initZ)
+    : x(initX), y(initY), z(initZ) {}
+  Long3() : x(0), y(0), z(0) {}
+};
+
+class Long4 {
+ public:
+  int64_t x, y, z, w;
+
+  Long4(int64_t initX, int64_t initY, int64_t initZ, int64_t initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  Long4() : x(0), y(0), z(0), w(0) {}
+};
+
+class ULong2 {
+ public:
+  uint64_t x, y;
+
+  ULong2(uint64_t initX, uint64_t initY)
+    : x(initX), y(initY) {}
+  ULong2() : x(0), y(0) {}
+};
+
+class ULong3 {
+ public:
+  uint64_t x, y, z;
+
+  ULong3(uint64_t initX, uint64_t initY, uint64_t initZ)
+    : x(initX), y(initY), z(initZ) {}
+  ULong3() : x(0), y(0), z(0) {}
+};
+
+class ULong4 {
+ public:
+  uint64_t x, y, z, w;
+
+  ULong4(uint64_t initX, uint64_t initY, uint64_t initZ, uint64_t initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  ULong4() : x(0), y(0), z(0), w(0) {}
+};
+
+class Float2 {
+ public:
+  float x, y;
+
+  Float2(float initX, float initY)
+    : x(initX), y(initY) {}
+  Float2() : x(0), y(0) {}
+};
+
+class Float3 {
+ public:
+  float x, y, z;
+
+  Float3(float initX, float initY, float initZ)
+    : x(initX), y(initY), z(initZ) {}
+  Float3() : x(0.f), y(0.f), z(0.f) {}
+};
+
+class Float4 {
+ public:
+  float x, y, z, w;
+
+  Float4(float initX, float initY, float initZ, float initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  Float4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
+};
+
+class Double2 {
+ public:
+  double x, y;
+
+  Double2(double initX, double initY)
+    : x(initX), y(initY) {}
+  Double2() : x(0), y(0) {}
+};
+
+class Double3 {
+ public:
+  double x, y, z;
+
+  Double3(double initX, double initY, double initZ)
+    : x(initX), y(initY), z(initZ) {}
+  Double3() : x(0), y(0), z(0) {}
+};
+
+class Double4 {
+ public:
+  double x, y, z, w;
+
+  Double4(double initX, double initY, double initZ, double initW)
+    : x(initX), y(initY), z(initZ), w(initW) {}
+  Double4() : x(0), y(0), z(0), w(0) {}
+};
+
  /**
   * The RenderScript context. This class controls initialization, resource management, and teardown.
   */
@@ -1512,6 +1783,1946 @@
     void setLUT(sp<Allocation> lut);
 };
 
+
+/**
+ * Intrinsic kernel provides high performance RenderScript APIs to BLAS.
+ *
+ * The BLAS (Basic Linear Algebra Subprograms) are routines that provide standard
+ * building blocks for performing basic vector and matrix operations.
+ *
+ * For detailed description of BLAS, please refer to http://www.netlib.org/blas/
+ *
+ **/
+class ScriptIntrinsicBLAS : public ScriptIntrinsic {
+ private:
+    ScriptIntrinsicBLAS(sp<RS> rs, sp<const Element> e);
+ public:
+    /**
+     * Create an intrinsic to access BLAS subroutines.
+     *
+     * @param rs The RenderScript context
+     * @return ScriptIntrinsicBLAS
+     */
+    static sp<ScriptIntrinsicBLAS> create(sp<RS> rs);
+
+    /**
+     * SGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d58/sgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void SGEMV(RsBlasTranspose TransA,
+               float alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               float beta, sp<Allocation> Y, int incY);
+
+    /**
+     * DGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void DGEMV(RsBlasTranspose TransA,
+               double alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               double beta, sp<Allocation> Y, int incY);
+
+    /**
+     * CGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d8a/cgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void CGEMV(RsBlasTranspose TransA,
+               Float2 alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               Float2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * ZGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d40/zgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void ZGEMV(RsBlasTranspose TransA,
+               Double2 alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               Double2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * SGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d46/sgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void SGBMV(RsBlasTranspose TransA,
+               int KL, int KU, float alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               float beta, sp<Allocation> Y, int incY);
+
+    /**
+     * DGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d3f/dgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void DGBMV(RsBlasTranspose TransA,
+               int KL, int KU, double alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, double beta, sp<Allocation> Y, int incY);
+
+    /**
+     * CGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d75/cgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void CGBMV(RsBlasTranspose TransA,
+               int KL, int KU, Float2 alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, Float2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * ZGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d9/d46/zgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void ZGBMV(RsBlasTranspose TransA,
+               int KL, int KU, Double2 alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               Double2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * STRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d45/strmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void STRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * DTRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/d7e/dtrmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void DTRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * CTRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d78/ctrmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void CTRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * ZTRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/dd1/ztrmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void ZTRMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * STBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d7d/stbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void STBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * DTBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d29/dtbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void DTBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * CTBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/dcd/ctbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void CTBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * ZTBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d39/ztbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void ZTBMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * STPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/db1/stpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void STPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * DTPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/dcd/dtpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void DTPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * CTPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/dbb/ctpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void CTPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * ZTPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d9e/ztpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void ZTPMV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * STRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d2a/strsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void STRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * DTRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d96/dtrsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void DTRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * CTRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/dc8/ctrsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void CTRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * ZTRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/d2f/ztrsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void ZTRSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * STBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d1f/stbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void STBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * DTBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/dcf/dtbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void DTBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * CTBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d9/d5f/ctbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void CTBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * ZTBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d5a/ztbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void ZTBSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               int K, sp<Allocation> A, sp<Allocation> X, int incX);
+
+    /**
+     * STPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d7c/stpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void STPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * DTPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d9/d84/dtpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void DTPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * CTPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d56/ctpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void CTPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * ZTPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/da/d57/ztpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
+    void ZTPSV(RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               sp<Allocation> Ap, sp<Allocation> X, int incX);
+
+    /**
+     * SSYMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d94/ssymv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void SSYMV(RsBlasUplo Uplo, float alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, float beta, sp<Allocation> Y, int incY);
+
+    /**
+     * SSBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/da1/ssbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void SSBMV(RsBlasUplo Uplo, int K, float alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, float beta, sp<Allocation> Y, int incY);
+
+    /**
+     * SSPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d68/sspmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void SSPMV(RsBlasUplo Uplo, float alpha, sp<Allocation> Ap, sp<Allocation> X,
+               int incX, float beta, sp<Allocation> Y, int incY);
+
+    /**
+     * SGER performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d5c/sger_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     */
+    void SGER(float alpha, sp<Allocation> X, int incX, sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * SSYR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/dac/ssyr_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     */
+    void SSYR(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX, sp<Allocation> A);
+
+    /**
+     * SSPR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d9b/sspr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F32}.
+     */
+    void SSPR(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX, sp<Allocation> Ap);
+
+    /**
+     * SSYR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d99/ssyr2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     */
+    void SSYR2(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * SSPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d3e/sspr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F32}.
+     */
+    void SSPR2(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> Ap);
+
+    /**
+     * DSYMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/dbe/dsymv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void DSYMV(RsBlasUplo Uplo, double alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               double beta, sp<Allocation> Y, int incY);
+
+    /**
+     * DSBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d1e/dsbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void DSBMV(RsBlasUplo Uplo, int K, double alpha, sp<Allocation> A, sp<Allocation> X, int incX,
+               double beta, sp<Allocation> Y, int incY);
+
+    /**
+     * DSPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d85/dspmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void DSPMV(RsBlasUplo Uplo, double alpha, sp<Allocation> Ap, sp<Allocation> X, int incX,
+               double beta, sp<Allocation> Y, int incY);
+
+    /**
+     * DGER performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dger_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     */
+    void DGER(double alpha, sp<Allocation> X, int incX, sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * DSYR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d60/dsyr_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     */
+    void DSYR(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX, sp<Allocation> A);
+
+    /**
+     * DSPR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/dba/dspr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F64}.
+     */
+    void DSPR(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX, sp<Allocation> Ap);
+
+    /**
+     * DSYR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d41/dsyr2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     */
+    void DSYR2(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * DSPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/d9e/dspr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F64}.
+     */
+    void DSPR2(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> Ap);
+
+    /**
+     * CHEMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d51/chemv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void CHEMV(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, Float2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * CHBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/dc2/chbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void CHBMV(RsBlasUplo Uplo, int K, Float2 alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, Float2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * CHPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d06/chpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void CHPMV(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> Ap, sp<Allocation> X,
+               int incX, Float2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * CGERU performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d5f/cgeru_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     */
+    void CGERU(Float2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * CGERC performs the rank 1 operation
+     * A := alpha*x*y**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/d84/cgerc_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     */
+    void CGERC(Float2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * CHER performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d6d/cher_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     */
+    void CHER(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX, sp<Allocation> A);
+
+    /**
+     * CHPR performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/dcd/chpr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     */
+    void CHPR(RsBlasUplo Uplo, float alpha, sp<Allocation> X, int incX, sp<Allocation> Ap);
+
+    /**
+     * CHER2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d87/cher2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     */
+    void CHER2(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * CHPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d44/chpr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     */
+    void CHPR2(RsBlasUplo Uplo, Float2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> Ap);
+
+    /**
+     * ZHEMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/ddd/zhemv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void ZHEMV(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, Double2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * ZHBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d1a/zhbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void ZHBMV(RsBlasUplo Uplo, int K, Double2 alpha, sp<Allocation> A, sp<Allocation> X,
+               int incX, Double2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * ZHPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d60/zhpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
+    void ZHPMV(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> Ap, sp<Allocation> X,
+               int incX, Double2 beta, sp<Allocation> Y, int incY);
+
+    /**
+     * ZGERU performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d12/zgeru_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     */
+    void ZGERU(Double2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * ZGERC performs the rank 1 operation
+     * A := alpha*x*y**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/dad/zgerc_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     */
+    void ZGERC(Double2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * ZHER performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d0e/zher_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     */
+    void ZHER(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX, sp<Allocation> A);
+
+    /**
+     * ZHPR performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/de1/zhpr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     */
+    void ZHPR(RsBlasUplo Uplo, double alpha, sp<Allocation> X, int incX, sp<Allocation> Ap);
+
+    /**
+     * ZHER2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/da/d8a/zher2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     */
+    void ZHER2(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> A);
+
+    /**
+     * ZHPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d5/d52/zhpr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type: {Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type: {Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     */
+    void ZHPR2(RsBlasUplo Uplo, Double2 alpha, sp<Allocation> X, int incX,
+               sp<Allocation> Y, int incY, sp<Allocation> Ap);
+
+    /**
+     * SGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32}.
+     */
+    void SGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, float alpha, sp<Allocation> A,
+                      sp<Allocation> B, float beta, sp<Allocation> C);
+
+
+    /**
+     * DGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d2b/dgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64}.
+     */
+    void DGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, double alpha, sp<Allocation> A,
+                      sp<Allocation> B, double beta, sp<Allocation> C);
+
+    /**
+     * CGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d5b/cgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, Float2 alpha, sp<Allocation> A,
+                      sp<Allocation> B, Float2 beta, sp<Allocation> C);
+
+    /**
+     * ZGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d76/zgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2
+     */
+    void ZGEMM(RsBlasTranspose TransA, RsBlasTranspose TransB, Double2 alpha, sp<Allocation> A,
+                      sp<Allocation> B, Double2 beta, sp<Allocation> C);
+
+    /**
+     * SSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d42/ssymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32}.
+     */
+    void SSYMM(RsBlasSide Side, RsBlasUplo Uplo, float alpha, sp<Allocation> A,
+                      sp<Allocation> B, float beta, sp<Allocation> C);
+
+    /**
+     * DSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/db0/dsymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64}.
+     */
+    void DSYMM(RsBlasSide Side, RsBlasUplo Uplo, double alpha, sp<Allocation> A,
+                      sp<Allocation> B, double beta, sp<Allocation> C);
+
+    /**
+     * CSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d59/csymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CSYMM(RsBlasSide Side, RsBlasUplo Uplo, Float2 alpha, sp<Allocation> A,
+                      sp<Allocation> B, Float2 beta, sp<Allocation> C);
+
+    /**
+     * ZSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d51/zsymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2}.
+     */
+    void ZSYMM(RsBlasSide Side, RsBlasUplo Uplo, Double2 alpha, sp<Allocation> A,
+                      sp<Allocation> B, Double2 beta, sp<Allocation> C);
+
+    /**
+     * SSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d40/ssyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32}.
+     */
+    void SSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, float alpha,
+               sp<Allocation> A, float beta, sp<Allocation> C);
+
+    /**
+     * DSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/d05/dsyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64}.
+     */
+    void DSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, double alpha,
+               sp<Allocation> A, double beta, sp<Allocation> C);
+
+    /**
+     * CSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d6a/csyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, Float2 alpha,
+               sp<Allocation> A, Float2 beta, sp<Allocation> C);
+
+    /**
+     * ZSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d54/zsyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2}.
+     */
+    void ZSYRK(RsBlasUplo Uplo, RsBlasTranspose Trans, Double2 alpha,
+               sp<Allocation> A, Double2 beta, sp<Allocation> C);
+
+    /**
+     * SSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d3d/ssyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32}.
+     */
+    void SSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, float alpha,
+                sp<Allocation> A, sp<Allocation> B, float beta, sp<Allocation> C);
+
+    /**
+     * DSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/dec/dsyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64}.
+     */
+    void DSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, double alpha,
+                sp<Allocation> A, sp<Allocation> B, double beta, sp<Allocation> C);
+
+    /**
+     * CSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d7e/csyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Float2 alpha,
+                sp<Allocation> A, sp<Allocation> B, Float2 beta, sp<Allocation> C);
+
+    /**
+     * ZSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d20/zsyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2}.
+     */
+    void ZSYR2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Double2 alpha,
+                sp<Allocation> A, sp<Allocation> B, Double2 beta, sp<Allocation> C);
+
+    /**
+     * STRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d01/strmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32}.
+     */
+    void STRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA,
+               RsBlasDiag Diag, float alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * DTRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/d19/dtrmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64}.
+     */
+    void DTRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               double alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * CTRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d9b/ctrmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     */
+    void CTRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               Float2 alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * ZTRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/de1/ztrmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2}.
+     */
+    void ZTRMM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               Double2 alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * STRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d8b/strsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32}.
+     */
+    void STRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               float alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * DTRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/da7/dtrsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64}.
+     */
+    void DTRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               double alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * CTRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d30/ctrsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     */
+    void CTRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               Float2 alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * ZTRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/d39/ztrsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2}.
+     */
+    void ZTRSM(RsBlasSide Side, RsBlasUplo Uplo, RsBlasTranspose TransA, RsBlasDiag Diag,
+               Double2 alpha, sp<Allocation> A, sp<Allocation> B);
+
+    /**
+     * CHEMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d66/chemm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CHEMM(RsBlasSide Side, RsBlasUplo Uplo, Float2 alpha, sp<Allocation> A,
+               sp<Allocation> B, Float2 beta, sp<Allocation> C);
+
+    /**
+     * ZHEMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d3e/zhemm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2}.
+     */
+    void ZHEMM(RsBlasSide Side, RsBlasUplo Uplo, Double2 alpha, sp<Allocation> A,
+               sp<Allocation> B, Double2 beta, sp<Allocation> C);
+
+    /**
+     * CHERK performs one of the hermitian rank k operations
+     * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d52/cherk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CHERK(RsBlasUplo Uplo, RsBlasTranspose Trans, float alpha, sp<Allocation> A,
+               float beta, sp<Allocation> C);
+
+    /**
+     * ZHERK performs one of the hermitian rank k operations
+     * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/db1/zherk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2}.
+     */
+    void ZHERK(RsBlasUplo Uplo, RsBlasTranspose Trans, double alpha, sp<Allocation> A,
+               double beta, sp<Allocation> C);
+
+    /**
+     * CHER2K performs one of the hermitian rank 2k operations
+     * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/d82/cher2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F32_2}.
+     */
+    void CHER2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Float2 alpha, sp<Allocation> A,
+                sp<Allocation> B, float beta, sp<Allocation> C);
+
+    /**
+     * ZHER2K performs one of the hermitian rank 2k operations
+     * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/dfa/zher2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type: {Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#F64_2}.
+     */
+    void ZHER2K(RsBlasUplo Uplo, RsBlasTranspose Trans, Double2 alpha, sp<Allocation> A,
+                sp<Allocation> B, double beta, sp<Allocation> C);
+
+    /**
+     * 8-bit GEMM-like operation for neural networks: C = A * Transpose(B)
+     * Calculations are done in 1.10.21 fixed-point format for the final output,
+     * just before there's a shift down to drop the fractional parts. The output
+     * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
+     * gives some headroom to avoid wrapping around on small overflows.
+     *
+     * @param A The input allocation contains matrix A, supported elements type: {Element#U8}.
+     * @param a_offset The offset for all values in matrix A, e.g A[i,j] = A[i,j] - a_offset. Value should be from 0 to 255.
+     * @param B The input allocation contains matrix B, supported elements type: {Element#U8}.
+     * @param b_offset The offset for all values in matrix B, e.g B[i,j] = B[i,j] - b_offset. Value should be from 0 to 255.
+     * @param C The input allocation contains matrix C, supported elements type: {Element#U8}.
+     * @param c_offset The offset for all values in matrix C.
+     * @param c_mult The multiplier for all values in matrix C, e.g C[i,j] = (C[i,j] + c_offset) * c_mult.
+     **/
+    void BNNM(sp<Allocation> A, int a_offset, sp<Allocation> B, int b_offset, sp<Allocation> C,
+              int c_offset, int c_mult);
+};
+
 /**
  * Intrinsic kernel for blending two Allocations.
  */
@@ -2114,276 +4325,6 @@
 
 };
 
-class Byte2 {
- public:
-  int8_t x, y;
-
-  Byte2(int8_t initX, int8_t initY)
-    : x(initX), y(initY) {}
-  Byte2() : x(0), y(0) {}
-};
-
-class Byte3 {
- public:
-  int8_t x, y, z;
-
-  Byte3(int8_t initX, int8_t initY, int8_t initZ)
-    : x(initX), y(initY), z(initZ) {}
-  Byte3() : x(0), y(0), z(0) {}
-};
-
-class Byte4 {
- public:
-  int8_t x, y, z, w;
-
-  Byte4(int8_t initX, int8_t initY, int8_t initZ, int8_t initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  Byte4() : x(0), y(0), z(0), w(0) {}
-};
-
-class UByte2 {
- public:
-  uint8_t x, y;
-
-  UByte2(uint8_t initX, uint8_t initY)
-    : x(initX), y(initY) {}
-  UByte2() : x(0), y(0) {}
-};
-
-class UByte3 {
- public:
-  uint8_t x, y, z;
-
-  UByte3(uint8_t initX, uint8_t initY, uint8_t initZ)
-    : x(initX), y(initY), z(initZ) {}
-  UByte3() : x(0), y(0), z(0) {}
-};
-
-class UByte4 {
- public:
-  uint8_t x, y, z, w;
-
-  UByte4(uint8_t initX, uint8_t initY, uint8_t initZ, uint8_t initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  UByte4() : x(0), y(0), z(0), w(0) {}
-};
-
-class Short2 {
- public:
-  short x, y;
-
-  Short2(short initX, short initY)
-    : x(initX), y(initY) {}
-  Short2() : x(0), y(0) {}
-};
-
-class Short3 {
- public:
-  short x, y, z;
-
-  Short3(short initX, short initY, short initZ)
-    : x(initX), y(initY), z(initZ) {}
-  Short3() : x(0), y(0), z(0) {}
-};
-
-class Short4 {
- public:
-  short x, y, z, w;
-
-  Short4(short initX, short initY, short initZ, short initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  Short4() : x(0), y(0), z(0), w(0) {}
-};
-
-class UShort2 {
- public:
-  uint16_t x, y;
-
-  UShort2(uint16_t initX, uint16_t initY)
-    : x(initX), y(initY) {}
-  UShort2() : x(0), y(0) {}
-};
-
-class UShort3 {
- public:
-  uint16_t x, y, z;
-
-  UShort3(uint16_t initX, uint16_t initY, uint16_t initZ)
-    : x(initX), y(initY), z(initZ) {}
-  UShort3() : x(0), y(0), z(0) {}
-};
-
-class UShort4 {
- public:
-  uint16_t x, y, z, w;
-
-  UShort4(uint16_t initX, uint16_t initY, uint16_t initZ, uint16_t initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  UShort4() : x(0), y(0), z(0), w(0) {}
-};
-
-class Int2 {
- public:
-  int x, y;
-
-  Int2(int initX, int initY)
-    : x(initX), y(initY) {}
-  Int2() : x(0), y(0) {}
-};
-
-class Int3 {
- public:
-  int x, y, z;
-
-  Int3(int initX, int initY, int initZ)
-    : x(initX), y(initY), z(initZ) {}
-  Int3() : x(0), y(0), z(0) {}
-};
-
-class Int4 {
- public:
-  int x, y, z, w;
-
-  Int4(int initX, int initY, int initZ, int initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  Int4() : x(0), y(0), z(0), w(0) {}
-};
-
-class UInt2 {
- public:
-  uint32_t x, y;
-
-  UInt2(uint32_t initX, uint32_t initY)
-    : x(initX), y(initY) {}
-  UInt2() : x(0), y(0) {}
-};
-
-class UInt3 {
- public:
-  uint32_t x, y, z;
-
-  UInt3(uint32_t initX, uint32_t initY, uint32_t initZ)
-    : x(initX), y(initY), z(initZ) {}
-  UInt3() : x(0), y(0), z(0) {}
-};
-
-class UInt4 {
- public:
-  uint32_t x, y, z, w;
-
-  UInt4(uint32_t initX, uint32_t initY, uint32_t initZ, uint32_t initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  UInt4() : x(0), y(0), z(0), w(0) {}
-};
-
-class Long2 {
- public:
-  int64_t x, y;
-
-  Long2(int64_t initX, int64_t initY)
-    : x(initX), y(initY) {}
-  Long2() : x(0), y(0) {}
-};
-
-class Long3 {
- public:
-  int64_t x, y, z;
-
-  Long3(int64_t initX, int64_t initY, int64_t initZ)
-    : x(initX), y(initY), z(initZ) {}
-  Long3() : x(0), y(0), z(0) {}
-};
-
-class Long4 {
- public:
-  int64_t x, y, z, w;
-
-  Long4(int64_t initX, int64_t initY, int64_t initZ, int64_t initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  Long4() : x(0), y(0), z(0), w(0) {}
-};
-
-class ULong2 {
- public:
-  uint64_t x, y;
-
-  ULong2(uint64_t initX, uint64_t initY)
-    : x(initX), y(initY) {}
-  ULong2() : x(0), y(0) {}
-};
-
-class ULong3 {
- public:
-  uint64_t x, y, z;
-
-  ULong3(uint64_t initX, uint64_t initY, uint64_t initZ)
-    : x(initX), y(initY), z(initZ) {}
-  ULong3() : x(0), y(0), z(0) {}
-};
-
-class ULong4 {
- public:
-  uint64_t x, y, z, w;
-
-  ULong4(uint64_t initX, uint64_t initY, uint64_t initZ, uint64_t initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  ULong4() : x(0), y(0), z(0), w(0) {}
-};
-
-class Float2 {
- public:
-  float x, y;
-
-  Float2(float initX, float initY)
-    : x(initX), y(initY) {}
-  Float2() : x(0), y(0) {}
-};
-
-class Float3 {
- public:
-  float x, y, z;
-
-  Float3(float initX, float initY, float initZ)
-    : x(initX), y(initY), z(initZ) {}
-  Float3() : x(0.f), y(0.f), z(0.f) {}
-};
-
-class Float4 {
- public:
-  float x, y, z, w;
-
-  Float4(float initX, float initY, float initZ, float initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  Float4() : x(0.f), y(0.f), z(0.f), w(0.f) {}
-};
-
-class Double2 {
- public:
-  double x, y;
-
-  Double2(double initX, double initY)
-    : x(initX), y(initY) {}
-  Double2() : x(0), y(0) {}
-};
-
-class Double3 {
- public:
-  double x, y, z;
-
-  Double3(double initX, double initY, double initZ)
-    : x(initX), y(initY), z(initZ) {}
-  Double3() : x(0), y(0), z(0) {}
-};
-
-class Double4 {
- public:
-  double x, y, z, w;
-
-  Double4(double initX, double initY, double initZ, double initW)
-    : x(initX), y(initY), z(initZ), w(initW) {}
-  Double4() : x(0), y(0), z(0), w(0) {}
-};
-
 }
 
 }