Add support for 2D strided copies to/from an allocation with the C++ API.

Change-Id: I55cd7512f683f8d36d2b75f894931fd0657521bc
diff --git a/cpp/Allocation.cpp b/cpp/Allocation.cpp
index 2e3597e..d6dfa94 100644
--- a/cpp/Allocation.cpp
+++ b/cpp/Allocation.cpp
@@ -221,7 +221,7 @@
                                  const void *data) {
     validate2DRange(xoff, yoff, w, h);
     rsAllocation2DData(mRS->getContext(), getIDSafe(), xoff, yoff, mSelectedLOD, mSelectedFace,
-                       w, h, data, w * h * mType->getElement()->getSizeBytes());
+                       w, h, data, w * h * mType->getElement()->getSizeBytes(), w * mType->getElement()->getSizeBytes());
 }
 
 void Allocation::copy2DRangeFrom(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
@@ -237,9 +237,32 @@
                                void* data) {
     validate2DRange(xoff, yoff, w, h);
     rsAllocation2DRead(mRS->getContext(), getIDSafe(), xoff, yoff, mSelectedLOD, mSelectedFace,
-                       w, h, data, w * h * mType->getElement()->getSizeBytes());
+                       w, h, data, w * h * mType->getElement()->getSizeBytes(), w * mType->getElement()->getSizeBytes());
 }
 
+void Allocation::copy2DStridedFrom(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
+                                   const void *data, size_t stride) {
+    validate2DRange(xoff, yoff, w, h);
+    rsAllocation2DData(mRS->getContext(), getIDSafe(), xoff, yoff, mSelectedLOD, mSelectedFace,
+                       w, h, data, w * h * mType->getElement()->getSizeBytes(), stride);
+}
+
+void Allocation::copy2DStridedFrom(const void* data, size_t stride) {
+    copy2DStridedFrom(0, 0, mCurrentDimX, mCurrentDimY, data, stride);
+}
+
+void Allocation::copy2DStridedTo(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
+                                 void *data, size_t stride) {
+    validate2DRange(xoff, yoff, w, h);
+    rsAllocation2DRead(mRS->getContext(), getIDSafe(), xoff, yoff, mSelectedLOD, mSelectedFace,
+                       w, h, data, w * h * mType->getElement()->getSizeBytes(), stride);
+}
+
+void Allocation::copy2DStridedTo(void* data, size_t stride) {
+    copy2DStridedTo(0, 0, mCurrentDimX, mCurrentDimY, data, stride);
+}
+
+
 /*
 void resize(int dimX) {
     if ((mType.getY() > 0)|| (mType.getZ() > 0) || mType.hasFaces() || mType.hasMipmaps()) {
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index 8d3a9af..a430c35 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -238,6 +238,14 @@
     void copy2DRangeFrom(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
                          sp<const Allocation> data, uint32_t dataXoff, uint32_t dataYoff);
 
+    void copy2DStridedFrom(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
+                           const void *data, size_t stride);
+    void copy2DStridedFrom(const void *data, size_t stride);
+
+    void copy2DStridedTo(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h,
+                         void *data, size_t stride);
+    void copy2DStridedTo(void *data, size_t stride);
+
     void resize(int dimX);
     void resize(int dimX, int dimY);
 
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 928f777..82d87c2 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -582,11 +582,14 @@
 
 void rsdAllocationData2D(const Context *rsc, const Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-                         uint32_t w, uint32_t h, const void *data, size_t sizeBytes) {
+                         uint32_t w, uint32_t h, const void *data, size_t sizeBytes, size_t stride) {
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
     uint32_t lineSize = eSize * w;
+    if (!stride) {
+        stride = lineSize;
+    }
 
     if (alloc->mHal.drvState.lod[0].mallocPtr) {
         const uint8_t *src = static_cast<const uint8_t *>(data);
@@ -598,7 +601,7 @@
                 alloc->decRefs(dst, w);
             }
             memcpy(dst, src, lineSize);
-            src += lineSize;
+            src += stride;
             dst += alloc->mHal.drvState.lod[lod].stride;
         }
         drv->uploadDeferred = true;
@@ -623,10 +626,13 @@
 }
 
 void rsdAllocationRead2D(const Context *rsc, const Allocation *alloc,
-                         uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-                         uint32_t w, uint32_t h, void *data, size_t sizeBytes) {
+                                uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+                                uint32_t w, uint32_t h, void *data, size_t sizeBytes, size_t stride) {
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
     uint32_t lineSize = eSize * w;
+    if (!stride) {
+        stride = lineSize;
+    }
 
     if (alloc->mHal.drvState.lod[0].mallocPtr) {
         uint8_t *dst = static_cast<uint8_t *>(data);
@@ -634,7 +640,7 @@
 
         for (uint32_t line=yoff; line < (yoff+h); line++) {
             memcpy(dst, src, lineSize);
-            dst += lineSize;
+            dst += stride;
             src += alloc->mHal.drvState.lod[lod].stride;
         }
     } else {
@@ -642,6 +648,7 @@
     }
 }
 
+
 void rsdAllocationRead3D(const Context *rsc, const Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t zoff,
                          uint32_t lod, RsAllocationCubemapFace face,
diff --git a/driver/rsdAllocation.h b/driver/rsdAllocation.h
index d2ecc9a..5f916d1 100644
--- a/driver/rsdAllocation.h
+++ b/driver/rsdAllocation.h
@@ -88,7 +88,7 @@
                          const android::renderscript::Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
                          uint32_t w, uint32_t h,
-                         const void *data, uint32_t sizeBytes);
+                         const void *data, uint32_t sizeBytes, size_t stride);
 void rsdAllocationData3D(const android::renderscript::Context *rsc,
                          const android::renderscript::Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t zoff,
@@ -103,7 +103,7 @@
                          const android::renderscript::Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
                          uint32_t w, uint32_t h,
-                         void *data, uint32_t sizeBytes);
+                         void *data, uint32_t sizeBytes, size_t stride);
 void rsdAllocationRead3D(const android::renderscript::Context *rsc,
                          const android::renderscript::Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t zoff,
diff --git a/rs.spec b/rs.spec
index 679b481..5802a0c 100644
--- a/rs.spec
+++ b/rs.spec
@@ -200,6 +200,7 @@
     param uint32_t w
     param uint32_t h
     param const void *data
+    param size_t stride
     }
 
 Allocation2DElementData {
@@ -238,6 +239,7 @@
     param uint32_t w
     param uint32_t h
     param void *data
+    param size_t stride
 }
 
 AllocationSyncAll {
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 259c3f8..df0a79e 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -89,19 +89,28 @@
 }
 
 void Allocation::data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-             uint32_t w, uint32_t h, const void *data, size_t sizeBytes) {
+                      uint32_t w, uint32_t h, const void *data, size_t sizeBytes) {
+
     const size_t eSize = mHal.state.elementSizeBytes;
     const size_t lineSize = eSize * w;
 
-    //ALOGE("data2d %p,  %i %i %i %i %i %i %p %i", this, xoff, yoff, lod, face, w, h, data, sizeBytes);
-
     if ((lineSize * h) != sizeBytes) {
         ALOGE("Allocation size mismatch, expected %zu, got %zu", (lineSize * h), sizeBytes);
         rsAssert(!"Allocation::subData called with mismatched size");
         return;
     }
 
-    rsc->mHal.funcs.allocation.data2D(rsc, this, xoff, yoff, lod, face, w, h, data, sizeBytes);
+    this->data(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes, lineSize);
+}
+
+void Allocation::data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+                      uint32_t w, uint32_t h, const void *data, size_t sizeBytes, size_t stride) {
+    const size_t eSize = mHal.state.elementSizeBytes;
+    const size_t lineSize = eSize * w;
+
+    //ALOGE("data2d %p,  %i %i %i %i %i %i %p %i", this, xoff, yoff, lod, face, w, h, data, sizeBytes);
+
+    rsc->mHal.funcs.allocation.data2D(rsc, this, xoff, yoff, lod, face, w, h, data, sizeBytes, stride);
     sendDirty(rsc);
 }
 
@@ -111,7 +120,7 @@
 }
 
 void Allocation::read(Context *rsc, uint32_t xoff, uint32_t lod,
-                         uint32_t count, void *data, size_t sizeBytes) {
+                      uint32_t count, void *data, size_t sizeBytes) {
     const size_t eSize = mHal.state.type->getElementSizeBytes();
 
     if ((count * eSize) != sizeBytes) {
@@ -131,7 +140,7 @@
 
 
 void Allocation::read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-             uint32_t w, uint32_t h, void *data, size_t sizeBytes) {
+                      uint32_t w, uint32_t h, void *data, size_t sizeBytes) {
     const size_t eSize = mHal.state.elementSizeBytes;
     const size_t lineSize = eSize * w;
 
@@ -141,7 +150,18 @@
         return;
     }
 
-    rsc->mHal.funcs.allocation.read2D(rsc, this, xoff, yoff, lod, face, w, h, data, sizeBytes);
+    read(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes, lineSize);
+}
+
+void Allocation::read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+                      uint32_t w, uint32_t h, void *data, size_t sizeBytes, size_t stride) {
+    const size_t eSize = mHal.state.elementSizeBytes;
+    const size_t lineSize = eSize * w;
+    if (!stride) {
+        stride = lineSize;
+    }
+
+    rsc->mHal.funcs.allocation.read2D(rsc, this, xoff, yoff, lod, face, w, h, data, sizeBytes, stride);
 }
 
 void Allocation::read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff,
@@ -526,9 +546,9 @@
 }
 
 void rsi_Allocation2DData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-                          uint32_t w, uint32_t h, const void *data, size_t sizeBytes) {
+                          uint32_t w, uint32_t h, const void *data, size_t sizeBytes, size_t stride) {
     Allocation *a = static_cast<Allocation *>(va);
-    a->data(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes);
+    a->data(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes, stride);
 }
 
 void rsi_AllocationRead(Context *rsc, RsAllocation va, void *data, size_t sizeBytes) {
@@ -674,9 +694,9 @@
 
 void rsi_Allocation2DRead(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t yoff,
                           uint32_t lod, RsAllocationCubemapFace face, uint32_t w,
-                          uint32_t h, void *data, size_t sizeBytes) {
+                          uint32_t h, void *data, size_t sizeBytes, size_t stride) {
     Allocation *a = static_cast<Allocation *>(va);
-    a->read(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes);
+    a->read(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes, stride);
 }
 
 }
diff --git a/rsAllocation.h b/rsAllocation.h
index de79cba..b552ca3 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -103,16 +103,20 @@
 
     void data(Context *rsc, uint32_t xoff, uint32_t lod, uint32_t count, const void *data, size_t sizeBytes);
     void data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-                 uint32_t w, uint32_t h, const void *data, size_t sizeBytes);
+              uint32_t w, uint32_t h, const void *data, size_t sizeBytes);
+    void data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+              uint32_t w, uint32_t h, const void *data, size_t sizeBytes, size_t stride);
     void data(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff, uint32_t lod, RsAllocationCubemapFace face,
-                 uint32_t w, uint32_t h, uint32_t d, const void *data, size_t sizeBytes);
+              uint32_t w, uint32_t h, uint32_t d, const void *data, size_t sizeBytes);
 
     void read(Context *rsc, uint32_t xoff, uint32_t lod, uint32_t count, void *data, size_t sizeBytes);
     void readUnchecked(Context *rsc, uint32_t xoff, uint32_t lod, uint32_t count, void *data, size_t sizeBytes);
     void read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
-                 uint32_t w, uint32_t h, void *data, size_t sizeBytes);
+              uint32_t w, uint32_t h, void *data, size_t sizeBytes);
+    void read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
+              uint32_t w, uint32_t h, void *data, size_t sizeBytes, size_t stride);
     void read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff, uint32_t lod, RsAllocationCubemapFace face,
-                 uint32_t w, uint32_t h, uint32_t d, void *data, size_t sizeBytes);
+              uint32_t w, uint32_t h, uint32_t d, void *data, size_t sizeBytes);
 
     void elementData(Context *rsc, uint32_t x,
                      const void *data, uint32_t elementOff, size_t sizeBytes);
diff --git a/rsFont.cpp b/rsFont.cpp
index 82fb90f..0d14d1b 100644
--- a/rsFont.cpp
+++ b/rsFont.cpp
@@ -469,7 +469,7 @@
 
     mRSC->mHal.funcs.allocation.data2D(mRSC, mTextTexture.get(), 0, 0, 0,
         RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X, mCacheWidth, mCacheHeight,
-        mCacheBuffer, mCacheWidth*mCacheHeight);
+        mCacheBuffer, mCacheWidth*mCacheHeight, mCacheWidth);
 
     mFontShaderF->bindTexture(mRSC, 0, mTextTexture.get());
 
diff --git a/rs_hal.h b/rs_hal.h
index 877fd96..4f562d5 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -157,7 +157,7 @@
         void (*data2D)(const Context *rsc, const Allocation *alloc,
                        uint32_t xoff, uint32_t yoff, uint32_t lod,
                        RsAllocationCubemapFace face, uint32_t w, uint32_t h,
-                       const void *data, size_t sizeBytes);
+                       const void *data, size_t sizeBytes, size_t stride);
         void (*data3D)(const Context *rsc, const Allocation *alloc,
                        uint32_t xoff, uint32_t yoff, uint32_t zoff,
                        uint32_t lod, RsAllocationCubemapFace face,
@@ -169,7 +169,7 @@
         void (*read2D)(const Context *rsc, const Allocation *alloc,
                        uint32_t xoff, uint32_t yoff, uint32_t lod,
                        RsAllocationCubemapFace face, uint32_t w, uint32_t h,
-                       void *data, size_t sizeBytes);
+                       void *data, size_t sizeBytes, size_t stride);
         void (*read3D)(const Context *rsc, const Allocation *alloc,
                        uint32_t xoff, uint32_t yoff, uint32_t zoff,
                        uint32_t lod, RsAllocationCubemapFace face,
diff --git a/tests/cppstrided/Android.mk b/tests/cppstrided/Android.mk
new file mode 100644
index 0000000..febfd38
--- /dev/null
+++ b/tests/cppstrided/Android.mk
@@ -0,0 +1,34 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES:= \
+	multiply.rs \
+	compute.cpp
+
+LOCAL_SHARED_LIBRARIES := \
+	libRS \
+	libRScpp \
+	libz \
+	libcutils \
+	libutils \
+	libEGL \
+	libGLESv1_CM \
+	libGLESv2 \
+	libui \
+	libbcc \
+	libbcinfo \
+	libgui
+
+LOCAL_MODULE:= rstest-cppstrided
+
+LOCAL_MODULE_TAGS := tests
+
+intermediates := $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
+
+LOCAL_C_INCLUDES += frameworks/rs/cpp
+LOCAL_C_INCLUDES += frameworks/rs
+LOCAL_C_INCLUDES += $(intermediates)
+
+
+include $(BUILD_EXECUTABLE)
+
diff --git a/tests/cppstrided/compute.cpp b/tests/cppstrided/compute.cpp
new file mode 100644
index 0000000..b705f77
--- /dev/null
+++ b/tests/cppstrided/compute.cpp
@@ -0,0 +1,74 @@
+
+#include "RenderScript.h"
+
+#include "ScriptC_multiply.h"
+
+using namespace android;
+using namespace RSC;
+
+int main(int argc, char** argv)
+{
+
+    uint32_t numElems = 1024;
+    uint32_t stride = 1025;
+
+    if (argc >= 2) {
+        int tempStride = atoi(argv[1]);
+        if (tempStride < 1024) {
+            printf("stride must be greater than or equal to 1024\n");
+            return 1;
+        }
+        stride = (uint32_t) tempStride;
+    }
+
+    sp<RS> rs = new RS();
+
+    bool r = rs->init();
+
+    sp<const Element> e = Element::U32(rs);
+
+    Type::Builder tb(rs, e);
+    tb.setX(numElems);
+    tb.setY(numElems);
+    sp<const Type> t = tb.create();
+
+    sp<Allocation> ain = Allocation::createTyped(rs, t);
+    sp<Allocation> aout = Allocation::createTyped(rs, t);
+
+    sp<ScriptC_multiply> sc = new ScriptC_multiply(rs, NULL, 0);
+
+    uint32_t* buf = (uint32_t*) malloc(stride * numElems * sizeof(uint32_t));
+    if (!buf) {
+        printf("malloc failed\n");
+        return 1;
+    }
+
+    for (uint32_t i = 0; i < numElems; i++) {
+        for (uint32_t ct=0; ct < numElems; ct++) {
+            *(buf+(stride*i)+ct) = (uint32_t)ct + (i * numElems);
+        }
+    }
+
+    ain->copy2DStridedFrom(buf, stride * sizeof(uint32_t));
+
+    sc->forEach_multiply(ain, aout);
+
+    aout->copy2DStridedTo(buf, stride * sizeof(uint32_t));
+
+    for (uint32_t i = 0; i < numElems; i++) {
+        for (uint32_t ct=0; ct < numElems; ct++) {
+            if (*(buf+(stride*i)+ct) != (uint32_t)(ct + (i * numElems)) * 2) {
+                printf("Mismatch at location %d, %d: %u\n", i, ct, *(buf+(stride*i)+ct));
+                return 1;
+            }
+        }
+    }
+
+    printf("Test successful with %u stride!\n", stride);
+
+    sc.clear();
+    t.clear();
+    e.clear();
+    ain.clear();
+    aout.clear();
+}
diff --git a/tests/cppstrided/multiply.rs b/tests/cppstrided/multiply.rs
new file mode 100644
index 0000000..d1ffefb
--- /dev/null
+++ b/tests/cppstrided/multiply.rs
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma version(1)
+#pragma rs java_package_name(unused)
+#pragma rs_fp_relaxed
+
+uint32_t __attribute__((kernel)) multiply(uint32_t in) {
+    return in * 2;
+}
+
+