Fix stride 16-byte alignment issues in RS.

Change-Id: I549d3acd4ebd921c36be3de6d2734bb89ec8a50e
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index cec99ed..aacdac3 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -245,7 +245,9 @@
     alloc->mHal.drvState.lod[0].dimY = type->getDimY();
     alloc->mHal.drvState.lod[0].dimZ = type->getDimZ();
     alloc->mHal.drvState.lod[0].mallocPtr = 0;
-    alloc->mHal.drvState.lod[0].stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+    // Stride needs to be 16-byte aligned too!
+    size_t stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+    alloc->mHal.drvState.lod[0].stride = rsRound(stride, 16);
     alloc->mHal.drvState.lodCount = type->getLODCount();
     alloc->mHal.drvState.faceCount = type->getDimFaces();
 
@@ -262,7 +264,8 @@
             alloc->mHal.drvState.lod[lod].dimX = tx;
             alloc->mHal.drvState.lod[lod].dimY = ty;
             alloc->mHal.drvState.lod[lod].dimZ = tz;
-            alloc->mHal.drvState.lod[lod].stride = tx * type->getElementSizeBytes();
+            alloc->mHal.drvState.lod[lod].stride =
+                    rsRound(tx * type->getElementSizeBytes(), 16);
             offsets[lod] = o;
             o += alloc->mHal.drvState.lod[lod].stride * rsMax(ty, 1u) * rsMax(tz, 1u);
             if (tx > 1) tx >>= 1;
@@ -540,6 +543,7 @@
             bounds, &dst);
     alloc->mHal.drvState.lod[0].mallocPtr = dst;
     alloc->mHal.drvState.lod[0].stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
+    rsAssert((alloc->mHal.drvState.lod[0].stride & 0xf) == 0);
 
     return true;
 }
diff --git a/rsUtils.h b/rsUtils.h
index ebfc679..78341d7 100644
--- a/rsUtils.h
+++ b/rsUtils.h
@@ -91,6 +91,22 @@
     return 1 << rsFindHighBit(v);
 }
 
+template<typename T>
+T rsRound(T v, unsigned int r) {
+    // Only valid for rounding up to powers of 2.
+    if ((r & (r - 1)) != 0) {
+        rsAssert(false && "Must be power of 2 for rounding up");
+        return v;
+    }
+    T res = v + (r - 1);
+    if (res < v) {
+        rsAssert(false && "Overflow of rounding operation");
+        return v;
+    }
+    res &= ~(r - 1);
+    return res;
+}
+
 static inline uint16_t rs888to565(uint32_t r, uint32_t g, uint32_t b) {
     uint16_t t = 0;
     t |= b >> 3;