Fix stride 16-byte alignment issues in RS.

Change-Id: I549d3acd4ebd921c36be3de6d2734bb89ec8a50e
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index cec99ed..aacdac3 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -245,7 +245,9 @@
     alloc->mHal.drvState.lod[0].dimY = type->getDimY();
     alloc->mHal.drvState.lod[0].dimZ = type->getDimZ();
     alloc->mHal.drvState.lod[0].mallocPtr = 0;
-    alloc->mHal.drvState.lod[0].stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+    // Stride needs to be 16-byte aligned too!
+    size_t stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+    alloc->mHal.drvState.lod[0].stride = rsRound(stride, 16);
     alloc->mHal.drvState.lodCount = type->getLODCount();
     alloc->mHal.drvState.faceCount = type->getDimFaces();
 
@@ -262,7 +264,8 @@
             alloc->mHal.drvState.lod[lod].dimX = tx;
             alloc->mHal.drvState.lod[lod].dimY = ty;
             alloc->mHal.drvState.lod[lod].dimZ = tz;
-            alloc->mHal.drvState.lod[lod].stride = tx * type->getElementSizeBytes();
+            alloc->mHal.drvState.lod[lod].stride =
+                    rsRound(tx * type->getElementSizeBytes(), 16);
             offsets[lod] = o;
             o += alloc->mHal.drvState.lod[lod].stride * rsMax(ty, 1u) * rsMax(tz, 1u);
             if (tx > 1) tx >>= 1;
@@ -540,6 +543,7 @@
             bounds, &dst);
     alloc->mHal.drvState.lod[0].mallocPtr = dst;
     alloc->mHal.drvState.lod[0].stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
+    rsAssert((alloc->mHal.drvState.lod[0].stride & 0xf) == 0);
 
     return true;
 }