Fix stride 16-byte alignment issues in RS.
Change-Id: I549d3acd4ebd921c36be3de6d2734bb89ec8a50e
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index cec99ed..aacdac3 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -245,7 +245,9 @@
alloc->mHal.drvState.lod[0].dimY = type->getDimY();
alloc->mHal.drvState.lod[0].dimZ = type->getDimZ();
alloc->mHal.drvState.lod[0].mallocPtr = 0;
- alloc->mHal.drvState.lod[0].stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+ // Stride needs to be 16-byte aligned too!
+ size_t stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+ alloc->mHal.drvState.lod[0].stride = rsRound(stride, 16);
alloc->mHal.drvState.lodCount = type->getLODCount();
alloc->mHal.drvState.faceCount = type->getDimFaces();
@@ -262,7 +264,8 @@
alloc->mHal.drvState.lod[lod].dimX = tx;
alloc->mHal.drvState.lod[lod].dimY = ty;
alloc->mHal.drvState.lod[lod].dimZ = tz;
- alloc->mHal.drvState.lod[lod].stride = tx * type->getElementSizeBytes();
+ alloc->mHal.drvState.lod[lod].stride =
+ rsRound(tx * type->getElementSizeBytes(), 16);
offsets[lod] = o;
o += alloc->mHal.drvState.lod[lod].stride * rsMax(ty, 1u) * rsMax(tz, 1u);
if (tx > 1) tx >>= 1;
@@ -540,6 +543,7 @@
bounds, &dst);
alloc->mHal.drvState.lod[0].mallocPtr = dst;
alloc->mHal.drvState.lod[0].stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
+ rsAssert((alloc->mHal.drvState.lod[0].stride & 0xf) == 0);
return true;
}
diff --git a/rsUtils.h b/rsUtils.h
index ebfc679..78341d7 100644
--- a/rsUtils.h
+++ b/rsUtils.h
@@ -91,6 +91,22 @@
return 1 << rsFindHighBit(v);
}
+template<typename T>
+T rsRound(T v, unsigned int r) {
+ // Only valid for rounding up to powers of 2.
+ if ((r & (r - 1)) != 0) {
+ rsAssert(false && "Must be power of 2 for rounding up");
+ return v;
+ }
+ T res = v + (r - 1);
+ if (res < v) {
+ rsAssert(false && "Overflow of rounding operation");
+ return v;
+ }
+ res &= ~(r - 1);
+ return res;
+}
+
static inline uint16_t rs888to565(uint32_t r, uint32_t g, uint32_t b) {
uint16_t t = 0;
t |= b >> 3;