Expand RS vector3 types to vector4.

BUG=5609007

The underlying LLVM implementation for vector3 types does this implicitly. If
RS does not adjust its implementation, we will always be misaligned for any
subsequent data after a vector3 type. We previously inserted padding into the
reflected layers from llvm-rs-cc (hence the skip padding part of this change).
We can safely ignore the padding now that the Java/native code is updated to
use the expanded size. The compiler will also need modification to ensure that
we don't mistakenly skip over any end-of-struct padding.

Fixing the 3 component vector padding problem.

Change-Id: If68af42287deb8f4b28addcd19a9fa314656be44
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 35d812d..c1192fe 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -195,6 +195,81 @@
          prefix, getPtr(), mHal.state.usageFlags, mHal.state.mipmapControl);
 }
 
+uint32_t Allocation::getPackedSize() const {
+    uint32_t numItems = mHal.state.type->getSizeBytes() / mHal.state.type->getElementSizeBytes();
+    return numItems * mHal.state.type->getElement()->getSizeBytesUnpadded();
+}
+
+void Allocation::writePackedData(const Type *type,
+                                 uint8_t *dst, const uint8_t *src, bool dstPadded) {
+    const Element *elem = type->getElement();
+    uint32_t unpaddedBytes = elem->getSizeBytesUnpadded();
+    uint32_t paddedBytes = elem->getSizeBytes();
+    uint32_t numItems = type->getSizeBytes() / paddedBytes;
+
+    uint32_t srcInc = !dstPadded ? paddedBytes : unpaddedBytes;
+    uint32_t dstInc =  dstPadded ? paddedBytes : unpaddedBytes;
+
+    // no sub-elements
+    uint32_t fieldCount = elem->getFieldCount();
+    if (fieldCount == 0) {
+        for (uint32_t i = 0; i < numItems; i ++) {
+            memcpy(dst, src, unpaddedBytes);
+            src += srcInc;
+            dst += dstInc;
+        }
+        return;
+    }
+
+    // Cache offsets
+    uint32_t *offsetsPadded = new uint32_t[fieldCount];
+    uint32_t *offsetsUnpadded = new uint32_t[fieldCount];
+    uint32_t *sizeUnpadded = new uint32_t[fieldCount];
+
+    for (uint32_t i = 0; i < fieldCount; i++) {
+        offsetsPadded[i] = elem->getFieldOffsetBytes(i);
+        offsetsUnpadded[i] = elem->getFieldOffsetBytesUnpadded(i);
+        sizeUnpadded[i] = elem->getField(i)->getSizeBytesUnpadded();
+    }
+
+    uint32_t *srcOffsets = !dstPadded ? offsetsPadded : offsetsUnpadded;
+    uint32_t *dstOffsets =  dstPadded ? offsetsPadded : offsetsUnpadded;
+
+    // complex elements, need to copy subelem after subelem
+    for (uint32_t i = 0; i < numItems; i ++) {
+        for (uint32_t fI = 0; fI < fieldCount; fI++) {
+            memcpy(dst + dstOffsets[fI], src + srcOffsets[fI], sizeUnpadded[fI]);
+        }
+        src += srcInc;
+        dst += dstInc;
+    }
+
+    delete[] offsetsPadded;
+    delete[] offsetsUnpadded;
+    delete[] sizeUnpadded;
+}
+
+void Allocation::unpackVec3Allocation(const void *data, uint32_t dataSize) {
+    const uint8_t *src = (const uint8_t*)data;
+    uint8_t *dst = (uint8_t*)getPtr();
+
+    writePackedData(getType(), dst, src, true);
+}
+
+void Allocation::packVec3Allocation(OStream *stream) const {
+    uint32_t paddedBytes = getType()->getElement()->getSizeBytes();
+    uint32_t unpaddedBytes = getType()->getElement()->getSizeBytesUnpadded();
+    uint32_t numItems = mHal.state.type->getSizeBytes() / paddedBytes;
+
+    const uint8_t *src = (const uint8_t*)getPtr();
+    uint8_t *dst = new uint8_t[numItems * unpaddedBytes];
+
+    writePackedData(getType(), dst, src, false);
+    stream->addByteArray(dst, getPackedSize());
+
+    delete[] dst;
+}
+
 void Allocation::serialize(OStream *stream) const {
     // Need to identify ourselves
     stream->addU32((uint32_t)getClassId());
@@ -207,10 +282,17 @@
     mHal.state.type->serialize(stream);
 
     uint32_t dataSize = mHal.state.type->getSizeBytes();
+    // 3 element vectors are padded to 4 in memory, but padding isn't serialized
+    uint32_t packedSize = getPackedSize();
     // Write how much data we are storing
-    stream->addU32(dataSize);
-    // Now write the data
-    stream->addByteArray(getPtr(), dataSize);
+    stream->addU32(packedSize);
+    if (dataSize == packedSize) {
+        // Now write the data
+        stream->addByteArray(getPtr(), dataSize);
+    } else {
+        // Now write the data
+        packVec3Allocation(stream);
+    }
 }
 
 Allocation *Allocation::createFromStream(Context *rsc, IStream *stream) {
@@ -230,22 +312,30 @@
     }
     type->compute();
 
+    Allocation *alloc = Allocation::createAllocation(rsc, type, RS_ALLOCATION_USAGE_SCRIPT);
+    type->decUserRef();
+
     // Number of bytes we wrote out for this allocation
     uint32_t dataSize = stream->loadU32();
-    if (dataSize != type->getSizeBytes()) {
+    // 3 element vectors are padded to 4 in memory, but padding isn't serialized
+    uint32_t packedSize = alloc->getPackedSize();
+    if (dataSize != type->getSizeBytes() &&
+        dataSize != packedSize) {
         LOGE("failed to read allocation because numbytes written is not the same loaded type wants\n");
+        ObjectBase::checkDelete(alloc);
         ObjectBase::checkDelete(type);
         return NULL;
     }
 
-    Allocation *alloc = Allocation::createAllocation(rsc, type, RS_ALLOCATION_USAGE_SCRIPT);
     alloc->setName(name.string(), name.size());
-    type->decUserRef();
 
-    uint32_t count = dataSize / type->getElementSizeBytes();
-
-    // Read in all of our allocation data
-    alloc->data(rsc, 0, 0, count, stream->getPtr() + stream->getPos(), dataSize);
+    if (dataSize == type->getSizeBytes()) {
+        uint32_t count = dataSize / type->getElementSizeBytes();
+        // Read in all of our allocation data
+        alloc->data(rsc, 0, 0, count, stream->getPtr() + stream->getPos(), dataSize);
+    } else {
+        alloc->unpackVec3Allocation(stream->getPtr() + stream->getPos(), dataSize);
+    }
     stream->reset(stream->getPos() + dataSize);
 
     return alloc;