merge in KFS78N (no-op)
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index 7546b38..4e38459 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -119,6 +119,10 @@
         return;
     }
     const uchar *pinY = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    if (pinY == NULL) {
+        ALOGE("YuvToRGB executed without data, skipping");
+        return;
+    }
 
     size_t strideY = cp->alloc->mHal.drvState.lod[0].stride;
 
@@ -128,15 +132,11 @@
     }
     const uchar *Y = pinY + (p->y * strideY);
 
-    //    ALOGE("pinY, %p, Y, %p, p->y, %d, strideY, %d", pinY, Y, p->y, strideY);
-    //    ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX, cp->alloc->mHal.drvState.lod[0].dimY);
-    //    ALOGE("p->dimX, %d, p->dimY, %d", p->dimX, p->dimY);
-
     uchar4 *out = (uchar4 *)p->out;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
-    const size_t cstep = cp->alloc->mHal.drvState.yuv.step;
+    size_t cstep = cp->alloc->mHal.drvState.yuv.step;
 
     const uchar *pinU = (const uchar *)cp->alloc->mHal.drvState.lod[1].mallocPtr;
     const size_t strideU = cp->alloc->mHal.drvState.lod[1].stride;
@@ -146,17 +146,25 @@
     const size_t strideV = cp->alloc->mHal.drvState.lod[2].stride;
     const uchar *v = pinV + ((p->y >> 1) * strideV);
 
+    //ALOGE("pinY, %p, Y, %p, p->y, %d, strideY, %d", pinY, Y, p->y, strideY);
+    //ALOGE("pinU, %p, U, %p, p->y, %d, strideU, %d", pinU, u, p->y, strideU);
+    //ALOGE("pinV, %p, V, %p, p->y, %d, strideV, %d", pinV, v, p->y, strideV);
+    //ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX, cp->alloc->mHal.drvState.lod[0].dimY);
+    //ALOGE("p->dimX, %d, p->dimY, %d", p->dimX, p->dimY);
+
     if (pinU == NULL) {
         // Legacy yuv support didn't fill in uv
         v = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
             (strideY * p->dimY) +
             ((p->y >> 1) * strideY);
         u = v + 1;
+        cstep = 2;
     }
 
 #if defined(ARCH_ARM_HAVE_VFP)
     if((x2 > x1) && gArchUseSIMD) {
-        int32_t len = (x2 - x1 - 1) >> 3;
+        // The neon paths may over-read by up to 8 bytes
+        int32_t len = (x2 - x1 - 8) >> 3;
         if(len > 0) {
             if (cstep == 1) {
                 rsdIntrinsicYuv2_K(out, Y, u, v, len, YuvCoeff);
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 9f217e8..817c9d8 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -673,6 +673,8 @@
         mapper.unlock(drv->wndBuffer->handle);
         old->cancelBuffer(old, drv->wndBuffer, -1);
         drv->wndSurface = NULL;
+
+        native_window_api_disconnect(old, NATIVE_WINDOW_API_CPU);
         old->decStrong(NULL);
     }
 
@@ -687,6 +689,12 @@
             flags |= GRALLOC_USAGE_HW_RENDER;
         }
 
+        r = native_window_api_connect(nw, NATIVE_WINDOW_API_CPU);
+        if (r) {
+            rsc->setError(RS_ERROR_DRIVER, "Error setting IO output buffer usage.");
+            goto error;
+        }
+
         r = native_window_set_usage(nw, flags);
         if (r) {
             rsc->setError(RS_ERROR_DRIVER, "Error setting IO output buffer usage.");
@@ -818,10 +826,12 @@
             dst += alloc->mHal.drvState.lod[lod].stride;
         }
         if (alloc->mHal.state.yuv) {
+            size_t clineSize = lineSize;
             int lod = 1;
             int maxLod = 2;
             if (alloc->mHal.state.yuv == HAL_PIXEL_FORMAT_YV12) {
                 maxLod = 3;
+                clineSize >>= 1;
             } else if (alloc->mHal.state.yuv == HAL_PIXEL_FORMAT_YCrCb_420_SP) {
                 lod = 2;
                 maxLod = 3;
@@ -831,7 +841,7 @@
                 uint8_t *dst = GetOffsetPtr(alloc, xoff, yoff, 0, lod, face);
 
                 for (uint32_t line=(yoff >> 1); line < ((yoff+h)>>1); line++) {
-                    memcpy(dst, src, lineSize);
+                    memcpy(dst, src, clineSize);
                     src += alloc->mHal.drvState.lod[lod].stride;
                     dst += alloc->mHal.drvState.lod[lod].stride;
                 }
diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c
index 964853b..a307776 100644
--- a/driver/runtime/rs_allocation.c
+++ b/driver/runtime/rs_allocation.c
@@ -269,9 +269,9 @@
 
     const size_t cstep = alloc->mHal.drvState.yuv.step;
     const size_t shift = alloc->mHal.drvState.yuv.shift;
-    const size_t stride = alloc->mHal.drvState.lod[2].stride;
+    const size_t stride = alloc->mHal.drvState.lod[1].stride;
 
-    const uchar *pin = (const uchar *)alloc->mHal.drvState.lod[2].mallocPtr;
+    const uchar *pin = (const uchar *)alloc->mHal.drvState.lod[1].mallocPtr;
 
     return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
 }
@@ -283,9 +283,9 @@
 
     const size_t cstep = alloc->mHal.drvState.yuv.step;
     const size_t shift = alloc->mHal.drvState.yuv.shift;
-    const size_t stride = alloc->mHal.drvState.lod[1].stride;
+    const size_t stride = alloc->mHal.drvState.lod[2].stride;
 
-    const uchar *pin = (const uchar *)alloc->mHal.drvState.lod[1].mallocPtr;
+    const uchar *pin = (const uchar *)alloc->mHal.drvState.lod[2].mallocPtr;
 
     return pin[((x >> shift) * cstep) + ((y >> shift) * stride)];
 }
diff --git a/java/tests/SampleTest/src/com/android/rs/sample/SampleRSActivity.java b/java/tests/SampleTest/src/com/android/rs/sample/SampleRSActivity.java
index 77cbf84..dd4a98a 100644
--- a/java/tests/SampleTest/src/com/android/rs/sample/SampleRSActivity.java
+++ b/java/tests/SampleTest/src/com/android/rs/sample/SampleRSActivity.java
@@ -31,6 +31,7 @@
 import android.renderscript.Type;
 import android.renderscript.Type.Builder;
 import android.util.Log;
+import android.view.Surface;
 import android.view.TextureView;
 import android.view.TextureView.SurfaceTextureListener;
 import android.view.View;
@@ -52,16 +53,20 @@
         }
 
         public void onSurfaceTextureSizeChanged(SurfaceTexture surface, int width, int height) {
-            mOutPixelsAllocation.setSurfaceTexture(surface);
+            if (surface != null) {
+                mOutPixelsAllocation.setSurface(new Surface(surface));
+            }
         }
 
         public void onSurfaceTextureAvailable(SurfaceTexture surface, int width, int height) {
-            mOutPixelsAllocation.setSurfaceTexture(surface);
+            if (surface != null) {
+                mOutPixelsAllocation.setSurface(new Surface(surface));
+            }
             filterAlloc(mOutPixelsAllocation, mSampler);
         }
 
         public boolean onSurfaceTextureDestroyed(SurfaceTexture surface) {
-            mOutPixelsAllocation.setSurfaceTexture(null);
+            mOutPixelsAllocation.setSurface(null);
             return true;
         }
     }
diff --git a/rsGrallocConsumer.cpp b/rsGrallocConsumer.cpp
index c5d37b2..e016e7d 100644
--- a/rsGrallocConsumer.cpp
+++ b/rsGrallocConsumer.cpp
@@ -142,8 +142,8 @@
     //mAlloc->frameNumber = b.mFrameNumber;
 
     if (mAlloc->mHal.state.yuv) {
-        mAlloc->mHal.drvState.lod[1].mallocPtr = ycbcr.cb;
-        mAlloc->mHal.drvState.lod[2].mallocPtr = ycbcr.cr;
+        mAlloc->mHal.drvState.lod[1].mallocPtr = ycbcr.cr;
+        mAlloc->mHal.drvState.lod[2].mallocPtr = ycbcr.cb;
 
         mAlloc->mHal.drvState.lod[0].stride = ycbcr.ystride;
         mAlloc->mHal.drvState.lod[1].stride = ycbcr.cstride;
diff --git a/tests/latency/latency.cpp b/tests/latency/latency.cpp
index 5bf134d..bea9237 100644
--- a/tests/latency/latency.cpp
+++ b/tests/latency/latency.cpp
@@ -53,7 +53,11 @@
 
     sp<RS> rs = new RS();
 
-    bool r = rs->init(forceCpu, synchronous);
+    uint32_t flags = 0;
+    if (forceCpu) flags |= RS_INIT_LOW_LATENCY;
+    if (synchronous) flags |= RS_INIT_SYNCHRONOUS;
+
+    bool r = rs->init(flags);
 
     sp<const Element> e = Element::U32(rs);
 
diff --git a/tests/latency/latency.rs b/tests/latency/latency.rs
index 7ddf3fa..6fc4bfd 100644
--- a/tests/latency/latency.rs
+++ b/tests/latency/latency.rs
@@ -18,7 +18,7 @@
 #pragma rs java_package_name(com.android.rs.cpptests)
 #pragma rs_fp_relaxed
 
-void root(const uchar4 *v_in, uchar4 *v_out) {
+void root(const uint32_t *v_in, uint32_t *v_out) {
 
 }