1st cut of ForEach and test.

Change-Id: I6534569c8d26db8b9691666134a555c8bf94184e
diff --git a/RenderScript.h b/RenderScript.h
index 745df95..6636fef 100644
--- a/RenderScript.h
+++ b/RenderScript.h
@@ -255,6 +255,19 @@
     const char* objectName;
 } RsFileIndexEntry;
 
+// Script to Script
+typedef struct {
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+
+} RsScriptCall;
+
 #ifndef NO_RS_FUNCS
 #include "rsgApiFuncDecl.h"
 #endif
diff --git a/java/ImageProcessing/res/raw/horizontal_blur.rs b/java/ImageProcessing/res/raw/horizontal_blur.rs
new file mode 100644
index 0000000..7b0e6bc
--- /dev/null
+++ b/java/ImageProcessing/res/raw/horizontal_blur.rs
@@ -0,0 +1,43 @@
+#pragma version(1)
+
+#include "../../../../scriptc/rs_types.rsh"
+#include "../../../../scriptc/rs_math.rsh"
+
+#include "ip.rsh"
+
+uchar4 * ScratchPixel;
+
+#pragma rs export_var(ScratchPixel)
+
+void root(const void *v_in, void *v_out, const void *usrData, uint32_t x, uint32_t y) {
+    uchar4 *output = (uchar4 *)v_out;
+    const uchar4 *input = (uchar4 *)v_in;
+    const FilterStruct *fs = (const FilterStruct *)usrData;
+
+    float4 blurredPixel = 0;
+    float4 currentPixel = 0;
+    for(int r = -fs->radius; r <= fs->radius; r ++) {
+        // Stepping left and right away from the pixel
+        int validW = x + r;
+        // Clamp to zero and width max() isn't exposed for ints yet
+        if(validW < 0) {
+            validW = 0;
+        }
+        if(validW > fs->width - 1) {
+            validW = fs->width - 1;
+        }
+        //int validW = rsClamp(w + r, 0, width - 1);
+
+        float weight = fs->gaussian[r + fs->radius];
+        currentPixel.x = (float)(input[validW].x);
+        currentPixel.y = (float)(input[validW].y);
+        currentPixel.z = (float)(input[validW].z);
+        //currentPixel.w = (float)(input->a);
+
+        blurredPixel += currentPixel * weight;
+    }
+
+    output->x = (uint8_t)blurredPixel.x;
+    output->y = (uint8_t)blurredPixel.y;
+    output->z = (uint8_t)blurredPixel.z;
+}
diff --git a/java/ImageProcessing/res/raw/horizontal_blur_bc.bc b/java/ImageProcessing/res/raw/horizontal_blur_bc.bc
new file mode 100644
index 0000000..c9ba5d9
--- /dev/null
+++ b/java/ImageProcessing/res/raw/horizontal_blur_bc.bc
Binary files differ
diff --git a/java/ImageProcessing/res/raw/ip.rsh b/java/ImageProcessing/res/raw/ip.rsh
new file mode 100644
index 0000000..4073304
--- /dev/null
+++ b/java/ImageProcessing/res/raw/ip.rsh
@@ -0,0 +1,15 @@
+#pragma rs java_package_name(com.android.rs.image)
+
+#define MAX_RADIUS 25
+
+typedef struct {
+    float *gaussian; //[MAX_RADIUS * 2 + 1];
+    rs_matrix3x3 colorMat;
+
+    int height;
+    int width;
+    int radius;
+
+} FilterStruct;
+
+
diff --git a/java/ImageProcessing/res/raw/threshold.rs b/java/ImageProcessing/res/raw/threshold.rs
index 9585e92..ecbfac4 100644
--- a/java/ImageProcessing/res/raw/threshold.rs
+++ b/java/ImageProcessing/res/raw/threshold.rs
@@ -2,12 +2,8 @@
 
 #include "../../../../scriptc/rs_types.rsh"
 #include "../../../../scriptc/rs_math.rsh"
-#include "../../../../scriptc/rs_graphics.rsh"
 
-#pragma rs java_package_name(com.android.rs.image)
-
-
-#define MAX_RADIUS 25
+#include "ip.rsh"
 
 int height;
 int width;
@@ -28,11 +24,15 @@
 static float inWMinInB;
 static float outWMinOutB;
 static float overInWMinInB;
-//static float3 gammaV;
+static FilterStruct filterStruct;
 
-#pragma rs export_var(height, width, radius, InPixel, OutPixel, ScratchPixel, inBlack, outBlack, inWhite, outWhite, gamma, saturation, InPixel, OutPixel, ScratchPixel)
+#pragma rs export_var(height, width, radius, InPixel, OutPixel, ScratchPixel, inBlack, outBlack, inWhite, outWhite, gamma, saturation, InPixel, OutPixel, ScratchPixel, vBlurScript, hBlurScript)
 #pragma rs export_func(filter, filterBenchmark);
 
+rs_script vBlurScript;
+rs_script hBlurScript;
+
+
 // Store our coefficients here
 static float gaussian[MAX_RADIUS * 2 + 1];
 static rs_matrix3x3 colorMat;
@@ -145,48 +145,6 @@
     rsSendToClient(&count, 1, 4, 0);
 }
 
-static void horizontalBlur() {
-    float4 blurredPixel = 0;
-    float4 currentPixel = 0;
-    // Horizontal blur
-    int w, h, r;
-    for(h = 0; h < height; h ++) {
-        uchar4 *input = InPixel + h*width;
-        uchar4 *output = ScratchPixel + h*width;
-
-        for(w = 0; w < width; w ++) {
-            blurredPixel = 0;
-
-            for(r = -radius; r <= radius; r ++) {
-                // Stepping left and right away from the pixel
-                int validW = w + r;
-                // Clamp to zero and width max() isn't exposed for ints yet
-                if(validW < 0) {
-                    validW = 0;
-                }
-                if(validW > width - 1) {
-                    validW = width - 1;
-                }
-                //int validW = rsClamp(w + r, 0, width - 1);
-
-                float weight = gaussian[r + radius];
-                currentPixel.x = (float)(input[validW].x);
-                currentPixel.y = (float)(input[validW].y);
-                currentPixel.z = (float)(input[validW].z);
-                //currentPixel.w = (float)(input->a);
-
-                blurredPixel += currentPixel*weight;
-            }
-
-            output->x = (uint8_t)blurredPixel.x;
-            output->y = (uint8_t)blurredPixel.y;
-            output->z = (uint8_t)blurredPixel.z;
-            //output->a = (uint8_t)blurredPixel.w;
-            output++;
-        }
-    }
-}
-
 static void horizontalBlurLevels() {
     float4 blurredPixel = 0;
     float4 currentPixel = 0;
@@ -232,52 +190,11 @@
     }
 }
 
-static void verticalBlur() {
-    float4 blurredPixel = 0;
-    float4 currentPixel = 0;
-    // Vertical blur
-    int w, h, r;
-    for(h = 0; h < height; h ++) {
-        uchar4 *output = OutPixel + h*width;
-
-        for(w = 0; w < width; w ++) {
-
-            blurredPixel = 0;
-            for(r = -radius; r <= radius; r ++) {
-#if 1
-                int validH = h + r;
-                // Clamp to zero and width
-                if(validH < 0) {
-                    validH = 0;
-                }
-                if(validH > height - 1) {
-                    validH = height - 1;
-                }
-
-                uchar4 *input = ScratchPixel + validH*width + w;
-
-                float weight = gaussian[r + radius];
-
-                currentPixel.x = (float)(input->x);
-                currentPixel.y = (float)(input->y);
-                currentPixel.z = (float)(input->z);
-
-                blurredPixel.xyz += currentPixel.xyz * weight;
-#else
-                int validH = rsClamp(h + r, 0, height - 1);
-                uchar4 *input = ScratchPixel + validH*width + w;
-                blurredPixel.xyz += convert_float3(input->xyz) * gaussian[r + radius];
-#endif
-            }
-
-            //output->xyz = convert_uchar3(blurredPixel.xyz);
-            output->x = (uint8_t)blurredPixel.x;
-            output->y = (uint8_t)blurredPixel.y;
-            output->z = (uint8_t)blurredPixel.z;
-            //output->a = (uint8_t)blurredPixel.w;
-            output++;
-        }
-    }
+static void initStructs() {
+    filterStruct.gaussian = gaussian;
+    filterStruct.width = width;
+    filterStruct.height = height;
+    filterStruct.radius = radius;
 }
 
 void filter() {
@@ -285,6 +202,8 @@
     RS_DEBUG(width);
     RS_DEBUG(radius);
 
+    initStructs();
+
     computeColorMatrix();
 
     if(radius == 0) {
@@ -295,18 +214,30 @@
     computeGaussianWeights();
 
     horizontalBlurLevels();
-    verticalBlur();
+
+    rsForEach(vBlurScript,
+              rsGetAllocation(InPixel),
+              rsGetAllocation(OutPixel),
+              &filterStruct);
 
     int count = 0;
     rsSendToClient(&count, 1, 4, 0);
 }
 
 void filterBenchmark() {
+    initStructs();
 
     computeGaussianWeights();
 
-    horizontalBlur();
-    verticalBlur();
+    rsForEach(hBlurScript,
+              rsGetAllocation(InPixel),
+              rsGetAllocation(OutPixel),
+              &filterStruct);
+
+    rsForEach(vBlurScript,
+              rsGetAllocation(InPixel),
+              rsGetAllocation(OutPixel),
+              &filterStruct);
 
     int count = 0;
     rsSendToClient(&count, 1, 4, 0);
diff --git a/java/ImageProcessing/res/raw/threshold_bc.bc b/java/ImageProcessing/res/raw/threshold_bc.bc
index 95dcd8d..8f37fdc 100644
--- a/java/ImageProcessing/res/raw/threshold_bc.bc
+++ b/java/ImageProcessing/res/raw/threshold_bc.bc
Binary files differ
diff --git a/java/ImageProcessing/res/raw/vertical_blur.rs b/java/ImageProcessing/res/raw/vertical_blur.rs
new file mode 100644
index 0000000..846f515
--- /dev/null
+++ b/java/ImageProcessing/res/raw/vertical_blur.rs
@@ -0,0 +1,51 @@
+#pragma version(1)
+
+#include "../../../../scriptc/rs_types.rsh"
+#include "../../../../scriptc/rs_math.rsh"
+
+#include "ip.rsh"
+
+uchar4 * ScratchPixel;
+
+#pragma rs export_var(ScratchPixel)
+
+void root(const void *v_in, void *v_out, const void *usrData, uint32_t x, uint32_t y) {
+    uchar4 *output = (uchar4 *)v_out;
+    const uchar4 *input = (uchar4 *)v_in;
+    const FilterStruct *fs = (const FilterStruct *)usrData;
+
+    float4 blurredPixel = 0;
+    float4 currentPixel = 0;
+    for(int r = -fs->radius; r <= fs->radius; r ++) {
+#if 1
+        int validH = y + r;
+        // Clamp to zero and width
+        if(validH < 0) {
+            validH = 0;
+        }
+        if(validH > fs->height - 1) {
+            validH = fs->height - 1;
+        }
+
+        uchar4 *input = ScratchPixel + validH * fs->width + x;
+
+        float weight = fs->gaussian[r + fs->radius];
+
+        currentPixel.x = (float)(input->x);
+        currentPixel.y = (float)(input->y);
+        currentPixel.z = (float)(input->z);
+
+        blurredPixel.xyz += currentPixel.xyz * weight;
+#else
+        int validH = rsClamp(y + r, 0, height - 1);
+        uchar4 *input = ScratchPixel + validH * width + x;
+        blurredPixel.xyz += convert_float3(input->xyz) * gaussian[r + fs->radius];
+#endif
+    }
+
+    //output->xyz = convert_uchar3(blurredPixel.xyz);
+    output->x = (uint8_t)blurredPixel.x;
+    output->y = (uint8_t)blurredPixel.y;
+    output->z = (uint8_t)blurredPixel.z;
+}
+
diff --git a/java/ImageProcessing/res/raw/vertical_blur_bc.bc b/java/ImageProcessing/res/raw/vertical_blur_bc.bc
new file mode 100644
index 0000000..af1cd8e
--- /dev/null
+++ b/java/ImageProcessing/res/raw/vertical_blur_bc.bc
Binary files differ
diff --git a/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java b/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
index 7bf6596..21c3d74 100644
--- a/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
+++ b/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
@@ -42,6 +42,8 @@
     private Bitmap mBitmapOut;
     private Bitmap mBitmapScratch;
     private ScriptC_Threshold mScript;
+    private ScriptC_Vertical_blur mScriptVBlur;
+    private ScriptC_Horizontal_blur mScriptHBlur;
     private int mRadius = 0;
     private SeekBar mRadiusSeekBar;
 
@@ -373,6 +375,12 @@
         mOutPixelsAllocation = Allocation.createBitmapRef(mRS, mBitmapOut);
         mScratchPixelsAllocation = Allocation.createBitmapRef(mRS, mBitmapScratch);
 
+        mScriptVBlur = new ScriptC_Vertical_blur(mRS, getResources(), R.raw.vertical_blur_bc, false);
+        mScriptVBlur.bind_ScratchPixel(mScratchPixelsAllocation);
+
+        mScriptHBlur = new ScriptC_Horizontal_blur(mRS, getResources(), R.raw.horizontal_blur_bc, false);
+        mScriptHBlur.bind_ScratchPixel(mScratchPixelsAllocation);
+
         mScript = new ScriptC_Threshold(mRS, getResources(), R.raw.threshold_bc, false);
         mScript.set_width(mBitmapIn.getWidth());
         mScript.set_height(mBitmapIn.getHeight());
@@ -388,6 +396,9 @@
         mScript.bind_InPixel(mInPixelsAllocation);
         mScript.bind_OutPixel(mOutPixelsAllocation);
         mScript.bind_ScratchPixel(mScratchPixelsAllocation);
+
+        mScript.set_vBlurScript(mScriptVBlur);
+        mScript.set_hBlurScript(mScriptHBlur);
     }
 
     private Bitmap loadBitmap(int resource) {
diff --git a/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java b/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java
new file mode 100644
index 0000000..8ee50a8
--- /dev/null
+++ b/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image;
+
+import android.renderscript.*;
+import android.content.res.Resources;
+import android.util.Log;
+
+public class ScriptC_Horizontal_blur extends ScriptC {
+    // Constructor
+    public  ScriptC_Horizontal_blur(RenderScript rs, Resources resources, int id, boolean isRoot) {
+        super(rs, resources, id, isRoot);
+    }
+
+    private final static int mExportVarIdx_ScratchPixel = 0;
+    private Allocation mExportVar_ScratchPixel;
+    public void bind_ScratchPixel(Allocation v) {
+        mExportVar_ScratchPixel = v;
+        if(v == null) bindAllocation(null, mExportVarIdx_ScratchPixel);
+        else bindAllocation(v, mExportVarIdx_ScratchPixel);
+    }
+
+    public Allocation get_ScratchPixel() {
+        return mExportVar_ScratchPixel;
+    }
+
+}
+
diff --git a/java/ImageProcessing/src/com/android/rs/image/ScriptC_Threshold.java b/java/ImageProcessing/src/com/android/rs/image/ScriptC_Threshold.java
index ea363d3..c23dca1 100644
--- a/java/ImageProcessing/src/com/android/rs/image/ScriptC_Threshold.java
+++ b/java/ImageProcessing/src/com/android/rs/image/ScriptC_Threshold.java
@@ -161,6 +161,28 @@
         return mExportVar_saturation;
     }
 
+    private final static int mExportVarIdx_vBlurScript = 12;
+    private Script mExportVar_vBlurScript;
+    public void set_vBlurScript(Script v) {
+        mExportVar_vBlurScript = v;
+        setVar(mExportVarIdx_vBlurScript, (v == null) ? 0 : v.getID());
+    }
+
+    public Script get_vBlurScript() {
+        return mExportVar_vBlurScript;
+    }
+
+    private final static int mExportVarIdx_hBlurScript = 13;
+    private Script mExportVar_hBlurScript;
+    public void set_hBlurScript(Script v) {
+        mExportVar_hBlurScript = v;
+        setVar(mExportVarIdx_hBlurScript, (v == null) ? 0 : v.getID());
+    }
+
+    public Script get_hBlurScript() {
+        return mExportVar_hBlurScript;
+    }
+
     private final static int mExportFuncIdx_filter = 0;
     public void invoke_filter() {
         invoke(mExportFuncIdx_filter);
diff --git a/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java b/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java
new file mode 100644
index 0000000..0215f60
--- /dev/null
+++ b/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image;
+
+import android.renderscript.*;
+import android.content.res.Resources;
+import android.util.Log;
+
+public class ScriptC_Vertical_blur extends ScriptC {
+    // Constructor
+    public  ScriptC_Vertical_blur(RenderScript rs, Resources resources, int id, boolean isRoot) {
+        super(rs, resources, id, isRoot);
+    }
+
+    private final static int mExportVarIdx_ScratchPixel = 0;
+    private Allocation mExportVar_ScratchPixel;
+    public void bind_ScratchPixel(Allocation v) {
+        mExportVar_ScratchPixel = v;
+        if(v == null) bindAllocation(null, mExportVarIdx_ScratchPixel);
+        else bindAllocation(v, mExportVarIdx_ScratchPixel);
+    }
+
+    public Allocation get_ScratchPixel() {
+        return mExportVar_ScratchPixel;
+    }
+
+}
+
diff --git a/rsScript.h b/rsScript.h
index ea6aec5..0717059 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -64,9 +64,11 @@
 
     void setVar(uint32_t slot, const void *val, uint32_t len);
 
-    virtual void runForEach(Context *rsc, const Allocation *ain, Allocation *aout) = 0;
-    virtual void runForEach(Context *rsc, const Allocation *ain, Allocation *aout, uint32_t xStart, uint32_t xEnd) = 0;
-    virtual void runForEach(Context *rsc, const Allocation *ain, Allocation *aout, uint32_t xStart, uint32_t yStart, uint32_t xEnd, uint32_t yEnd) = 0;
+    virtual void runForEach(Context *rsc,
+                            const Allocation * ain,
+                            Allocation * aout,
+                            const void * usr,
+                            const RsScriptCall *sc = NULL) = 0;
 
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, uint32_t len) = 0;
     virtual void setupScript(Context *rsc) = 0;
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index 975b704..b87ac28 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -136,55 +136,95 @@
     return ret;
 }
 
-void ScriptC::runForEach(Context *rsc, const Allocation *ain, Allocation *aout,
-                         uint32_t xStart, uint32_t yStart, uint32_t xEnd, uint32_t yEnd)
-{
-    LOGE("ScriptC::runForEach not implemented");
-}
 
-void ScriptC::runForEach(Context *rsc, const Allocation *ain, Allocation *aout, uint32_t xStart, uint32_t xEnd)
+void ScriptC::runForEach(Context *rsc,
+                         const Allocation * ain,
+                         Allocation * aout,
+                         const void * usr,
+                         const RsScriptCall *sc)
 {
     uint32_t dimX = ain->getType()->getDimX();
-    rsAssert(xStart < dimX);
-    rsAssert(xEnd <= dimX);
-    rsAssert(ain->getType()->getDimY() == 0);
-    rsAssert(ain->getType()->getDimZ() == 0);
+    uint32_t dimY = ain->getType()->getDimY();
+    uint32_t dimZ = ain->getType()->getDimZ();
+    uint32_t dimA = 0;//ain->getType()->getDimArray();
 
-    if (xStart >= dimX) xStart = dimX - 1;
-    if (xEnd >= dimX) xEnd = dimX - 1;
-    if (xStart > xEnd) return;
+    uint32_t xStart = 0;
+    uint32_t xEnd = 0;
+    uint32_t yStart = 0;
+    uint32_t yEnd = 0;
+    uint32_t zStart = 0;
+    uint32_t zEnd = 0;
+    uint32_t arrayStart = 0;
+    uint32_t arrayEnd = 0;
+
+    if (!sc || (sc->xEnd == 0)) {
+        xStart = 0;
+        xEnd = ain->getType()->getDimX();
+    } else {
+        rsAssert(xStart < dimX);
+        rsAssert(xEnd <= dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        xStart = rsMin(dimX, sc->xStart);
+        xEnd = rsMin(dimX, sc->xEnd);
+        if (xStart >= xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        yStart = 0;
+        yEnd = ain->getType()->getDimY();
+    } else {
+        rsAssert(yStart < dimY);
+        rsAssert(yEnd <= dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        yStart = rsMin(dimY, sc->yStart);
+        yEnd = rsMin(dimY, sc->yEnd);
+        if (yStart >= yEnd) return;
+    }
+
+    xEnd = rsMax((uint32_t)1, xEnd);
+    yEnd = rsMax((uint32_t)1, yEnd);
+    zEnd = rsMax((uint32_t)1, zEnd);
+    arrayEnd = rsMax((uint32_t)1, arrayEnd);
+
+    rsAssert(ain->getType()->getDimZ() == 0);
 
     setupScript(rsc);
     Script * oldTLS = setTLS(this);
 
-    typedef int (*rs_t)(const void *, void *, uint32_t);
+    typedef int (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
     const uint8_t *ptrIn = (const uint8_t *)ain->getPtr();
-    uint32_t strideIn = ain->getType()->getElementSizeBytes();
+    uint32_t eStrideIn = ain->getType()->getElementSizeBytes();
 
     uint8_t *ptrOut = NULL;
-    uint32_t strideOut = 0;
+    uint32_t eStrideOut = 0;
     if (aout) {
         ptrOut = (uint8_t *)aout->getPtr();
-        strideOut = aout->getType()->getElementSizeBytes();
+        eStrideOut = aout->getType()->getElementSizeBytes();
     }
 
-    for (uint32_t ct=xStart; ct < xEnd; ct++) {
-        ((rs_t)mProgram.mRoot) (ptrIn + (strideIn * ct), ptrOut + (strideOut * ct), ct);
+    for (uint32_t ar = arrayStart; ar < arrayEnd; ar++) {
+        for (uint32_t z = zStart; z < zEnd; z++) {
+            for (uint32_t y = yStart; y < yEnd; y++) {
+                uint32_t offset = dimX * dimY * dimZ * ar +
+                                  dimX * dimY * z +
+                                  dimX * y;
+                uint8_t *xPtrOut = ptrOut + (eStrideOut * offset);
+                const uint8_t *xPtrIn = ptrIn + (eStrideIn * offset);
+
+                for (uint32_t x = xStart; x < xEnd; x++) {
+                    ((rs_t)mProgram.mRoot) (xPtrIn, xPtrOut, usr, x, y, z, ar);
+                    xPtrIn += eStrideIn;
+                    xPtrOut += eStrideOut;
+                }
+            }
+        }
+
     }
 
     setTLS(oldTLS);
 }
 
-void ScriptC::runForEach(Context *rsc, const Allocation *ain, Allocation *aout)
-{
-    if (ain->getType()->getDimY()) {
-        runForEach(rsc, ain, aout, 0, 0, 0xffffffff, 0xffffffff);
-    } else {
-        runForEach(rsc, ain, aout, 0, 0xffffffff);
-    }
-}
-
-
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, uint32_t len)
 {
     //LOGE("rsi_ScriptInvoke %i", slot);
@@ -246,7 +286,7 @@
 
 void ScriptCState::runCompiler(Context *rsc, ScriptC *s)
 {
-    LOGE("ScriptCState::runCompiler ");
+    LOGV("ScriptCState::runCompiler ");
 
     s->mBccScript = bccCreateScript();
     bccScriptBitcode(s->mBccScript, s->mEnviroment.mScriptText, s->mEnviroment.mScriptTextLength);
@@ -254,7 +294,7 @@
     bccCompileScript(s->mBccScript);
     bccGetScriptLabel(s->mBccScript, "root", (BCCvoid**) &s->mProgram.mRoot);
     bccGetScriptLabel(s->mBccScript, "init", (BCCvoid**) &s->mProgram.mInit);
-    LOGE("root %p,  init %p", s->mProgram.mRoot, s->mProgram.mInit);
+    LOGV("root %p,  init %p", s->mProgram.mRoot, s->mProgram.mInit);
 
     if (s->mProgram.mInit) {
         s->mProgram.mInit();
@@ -268,24 +308,9 @@
         bccGetExportFuncs(s->mBccScript, NULL, s->mEnviroment.mInvokeFunctionCount, (BCCvoid **) s->mEnviroment.mInvokeFunctions);
     }
 
-//    s->mEnviroment.mInvokeFunctions = (Script::InvokeFunc_t *)calloc(100, sizeof(void *));
-//    BCCchar **labels = new char*[100];
-//    bccGetFunctions(s->mBccScript, (BCCsizei *)&s->mEnviroment.mInvokeFunctionCount,
-//                    100, (BCCchar **)labels);
-    //LOGE("func count %i", s->mEnviroment.mInvokeFunctionCount);
-//    for (uint32_t i=0; i < s->mEnviroment.mInvokeFunctionCount; i++) {
-//        BCCsizei length;
-//        bccGetFunctionBinary(s->mBccScript, labels[i], (BCCvoid **)&(s->mEnviroment.mInvokeFunctions[i]), &length);
-        //LOGE("func %i %p", i, s->mEnviroment.mInvokeFunctions[i]);
-  //  }
-
     s->mEnviroment.mFieldAddress = (void **)calloc(100, sizeof(void *));
     bccGetExportVars(s->mBccScript, (BCCsizei *)&s->mEnviroment.mFieldCount,
                      100, s->mEnviroment.mFieldAddress);
-    //LOGE("var count %i", s->mEnviroment.mFieldCount);
-    for (uint32_t i=0; i < s->mEnviroment.mFieldCount; i++) {
-        //LOGE("var %i %p", i, s->mEnviroment.mFieldAddress[i]);
-    }
 
     s->mEnviroment.mFragment.set(rsc->getDefaultProgramFragment());
     s->mEnviroment.mVertex.set(rsc->getDefaultProgramVertex());
diff --git a/rsScriptC.h b/rsScriptC.h
index 50e8a4c..9d09b0b 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -57,9 +57,11 @@
 
     virtual uint32_t run(Context *);
 
-    virtual void runForEach(Context *rsc, const Allocation *ain, Allocation *aout);
-    virtual void runForEach(Context *rsc, const Allocation *ain, Allocation *aout, uint32_t xStart, uint32_t xEnd);
-    virtual void runForEach(Context *rsc, const Allocation *ain, Allocation *aout, uint32_t xStart, uint32_t yStart, uint32_t xEnd, uint32_t yEnd);
+    virtual void runForEach(Context *rsc,
+                            const Allocation * ain,
+                            Allocation * aout,
+                            const void * usr,
+                            const RsScriptCall *sc = NULL);
 
     virtual void serialize(OStream *stream) const {    }
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index ea01134..8d9ca9f 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -376,58 +376,31 @@
 }
 
 
-void SC_ForEachii(RsScript vs, RsAllocation vin)
+void SC_ForEach(RsScript vs,
+                RsAllocation vin,
+                RsAllocation vout,
+                const void *usr)
 {
     GET_TLS();
-    Script *s = static_cast<Script *>(vs);
-    Allocation *ain = static_cast<Allocation *>(vin);
-    s->runForEach(rsc, ain, NULL);
-}
-
-void SC_ForEachiii(RsScript vs, RsAllocation vin, RsAllocation vout)
-{
-    GET_TLS();
-    Script *s = static_cast<Script *>(vs);
-    Allocation *ain = static_cast<Allocation *>(vin);
+    const Allocation *ain = static_cast<const Allocation *>(vin);
     Allocation *aout = static_cast<Allocation *>(vout);
-    s->runForEach(rsc, ain, aout);
+    Script *s = static_cast<Script *>(vs);
+    s->runForEach(rsc, ain, aout, usr);
 }
 
-void SC_ForEachiiii(RsScript vs, RsAllocation vin, int xStart, int xEnd)
+void SC_ForEach2(RsScript vs,
+                RsAllocation vin,
+                RsAllocation vout,
+                const void *usr,
+                const RsScriptCall *call)
 {
     GET_TLS();
-    Script *s = static_cast<Script *>(vs);
-    Allocation *ain = static_cast<Allocation *>(vin);
-    s->runForEach(rsc, ain, NULL, xStart, xEnd);
-}
-
-void SC_ForEachiiiii(RsScript vs, RsAllocation vin, RsAllocation vout, int xStart, int xEnd)
-{
-    GET_TLS();
-    Script *s = static_cast<Script *>(vs);
-    Allocation *ain = static_cast<Allocation *>(vin);
+    const Allocation *ain = static_cast<const Allocation *>(vin);
     Allocation *aout = static_cast<Allocation *>(vout);
-    s->runForEach(rsc, ain, aout, xStart, xEnd);
-}
-
-void SC_ForEachiiiiii(RsScript vs, RsAllocation vin, int xStart, int yStart, int xEnd, int yEnd)
-{
-    GET_TLS();
     Script *s = static_cast<Script *>(vs);
-    Allocation *ain = static_cast<Allocation *>(vin);
-    s->runForEach(rsc, ain, NULL, xStart, yStart, xEnd, yEnd);
+    s->runForEach(rsc, ain, aout, usr, call);
 }
 
-void SC_ForEachiiiiiii(RsScript vs, RsAllocation vin, RsAllocation vout, int xStart, int yStart, int xEnd, int yEnd)
-{
-    GET_TLS();
-    Script *s = static_cast<Script *>(vs);
-    Allocation *ain = static_cast<Allocation *>(vin);
-    Allocation *aout = static_cast<Allocation *>(vout);
-    s->runForEach(rsc, ain, aout, xStart, yStart, xEnd, yEnd);
-}
-
-
 //////////////////////////////////////////////////////////////////////////////
 // Class implementation
 //////////////////////////////////////////////////////////////////////////////
@@ -502,12 +475,8 @@
     { "rsMatrixScale", (void *)&SC_matrixScale },
     { "rsMatrixTranslate", (void *)&SC_matrixTranslate },
 
-    { "_Z9rsForEachii", (void *)&SC_ForEachii },
-    { "_Z9rsForEachiii", (void *)&SC_ForEachiii },
-    { "_Z9rsForEachiiii", (void *)&SC_ForEachiiii },
-    { "_Z9rsForEachiiiii", (void *)&SC_ForEachiiiii },
-    { "_Z9rsForEachiiiiii", (void *)&SC_ForEachiiiiii },
-    { "_Z9rsForEachiiiiiii", (void *)&SC_ForEachiiiiiii },
+    { "_Z9rsForEach9rs_script13rs_allocationS0_PKv", (void *)&SC_ForEach },
+    //{ "_Z9rsForEach9rs_script13rs_allocationS0_PKv", (void *)&SC_ForEach2 },
 
 ////////////////////////////////////////////////////////////////////
 
diff --git a/scriptc/rs_math.rsh b/scriptc/rs_math.rsh
index 4390a5d..e11c832 100644
--- a/scriptc/rs_math.rsh
+++ b/scriptc/rs_math.rsh
@@ -48,11 +48,7 @@
 extern int rsSendToClient(void *data, int cmdID, int len, int waitForSpace);
 
 // Script to Script
-typedef struct rs_script_call_rec {
-    rs_script script;
-    rs_allocation input;
-    rs_allocation output;
-
+typedef struct rs_script_call {
     uint32_t xStart;
     uint32_t xEnd;
     uint32_t yStart;
@@ -62,14 +58,17 @@
     uint32_t arrayStart;
     uint32_t arrayEnd;
 
-    const void * usrData;
-} rs_script_call;
+} rs_script_call_t;
 
-extern void __attribute__((overloadable))rsForEach(rs_script, rs_allocation input);
-extern void __attribute__((overloadable))rsForEach(rs_script, rs_allocation input, rs_allocation output);
-extern void __attribute__((overloadable))rsForEach(rs_script, rs_allocation input, int xStart, int xEnd);
-extern void __attribute__((overloadable))rsForEach(rs_script, rs_allocation input, rs_allocation output, int xStart, int xEnd);
-extern void __attribute__((overloadable))rsForEach(rs_script, rs_allocation input, int xStart, int yStart, int xEnd, int yEnd);
-extern void __attribute__((overloadable))rsForEach(rs_script, rs_allocation input, rs_allocation output, int xStart, int yStart, int xEnd, int yEnd);
+extern void __attribute__((overloadable))rsForEach(rs_script script,
+                                                   rs_allocation input,
+                                                   rs_allocation output,
+                                                   const void * usrData);
+
+extern void __attribute__((overloadable))rsForEach(rs_script script,
+                                                   rs_allocation input,
+                                                   rs_allocation output,
+                                                   const void * usrData,
+                                                   const rs_script_call_t *);
 
 #endif