Cleanup of ForEachParams in cpu ref

Change-Id: I8cc51915b2a605c240d98e3010619b741a13bae2
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 738eb84..40f4745 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -164,7 +164,7 @@
 
     // fast path for very small launches
     MTLaunchStruct *mtls = (MTLaunchStruct *)data;
-    if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+    if (mtls && mtls->fep.dim.y <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
         if (mWorkers.mLaunchCallback) {
             mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
         }
@@ -344,106 +344,100 @@
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void (*walk_loop_t)(MTLaunchStruct*,
-                            RsExpandKernelParams&,
+typedef void (*walk_loop_t)(const MTLaunchStruct*,
+                            RsExpandKernelDriverInfo,
                             outer_foreach_t);
 
+static void kparamSetup(RsExpandKernelParams *kparams, const RsExpandKernelDriverInfo *fep) {
+    //ALOGE("kp  usr %p", fep->usr);
+    //ALOGE("kp  slot %i", fep->slot);
+    //ALOGE("kp  dim %i %i %i", fep->dim.x, fep->dim.y, fep->dim.z);
+    //ALOGE("kp  lid %i", fep->lid);
+    //ALOGE("kp  in[0] stide %i  ptr %p", fep->inStride[0], fep->inPtr[0]);
+    //ALOGE("kp  out[0] ptr %p", fep->outPtr[0]);
+    //ALOGE("kp  loc %i %i %i", fep->current.x, fep->current.y, fep->current.z);
 
-static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
-    uint32_t inLen = mtls->fep.inLen;
-
-    RsExpandKernelParams kparams;
-    kparams.takeFields(mtls->fep);
-
-    // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
-    kparams.lid = idx;
-
-    if (inLen > 0) {
-        // Allocate space for our input base pointers.
-        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
-
-        // Allocate space for our input stride information.
-        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
-
-        // Fill our stride information.
-        for (int inIndex = inLen; --inIndex >= 0;) {
-          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
-        }
-    }
-
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-
-    walk_loop(mtls, kparams, fn);
+    kparams->usr  = fep->usr;
+    kparams->slot = fep->slot;
+    kparams->dimX = fep->dim.x;
+    kparams->dimY = fep->dim.y;
+    kparams->dimZ = fep->dim.z;
+    kparams->lid = fep->lid;
+    kparams->inEStrides = (uint32_t *)&fep->inStride[0];
+    kparams->ins = (const void **)&fep->inPtr[0];
+    kparams->out = fep->outPtr[0];
+    kparams->y = fep->current.y;
+    kparams->z = fep->current.z;
 }
 
+static inline void fepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
+                               uint32_t x, uint32_t y,
+                               uint32_t z = 0, uint32_t lod = 0,
+                               RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
+                               uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
+
+    for (uint32_t i = 0; i < fep->inLen; i++) {
+        fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
+    }
+
+    if (mtls->aout[0] != nullptr) {
+        fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
+    }
+}
+
+
 static void walk_2d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsExpandKernelDriverInfo fep = mtls->fep;
+    fep.lid = idx;
 
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-            uint32_t yEnd   = yStart + mtls->mSliceSize;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-            yEnd = rsMin(yEnd, mtls->yEnd);
+        yEnd = rsMin(yEnd, mtls->yEnd);
 
-            if (yEnd <= yStart) {
-                return;
-            }
-
-            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
-                kparams.out = mtls->fep.outPtr +
-                              (mtls->fep.outStride.yStride * kparams.y) +
-                              (mtls->fep.outStride.eStride * mtls->xStart);
-
-                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                    StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                    kparams.ins[inIndex] =
-                      mtls->fep.inPtrs[inIndex] +
-                      (strides.yStride * kparams.y) +
-                      (strides.eStride * mtls->xStart);
-                }
-
-                fn(&kparams, mtls->xStart, mtls->xEnd,
-                   mtls->fep.outStride.eStride);
-            }
+        if (yEnd <= yStart) {
+            return;
         }
-    });
+
+        for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
+            fepPtrSetup(mtls, &fep, mtls->xStart, fep.current.y);
+
+            RsExpandKernelParams kparams;
+            kparamSetup(&kparams, &fep);
+
+            outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+            fn(&kparams, mtls->xStart, mtls->xEnd, fep.outStride[0]);
+        }
+    }
 }
 
 static void walk_1d(void *usr, uint32_t idx) {
-    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
-                              RsExpandKernelParams &kparams,
-                              outer_foreach_t fn) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsExpandKernelDriverInfo fep = mtls->fep;
+    fep.lid = idx;
 
-        while (1) {
-            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-            uint32_t xEnd   = xStart + mtls->mSliceSize;
+    while (1) {
+        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd   = xStart + mtls->mSliceSize;
 
-            xEnd = rsMin(xEnd, mtls->xEnd);
+        xEnd = rsMin(xEnd, mtls->xEnd);
 
-            if (xEnd <= xStart) {
-                return;
-            }
-
-            kparams.out = mtls->fep.outPtr +
-                          (mtls->fep.outStride.eStride * xStart);
-
-            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
-                StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                kparams.ins[inIndex] =
-                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
-            }
-
-            fn(&kparams, xStart, xEnd, mtls->fep.outStride.eStride);
+        if (xEnd <= xStart) {
+            return;
         }
-    });
+
+        fepPtrSetup(mtls, &fep, xStart, 0);
+
+        RsExpandKernelParams kparams;
+        kparamSetup(&kparams, &fep);
+
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        fn(&kparams, xStart, xEnd, fep.outStride[0]);
+    }
 }
 
 
@@ -459,17 +453,17 @@
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
 
-        if (mtls->fep.dimY > 1) {
-            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+        if (mtls->fep.dim.y > 1) {
+            uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.yStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.yStride;
+            if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
+                s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
             } else {
                 // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
+                s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -479,16 +473,16 @@
 
             launchThreads(walk_2d, mtls);
         } else {
-            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.outStride.eStride) {
-                s2 = targetByteChunk / mtls->fep.outStride.eStride;
+            if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
+                s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
             } else {
                 // We know that there is either an output or an input.
-                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
+                s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -501,53 +495,21 @@
         mInForEach = false;
 
     } else {
-        RsExpandKernelParams kparams;
-        kparams.takeFields(mtls->fep);
-
-        if (inLen > 0) {
-            // Allocate space for our input base pointers.
-            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
-
-            // Allocate space for our input stride information.
-            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
-
-            // Fill our stride information.
-            for (int inIndex = inLen; --inIndex >= 0;) {
-                kparams.inEStrides[inIndex] =
-                    mtls->fep.inStrides[inIndex].eStride;
-            }
-        }
-
-        //ALOGE("launch 3");
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
         for (uint32_t arrayIndex = mtls->arrayStart;
              arrayIndex < mtls->arrayEnd; arrayIndex++) {
 
-            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
-                 kparams.z++) {
+            for (mtls->fep.current.z = mtls->zStart; mtls->fep.current.z < mtls->zEnd;
+                 mtls->fep.current.z++) {
 
-                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
-                     kparams.y++) {
+                for (mtls->fep.current.y = mtls->yStart; mtls->fep.current.y < mtls->yEnd;
+                     mtls->fep.current.y++) {
 
-                    uint32_t offset =
-                      mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
-                      mtls->fep.dimY * kparams.z + kparams.y;
+                    fepPtrSetup(mtls, &mtls->fep, mtls->xStart, mtls->fep.current.y, mtls->fep.current.z);
 
-                    kparams.out = mtls->fep.outPtr +
-                                  (mtls->fep.outStride.yStride * offset) +
-                                  (mtls->fep.outStride.eStride * mtls->xStart);
-
-                    for (int inIndex = inLen; --inIndex >= 0;) {
-                        StridePair &strides = mtls->fep.inStrides[inIndex];
-
-                        kparams.ins[inIndex] =
-                          mtls->fep.inPtrs[inIndex] +
-                          (strides.yStride * offset) +
-                          (strides.eStride * mtls->xStart);
-                    }
-
-                    fn(&kparams, mtls->xStart, mtls->xEnd,
-                       mtls->fep.outStride.eStride);
+                    RsExpandKernelParams kparams;
+                    kparamSetup(&kparams, &mtls->fep);
+                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.outStride[0]);
                 }
             }
         }
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 8060826..a42cef7 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,7 +25,7 @@
 
 #include <string>
 
-#define RS_KERNEL_INPUT_THRESHOLD 32
+#define RS_KERNEL_INPUT_LIMIT 8
 
 namespace bcc {
     class BCCContext;
@@ -41,39 +41,42 @@
   uint32_t yStride;
 };
 
+struct RsLaunchDimensions {
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+    uint32_t lod;
+    uint32_t faces;
+    uint32_t array[4 /*make a define*/];
+};
+
 struct RsExpandKernelDriverInfo {
-    const uint8_t **inPtrs;
+    // Warning: This structure is shared with the compiler
+    // Any change to the fields here requires a matching compiler change
+
+    const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
+    uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
     uint32_t inLen;
 
-    uint8_t *outPtr;
+    uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
+    uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
+    uint32_t outLen;
 
-    StridePair *inStrides;
-    StridePair  outStride;
+    // Dimension of the launch
+    RsLaunchDimensions dim;
 
-    uint32_t dimX;
-    uint32_t dimY;
-    uint32_t dimZ;
-
-    uint32_t slot;
+    // The walking itterator of the launch
+    RsLaunchDimensions current;
 
     const void *usr;
     uint32_t usrLen;
 
-    bool heapAllocatedArrays;
 
-    RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
 
-    ~RsExpandKernelDriverInfo() {
-        if (heapAllocatedArrays) {
-            if (inPtrs != nullptr) {
-                delete[] inPtrs;
-            }
+    // Items below this line are not used by the compiler and can be change in the driver
+    uint32_t lid;
+    uint32_t slot;
 
-            if (inStrides != nullptr) {
-                delete[] inStrides;
-            }
-        }
-    }
 };
 
 struct RsExpandKernelParams {
@@ -99,16 +102,6 @@
      *        modify blur to not need it.
      */
     uint32_t slot;
-
-    /// Copy fields needed by a kernel from a driver struct.
-    void takeFields(const RsExpandKernelDriverInfo &dstruct) {
-        this->usr  = dstruct.usr;
-        this->slot = dstruct.slot;
-
-        this->dimX = dstruct.dimX;
-        this->dimY = dstruct.dimY;
-        this->dimZ = dstruct.dimZ;
-    }
 };
 
 extern bool gArchUseSIMD;
@@ -134,13 +127,17 @@
 
     ForEachFunc_t kernel;
     uint32_t sig;
-    const Allocation ** ains;
-    Allocation * aout;
+    const Allocation * ains[RS_KERNEL_INPUT_LIMIT];
+    Allocation * aout[RS_KERNEL_INPUT_LIMIT];
 
     uint32_t mSliceSize;
     volatile int mSliceNum;
     bool isThreadable;
 
+    // origin of the launch
+    RsLaunchDimensions origin;
+
+    // TODO: convert to RsLaunchDimensions
     uint32_t xStart;
     uint32_t xEnd;
     uint32_t yStart;
@@ -149,9 +146,6 @@
     uint32_t zEnd;
     uint32_t arrayStart;
     uint32_t arrayEnd;
-
-    const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
-    StridePair     inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
 };
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index cfb6da2..f312866 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -1013,9 +1013,9 @@
         const Allocation *ain0   = ains[0];
         const Type       *inType = ain0->getType();
 
-        mtls->fep.dimX = inType->getDimX();
-        mtls->fep.dimY = inType->getDimY();
-        mtls->fep.dimZ = inType->getDimZ();
+        mtls->fep.dim.x = inType->getDimX();
+        mtls->fep.dim.y = inType->getDimY();
+        mtls->fep.dim.z = inType->getDimZ();
 
         for (int Index = inLen; --Index >= 1;) {
             if (!ain0->hasSameDims(ains[Index])) {
@@ -1029,9 +1029,9 @@
     } else if (aout != nullptr) {
         const Type *outType = aout->getType();
 
-        mtls->fep.dimX = outType->getDimX();
-        mtls->fep.dimY = outType->getDimY();
-        mtls->fep.dimZ = outType->getDimZ();
+        mtls->fep.dim.x = outType->getDimX();
+        mtls->fep.dim.y = outType->getDimY();
+        mtls->fep.dim.z = outType->getDimZ();
 
     } else {
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
@@ -1049,35 +1049,35 @@
     }
 
     if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dimX;
+        mtls->xEnd = mtls->fep.dim.x;
     } else {
-        rsAssert(sc->xStart < mtls->fep.dimX);
-        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < mtls->fep.dim.x);
+        rsAssert(sc->xEnd <= mtls->fep.dim.x);
         rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dim.x, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dim.x, sc->xEnd);
         if (mtls->xStart >= mtls->xEnd) return;
     }
 
     if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dimY;
+        mtls->yEnd = mtls->fep.dim.y;
     } else {
-        rsAssert(sc->yStart < mtls->fep.dimY);
-        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < mtls->fep.dim.y);
+        rsAssert(sc->yEnd <= mtls->fep.dim.y);
         rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dim.y, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dim.y, sc->yEnd);
         if (mtls->yStart >= mtls->yEnd) return;
     }
 
     if (!sc || (sc->zEnd == 0)) {
-        mtls->zEnd = mtls->fep.dimZ;
+        mtls->zEnd = mtls->fep.dim.z;
     } else {
-        rsAssert(sc->zStart < mtls->fep.dimZ);
-        rsAssert(sc->zEnd <= mtls->fep.dimZ);
+        rsAssert(sc->zStart < mtls->fep.dim.z);
+        rsAssert(sc->zEnd <= mtls->fep.dim.z);
         rsAssert(sc->zStart < sc->zEnd);
-        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
-        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+        mtls->zStart = rsMin(mtls->fep.dim.z, sc->zStart);
+        mtls->zEnd = rsMin(mtls->fep.dim.z, sc->zEnd);
         if (mtls->zStart >= mtls->zEnd) return;
     }
 
@@ -1089,52 +1089,28 @@
     rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
 
     mtls->rsc        = mCtx;
-    mtls->ains       = ains;
-    mtls->aout       = aout;
+    if (ains) {
+        memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
+    }
+    mtls->aout[0]    = aout;
     mtls->fep.usr    = usr;
     mtls->fep.usrLen = usrLen;
     mtls->mSliceSize = 1;
     mtls->mSliceNum  = 0;
 
-    mtls->fep.inPtrs    = nullptr;
-    mtls->fep.inStrides = nullptr;
     mtls->isThreadable  = mIsThreadable;
 
     if (inLen > 0) {
-
-        if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
-            mtls->fep.inPtrs    = (const uint8_t**)mtls->inPtrsBuff;
-            mtls->fep.inStrides = mtls->inStridesBuff;
-        } else {
-            mtls->fep.heapAllocatedArrays = true;
-
-            mtls->fep.inPtrs    = new const uint8_t*[inLen];
-            mtls->fep.inStrides = new StridePair[inLen];
-        }
-
         mtls->fep.inLen = inLen;
-
         for (int index = inLen; --index >= 0;) {
-            const Allocation *ain = ains[index];
-
-            mtls->fep.inPtrs[index] =
-              (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
-
-            mtls->fep.inStrides[index].eStride =
-              ain->getType()->getElementSizeBytes();
-            mtls->fep.inStrides[index].yStride =
-              ain->mHal.drvState.lod[0].stride;
+            mtls->fep.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
+            mtls->fep.inStride[index] = ains[index]->getType()->getElementSizeBytes();
         }
     }
 
-    mtls->fep.outPtr            = nullptr;
-    mtls->fep.outStride.eStride = 0;
-    mtls->fep.outStride.yStride = 0;
     if (aout != nullptr) {
-        mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-
-        mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
-        mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
+        mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
     }
 }