Add a basic implementation of the reduce kernel API to the CPU
reference implementation.

Bug: 22631253

For now, this just runs a serial reduction on one thread.

Change-Id: I34c96d24bb6f44274de72bb53160abcf79d143b0
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 1636369..4cb3f9f 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -93,7 +93,7 @@
                                           uint32_t usrLen,
                                           const RsScriptCall *sc) {
 
-    MTLaunchStruct mtls;
+    MTLaunchStructForEach mtls;
 
     preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 
@@ -101,21 +101,21 @@
         mtls.script = this;
         mtls.fep.slot = slot;
 
-        mtls.kernel = (void (*)())mRootPtr;
+        mtls.kernel = mRootPtr;
         mtls.fep.usr = this;
 
         RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-        mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+        mCtx->launchForEach(ains, inLen, aout, sc, &mtls);
         mCtx->setTLS(oldTLS);
     }
 
     postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
-void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
+void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
 
     mtls->script = this;
     mtls->fep.slot = slot;
-    mtls->kernel = (void (*)())mRootPtr;
+    mtls->kernel = mRootPtr;
     mtls->fep.usr = this;
 }