enable synchronous mode (functional)

Change-Id: I613610013e7e4d1623620ab94d2d25d8a1bd82b3
Bug: 5972398
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 5ea28d4..e22b730 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -132,6 +132,16 @@
 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
     mWorkers.mLaunchData = data;
     mWorkers.mLaunchCallback = cbk;
+
+    // fast path for very small launches
+    MTLaunchStruct *mtls = (MTLaunchStruct *)data;
+    if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+        if (mWorkers.mLaunchCallback) {
+            mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
+        }
+        return;
+    }
+
     android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
         mWorkers.mLaunchSignals[ct].set();
@@ -140,7 +150,7 @@
     // We use the calling thread as one of the workers so we can start without
     // the delay of the thread wakeup.
     if (mWorkers.mLaunchCallback) {
-       mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
+        mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
     }
 
     while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {