blob: 43e4521873671d8e9f7bb94e69a3fc8bcd8fc93e [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuCore.h"
18#include "rsCpuScript.h"
19#include "rsCpuScriptGroup.h"
Yang Ni1ffd86b2015-01-07 09:16:40 -080020#include "rsCpuScriptGroup2.h"
Jason Sams709a0972012-11-15 18:18:04 -080021
22#include <malloc.h>
23#include "rsContext.h"
24
25#include <sys/types.h>
26#include <sys/resource.h>
27#include <sched.h>
Jason Sams709a0972012-11-15 18:18:04 -080028#include <sys/syscall.h>
Matt Wala11fd9ec2015-07-10 16:40:12 -070029#include <stdio.h>
Jason Sams709a0972012-11-15 18:18:04 -080030#include <string.h>
Stephen Hinesb0934b62013-07-03 17:27:38 -070031#include <unistd.h>
Tim Murray0b575de2013-03-15 15:56:43 -070032
David Grossae2ec3f2016-06-01 14:45:47 -070033#define REDUCE_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
David Gross35dbc8c2016-03-29 13:48:41 -070034
Jason Sams709a0972012-11-15 18:18:04 -080035static pthread_key_t gThreadTLSKey = 0;
36static uint32_t gThreadTLSKeyCount = 0;
37static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
38
Chih-Hung Hsieh462de212016-11-16 11:33:57 -080039namespace android {
40namespace renderscript {
41
42bool gArchUseSIMD = false;
Jason Samsf5ef8df2013-08-06 13:49:25 -070043
Jason Sams709a0972012-11-15 18:18:04 -080044RsdCpuReference::~RsdCpuReference() {
45}
46
47RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
Jason Samscadfac42013-03-06 18:09:08 -080048 uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
David Grossb043df02015-05-29 11:38:15 -070049 , RSSelectRTCallback pSelectRTCallback,
Stephen Hines00511322014-01-31 11:20:23 -080050 const char *pBccPluginName
Jason Samscadfac42013-03-06 18:09:08 -080051 ) {
Jason Sams709a0972012-11-15 18:18:04 -080052
53 RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
54 if (!cpu) {
Chris Wailes44bef6f2014-08-12 13:51:10 -070055 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -080056 }
57 if (!cpu->init(version_major, version_minor, lfn, slfn)) {
58 delete cpu;
Chris Wailes44bef6f2014-08-12 13:51:10 -070059 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -080060 }
Stephen Hinesf218bf12013-02-12 19:32:38 -080061
Stephen Hines1d476622013-03-29 22:08:49 -070062 cpu->setSelectRTCallback(pSelectRTCallback);
Stephen Hines00511322014-01-31 11:20:23 -080063 if (pBccPluginName) {
64 cpu->setBccPluginName(pBccPluginName);
65 }
Stephen Hinesf218bf12013-02-12 19:32:38 -080066
Jason Sams709a0972012-11-15 18:18:04 -080067 return cpu;
68}
69
70
71Context * RsdCpuReference::getTlsContext() {
72 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
73 return tls->mContext;
74}
75
76const Script * RsdCpuReference::getTlsScript() {
77 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
78 return tls->mScript;
79}
80
Stephen Hinesf218bf12013-02-12 19:32:38 -080081pthread_key_t RsdCpuReference::getThreadTLSKey(){ return gThreadTLSKey; }
Jason Sams709a0972012-11-15 18:18:04 -080082
83////////////////////////////////////////////////////////////
84///
85
86RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
87 mRSC = rsc;
88
89 version_major = 0;
90 version_minor = 0;
David Gross35dbc8c2016-03-29 13:48:41 -070091 mInKernel = false;
Jason Sams709a0972012-11-15 18:18:04 -080092 memset(&mWorkers, 0, sizeof(mWorkers));
93 memset(&mTlsStruct, 0, sizeof(mTlsStruct));
94 mExit = false;
Chris Wailes44bef6f2014-08-12 13:51:10 -070095 mSelectRTCallback = nullptr;
Stephen Hines8409d642015-04-28 18:49:56 -070096 mEmbedGlobalInfo = true;
97 mEmbedGlobalInfoSkipConstant = true;
Jason Sams709a0972012-11-15 18:18:04 -080098}
99
100
101void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
102 RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
103
Tim Murray0b575de2013-03-15 15:56:43 -0700104 uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
Jason Sams709a0972012-11-15 18:18:04 -0800105
106 //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
107
108 dc->mWorkers.mLaunchSignals[idx].init();
109 dc->mWorkers.mNativeThreadId[idx] = gettid();
110
111 memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
112 int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
113 if (status) {
114 ALOGE("pthread_setspecific %i", status);
115 }
116
117#if 0
118 typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
119 cpu_set_t cpuset;
120 memset(&cpuset, 0, sizeof(cpuset));
121 cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
122 int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
123 sizeof(cpuset), &cpuset);
124 ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
125#endif
126
127 while (!dc->mExit) {
128 dc->mWorkers.mLaunchSignals[idx].wait();
129 if (dc->mWorkers.mLaunchCallback) {
130 // idx +1 is used because the calling thread is always worker 0.
131 dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
132 }
Tim Murray0b575de2013-03-15 15:56:43 -0700133 __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
Jason Sams709a0972012-11-15 18:18:04 -0800134 dc->mWorkers.mCompleteSignal.set();
135 }
136
137 //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
Chris Wailes44bef6f2014-08-12 13:51:10 -0700138 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800139}
140
Matt Wala14ce0072015-07-30 17:30:25 -0700141// Launch a kernel.
142// The callback function is called to execute the kernel.
Jason Sams709a0972012-11-15 18:18:04 -0800143void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
144 mWorkers.mLaunchData = data;
145 mWorkers.mLaunchCallback = cbk;
Tim Murray4d252d62012-11-29 14:37:59 -0800146
147 // fast path for very small launches
Matt Wala14ce0072015-07-30 17:30:25 -0700148 MTLaunchStructCommon *mtls = (MTLaunchStructCommon *)data;
149 if (mtls && mtls->dimPtr->y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
Tim Murray4d252d62012-11-29 14:37:59 -0800150 if (mWorkers.mLaunchCallback) {
151 mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
152 }
153 return;
154 }
155
Tim Murray0b575de2013-03-15 15:56:43 -0700156 mWorkers.mRunningCount = mWorkers.mCount;
157 __sync_synchronize();
158
Jason Sams709a0972012-11-15 18:18:04 -0800159 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
160 mWorkers.mLaunchSignals[ct].set();
161 }
162
163 // We use the calling thread as one of the workers so we can start without
164 // the delay of the thread wakeup.
165 if (mWorkers.mLaunchCallback) {
Tim Murray4d252d62012-11-29 14:37:59 -0800166 mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
Jason Sams709a0972012-11-15 18:18:04 -0800167 }
168
Tim Murray0b575de2013-03-15 15:56:43 -0700169 while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
Jason Sams709a0972012-11-15 18:18:04 -0800170 mWorkers.mCompleteSignal.wait();
171 }
172}
173
174
175void RsdCpuReferenceImpl::lockMutex() {
176 pthread_mutex_lock(&gInitMutex);
177}
178
179void RsdCpuReferenceImpl::unlockMutex() {
180 pthread_mutex_unlock(&gInitMutex);
181}
182
Matt Wala11fd9ec2015-07-10 16:40:12 -0700183// Determine if the CPU we're running on supports SIMD instructions.
Jason Samsf5ef8df2013-08-06 13:49:25 -0700184static void GetCpuInfo() {
Matt Wala11fd9ec2015-07-10 16:40:12 -0700185 // Read the CPU flags from /proc/cpuinfo.
186 FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
Jason Samsf5ef8df2013-08-06 13:49:25 -0700187
Matt Wala11fd9ec2015-07-10 16:40:12 -0700188 if (!cpuinfo) {
Jason Samsf5ef8df2013-08-06 13:49:25 -0700189 return;
190 }
191
Matt Wala11fd9ec2015-07-10 16:40:12 -0700192 char cpuinfostr[4096];
Miao Wang5d70cb52015-07-17 11:53:04 -0700193 // fgets() ends with newline or EOF, need to check the whole
194 // "cpuinfo" file to make sure we can use SIMD or not.
195 while (fgets(cpuinfostr, sizeof(cpuinfostr), cpuinfo)) {
196#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
197 gArchUseSIMD = strstr(cpuinfostr, " neon") || strstr(cpuinfostr, " asimd");
198#elif defined(ARCH_X86_HAVE_SSSE3)
199 gArchUseSIMD = strstr(cpuinfostr, " ssse3");
200#endif
201 if (gArchUseSIMD) {
202 break;
203 }
Matt Wala11fd9ec2015-07-10 16:40:12 -0700204 }
205 fclose(cpuinfo);
Jason Samsf5ef8df2013-08-06 13:49:25 -0700206}
Jason Samsf5ef8df2013-08-06 13:49:25 -0700207
Jason Sams709a0972012-11-15 18:18:04 -0800208bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
209 sym_lookup_t lfn, script_lookup_t slfn) {
Jason Sams709a0972012-11-15 18:18:04 -0800210 mSymLookupFn = lfn;
211 mScriptLookupFn = slfn;
212
213 lockMutex();
214 if (!gThreadTLSKeyCount) {
Chris Wailes44bef6f2014-08-12 13:51:10 -0700215 int status = pthread_key_create(&gThreadTLSKey, nullptr);
Jason Sams709a0972012-11-15 18:18:04 -0800216 if (status) {
217 ALOGE("Failed to init thread tls key.");
218 unlockMutex();
219 return false;
220 }
221 }
222 gThreadTLSKeyCount++;
223 unlockMutex();
224
225 mTlsStruct.mContext = mRSC;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700226 mTlsStruct.mScript = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800227 int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
228 if (status) {
229 ALOGE("pthread_setspecific %i", status);
230 }
231
David Gross35dbc8c2016-03-29 13:48:41 -0700232 mPageSize = sysconf(_SC_PAGE_SIZE);
David Gross013ff532016-04-01 12:46:58 -0700233 // ALOGV("page size = %ld", mPageSize);
David Gross35dbc8c2016-03-29 13:48:41 -0700234
Jason Samsf5ef8df2013-08-06 13:49:25 -0700235 GetCpuInfo();
Jason Samsf5ef8df2013-08-06 13:49:25 -0700236
Jason Sams77d57a32014-10-23 17:43:53 -0700237 int cpu = sysconf(_SC_NPROCESSORS_CONF);
Jason Sams709a0972012-11-15 18:18:04 -0800238 if(mRSC->props.mDebugMaxThreads) {
239 cpu = mRSC->props.mDebugMaxThreads;
240 }
241 if (cpu < 2) {
242 mWorkers.mCount = 0;
243 return true;
244 }
245
246 // Subtract one from the cpu count because we also use the command thread as a worker.
247 mWorkers.mCount = (uint32_t)(cpu - 1);
248
Yang Ni554054c2016-04-20 09:04:02 -0700249 if (mRSC->props.mLogScripts) {
250 ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
251 }
Jason Sams709a0972012-11-15 18:18:04 -0800252
253 mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
254 mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
255 mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
Chris Wailes44bef6f2014-08-12 13:51:10 -0700256 mWorkers.mLaunchCallback = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800257
258 mWorkers.mCompleteSignal.init();
259
Tim Murray0b575de2013-03-15 15:56:43 -0700260 mWorkers.mRunningCount = mWorkers.mCount;
261 mWorkers.mLaunchCount = 0;
262 __sync_synchronize();
Jason Sams709a0972012-11-15 18:18:04 -0800263
264 pthread_attr_t threadAttr;
265 status = pthread_attr_init(&threadAttr);
266 if (status) {
267 ALOGE("Failed to init thread attribute.");
268 return false;
269 }
270
271 for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
272 status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
273 if (status) {
274 mWorkers.mCount = ct;
275 ALOGE("Created fewer than expected number of RS threads.");
276 break;
277 }
278 }
Tim Murray0b575de2013-03-15 15:56:43 -0700279 while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
Jason Sams709a0972012-11-15 18:18:04 -0800280 usleep(100);
281 }
282
283 pthread_attr_destroy(&threadAttr);
284 return true;
285}
286
287
288void RsdCpuReferenceImpl::setPriority(int32_t priority) {
289 for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
290 setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
291 }
292}
293
294RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
295 mExit = true;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700296 mWorkers.mLaunchData = nullptr;
297 mWorkers.mLaunchCallback = nullptr;
Tim Murray0b575de2013-03-15 15:56:43 -0700298 mWorkers.mRunningCount = mWorkers.mCount;
299 __sync_synchronize();
Jason Sams709a0972012-11-15 18:18:04 -0800300 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
301 mWorkers.mLaunchSignals[ct].set();
302 }
303 void *res;
304 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
305 pthread_join(mWorkers.mThreadId[ct], &res);
306 }
Miao Wang19601aa2016-04-21 20:36:57 -0700307 // b/23109602
308 // TODO: Refactor the implementation with threadpool to
309 // fix the race condition in the destuctor.
310 // rsAssert(__sync_fetch_and_or(&mWorkers.mRunningCount, 0) == 0);
Jens Gulin07ef7042014-02-19 18:16:01 +0100311 free(mWorkers.mThreadId);
312 free(mWorkers.mNativeThreadId);
313 delete[] mWorkers.mLaunchSignals;
Jason Sams709a0972012-11-15 18:18:04 -0800314
315 // Global structure cleanup.
316 lockMutex();
317 --gThreadTLSKeyCount;
318 if (!gThreadTLSKeyCount) {
319 pthread_key_delete(gThreadTLSKey);
320 }
321 unlockMutex();
322
323}
324
Matt Wala14ce0072015-07-30 17:30:25 -0700325// Set up the appropriate input and output pointers to the kernel driver info structure.
326// Inputs:
327// mtls - The MTLaunchStruct holding information about the kernel launch
328// fep - The forEach parameters (driver info structure)
329// x, y, z, lod, face, a1, a2, a3, a4 - The start offsets into each dimension
330static inline void FepPtrSetup(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo *fep,
Jason Samsc0d68472015-01-20 14:29:52 -0800331 uint32_t x, uint32_t y,
332 uint32_t z = 0, uint32_t lod = 0,
333 RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
334 uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
Jason Samsc0d68472015-01-20 14:29:52 -0800335 for (uint32_t i = 0; i < fep->inLen; i++) {
336 fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
337 }
Jason Samsc0d68472015-01-20 14:29:52 -0800338 if (mtls->aout[0] != nullptr) {
339 fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
340 }
341}
342
David Gross6c1876b2016-01-15 11:52:14 -0800343// Set up the appropriate input and output pointers to the kernel driver info structure.
344// Inputs:
345// mtls - The MTLaunchStruct holding information about the kernel launch
346// redp - The reduce parameters (driver info structure)
347// x, y, z - The start offsets into each dimension
David Grossae2ec3f2016-06-01 14:45:47 -0700348static inline void RedpPtrSetup(const MTLaunchStructReduce *mtls, RsExpandKernelDriverInfo *redp,
David Gross6c1876b2016-01-15 11:52:14 -0800349 uint32_t x, uint32_t y, uint32_t z) {
350 for (uint32_t i = 0; i < redp->inLen; i++) {
351 redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
352 }
353}
354
Jason Samsbf2111d2015-01-26 18:13:41 -0800355static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
356 if (start >= end) {
357 *p = start;
358 return val;
359 }
360
361 uint32_t div = end - start;
362
363 uint32_t n = val / div;
364 *p = (val - (n * div)) + start;
365 return n;
366}
367
David Gross6c1876b2016-01-15 11:52:14 -0800368static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800369 uint32_t r = sliceNum;
David Gross6c1876b2016-01-15 11:52:14 -0800370 r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
371 r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
372 r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
373 r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
374 r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
375 r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
376 r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
Jason Samsbf2111d2015-01-26 18:13:41 -0800377 return r == 0;
378}
379
David Grossf9bd1f22016-04-06 16:45:26 -0700380static bool SelectZSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
381 return sliceInt(&info->current.z, sliceNum, mtls->start.z, mtls->end.z) == 0;
382}
Jason Samsbf2111d2015-01-26 18:13:41 -0800383
David Grossf9bd1f22016-04-06 16:45:26 -0700384static void walk_general_foreach(void *usr, uint32_t idx) {
Matt Wala14ce0072015-07-30 17:30:25 -0700385 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
Jason Samsbf2111d2015-01-26 18:13:41 -0800386 RsExpandKernelDriverInfo fep = mtls->fep;
387 fep.lid = idx;
Matt Wala14ce0072015-07-30 17:30:25 -0700388 ForEachFunc_t fn = mtls->kernel;
Jason Samsbf2111d2015-01-26 18:13:41 -0800389
Jason Samsbf2111d2015-01-26 18:13:41 -0800390 while(1) {
391 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
392
Jason Sams59b35f22015-03-17 12:59:17 -0700393 if (!SelectOuterSlice(mtls, &fep, slice)) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800394 return;
395 }
396
Jason Sams59b35f22015-03-17 12:59:17 -0700397 for (fep.current.y = mtls->start.y; fep.current.y < mtls->end.y;
398 fep.current.y++) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800399
Jason Sams59b35f22015-03-17 12:59:17 -0700400 FepPtrSetup(mtls, &fep, mtls->start.x,
401 fep.current.y, fep.current.z, fep.current.lod,
402 (RsAllocationCubemapFace)fep.current.face,
403 fep.current.array[0], fep.current.array[1],
404 fep.current.array[2], fep.current.array[3]);
Jason Samsbf2111d2015-01-26 18:13:41 -0800405
Jason Sams59b35f22015-03-17 12:59:17 -0700406 fn(&fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
Jason Samsbf2111d2015-01-26 18:13:41 -0800407 }
408 }
Jason Samsbf2111d2015-01-26 18:13:41 -0800409}
Jason Samsc0d68472015-01-20 14:29:52 -0800410
David Grossf9bd1f22016-04-06 16:45:26 -0700411static void walk_2d_foreach(void *usr, uint32_t idx) {
Matt Wala14ce0072015-07-30 17:30:25 -0700412 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
Jason Samsc0d68472015-01-20 14:29:52 -0800413 RsExpandKernelDriverInfo fep = mtls->fep;
414 fep.lid = idx;
Matt Wala14ce0072015-07-30 17:30:25 -0700415 ForEachFunc_t fn = mtls->kernel;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700416
Jason Samsc0d68472015-01-20 14:29:52 -0800417 while (1) {
418 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
Jason Samsbf2111d2015-01-26 18:13:41 -0800419 uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
Jason Samsc0d68472015-01-20 14:29:52 -0800420 uint32_t yEnd = yStart + mtls->mSliceSize;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700421
Jason Samsbf2111d2015-01-26 18:13:41 -0800422 yEnd = rsMin(yEnd, mtls->end.y);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700423
Jason Samsc0d68472015-01-20 14:29:52 -0800424 if (yEnd <= yStart) {
425 return;
Stephen Hines4b2bea32014-08-13 17:32:10 +0000426 }
Jason Samsc0d68472015-01-20 14:29:52 -0800427
428 for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800429 FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
Jason Samsc0d68472015-01-20 14:29:52 -0800430
David Grossb0abb142015-03-12 15:23:03 -0700431 fn(&fep, mtls->start.x, mtls->end.x, fep.outStride[0]);
Jason Samsc0d68472015-01-20 14:29:52 -0800432 }
433 }
Stephen Hines4b2bea32014-08-13 17:32:10 +0000434}
435
David Gross35dbc8c2016-03-29 13:48:41 -0700436static void walk_1d_foreach(void *usr, uint32_t idx) {
Matt Wala14ce0072015-07-30 17:30:25 -0700437 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
Jason Samsc0d68472015-01-20 14:29:52 -0800438 RsExpandKernelDriverInfo fep = mtls->fep;
439 fep.lid = idx;
Matt Wala14ce0072015-07-30 17:30:25 -0700440 ForEachFunc_t fn = mtls->kernel;
Chris Wailesf3712132014-07-16 15:18:30 -0700441
Jason Samsc0d68472015-01-20 14:29:52 -0800442 while (1) {
443 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
Jason Samsbf2111d2015-01-26 18:13:41 -0800444 uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
Jason Samsc0d68472015-01-20 14:29:52 -0800445 uint32_t xEnd = xStart + mtls->mSliceSize;
Chris Wailesf3712132014-07-16 15:18:30 -0700446
Jason Samsbf2111d2015-01-26 18:13:41 -0800447 xEnd = rsMin(xEnd, mtls->end.x);
Chris Wailesf3712132014-07-16 15:18:30 -0700448
Jason Samsc0d68472015-01-20 14:29:52 -0800449 if (xEnd <= xStart) {
450 return;
Chris Wailesf3712132014-07-16 15:18:30 -0700451 }
Jason Samsc0d68472015-01-20 14:29:52 -0800452
Jason Samsbf2111d2015-01-26 18:13:41 -0800453 FepPtrSetup(mtls, &fep, xStart, 0);
Jason Samsc0d68472015-01-20 14:29:52 -0800454
David Grossb0abb142015-03-12 15:23:03 -0700455 fn(&fep, xStart, xEnd, fep.outStride[0]);
Jason Samsc0d68472015-01-20 14:29:52 -0800456 }
Chris Wailesf3712132014-07-16 15:18:30 -0700457}
458
David Gross35dbc8c2016-03-29 13:48:41 -0700459// The function format_bytes() is an auxiliary function to assist in logging.
460//
461// Bytes are read from an input (inBuf) and written (as pairs of hex digits)
462// to an output (outBuf).
463//
464// Output format:
465// - starts with ": "
466// - each input byte is translated to a pair of hex digits
467// - bytes are separated by "." except that every fourth separator is "|"
468// - if the input is sufficiently long, the output is truncated and terminated with "..."
469//
470// Arguments:
471// - outBuf -- Pointer to buffer of type "FormatBuf" into which output is written
472// - inBuf -- Pointer to bytes which are to be formatted into outBuf
473// - inBytes -- Number of bytes in inBuf
474//
475// Constant:
476// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
477// from inBuf
478//
479// Return value:
480// - pointer (const char *) to output (which is part of outBuf)
481//
482static const int kFormatInBytesMax = 16;
483// ": " + 2 digits per byte + 1 separator between bytes + "..." + null
484typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
485static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
486 strcpy(*outBuf, ": ");
487 int pos = 2;
488 const int lim = std::min(kFormatInBytesMax, inBytes);
489 for (int i = 0; i < lim; ++i) {
490 if (i) {
491 sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
492 ++pos;
493 }
494 sprintf(*outBuf + pos, "%02x", inBuf[i]);
495 pos += 2;
496 }
497 if (kFormatInBytesMax < inBytes)
498 strcpy(*outBuf + pos, "...");
499 return *outBuf;
500}
501
David Grossae2ec3f2016-06-01 14:45:47 -0700502static void reduce_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduce *mtls,
503 const char *walkerName, uint32_t threadIdx) {
David Grossf9bd1f22016-04-06 16:45:26 -0700504 rsAssert(!accumPtr);
505
506 uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
507 if (mtls->outFunc) {
508 accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
509 } else {
510 if (accumIdx == 0) {
511 accumPtr = mtls->redp.outPtr[0];
512 } else {
513 accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
514 }
515 }
David Grossae2ec3f2016-06-01 14:45:47 -0700516 REDUCE_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
517 walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
David Grossf9bd1f22016-04-06 16:45:26 -0700518 // initialize accumulator
519 if (mtls->initFunc) {
520 mtls->initFunc(accumPtr);
521 } else {
522 memset(accumPtr, 0, mtls->accumSize);
523 }
524}
525
David Grossae2ec3f2016-06-01 14:45:47 -0700526static void walk_1d_reduce(void *usr, uint32_t idx) {
527 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
David Gross35dbc8c2016-03-29 13:48:41 -0700528 RsExpandKernelDriverInfo redp = mtls->redp;
529
530 // find accumulator
531 uint8_t *&accumPtr = mtls->accumPtr[idx];
532 if (!accumPtr) {
David Grossae2ec3f2016-06-01 14:45:47 -0700533 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
David Gross35dbc8c2016-03-29 13:48:41 -0700534 }
535
536 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700537 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Gross35dbc8c2016-03-29 13:48:41 -0700538 while (1) {
539 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
540 uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
541 uint32_t xEnd = xStart + mtls->mSliceSize;
542
543 xEnd = rsMin(xEnd, mtls->end.x);
544
545 if (xEnd <= xStart) {
546 return;
547 }
548
549 RedpPtrSetup(mtls, &redp, xStart, 0, 0);
550 fn(&redp, xStart, xEnd, accumPtr);
551
David Grossf9bd1f22016-04-06 16:45:26 -0700552 // Emit log line after slice has been run, so that we can include
553 // the results of the run on that line.
David Gross35dbc8c2016-03-29 13:48:41 -0700554 FormatBuf fmt;
David Gross013ff532016-04-01 12:46:58 -0700555 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700556 format_bytes(&fmt, accumPtr, mtls->accumSize);
557 } else {
558 fmt[0] = 0;
559 }
David Grossae2ec3f2016-06-01 14:45:47 -0700560 REDUCE_ALOGV(mtls, 2, "walk_1d_reduce(%p): idx = %u, x in [%u, %u)%s",
561 mtls->accumFunc, idx, xStart, xEnd, fmt);
David Gross35dbc8c2016-03-29 13:48:41 -0700562 }
563}
564
David Grossae2ec3f2016-06-01 14:45:47 -0700565static void walk_2d_reduce(void *usr, uint32_t idx) {
566 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
David Grossf9bd1f22016-04-06 16:45:26 -0700567 RsExpandKernelDriverInfo redp = mtls->redp;
568
569 // find accumulator
570 uint8_t *&accumPtr = mtls->accumPtr[idx];
571 if (!accumPtr) {
David Grossae2ec3f2016-06-01 14:45:47 -0700572 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
David Grossf9bd1f22016-04-06 16:45:26 -0700573 }
574
575 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700576 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Grossf9bd1f22016-04-06 16:45:26 -0700577 while (1) {
578 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
579 uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
580 uint32_t yEnd = yStart + mtls->mSliceSize;
581
582 yEnd = rsMin(yEnd, mtls->end.y);
583
584 if (yEnd <= yStart) {
585 return;
586 }
587
588 for (redp.current.y = yStart; redp.current.y < yEnd; redp.current.y++) {
589 RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, 0);
590 fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
591 }
592
593 FormatBuf fmt;
594 if (mtls->logReduce >= 3) {
595 format_bytes(&fmt, accumPtr, mtls->accumSize);
596 } else {
597 fmt[0] = 0;
598 }
David Grossae2ec3f2016-06-01 14:45:47 -0700599 REDUCE_ALOGV(mtls, 2, "walk_2d_reduce(%p): idx = %u, y in [%u, %u)%s",
600 mtls->accumFunc, idx, yStart, yEnd, fmt);
David Grossf9bd1f22016-04-06 16:45:26 -0700601 }
602}
603
David Grossae2ec3f2016-06-01 14:45:47 -0700604static void walk_3d_reduce(void *usr, uint32_t idx) {
605 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
David Grossf9bd1f22016-04-06 16:45:26 -0700606 RsExpandKernelDriverInfo redp = mtls->redp;
607
608 // find accumulator
609 uint8_t *&accumPtr = mtls->accumPtr[idx];
610 if (!accumPtr) {
David Grossae2ec3f2016-06-01 14:45:47 -0700611 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
David Grossf9bd1f22016-04-06 16:45:26 -0700612 }
613
614 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700615 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Grossf9bd1f22016-04-06 16:45:26 -0700616 while (1) {
617 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
618
619 if (!SelectZSlice(mtls, &redp, slice)) {
620 return;
621 }
622
623 for (redp.current.y = mtls->start.y; redp.current.y < mtls->end.y; redp.current.y++) {
624 RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, redp.current.z);
625 fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
626 }
627
628 FormatBuf fmt;
629 if (mtls->logReduce >= 3) {
630 format_bytes(&fmt, accumPtr, mtls->accumSize);
631 } else {
632 fmt[0] = 0;
633 }
David Grossae2ec3f2016-06-01 14:45:47 -0700634 REDUCE_ALOGV(mtls, 2, "walk_3d_reduce(%p): idx = %u, z = %u%s",
635 mtls->accumFunc, idx, redp.current.z, fmt);
David Grossf9bd1f22016-04-06 16:45:26 -0700636 }
637}
638
David Gross6c1876b2016-01-15 11:52:14 -0800639// Launch a general reduce-style kernel.
640// Inputs:
641// ains[0..inLen-1]: Array of allocations that contain the inputs
642// aout: The allocation that will hold the output
643// mtls: Holds launch parameters
David Grossae2ec3f2016-06-01 14:45:47 -0700644void RsdCpuReferenceImpl::launchReduce(const Allocation ** ains,
645 uint32_t inLen,
646 Allocation * aout,
647 MTLaunchStructReduce *mtls) {
David Gross013ff532016-04-01 12:46:58 -0700648 mtls->logReduce = mRSC->props.mLogReduce;
David Gross35dbc8c2016-03-29 13:48:41 -0700649 if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
David Grossae2ec3f2016-06-01 14:45:47 -0700650 launchReduceParallel(ains, inLen, aout, mtls);
David Gross35dbc8c2016-03-29 13:48:41 -0700651 } else {
David Grossae2ec3f2016-06-01 14:45:47 -0700652 launchReduceSerial(ains, inLen, aout, mtls);
David Gross35dbc8c2016-03-29 13:48:41 -0700653 }
654}
655
656// Launch a general reduce-style kernel, single-threaded.
657// Inputs:
658// ains[0..inLen-1]: Array of allocations that contain the inputs
659// aout: The allocation that will hold the output
660// mtls: Holds launch parameters
David Grossae2ec3f2016-06-01 14:45:47 -0700661void RsdCpuReferenceImpl::launchReduceSerial(const Allocation ** ains,
662 uint32_t inLen,
663 Allocation * aout,
664 MTLaunchStructReduce *mtls) {
665 REDUCE_ALOGV(mtls, 1, "launchReduceSerial(%p): %u x %u x %u", mtls->accumFunc,
666 mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
David Gross35dbc8c2016-03-29 13:48:41 -0700667
David Gross6c1876b2016-01-15 11:52:14 -0800668 // In the presence of outconverter, we allocate temporary memory for
669 // the accumulator.
670 //
671 // In the absence of outconverter, we use the output allocation as the
672 // accumulator.
673 uint8_t *const accumPtr = (mtls->outFunc
674 ? static_cast<uint8_t *>(malloc(mtls->accumSize))
675 : mtls->redp.outPtr[0]);
676
677 // initialize
678 if (mtls->initFunc) {
679 mtls->initFunc(accumPtr);
680 } else {
681 memset(accumPtr, 0, mtls->accumSize);
682 }
683
684 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700685 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Gross6c1876b2016-01-15 11:52:14 -0800686 uint32_t slice = 0;
687 while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
688 for (mtls->redp.current.y = mtls->start.y;
689 mtls->redp.current.y < mtls->end.y;
690 mtls->redp.current.y++) {
691 RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
692 fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
693 }
694 }
695
696 // outconvert
697 if (mtls->outFunc) {
698 mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
699 free(accumPtr);
700 }
701}
702
David Gross35dbc8c2016-03-29 13:48:41 -0700703// Launch a general reduce-style kernel, multi-threaded.
704// Inputs:
705// ains[0..inLen-1]: Array of allocations that contain the inputs
706// aout: The allocation that will hold the output
707// mtls: Holds launch parameters
David Grossae2ec3f2016-06-01 14:45:47 -0700708void RsdCpuReferenceImpl::launchReduceParallel(const Allocation ** ains,
709 uint32_t inLen,
710 Allocation * aout,
711 MTLaunchStructReduce *mtls) {
David Grossf9bd1f22016-04-06 16:45:26 -0700712 // For now, we don't know how to go parallel in the absence of a combiner.
713 if (!mtls->combFunc) {
David Grossae2ec3f2016-06-01 14:45:47 -0700714 launchReduceSerial(ains, inLen, aout, mtls);
David Gross35dbc8c2016-03-29 13:48:41 -0700715 return;
716 }
717
718 // Number of threads = "main thread" + number of other (worker) threads
719 const uint32_t numThreads = mWorkers.mCount + 1;
720
721 // In the absence of outconverter, we use the output allocation as
722 // an accumulator, and therefore need to allocate one fewer accumulator.
723 const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
724
725 // If mDebugReduceSplitAccum, then we want each accumulator to start
726 // on a page boundary. (TODO: Would some unit smaller than a page
727 // be sufficient to avoid false sharing?)
728 if (mRSC->props.mDebugReduceSplitAccum) {
729 // Round up accumulator size to an integral number of pages
730 mtls->accumStride =
731 (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
732 ~(unsigned(mPageSize)-1);
733 // Each accumulator gets its own page. Alternatively, if we just
734 // wanted to make sure no two accumulators are on the same page,
735 // we could instead do
736 // allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
737 const size_t allocSize = mtls->accumStride * numAllocAccum;
738 mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
739 } else {
740 mtls->accumStride = mtls->accumSize;
741 mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
742 }
743
744 const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
745 mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
746 memset(mtls->accumPtr, 0, accumPtrArrayBytes);
747
748 mtls->accumCount = 0;
749
750 rsAssert(!mInKernel);
751 mInKernel = true;
David Grossae2ec3f2016-06-01 14:45:47 -0700752 REDUCE_ALOGV(mtls, 1, "launchReduceParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
753 mtls->accumFunc,
754 mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
755 numThreads, mtls->accumAlloc);
David Grossf9bd1f22016-04-06 16:45:26 -0700756 if (mtls->redp.dim.z > 1) {
757 mtls->mSliceSize = 1;
David Grossae2ec3f2016-06-01 14:45:47 -0700758 launchThreads(walk_3d_reduce, mtls);
David Grossf9bd1f22016-04-06 16:45:26 -0700759 } else if (mtls->redp.dim.y > 1) {
760 mtls->mSliceSize = rsMax(1U, mtls->redp.dim.y / (numThreads * 4));
David Grossae2ec3f2016-06-01 14:45:47 -0700761 launchThreads(walk_2d_reduce, mtls);
David Grossf9bd1f22016-04-06 16:45:26 -0700762 } else {
763 mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
David Grossae2ec3f2016-06-01 14:45:47 -0700764 launchThreads(walk_1d_reduce, mtls);
David Grossf9bd1f22016-04-06 16:45:26 -0700765 }
David Gross35dbc8c2016-03-29 13:48:41 -0700766 mInKernel = false;
767
768 // Combine accumulators and identify final accumulator
769 uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
770 // Loop over accumulators, combining into finalAccumPtr. If finalAccumPtr
771 // is null, then the first accumulator I find becomes finalAccumPtr.
772 for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
773 uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
774 if (finalAccumPtr) {
775 if (finalAccumPtr != thisAccumPtr) {
776 if (mtls->combFunc) {
David Gross013ff532016-04-01 12:46:58 -0700777 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700778 FormatBuf fmt;
David Grossae2ec3f2016-06-01 14:45:47 -0700779 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulating into%s",
780 mtls->accumFunc,
781 format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
782 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulator[%d]%s",
783 mtls->accumFunc, idx,
784 format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
David Gross35dbc8c2016-03-29 13:48:41 -0700785 }
786 mtls->combFunc(finalAccumPtr, thisAccumPtr);
787 } else {
788 rsAssert(!"expected combiner");
789 }
790 }
791 } else {
792 finalAccumPtr = thisAccumPtr;
793 }
794 }
795 rsAssert(finalAccumPtr != nullptr);
David Gross013ff532016-04-01 12:46:58 -0700796 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700797 FormatBuf fmt;
David Grossae2ec3f2016-06-01 14:45:47 -0700798 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final accumulator%s",
799 mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
David Gross35dbc8c2016-03-29 13:48:41 -0700800 }
801
802 // Outconvert
803 if (mtls->outFunc) {
804 mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
David Gross013ff532016-04-01 12:46:58 -0700805 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700806 FormatBuf fmt;
David Grossae2ec3f2016-06-01 14:45:47 -0700807 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final outconverted result%s",
808 mtls->accumFunc,
809 format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
David Gross35dbc8c2016-03-29 13:48:41 -0700810 }
811 }
812
813 // Clean up
814 free(mtls->accumPtr);
815 free(mtls->accumAlloc);
816}
817
818
Matt Wala14ce0072015-07-30 17:30:25 -0700819void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
Chris Wailesf3712132014-07-16 15:18:30 -0700820 uint32_t inLen,
821 Allocation* aout,
822 const RsScriptCall* sc,
Matt Wala14ce0072015-07-30 17:30:25 -0700823 MTLaunchStructForEach* mtls) {
Stephen Hines4b2bea32014-08-13 17:32:10 +0000824
825 //android::StopWatch kernel_time("kernel time");
826
Jason Samsbf2111d2015-01-26 18:13:41 -0800827 bool outerDims = (mtls->start.z != mtls->end.z) ||
828 (mtls->start.face != mtls->end.face) ||
829 (mtls->start.lod != mtls->end.lod) ||
830 (mtls->start.array[0] != mtls->end.array[0]) ||
831 (mtls->start.array[1] != mtls->end.array[1]) ||
832 (mtls->start.array[2] != mtls->end.array[2]) ||
833 (mtls->start.array[3] != mtls->end.array[3]);
834
David Gross35dbc8c2016-03-29 13:48:41 -0700835 if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
Stephen Hines4b2bea32014-08-13 17:32:10 +0000836 const size_t targetByteChunk = 16 * 1024;
David Gross35dbc8c2016-03-29 13:48:41 -0700837 mInKernel = true; // NOTE: The guard immediately above ensures this was !mInKernel
Chris Wailesf3712132014-07-16 15:18:30 -0700838
Jason Samsbf2111d2015-01-26 18:13:41 -0800839 if (outerDims) {
840 // No fancy logic for chunk size
841 mtls->mSliceSize = 1;
David Grossf9bd1f22016-04-06 16:45:26 -0700842 launchThreads(walk_general_foreach, mtls);
Jason Samsbf2111d2015-01-26 18:13:41 -0800843 } else if (mtls->fep.dim.y > 1) {
Jason Samsc0d68472015-01-20 14:29:52 -0800844 uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
Stephen Hines4b2bea32014-08-13 17:32:10 +0000845 uint32_t s2 = 0;
846
847 // This chooses our slice size to rate limit atomic ops to
848 // one per 16k bytes of reads/writes.
Jason Samsc0d68472015-01-20 14:29:52 -0800849 if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
850 s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
Jason Samsa9139c72015-04-16 15:11:04 -0700851 } else if (mtls->ains[0]) {
Jason Samsc0d68472015-01-20 14:29:52 -0800852 s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
Jason Samsa9139c72015-04-16 15:11:04 -0700853 } else {
854 // Launch option only case
855 // Use s1 based only on the dimensions
856 s2 = s1;
Stephen Hines4b2bea32014-08-13 17:32:10 +0000857 }
858 mtls->mSliceSize = rsMin(s1, s2);
859
860 if(mtls->mSliceSize < 1) {
861 mtls->mSliceSize = 1;
862 }
863
David Grossf9bd1f22016-04-06 16:45:26 -0700864 launchThreads(walk_2d_foreach, mtls);
Stephen Hines4b2bea32014-08-13 17:32:10 +0000865 } else {
Jason Samsc0d68472015-01-20 14:29:52 -0800866 uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
Stephen Hines4b2bea32014-08-13 17:32:10 +0000867 uint32_t s2 = 0;
868
869 // This chooses our slice size to rate limit atomic ops to
870 // one per 16k bytes of reads/writes.
Jason Samsc0d68472015-01-20 14:29:52 -0800871 if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
872 s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
Jason Samsa9139c72015-04-16 15:11:04 -0700873 } else if (mtls->ains[0]) {
Jason Samsc0d68472015-01-20 14:29:52 -0800874 s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
Jason Samsa9139c72015-04-16 15:11:04 -0700875 } else {
876 // Launch option only case
877 // Use s1 based only on the dimensions
878 s2 = s1;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700879 }
880 mtls->mSliceSize = rsMin(s1, s2);
881
882 if (mtls->mSliceSize < 1) {
883 mtls->mSliceSize = 1;
884 }
885
David Gross35dbc8c2016-03-29 13:48:41 -0700886 launchThreads(walk_1d_foreach, mtls);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700887 }
David Gross35dbc8c2016-03-29 13:48:41 -0700888 mInKernel = false;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700889
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700890 } else {
Matt Wala14ce0072015-07-30 17:30:25 -0700891 ForEachFunc_t fn = mtls->kernel;
Jason Samsbf2111d2015-01-26 18:13:41 -0800892 uint32_t slice = 0;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700893
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700894
Jason Sams59b35f22015-03-17 12:59:17 -0700895 while(SelectOuterSlice(mtls, &mtls->fep, slice++)) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800896 for (mtls->fep.current.y = mtls->start.y;
897 mtls->fep.current.y < mtls->end.y;
898 mtls->fep.current.y++) {
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700899
Jason Samsbf2111d2015-01-26 18:13:41 -0800900 FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
901 mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
902 (RsAllocationCubemapFace) mtls->fep.current.face,
903 mtls->fep.current.array[0], mtls->fep.current.array[1],
904 mtls->fep.current.array[2], mtls->fep.current.array[3]);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700905
David Grossb0abb142015-03-12 15:23:03 -0700906 fn(&mtls->fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700907 }
908 }
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700909 }
910}
911
Jason Sams709a0972012-11-15 18:18:04 -0800912RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
913 //ALOGE("setTls %p", sc);
914 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
915 rsAssert(tls);
916 RsdCpuScriptImpl *old = tls->mImpl;
917 tls->mImpl = sc;
918 tls->mContext = mRSC;
919 if (sc) {
920 tls->mScript = sc->getScript();
921 } else {
Chris Wailes44bef6f2014-08-12 13:51:10 -0700922 tls->mScript = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800923 }
924 return old;
925}
926
927const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
928 return mSymLookupFn(mRSC, name);
929}
930
931
932RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
933 char const *resName, char const *cacheDir,
934 uint8_t const *bitcode, size_t bitcodeSize,
935 uint32_t flags) {
936
937 RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
Stephen Hines00511322014-01-31 11:20:23 -0800938 if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags
Stephen Hines00511322014-01-31 11:20:23 -0800939 , getBccPluginName()
Stephen Hines00511322014-01-31 11:20:23 -0800940 )) {
Jason Sams709a0972012-11-15 18:18:04 -0800941 delete i;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700942 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800943 }
944 return i;
945}
946
Jason Sams7c4b8882013-01-04 10:50:05 -0800947extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
948 const Script *s, const Element *e);
Jason Samsc905efd2012-11-26 15:20:18 -0800949extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
950 const Script *s, const Element *e);
951extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
952 const Script *s, const Element *e);
953extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
954 const Script *s, const Element *e);
955extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
956 const Script *s, const Element *e);
957extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
958 const Script *s, const Element *e);
959extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
960 const Script *s, const Element *e);
961extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
962 const Script *s, const Element *e);
Jason Sams2282e282013-06-17 16:52:01 -0700963extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
964 const Script *s, const Element *e);
Jason Sams39ab94a2014-04-16 17:14:05 -0700965extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
966 const Script *s, const Element *e);
Tim Murray64c682b2015-01-09 12:08:43 -0800967extern RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
968 const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -0800969
970RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
971 RsScriptIntrinsicID iid, Element *e) {
972
Chris Wailes44bef6f2014-08-12 13:51:10 -0700973 RsdCpuScriptImpl *i = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800974 switch (iid) {
Jason Sams7c4b8882013-01-04 10:50:05 -0800975 case RS_SCRIPT_INTRINSIC_ID_3DLUT:
976 i = rsdIntrinsic_3DLUT(this, s, e);
977 break;
Jason Sams709a0972012-11-15 18:18:04 -0800978 case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
Jason Samsc905efd2012-11-26 15:20:18 -0800979 i = rsdIntrinsic_Convolve3x3(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800980 break;
981 case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
Jason Samsc905efd2012-11-26 15:20:18 -0800982 i = rsdIntrinsic_ColorMatrix(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800983 break;
984 case RS_SCRIPT_INTRINSIC_ID_LUT:
Jason Samsc905efd2012-11-26 15:20:18 -0800985 i = rsdIntrinsic_LUT(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800986 break;
987 case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
Jason Samsc905efd2012-11-26 15:20:18 -0800988 i = rsdIntrinsic_Convolve5x5(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800989 break;
990 case RS_SCRIPT_INTRINSIC_ID_BLUR:
Jason Samsc905efd2012-11-26 15:20:18 -0800991 i = rsdIntrinsic_Blur(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800992 break;
993 case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
Jason Samsc905efd2012-11-26 15:20:18 -0800994 i = rsdIntrinsic_YuvToRGB(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800995 break;
996 case RS_SCRIPT_INTRINSIC_ID_BLEND:
Jason Samsc905efd2012-11-26 15:20:18 -0800997 i = rsdIntrinsic_Blend(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800998 break;
Jason Sams2282e282013-06-17 16:52:01 -0700999 case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
1000 i = rsdIntrinsic_Histogram(this, s, e);
1001 break;
Jason Sams39ab94a2014-04-16 17:14:05 -07001002 case RS_SCRIPT_INTRINSIC_ID_RESIZE:
1003 i = rsdIntrinsic_Resize(this, s, e);
1004 break;
Tim Murray64c682b2015-01-09 12:08:43 -08001005 case RS_SCRIPT_INTRINSIC_ID_BLAS:
1006 i = rsdIntrinsic_BLAS(this, s, e);
1007 break;
Jason Sams709a0972012-11-15 18:18:04 -08001008
1009 default:
1010 rsAssert(0);
1011 }
1012
1013 return i;
1014}
1015
Yang Ni1ffd86b2015-01-07 09:16:40 -08001016void* RsdCpuReferenceImpl::createScriptGroup(const ScriptGroupBase *sg) {
1017 switch (sg->getApiVersion()) {
1018 case ScriptGroupBase::SG_V1: {
1019 CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
1020 if (!sgi->init()) {
Jason Sams709a0972012-11-15 18:18:04 -08001021 delete sgi;
Chris Wailes44bef6f2014-08-12 13:51:10 -07001022 return nullptr;
Yang Ni1ffd86b2015-01-07 09:16:40 -08001023 }
1024 return sgi;
Jason Sams709a0972012-11-15 18:18:04 -08001025 }
Yang Ni1ffd86b2015-01-07 09:16:40 -08001026 case ScriptGroupBase::SG_V2: {
1027 return new CpuScriptGroup2Impl(this, sg);
1028 }
1029 }
1030 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -08001031}
Chih-Hung Hsieh462de212016-11-16 11:33:57 -08001032
1033} // namespace renderscript
1034} // namespace android