blob: 23ce72fb77360d51e54f1fca1a40388ab3bf80f9 [file] [log] [blame]
Jason Sams709a0972012-11-15 18:18:04 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuCore.h"
18#include "rsCpuScript.h"
19#include "rsCpuScriptGroup.h"
Yang Ni1ffd86b2015-01-07 09:16:40 -080020#include "rsCpuScriptGroup2.h"
Jason Sams709a0972012-11-15 18:18:04 -080021
22#include <malloc.h>
23#include "rsContext.h"
24
25#include <sys/types.h>
26#include <sys/resource.h>
27#include <sched.h>
Jason Sams709a0972012-11-15 18:18:04 -080028#include <sys/syscall.h>
Matt Wala11fd9ec2015-07-10 16:40:12 -070029#include <stdio.h>
Jason Sams709a0972012-11-15 18:18:04 -080030#include <string.h>
Stephen Hinesb0934b62013-07-03 17:27:38 -070031#include <unistd.h>
Tim Murray0b575de2013-03-15 15:56:43 -070032
Stephen Hinesb0934b62013-07-03 17:27:38 -070033#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
Tim Murray0b575de2013-03-15 15:56:43 -070034#include <cutils/properties.h>
Jason Sams709a0972012-11-15 18:18:04 -080035#include "utils/StopWatch.h"
Tim Murray0b575de2013-03-15 15:56:43 -070036#endif
37
38#ifdef RS_SERVER
39// Android exposes gettid(), standard Linux does not
40static pid_t gettid() {
41 return syscall(SYS_gettid);
42}
43#endif
Jason Sams709a0972012-11-15 18:18:04 -080044
45using namespace android;
46using namespace android::renderscript;
47
David Grossae2ec3f2016-06-01 14:45:47 -070048#define REDUCE_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
David Gross35dbc8c2016-03-29 13:48:41 -070049
Jason Sams709a0972012-11-15 18:18:04 -080050static pthread_key_t gThreadTLSKey = 0;
51static uint32_t gThreadTLSKeyCount = 0;
52static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
53
Jason Samsf5ef8df2013-08-06 13:49:25 -070054bool android::renderscript::gArchUseSIMD = false;
55
Jason Sams709a0972012-11-15 18:18:04 -080056RsdCpuReference::~RsdCpuReference() {
57}
58
59RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
Jason Samscadfac42013-03-06 18:09:08 -080060 uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
David Grossb043df02015-05-29 11:38:15 -070061 , RSSelectRTCallback pSelectRTCallback,
Stephen Hines00511322014-01-31 11:20:23 -080062 const char *pBccPluginName
Jason Samscadfac42013-03-06 18:09:08 -080063 ) {
Jason Sams709a0972012-11-15 18:18:04 -080064
65 RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
66 if (!cpu) {
Chris Wailes44bef6f2014-08-12 13:51:10 -070067 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -080068 }
69 if (!cpu->init(version_major, version_minor, lfn, slfn)) {
70 delete cpu;
Chris Wailes44bef6f2014-08-12 13:51:10 -070071 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -080072 }
Stephen Hinesf218bf12013-02-12 19:32:38 -080073
Stephen Hines1d476622013-03-29 22:08:49 -070074 cpu->setSelectRTCallback(pSelectRTCallback);
Stephen Hines00511322014-01-31 11:20:23 -080075 if (pBccPluginName) {
76 cpu->setBccPluginName(pBccPluginName);
77 }
Stephen Hinesf218bf12013-02-12 19:32:38 -080078
Jason Sams709a0972012-11-15 18:18:04 -080079 return cpu;
80}
81
82
83Context * RsdCpuReference::getTlsContext() {
84 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
85 return tls->mContext;
86}
87
88const Script * RsdCpuReference::getTlsScript() {
89 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
90 return tls->mScript;
91}
92
Stephen Hinesf218bf12013-02-12 19:32:38 -080093pthread_key_t RsdCpuReference::getThreadTLSKey(){ return gThreadTLSKey; }
Jason Sams709a0972012-11-15 18:18:04 -080094
95////////////////////////////////////////////////////////////
96///
97
98RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
99 mRSC = rsc;
100
101 version_major = 0;
102 version_minor = 0;
David Gross35dbc8c2016-03-29 13:48:41 -0700103 mInKernel = false;
Jason Sams709a0972012-11-15 18:18:04 -0800104 memset(&mWorkers, 0, sizeof(mWorkers));
105 memset(&mTlsStruct, 0, sizeof(mTlsStruct));
106 mExit = false;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700107 mSelectRTCallback = nullptr;
Stephen Hines8409d642015-04-28 18:49:56 -0700108 mEmbedGlobalInfo = true;
109 mEmbedGlobalInfoSkipConstant = true;
Jason Sams709a0972012-11-15 18:18:04 -0800110}
111
112
113void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
114 RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
115
Tim Murray0b575de2013-03-15 15:56:43 -0700116 uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
Jason Sams709a0972012-11-15 18:18:04 -0800117
118 //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
119
120 dc->mWorkers.mLaunchSignals[idx].init();
121 dc->mWorkers.mNativeThreadId[idx] = gettid();
122
123 memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
124 int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
125 if (status) {
126 ALOGE("pthread_setspecific %i", status);
127 }
128
129#if 0
130 typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
131 cpu_set_t cpuset;
132 memset(&cpuset, 0, sizeof(cpuset));
133 cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
134 int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
135 sizeof(cpuset), &cpuset);
136 ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
137#endif
138
139 while (!dc->mExit) {
140 dc->mWorkers.mLaunchSignals[idx].wait();
141 if (dc->mWorkers.mLaunchCallback) {
142 // idx +1 is used because the calling thread is always worker 0.
143 dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
144 }
Tim Murray0b575de2013-03-15 15:56:43 -0700145 __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
Jason Sams709a0972012-11-15 18:18:04 -0800146 dc->mWorkers.mCompleteSignal.set();
147 }
148
149 //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
Chris Wailes44bef6f2014-08-12 13:51:10 -0700150 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800151}
152
Matt Wala14ce0072015-07-30 17:30:25 -0700153// Launch a kernel.
154// The callback function is called to execute the kernel.
Jason Sams709a0972012-11-15 18:18:04 -0800155void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
156 mWorkers.mLaunchData = data;
157 mWorkers.mLaunchCallback = cbk;
Tim Murray4d252d62012-11-29 14:37:59 -0800158
159 // fast path for very small launches
Matt Wala14ce0072015-07-30 17:30:25 -0700160 MTLaunchStructCommon *mtls = (MTLaunchStructCommon *)data;
161 if (mtls && mtls->dimPtr->y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
Tim Murray4d252d62012-11-29 14:37:59 -0800162 if (mWorkers.mLaunchCallback) {
163 mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
164 }
165 return;
166 }
167
Tim Murray0b575de2013-03-15 15:56:43 -0700168 mWorkers.mRunningCount = mWorkers.mCount;
169 __sync_synchronize();
170
Jason Sams709a0972012-11-15 18:18:04 -0800171 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
172 mWorkers.mLaunchSignals[ct].set();
173 }
174
175 // We use the calling thread as one of the workers so we can start without
176 // the delay of the thread wakeup.
177 if (mWorkers.mLaunchCallback) {
Tim Murray4d252d62012-11-29 14:37:59 -0800178 mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
Jason Sams709a0972012-11-15 18:18:04 -0800179 }
180
Tim Murray0b575de2013-03-15 15:56:43 -0700181 while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
Jason Sams709a0972012-11-15 18:18:04 -0800182 mWorkers.mCompleteSignal.wait();
183 }
184}
185
186
187void RsdCpuReferenceImpl::lockMutex() {
188 pthread_mutex_lock(&gInitMutex);
189}
190
191void RsdCpuReferenceImpl::unlockMutex() {
192 pthread_mutex_unlock(&gInitMutex);
193}
194
Matt Wala11fd9ec2015-07-10 16:40:12 -0700195// Determine if the CPU we're running on supports SIMD instructions.
Jason Samsf5ef8df2013-08-06 13:49:25 -0700196static void GetCpuInfo() {
Matt Wala11fd9ec2015-07-10 16:40:12 -0700197 // Read the CPU flags from /proc/cpuinfo.
198 FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
Jason Samsf5ef8df2013-08-06 13:49:25 -0700199
Matt Wala11fd9ec2015-07-10 16:40:12 -0700200 if (!cpuinfo) {
Jason Samsf5ef8df2013-08-06 13:49:25 -0700201 return;
202 }
203
Matt Wala11fd9ec2015-07-10 16:40:12 -0700204 char cpuinfostr[4096];
Miao Wang5d70cb52015-07-17 11:53:04 -0700205 // fgets() ends with newline or EOF, need to check the whole
206 // "cpuinfo" file to make sure we can use SIMD or not.
207 while (fgets(cpuinfostr, sizeof(cpuinfostr), cpuinfo)) {
208#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
209 gArchUseSIMD = strstr(cpuinfostr, " neon") || strstr(cpuinfostr, " asimd");
210#elif defined(ARCH_X86_HAVE_SSSE3)
211 gArchUseSIMD = strstr(cpuinfostr, " ssse3");
212#endif
213 if (gArchUseSIMD) {
214 break;
215 }
Matt Wala11fd9ec2015-07-10 16:40:12 -0700216 }
217 fclose(cpuinfo);
Jason Samsf5ef8df2013-08-06 13:49:25 -0700218}
Jason Samsf5ef8df2013-08-06 13:49:25 -0700219
Jason Sams709a0972012-11-15 18:18:04 -0800220bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
221 sym_lookup_t lfn, script_lookup_t slfn) {
Jason Sams709a0972012-11-15 18:18:04 -0800222 mSymLookupFn = lfn;
223 mScriptLookupFn = slfn;
224
225 lockMutex();
226 if (!gThreadTLSKeyCount) {
Chris Wailes44bef6f2014-08-12 13:51:10 -0700227 int status = pthread_key_create(&gThreadTLSKey, nullptr);
Jason Sams709a0972012-11-15 18:18:04 -0800228 if (status) {
229 ALOGE("Failed to init thread tls key.");
230 unlockMutex();
231 return false;
232 }
233 }
234 gThreadTLSKeyCount++;
235 unlockMutex();
236
237 mTlsStruct.mContext = mRSC;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700238 mTlsStruct.mScript = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800239 int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
240 if (status) {
241 ALOGE("pthread_setspecific %i", status);
242 }
243
David Gross35dbc8c2016-03-29 13:48:41 -0700244 mPageSize = sysconf(_SC_PAGE_SIZE);
David Gross013ff532016-04-01 12:46:58 -0700245 // ALOGV("page size = %ld", mPageSize);
David Gross35dbc8c2016-03-29 13:48:41 -0700246
Jason Samsf5ef8df2013-08-06 13:49:25 -0700247 GetCpuInfo();
Jason Samsf5ef8df2013-08-06 13:49:25 -0700248
Jason Sams77d57a32014-10-23 17:43:53 -0700249 int cpu = sysconf(_SC_NPROCESSORS_CONF);
Jason Sams709a0972012-11-15 18:18:04 -0800250 if(mRSC->props.mDebugMaxThreads) {
251 cpu = mRSC->props.mDebugMaxThreads;
252 }
253 if (cpu < 2) {
254 mWorkers.mCount = 0;
255 return true;
256 }
257
258 // Subtract one from the cpu count because we also use the command thread as a worker.
259 mWorkers.mCount = (uint32_t)(cpu - 1);
260
Yang Ni554054c2016-04-20 09:04:02 -0700261 if (mRSC->props.mLogScripts) {
262 ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
263 }
Jason Sams709a0972012-11-15 18:18:04 -0800264
265 mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
266 mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
267 mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
Chris Wailes44bef6f2014-08-12 13:51:10 -0700268 mWorkers.mLaunchCallback = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800269
270 mWorkers.mCompleteSignal.init();
271
Tim Murray0b575de2013-03-15 15:56:43 -0700272 mWorkers.mRunningCount = mWorkers.mCount;
273 mWorkers.mLaunchCount = 0;
274 __sync_synchronize();
Jason Sams709a0972012-11-15 18:18:04 -0800275
276 pthread_attr_t threadAttr;
277 status = pthread_attr_init(&threadAttr);
278 if (status) {
279 ALOGE("Failed to init thread attribute.");
280 return false;
281 }
282
283 for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
284 status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
285 if (status) {
286 mWorkers.mCount = ct;
287 ALOGE("Created fewer than expected number of RS threads.");
288 break;
289 }
290 }
Tim Murray0b575de2013-03-15 15:56:43 -0700291 while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
Jason Sams709a0972012-11-15 18:18:04 -0800292 usleep(100);
293 }
294
295 pthread_attr_destroy(&threadAttr);
296 return true;
297}
298
299
300void RsdCpuReferenceImpl::setPriority(int32_t priority) {
301 for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
302 setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
303 }
304}
305
306RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
307 mExit = true;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700308 mWorkers.mLaunchData = nullptr;
309 mWorkers.mLaunchCallback = nullptr;
Tim Murray0b575de2013-03-15 15:56:43 -0700310 mWorkers.mRunningCount = mWorkers.mCount;
311 __sync_synchronize();
Jason Sams709a0972012-11-15 18:18:04 -0800312 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
313 mWorkers.mLaunchSignals[ct].set();
314 }
315 void *res;
316 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
317 pthread_join(mWorkers.mThreadId[ct], &res);
318 }
Miao Wang19601aa2016-04-21 20:36:57 -0700319 // b/23109602
320 // TODO: Refactor the implementation with threadpool to
321 // fix the race condition in the destuctor.
322 // rsAssert(__sync_fetch_and_or(&mWorkers.mRunningCount, 0) == 0);
Jens Gulin07ef7042014-02-19 18:16:01 +0100323 free(mWorkers.mThreadId);
324 free(mWorkers.mNativeThreadId);
325 delete[] mWorkers.mLaunchSignals;
Jason Sams709a0972012-11-15 18:18:04 -0800326
327 // Global structure cleanup.
328 lockMutex();
329 --gThreadTLSKeyCount;
330 if (!gThreadTLSKeyCount) {
331 pthread_key_delete(gThreadTLSKey);
332 }
333 unlockMutex();
334
335}
336
Matt Wala14ce0072015-07-30 17:30:25 -0700337// Set up the appropriate input and output pointers to the kernel driver info structure.
338// Inputs:
339// mtls - The MTLaunchStruct holding information about the kernel launch
340// fep - The forEach parameters (driver info structure)
341// x, y, z, lod, face, a1, a2, a3, a4 - The start offsets into each dimension
342static inline void FepPtrSetup(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo *fep,
Jason Samsc0d68472015-01-20 14:29:52 -0800343 uint32_t x, uint32_t y,
344 uint32_t z = 0, uint32_t lod = 0,
345 RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
346 uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
Jason Samsc0d68472015-01-20 14:29:52 -0800347 for (uint32_t i = 0; i < fep->inLen; i++) {
348 fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
349 }
Jason Samsc0d68472015-01-20 14:29:52 -0800350 if (mtls->aout[0] != nullptr) {
351 fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
352 }
353}
354
David Gross6c1876b2016-01-15 11:52:14 -0800355// Set up the appropriate input and output pointers to the kernel driver info structure.
356// Inputs:
357// mtls - The MTLaunchStruct holding information about the kernel launch
358// redp - The reduce parameters (driver info structure)
359// x, y, z - The start offsets into each dimension
David Grossae2ec3f2016-06-01 14:45:47 -0700360static inline void RedpPtrSetup(const MTLaunchStructReduce *mtls, RsExpandKernelDriverInfo *redp,
David Gross6c1876b2016-01-15 11:52:14 -0800361 uint32_t x, uint32_t y, uint32_t z) {
362 for (uint32_t i = 0; i < redp->inLen; i++) {
363 redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
364 }
365}
366
Jason Samsbf2111d2015-01-26 18:13:41 -0800367static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
368 if (start >= end) {
369 *p = start;
370 return val;
371 }
372
373 uint32_t div = end - start;
374
375 uint32_t n = val / div;
376 *p = (val - (n * div)) + start;
377 return n;
378}
379
David Gross6c1876b2016-01-15 11:52:14 -0800380static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800381 uint32_t r = sliceNum;
David Gross6c1876b2016-01-15 11:52:14 -0800382 r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
383 r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
384 r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
385 r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
386 r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
387 r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
388 r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
Jason Samsbf2111d2015-01-26 18:13:41 -0800389 return r == 0;
390}
391
David Grossf9bd1f22016-04-06 16:45:26 -0700392static bool SelectZSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
393 return sliceInt(&info->current.z, sliceNum, mtls->start.z, mtls->end.z) == 0;
394}
Jason Samsbf2111d2015-01-26 18:13:41 -0800395
David Grossf9bd1f22016-04-06 16:45:26 -0700396static void walk_general_foreach(void *usr, uint32_t idx) {
Matt Wala14ce0072015-07-30 17:30:25 -0700397 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
Jason Samsbf2111d2015-01-26 18:13:41 -0800398 RsExpandKernelDriverInfo fep = mtls->fep;
399 fep.lid = idx;
Matt Wala14ce0072015-07-30 17:30:25 -0700400 ForEachFunc_t fn = mtls->kernel;
Jason Samsbf2111d2015-01-26 18:13:41 -0800401
Jason Samsbf2111d2015-01-26 18:13:41 -0800402 while(1) {
403 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
404
Jason Sams59b35f22015-03-17 12:59:17 -0700405 if (!SelectOuterSlice(mtls, &fep, slice)) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800406 return;
407 }
408
Jason Sams59b35f22015-03-17 12:59:17 -0700409 for (fep.current.y = mtls->start.y; fep.current.y < mtls->end.y;
410 fep.current.y++) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800411
Jason Sams59b35f22015-03-17 12:59:17 -0700412 FepPtrSetup(mtls, &fep, mtls->start.x,
413 fep.current.y, fep.current.z, fep.current.lod,
414 (RsAllocationCubemapFace)fep.current.face,
415 fep.current.array[0], fep.current.array[1],
416 fep.current.array[2], fep.current.array[3]);
Jason Samsbf2111d2015-01-26 18:13:41 -0800417
Jason Sams59b35f22015-03-17 12:59:17 -0700418 fn(&fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
Jason Samsbf2111d2015-01-26 18:13:41 -0800419 }
420 }
Jason Samsbf2111d2015-01-26 18:13:41 -0800421}
Jason Samsc0d68472015-01-20 14:29:52 -0800422
David Grossf9bd1f22016-04-06 16:45:26 -0700423static void walk_2d_foreach(void *usr, uint32_t idx) {
Matt Wala14ce0072015-07-30 17:30:25 -0700424 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
Jason Samsc0d68472015-01-20 14:29:52 -0800425 RsExpandKernelDriverInfo fep = mtls->fep;
426 fep.lid = idx;
Matt Wala14ce0072015-07-30 17:30:25 -0700427 ForEachFunc_t fn = mtls->kernel;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700428
Jason Samsc0d68472015-01-20 14:29:52 -0800429 while (1) {
430 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
Jason Samsbf2111d2015-01-26 18:13:41 -0800431 uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
Jason Samsc0d68472015-01-20 14:29:52 -0800432 uint32_t yEnd = yStart + mtls->mSliceSize;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700433
Jason Samsbf2111d2015-01-26 18:13:41 -0800434 yEnd = rsMin(yEnd, mtls->end.y);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700435
Jason Samsc0d68472015-01-20 14:29:52 -0800436 if (yEnd <= yStart) {
437 return;
Stephen Hines4b2bea32014-08-13 17:32:10 +0000438 }
Jason Samsc0d68472015-01-20 14:29:52 -0800439
440 for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800441 FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
Jason Samsc0d68472015-01-20 14:29:52 -0800442
David Grossb0abb142015-03-12 15:23:03 -0700443 fn(&fep, mtls->start.x, mtls->end.x, fep.outStride[0]);
Jason Samsc0d68472015-01-20 14:29:52 -0800444 }
445 }
Stephen Hines4b2bea32014-08-13 17:32:10 +0000446}
447
David Gross35dbc8c2016-03-29 13:48:41 -0700448static void walk_1d_foreach(void *usr, uint32_t idx) {
Matt Wala14ce0072015-07-30 17:30:25 -0700449 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
Jason Samsc0d68472015-01-20 14:29:52 -0800450 RsExpandKernelDriverInfo fep = mtls->fep;
451 fep.lid = idx;
Matt Wala14ce0072015-07-30 17:30:25 -0700452 ForEachFunc_t fn = mtls->kernel;
Chris Wailesf3712132014-07-16 15:18:30 -0700453
Jason Samsc0d68472015-01-20 14:29:52 -0800454 while (1) {
455 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
Jason Samsbf2111d2015-01-26 18:13:41 -0800456 uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
Jason Samsc0d68472015-01-20 14:29:52 -0800457 uint32_t xEnd = xStart + mtls->mSliceSize;
Chris Wailesf3712132014-07-16 15:18:30 -0700458
Jason Samsbf2111d2015-01-26 18:13:41 -0800459 xEnd = rsMin(xEnd, mtls->end.x);
Chris Wailesf3712132014-07-16 15:18:30 -0700460
Jason Samsc0d68472015-01-20 14:29:52 -0800461 if (xEnd <= xStart) {
462 return;
Chris Wailesf3712132014-07-16 15:18:30 -0700463 }
Jason Samsc0d68472015-01-20 14:29:52 -0800464
Jason Samsbf2111d2015-01-26 18:13:41 -0800465 FepPtrSetup(mtls, &fep, xStart, 0);
Jason Samsc0d68472015-01-20 14:29:52 -0800466
David Grossb0abb142015-03-12 15:23:03 -0700467 fn(&fep, xStart, xEnd, fep.outStride[0]);
Jason Samsc0d68472015-01-20 14:29:52 -0800468 }
Chris Wailesf3712132014-07-16 15:18:30 -0700469}
470
David Gross35dbc8c2016-03-29 13:48:41 -0700471// The function format_bytes() is an auxiliary function to assist in logging.
472//
473// Bytes are read from an input (inBuf) and written (as pairs of hex digits)
474// to an output (outBuf).
475//
476// Output format:
477// - starts with ": "
478// - each input byte is translated to a pair of hex digits
479// - bytes are separated by "." except that every fourth separator is "|"
480// - if the input is sufficiently long, the output is truncated and terminated with "..."
481//
482// Arguments:
483// - outBuf -- Pointer to buffer of type "FormatBuf" into which output is written
484// - inBuf -- Pointer to bytes which are to be formatted into outBuf
485// - inBytes -- Number of bytes in inBuf
486//
487// Constant:
488// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
489// from inBuf
490//
491// Return value:
492// - pointer (const char *) to output (which is part of outBuf)
493//
494static const int kFormatInBytesMax = 16;
495// ": " + 2 digits per byte + 1 separator between bytes + "..." + null
496typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
497static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
498 strcpy(*outBuf, ": ");
499 int pos = 2;
500 const int lim = std::min(kFormatInBytesMax, inBytes);
501 for (int i = 0; i < lim; ++i) {
502 if (i) {
503 sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
504 ++pos;
505 }
506 sprintf(*outBuf + pos, "%02x", inBuf[i]);
507 pos += 2;
508 }
509 if (kFormatInBytesMax < inBytes)
510 strcpy(*outBuf + pos, "...");
511 return *outBuf;
512}
513
David Grossae2ec3f2016-06-01 14:45:47 -0700514static void reduce_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduce *mtls,
515 const char *walkerName, uint32_t threadIdx) {
David Grossf9bd1f22016-04-06 16:45:26 -0700516 rsAssert(!accumPtr);
517
518 uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
519 if (mtls->outFunc) {
520 accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
521 } else {
522 if (accumIdx == 0) {
523 accumPtr = mtls->redp.outPtr[0];
524 } else {
525 accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
526 }
527 }
David Grossae2ec3f2016-06-01 14:45:47 -0700528 REDUCE_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
529 walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
David Grossf9bd1f22016-04-06 16:45:26 -0700530 // initialize accumulator
531 if (mtls->initFunc) {
532 mtls->initFunc(accumPtr);
533 } else {
534 memset(accumPtr, 0, mtls->accumSize);
535 }
536}
537
David Grossae2ec3f2016-06-01 14:45:47 -0700538static void walk_1d_reduce(void *usr, uint32_t idx) {
539 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
David Gross35dbc8c2016-03-29 13:48:41 -0700540 RsExpandKernelDriverInfo redp = mtls->redp;
541
542 // find accumulator
543 uint8_t *&accumPtr = mtls->accumPtr[idx];
544 if (!accumPtr) {
David Grossae2ec3f2016-06-01 14:45:47 -0700545 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
David Gross35dbc8c2016-03-29 13:48:41 -0700546 }
547
548 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700549 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Gross35dbc8c2016-03-29 13:48:41 -0700550 while (1) {
551 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
552 uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
553 uint32_t xEnd = xStart + mtls->mSliceSize;
554
555 xEnd = rsMin(xEnd, mtls->end.x);
556
557 if (xEnd <= xStart) {
558 return;
559 }
560
561 RedpPtrSetup(mtls, &redp, xStart, 0, 0);
562 fn(&redp, xStart, xEnd, accumPtr);
563
David Grossf9bd1f22016-04-06 16:45:26 -0700564 // Emit log line after slice has been run, so that we can include
565 // the results of the run on that line.
David Gross35dbc8c2016-03-29 13:48:41 -0700566 FormatBuf fmt;
David Gross013ff532016-04-01 12:46:58 -0700567 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700568 format_bytes(&fmt, accumPtr, mtls->accumSize);
569 } else {
570 fmt[0] = 0;
571 }
David Grossae2ec3f2016-06-01 14:45:47 -0700572 REDUCE_ALOGV(mtls, 2, "walk_1d_reduce(%p): idx = %u, x in [%u, %u)%s",
573 mtls->accumFunc, idx, xStart, xEnd, fmt);
David Gross35dbc8c2016-03-29 13:48:41 -0700574 }
575}
576
David Grossae2ec3f2016-06-01 14:45:47 -0700577static void walk_2d_reduce(void *usr, uint32_t idx) {
578 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
David Grossf9bd1f22016-04-06 16:45:26 -0700579 RsExpandKernelDriverInfo redp = mtls->redp;
580
581 // find accumulator
582 uint8_t *&accumPtr = mtls->accumPtr[idx];
583 if (!accumPtr) {
David Grossae2ec3f2016-06-01 14:45:47 -0700584 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
David Grossf9bd1f22016-04-06 16:45:26 -0700585 }
586
587 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700588 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Grossf9bd1f22016-04-06 16:45:26 -0700589 while (1) {
590 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
591 uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
592 uint32_t yEnd = yStart + mtls->mSliceSize;
593
594 yEnd = rsMin(yEnd, mtls->end.y);
595
596 if (yEnd <= yStart) {
597 return;
598 }
599
600 for (redp.current.y = yStart; redp.current.y < yEnd; redp.current.y++) {
601 RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, 0);
602 fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
603 }
604
605 FormatBuf fmt;
606 if (mtls->logReduce >= 3) {
607 format_bytes(&fmt, accumPtr, mtls->accumSize);
608 } else {
609 fmt[0] = 0;
610 }
David Grossae2ec3f2016-06-01 14:45:47 -0700611 REDUCE_ALOGV(mtls, 2, "walk_2d_reduce(%p): idx = %u, y in [%u, %u)%s",
612 mtls->accumFunc, idx, yStart, yEnd, fmt);
David Grossf9bd1f22016-04-06 16:45:26 -0700613 }
614}
615
David Grossae2ec3f2016-06-01 14:45:47 -0700616static void walk_3d_reduce(void *usr, uint32_t idx) {
617 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
David Grossf9bd1f22016-04-06 16:45:26 -0700618 RsExpandKernelDriverInfo redp = mtls->redp;
619
620 // find accumulator
621 uint8_t *&accumPtr = mtls->accumPtr[idx];
622 if (!accumPtr) {
David Grossae2ec3f2016-06-01 14:45:47 -0700623 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
David Grossf9bd1f22016-04-06 16:45:26 -0700624 }
625
626 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700627 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Grossf9bd1f22016-04-06 16:45:26 -0700628 while (1) {
629 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
630
631 if (!SelectZSlice(mtls, &redp, slice)) {
632 return;
633 }
634
635 for (redp.current.y = mtls->start.y; redp.current.y < mtls->end.y; redp.current.y++) {
636 RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, redp.current.z);
637 fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
638 }
639
640 FormatBuf fmt;
641 if (mtls->logReduce >= 3) {
642 format_bytes(&fmt, accumPtr, mtls->accumSize);
643 } else {
644 fmt[0] = 0;
645 }
David Grossae2ec3f2016-06-01 14:45:47 -0700646 REDUCE_ALOGV(mtls, 2, "walk_3d_reduce(%p): idx = %u, z = %u%s",
647 mtls->accumFunc, idx, redp.current.z, fmt);
David Grossf9bd1f22016-04-06 16:45:26 -0700648 }
649}
650
David Gross6c1876b2016-01-15 11:52:14 -0800651// Launch a general reduce-style kernel.
652// Inputs:
653// ains[0..inLen-1]: Array of allocations that contain the inputs
654// aout: The allocation that will hold the output
655// mtls: Holds launch parameters
David Grossae2ec3f2016-06-01 14:45:47 -0700656void RsdCpuReferenceImpl::launchReduce(const Allocation ** ains,
657 uint32_t inLen,
658 Allocation * aout,
659 MTLaunchStructReduce *mtls) {
David Gross013ff532016-04-01 12:46:58 -0700660 mtls->logReduce = mRSC->props.mLogReduce;
David Gross35dbc8c2016-03-29 13:48:41 -0700661 if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
David Grossae2ec3f2016-06-01 14:45:47 -0700662 launchReduceParallel(ains, inLen, aout, mtls);
David Gross35dbc8c2016-03-29 13:48:41 -0700663 } else {
David Grossae2ec3f2016-06-01 14:45:47 -0700664 launchReduceSerial(ains, inLen, aout, mtls);
David Gross35dbc8c2016-03-29 13:48:41 -0700665 }
666}
667
668// Launch a general reduce-style kernel, single-threaded.
669// Inputs:
670// ains[0..inLen-1]: Array of allocations that contain the inputs
671// aout: The allocation that will hold the output
672// mtls: Holds launch parameters
David Grossae2ec3f2016-06-01 14:45:47 -0700673void RsdCpuReferenceImpl::launchReduceSerial(const Allocation ** ains,
674 uint32_t inLen,
675 Allocation * aout,
676 MTLaunchStructReduce *mtls) {
677 REDUCE_ALOGV(mtls, 1, "launchReduceSerial(%p): %u x %u x %u", mtls->accumFunc,
678 mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
David Gross35dbc8c2016-03-29 13:48:41 -0700679
David Gross6c1876b2016-01-15 11:52:14 -0800680 // In the presence of outconverter, we allocate temporary memory for
681 // the accumulator.
682 //
683 // In the absence of outconverter, we use the output allocation as the
684 // accumulator.
685 uint8_t *const accumPtr = (mtls->outFunc
686 ? static_cast<uint8_t *>(malloc(mtls->accumSize))
687 : mtls->redp.outPtr[0]);
688
689 // initialize
690 if (mtls->initFunc) {
691 mtls->initFunc(accumPtr);
692 } else {
693 memset(accumPtr, 0, mtls->accumSize);
694 }
695
696 // accumulate
David Grossae2ec3f2016-06-01 14:45:47 -0700697 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
David Gross6c1876b2016-01-15 11:52:14 -0800698 uint32_t slice = 0;
699 while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
700 for (mtls->redp.current.y = mtls->start.y;
701 mtls->redp.current.y < mtls->end.y;
702 mtls->redp.current.y++) {
703 RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
704 fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
705 }
706 }
707
708 // outconvert
709 if (mtls->outFunc) {
710 mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
711 free(accumPtr);
712 }
713}
714
David Gross35dbc8c2016-03-29 13:48:41 -0700715// Launch a general reduce-style kernel, multi-threaded.
716// Inputs:
717// ains[0..inLen-1]: Array of allocations that contain the inputs
718// aout: The allocation that will hold the output
719// mtls: Holds launch parameters
David Grossae2ec3f2016-06-01 14:45:47 -0700720void RsdCpuReferenceImpl::launchReduceParallel(const Allocation ** ains,
721 uint32_t inLen,
722 Allocation * aout,
723 MTLaunchStructReduce *mtls) {
David Grossf9bd1f22016-04-06 16:45:26 -0700724 // For now, we don't know how to go parallel in the absence of a combiner.
725 if (!mtls->combFunc) {
David Grossae2ec3f2016-06-01 14:45:47 -0700726 launchReduceSerial(ains, inLen, aout, mtls);
David Gross35dbc8c2016-03-29 13:48:41 -0700727 return;
728 }
729
730 // Number of threads = "main thread" + number of other (worker) threads
731 const uint32_t numThreads = mWorkers.mCount + 1;
732
733 // In the absence of outconverter, we use the output allocation as
734 // an accumulator, and therefore need to allocate one fewer accumulator.
735 const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
736
737 // If mDebugReduceSplitAccum, then we want each accumulator to start
738 // on a page boundary. (TODO: Would some unit smaller than a page
739 // be sufficient to avoid false sharing?)
740 if (mRSC->props.mDebugReduceSplitAccum) {
741 // Round up accumulator size to an integral number of pages
742 mtls->accumStride =
743 (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
744 ~(unsigned(mPageSize)-1);
745 // Each accumulator gets its own page. Alternatively, if we just
746 // wanted to make sure no two accumulators are on the same page,
747 // we could instead do
748 // allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
749 const size_t allocSize = mtls->accumStride * numAllocAccum;
750 mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
751 } else {
752 mtls->accumStride = mtls->accumSize;
753 mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
754 }
755
756 const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
757 mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
758 memset(mtls->accumPtr, 0, accumPtrArrayBytes);
759
760 mtls->accumCount = 0;
761
762 rsAssert(!mInKernel);
763 mInKernel = true;
David Grossae2ec3f2016-06-01 14:45:47 -0700764 REDUCE_ALOGV(mtls, 1, "launchReduceParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
765 mtls->accumFunc,
766 mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
767 numThreads, mtls->accumAlloc);
David Grossf9bd1f22016-04-06 16:45:26 -0700768 if (mtls->redp.dim.z > 1) {
769 mtls->mSliceSize = 1;
David Grossae2ec3f2016-06-01 14:45:47 -0700770 launchThreads(walk_3d_reduce, mtls);
David Grossf9bd1f22016-04-06 16:45:26 -0700771 } else if (mtls->redp.dim.y > 1) {
772 mtls->mSliceSize = rsMax(1U, mtls->redp.dim.y / (numThreads * 4));
David Grossae2ec3f2016-06-01 14:45:47 -0700773 launchThreads(walk_2d_reduce, mtls);
David Grossf9bd1f22016-04-06 16:45:26 -0700774 } else {
775 mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
David Grossae2ec3f2016-06-01 14:45:47 -0700776 launchThreads(walk_1d_reduce, mtls);
David Grossf9bd1f22016-04-06 16:45:26 -0700777 }
David Gross35dbc8c2016-03-29 13:48:41 -0700778 mInKernel = false;
779
780 // Combine accumulators and identify final accumulator
781 uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
782 // Loop over accumulators, combining into finalAccumPtr. If finalAccumPtr
783 // is null, then the first accumulator I find becomes finalAccumPtr.
784 for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
785 uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
786 if (finalAccumPtr) {
787 if (finalAccumPtr != thisAccumPtr) {
788 if (mtls->combFunc) {
David Gross013ff532016-04-01 12:46:58 -0700789 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700790 FormatBuf fmt;
David Grossae2ec3f2016-06-01 14:45:47 -0700791 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulating into%s",
792 mtls->accumFunc,
793 format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
794 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulator[%d]%s",
795 mtls->accumFunc, idx,
796 format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
David Gross35dbc8c2016-03-29 13:48:41 -0700797 }
798 mtls->combFunc(finalAccumPtr, thisAccumPtr);
799 } else {
800 rsAssert(!"expected combiner");
801 }
802 }
803 } else {
804 finalAccumPtr = thisAccumPtr;
805 }
806 }
807 rsAssert(finalAccumPtr != nullptr);
David Gross013ff532016-04-01 12:46:58 -0700808 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700809 FormatBuf fmt;
David Grossae2ec3f2016-06-01 14:45:47 -0700810 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final accumulator%s",
811 mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
David Gross35dbc8c2016-03-29 13:48:41 -0700812 }
813
814 // Outconvert
815 if (mtls->outFunc) {
816 mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
David Gross013ff532016-04-01 12:46:58 -0700817 if (mtls->logReduce >= 3) {
David Gross35dbc8c2016-03-29 13:48:41 -0700818 FormatBuf fmt;
David Grossae2ec3f2016-06-01 14:45:47 -0700819 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final outconverted result%s",
820 mtls->accumFunc,
821 format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
David Gross35dbc8c2016-03-29 13:48:41 -0700822 }
823 }
824
825 // Clean up
826 free(mtls->accumPtr);
827 free(mtls->accumAlloc);
828}
829
830
Matt Wala14ce0072015-07-30 17:30:25 -0700831void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
Chris Wailesf3712132014-07-16 15:18:30 -0700832 uint32_t inLen,
833 Allocation* aout,
834 const RsScriptCall* sc,
Matt Wala14ce0072015-07-30 17:30:25 -0700835 MTLaunchStructForEach* mtls) {
Stephen Hines4b2bea32014-08-13 17:32:10 +0000836
837 //android::StopWatch kernel_time("kernel time");
838
Jason Samsbf2111d2015-01-26 18:13:41 -0800839 bool outerDims = (mtls->start.z != mtls->end.z) ||
840 (mtls->start.face != mtls->end.face) ||
841 (mtls->start.lod != mtls->end.lod) ||
842 (mtls->start.array[0] != mtls->end.array[0]) ||
843 (mtls->start.array[1] != mtls->end.array[1]) ||
844 (mtls->start.array[2] != mtls->end.array[2]) ||
845 (mtls->start.array[3] != mtls->end.array[3]);
846
David Gross35dbc8c2016-03-29 13:48:41 -0700847 if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
Stephen Hines4b2bea32014-08-13 17:32:10 +0000848 const size_t targetByteChunk = 16 * 1024;
David Gross35dbc8c2016-03-29 13:48:41 -0700849 mInKernel = true; // NOTE: The guard immediately above ensures this was !mInKernel
Chris Wailesf3712132014-07-16 15:18:30 -0700850
Jason Samsbf2111d2015-01-26 18:13:41 -0800851 if (outerDims) {
852 // No fancy logic for chunk size
853 mtls->mSliceSize = 1;
David Grossf9bd1f22016-04-06 16:45:26 -0700854 launchThreads(walk_general_foreach, mtls);
Jason Samsbf2111d2015-01-26 18:13:41 -0800855 } else if (mtls->fep.dim.y > 1) {
Jason Samsc0d68472015-01-20 14:29:52 -0800856 uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
Stephen Hines4b2bea32014-08-13 17:32:10 +0000857 uint32_t s2 = 0;
858
859 // This chooses our slice size to rate limit atomic ops to
860 // one per 16k bytes of reads/writes.
Jason Samsc0d68472015-01-20 14:29:52 -0800861 if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
862 s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
Jason Samsa9139c72015-04-16 15:11:04 -0700863 } else if (mtls->ains[0]) {
Jason Samsc0d68472015-01-20 14:29:52 -0800864 s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
Jason Samsa9139c72015-04-16 15:11:04 -0700865 } else {
866 // Launch option only case
867 // Use s1 based only on the dimensions
868 s2 = s1;
Stephen Hines4b2bea32014-08-13 17:32:10 +0000869 }
870 mtls->mSliceSize = rsMin(s1, s2);
871
872 if(mtls->mSliceSize < 1) {
873 mtls->mSliceSize = 1;
874 }
875
David Grossf9bd1f22016-04-06 16:45:26 -0700876 launchThreads(walk_2d_foreach, mtls);
Stephen Hines4b2bea32014-08-13 17:32:10 +0000877 } else {
Jason Samsc0d68472015-01-20 14:29:52 -0800878 uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
Stephen Hines4b2bea32014-08-13 17:32:10 +0000879 uint32_t s2 = 0;
880
881 // This chooses our slice size to rate limit atomic ops to
882 // one per 16k bytes of reads/writes.
Jason Samsc0d68472015-01-20 14:29:52 -0800883 if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
884 s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
Jason Samsa9139c72015-04-16 15:11:04 -0700885 } else if (mtls->ains[0]) {
Jason Samsc0d68472015-01-20 14:29:52 -0800886 s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
Jason Samsa9139c72015-04-16 15:11:04 -0700887 } else {
888 // Launch option only case
889 // Use s1 based only on the dimensions
890 s2 = s1;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700891 }
892 mtls->mSliceSize = rsMin(s1, s2);
893
894 if (mtls->mSliceSize < 1) {
895 mtls->mSliceSize = 1;
896 }
897
David Gross35dbc8c2016-03-29 13:48:41 -0700898 launchThreads(walk_1d_foreach, mtls);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700899 }
David Gross35dbc8c2016-03-29 13:48:41 -0700900 mInKernel = false;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700901
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700902 } else {
Matt Wala14ce0072015-07-30 17:30:25 -0700903 ForEachFunc_t fn = mtls->kernel;
Jason Samsbf2111d2015-01-26 18:13:41 -0800904 uint32_t slice = 0;
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700905
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700906
Jason Sams59b35f22015-03-17 12:59:17 -0700907 while(SelectOuterSlice(mtls, &mtls->fep, slice++)) {
Jason Samsbf2111d2015-01-26 18:13:41 -0800908 for (mtls->fep.current.y = mtls->start.y;
909 mtls->fep.current.y < mtls->end.y;
910 mtls->fep.current.y++) {
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700911
Jason Samsbf2111d2015-01-26 18:13:41 -0800912 FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
913 mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
914 (RsAllocationCubemapFace) mtls->fep.current.face,
915 mtls->fep.current.array[0], mtls->fep.current.array[1],
916 mtls->fep.current.array[2], mtls->fep.current.array[3]);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700917
David Grossb0abb142015-03-12 15:23:03 -0700918 fn(&mtls->fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700919 }
920 }
Chris Wailes4b3c34e2014-06-11 12:00:29 -0700921 }
922}
923
Jason Sams709a0972012-11-15 18:18:04 -0800924RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
925 //ALOGE("setTls %p", sc);
926 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
927 rsAssert(tls);
928 RsdCpuScriptImpl *old = tls->mImpl;
929 tls->mImpl = sc;
930 tls->mContext = mRSC;
931 if (sc) {
932 tls->mScript = sc->getScript();
933 } else {
Chris Wailes44bef6f2014-08-12 13:51:10 -0700934 tls->mScript = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800935 }
936 return old;
937}
938
939const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
940 return mSymLookupFn(mRSC, name);
941}
942
943
944RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
945 char const *resName, char const *cacheDir,
946 uint8_t const *bitcode, size_t bitcodeSize,
947 uint32_t flags) {
948
949 RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
Stephen Hines00511322014-01-31 11:20:23 -0800950 if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags
Stephen Hines00511322014-01-31 11:20:23 -0800951 , getBccPluginName()
Stephen Hines00511322014-01-31 11:20:23 -0800952 )) {
Jason Sams709a0972012-11-15 18:18:04 -0800953 delete i;
Chris Wailes44bef6f2014-08-12 13:51:10 -0700954 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800955 }
956 return i;
957}
958
Jason Sams7c4b8882013-01-04 10:50:05 -0800959extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
960 const Script *s, const Element *e);
Jason Samsc905efd2012-11-26 15:20:18 -0800961extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
962 const Script *s, const Element *e);
963extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
964 const Script *s, const Element *e);
965extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
966 const Script *s, const Element *e);
967extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
968 const Script *s, const Element *e);
969extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
970 const Script *s, const Element *e);
971extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
972 const Script *s, const Element *e);
973extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
974 const Script *s, const Element *e);
Jason Sams2282e282013-06-17 16:52:01 -0700975extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
976 const Script *s, const Element *e);
Jason Sams39ab94a2014-04-16 17:14:05 -0700977extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
978 const Script *s, const Element *e);
Tim Murray64c682b2015-01-09 12:08:43 -0800979extern RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
980 const Script *s, const Element *e);
Jason Sams709a0972012-11-15 18:18:04 -0800981
982RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
983 RsScriptIntrinsicID iid, Element *e) {
984
Chris Wailes44bef6f2014-08-12 13:51:10 -0700985 RsdCpuScriptImpl *i = nullptr;
Jason Sams709a0972012-11-15 18:18:04 -0800986 switch (iid) {
Jason Sams7c4b8882013-01-04 10:50:05 -0800987 case RS_SCRIPT_INTRINSIC_ID_3DLUT:
988 i = rsdIntrinsic_3DLUT(this, s, e);
989 break;
Jason Sams709a0972012-11-15 18:18:04 -0800990 case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
Jason Samsc905efd2012-11-26 15:20:18 -0800991 i = rsdIntrinsic_Convolve3x3(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800992 break;
993 case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
Jason Samsc905efd2012-11-26 15:20:18 -0800994 i = rsdIntrinsic_ColorMatrix(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800995 break;
996 case RS_SCRIPT_INTRINSIC_ID_LUT:
Jason Samsc905efd2012-11-26 15:20:18 -0800997 i = rsdIntrinsic_LUT(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -0800998 break;
999 case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
Jason Samsc905efd2012-11-26 15:20:18 -08001000 i = rsdIntrinsic_Convolve5x5(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -08001001 break;
1002 case RS_SCRIPT_INTRINSIC_ID_BLUR:
Jason Samsc905efd2012-11-26 15:20:18 -08001003 i = rsdIntrinsic_Blur(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -08001004 break;
1005 case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
Jason Samsc905efd2012-11-26 15:20:18 -08001006 i = rsdIntrinsic_YuvToRGB(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -08001007 break;
1008 case RS_SCRIPT_INTRINSIC_ID_BLEND:
Jason Samsc905efd2012-11-26 15:20:18 -08001009 i = rsdIntrinsic_Blend(this, s, e);
Jason Sams709a0972012-11-15 18:18:04 -08001010 break;
Jason Sams2282e282013-06-17 16:52:01 -07001011 case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
1012 i = rsdIntrinsic_Histogram(this, s, e);
1013 break;
Jason Sams39ab94a2014-04-16 17:14:05 -07001014 case RS_SCRIPT_INTRINSIC_ID_RESIZE:
1015 i = rsdIntrinsic_Resize(this, s, e);
1016 break;
Tim Murray64c682b2015-01-09 12:08:43 -08001017 case RS_SCRIPT_INTRINSIC_ID_BLAS:
1018 i = rsdIntrinsic_BLAS(this, s, e);
1019 break;
Jason Sams709a0972012-11-15 18:18:04 -08001020
1021 default:
1022 rsAssert(0);
1023 }
1024
1025 return i;
1026}
1027
Yang Ni1ffd86b2015-01-07 09:16:40 -08001028void* RsdCpuReferenceImpl::createScriptGroup(const ScriptGroupBase *sg) {
1029 switch (sg->getApiVersion()) {
1030 case ScriptGroupBase::SG_V1: {
1031 CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
1032 if (!sgi->init()) {
Jason Sams709a0972012-11-15 18:18:04 -08001033 delete sgi;
Chris Wailes44bef6f2014-08-12 13:51:10 -07001034 return nullptr;
Yang Ni1ffd86b2015-01-07 09:16:40 -08001035 }
1036 return sgi;
Jason Sams709a0972012-11-15 18:18:04 -08001037 }
Yang Ni1ffd86b2015-01-07 09:16:40 -08001038 case ScriptGroupBase::SG_V2: {
1039 return new CpuScriptGroup2Impl(this, sg);
1040 }
1041 }
1042 return nullptr;
Jason Sams709a0972012-11-15 18:18:04 -08001043}