Blame - toolkit/TaskProcessor.cpp - platform/frameworks/rs

blob: d9ae83ca1f58e7b7ff339192f8e0e5fc90cb7640 [file] [log] [blame]

Jean-Luc Brouillet	9b7aff5	2021-03-05 12:11:37 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2021 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "TaskProcessor.h"
				18
				19	#include <assert.h>
				20	#include <sys/prctl.h>
				21
Jean-Luc Brouillet	5d9c8b6	2021-03-10 15:25:06 -0800	[diff] [blame]	22	#include "RenderScriptToolkit.h"
Jean-Luc Brouillet	9b7aff5	2021-03-05 12:11:37 -0800	[diff] [blame]	23	#include "Utils.h"
				24
				25	#define LOG_TAG "renderscript.toolkit.TaskProcessor"
				26
				27	namespace android {
				28	namespace renderscript {
				29
				30	int Task::setTiling(unsigned int targetTileSizeInBytes) {
				31	// Empirically, values smaller than 1000 are unlikely to give good performance.
				32	targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
				33	const size_t cellSizeInBytes =
				34	mVectorSize; // If we add float support, vectorSize * 4 for that.
				35	const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
				36	assert(targetCellsPerTile > 0);
				37
				38	size_t cellsToProcessY;
				39	size_t cellsToProcessX;
				40	if (mRestriction == nullptr) {
				41	cellsToProcessX = mSizeX;
				42	cellsToProcessY = mSizeY;
				43	} else {
				44	assert(mRestriction->endX > mRestriction->startX);
				45	assert(mRestriction->endY > mRestriction->startY);
				46	cellsToProcessX = mRestriction->endX - mRestriction->startX;
				47	cellsToProcessY = mRestriction->endY - mRestriction->startY;
				48	}
				49
				50	// We want rows as large as possible, as the SIMD code we have is more efficient with
				51	// large rows.
				52	mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
				53	// Once we know the number of tiles per row, we divide that row evenly. We round up to make
				54	// sure all cells are included in the last tile of the row.
				55	mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
				56
				57	// We do the same thing for the Y direction.
				58	size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
				59	mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
				60	mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
				61
				62	return mTilesPerRow * mTilesPerColumn;
				63	}
				64
				65	void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
				66	// Figure out the overall boundaries.
				67	size_t startWorkX;
				68	size_t startWorkY;
				69	size_t endWorkX;
				70	size_t endWorkY;
				71	if (mRestriction == nullptr) {
				72	startWorkX = 0;
				73	startWorkY = 0;
				74	endWorkX = mSizeX;
				75	endWorkY = mSizeY;
				76	} else {
				77	startWorkX = mRestriction->startX;
				78	startWorkY = mRestriction->startY;
				79	endWorkX = mRestriction->endX;
				80	endWorkY = mRestriction->endY;
				81	}
				82	// Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
				83	// first the X, Y coordinate of our tile in that grid.
				84	size_t tileIndexY = tileIndex / mTilesPerRow;
				85	size_t tileIndexX = tileIndex % mTilesPerRow;
				86	// Calculate the starting and ending point of that tile.
				87	size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
				88	size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
				89	size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
				90	size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
				91
				92	// Call the derived class to do the specific work.
Jean-Luc Brouillet	5d9c8b6	2021-03-10 15:25:06 -0800	[diff] [blame]	93	if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
				94	// When the tile covers entire rows, we can take advantage that some ops are not 2D.
				95	processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
				96	} else {
				97	processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
				98	}
Jean-Luc Brouillet	9b7aff5	2021-03-05 12:11:37 -0800	[diff] [blame]	99	}
				100
				101	TaskProcessor::TaskProcessor(unsigned int numThreads)
				102	: mUsesSimd{cpuSupportsSimd()},
				103	/* If the requested number of threads is 0, we'll decide based on the number of cores.
				104	* Through empirical testing, we've found that using more than 6 threads does not help.
				105	* There may be more optimal choices to make depending on the SoC but we'll stick to
				106	* this simple heuristic for now.
				107	*
				108	* We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
				109	* worker pool thread than the total number of threads.
				110	*/
				111	mNumberOfPoolThreads{numThreads ? numThreads - 1
				112	: std::min(6u, std::thread::hardware_concurrency() - 1)} {
				113	for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
				114	mPoolThreads.emplace_back(
				115	std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
				116	}
				117	}
				118
				119	TaskProcessor::~TaskProcessor() {
				120	{
				121	std::lock_guard<std::mutex> lock(mQueueMutex);
				122	mStopThreads = true;
				123	mWorkAvailableOrStop.notify_all();
				124	}
				125
				126	for (auto& thread : mPoolThreads) {
				127	thread.join();
				128	}
				129	}
				130
				131	void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
				132	if (threadIndex != 0) {
				133	// Set the name of the thread, except for thread 0, which is not part of the pool.
				134	// PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
				135	char name[16]{"RenderScToolkit"};
				136	prctl(PR_SET_NAME, name, 0, 0, 0);
				137	// ALOGI("Starting thread%d", threadIndex);
				138	}
				139
				140	std::unique_lock<std::mutex> lock(mQueueMutex);
				141	while (true) {
				142	mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
				143	return mStopThreads \|\| (mTilesNotYetStarted > 0) \|\|
				144	(returnWhenNoWork && (mTilesNotYetStarted == 0));
				145	});
				146	// ALOGI("Woke thread%d", threadIndex);
				147
				148	// This ScopedLockAssertion is to help the compiler when it checks thread annotations
				149	// to realize that we have the lock. It's however not completely true; we don't
				150	// hold the lock while processing the tile.
				151	// TODO Figure out how to fix that.
				152	android::base::ScopedLockAssertion lockAssert(mQueueMutex);
				153	if (mStopThreads \|\| (returnWhenNoWork && mTilesNotYetStarted == 0)) {
				154	break;
				155	}
				156
				157	while (mTilesNotYetStarted > 0 && !mStopThreads) {
				158	// This picks the tiles in decreasing order but that does not matter.
				159	int myTile = --mTilesNotYetStarted;
				160	mTilesInProcess++;
				161	lock.unlock();
				162	{
				163	// We won't be executing this code unless the main thread is
				164	// holding the mTaskMutex lock, which guards mCurrentTask.
				165	// The compiler can't figure this out.
				166	android::base::ScopedLockAssertion lockAssert(mTaskMutex);
				167	mCurrentTask->processTile(threadIndex, myTile);
				168	}
				169	lock.lock();
				170	mTilesInProcess--;
				171	if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
				172	mWorkIsFinished.notify_one();
				173	}
				174	}
				175	}
				176	// if (threadIndex != 0) {
				177	// ALOGI("Ending thread%d", threadIndex);
				178	// }
				179	}
				180
				181	void TaskProcessor::doTask(Task* task) {
				182	std::lock_guard<std::mutex> lockGuard(mTaskMutex);
				183	task->setUsesSimd(mUsesSimd);
				184	mCurrentTask = task;
				185	// Notify the thread pool of available work.
				186	startWork(task);
				187	// Start processing some of the tiles on the calling thread.
				188	processTilesOfWork(0, true);
				189	// Wait for all the pool workers to complete.
				190	waitForPoolWorkersToComplete();
				191	mCurrentTask = nullptr;
				192	}
				193
				194	void TaskProcessor::startWork(Task* task) {
				195	/**
				196	* The size in bytes that we're hoping each tile will be. If this value is too small,
				197	* we'll spend too much time in synchronization. If it's too large, some cores may be
				198	* idle while others still have a lot of work to do. Ideally, it would depend on the
				199	* device we're running. 16k is the same value used by RenderScript and seems reasonable
				200	* from ad-hoc tests.
				201	*/
				202	const size_t targetTileSize = 16 * 1024;
				203
				204	std::lock_guard<std::mutex> lock(mQueueMutex);
				205	assert(mTilesInProcess == 0);
				206	mTilesNotYetStarted = task->setTiling(targetTileSize);
				207	mWorkAvailableOrStop.notify_all();
				208	}
				209
				210	void TaskProcessor::waitForPoolWorkersToComplete() {
				211	std::unique_lock<std::mutex> lock(mQueueMutex);
				212	// The predicate, i.e. the lambda, will make sure that
				213	// we terminate even if the main thread calls this after
				214	// mWorkIsFinished is signaled.
				215	mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
				216	return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
				217	});
				218	}
				219
				220	} // namespace renderscript
				221	} // namespace android