Jean-Luc Brouillet | 9b7aff5 | 2021-03-05 12:11:37 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2021 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "TaskProcessor.h" |
| 18 | |
| 19 | #include <assert.h> |
| 20 | #include <sys/prctl.h> |
| 21 | |
Jean-Luc Brouillet | 5d9c8b6 | 2021-03-10 15:25:06 -0800 | [diff] [blame] | 22 | #include "RenderScriptToolkit.h" |
Jean-Luc Brouillet | 9b7aff5 | 2021-03-05 12:11:37 -0800 | [diff] [blame] | 23 | #include "Utils.h" |
| 24 | |
| 25 | #define LOG_TAG "renderscript.toolkit.TaskProcessor" |
| 26 | |
| 27 | namespace android { |
| 28 | namespace renderscript { |
| 29 | |
| 30 | int Task::setTiling(unsigned int targetTileSizeInBytes) { |
| 31 | // Empirically, values smaller than 1000 are unlikely to give good performance. |
| 32 | targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes); |
| 33 | const size_t cellSizeInBytes = |
| 34 | mVectorSize; // If we add float support, vectorSize * 4 for that. |
| 35 | const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes; |
| 36 | assert(targetCellsPerTile > 0); |
| 37 | |
| 38 | size_t cellsToProcessY; |
| 39 | size_t cellsToProcessX; |
| 40 | if (mRestriction == nullptr) { |
| 41 | cellsToProcessX = mSizeX; |
| 42 | cellsToProcessY = mSizeY; |
| 43 | } else { |
| 44 | assert(mRestriction->endX > mRestriction->startX); |
| 45 | assert(mRestriction->endY > mRestriction->startY); |
| 46 | cellsToProcessX = mRestriction->endX - mRestriction->startX; |
| 47 | cellsToProcessY = mRestriction->endY - mRestriction->startY; |
| 48 | } |
| 49 | |
| 50 | // We want rows as large as possible, as the SIMD code we have is more efficient with |
| 51 | // large rows. |
| 52 | mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile); |
| 53 | // Once we know the number of tiles per row, we divide that row evenly. We round up to make |
| 54 | // sure all cells are included in the last tile of the row. |
| 55 | mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow); |
| 56 | |
| 57 | // We do the same thing for the Y direction. |
| 58 | size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX); |
| 59 | mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile); |
| 60 | mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn); |
| 61 | |
| 62 | return mTilesPerRow * mTilesPerColumn; |
| 63 | } |
| 64 | |
| 65 | void Task::processTile(unsigned int threadIndex, size_t tileIndex) { |
| 66 | // Figure out the overall boundaries. |
| 67 | size_t startWorkX; |
| 68 | size_t startWorkY; |
| 69 | size_t endWorkX; |
| 70 | size_t endWorkY; |
| 71 | if (mRestriction == nullptr) { |
| 72 | startWorkX = 0; |
| 73 | startWorkY = 0; |
| 74 | endWorkX = mSizeX; |
| 75 | endWorkY = mSizeY; |
| 76 | } else { |
| 77 | startWorkX = mRestriction->startX; |
| 78 | startWorkY = mRestriction->startY; |
| 79 | endWorkX = mRestriction->endX; |
| 80 | endWorkY = mRestriction->endY; |
| 81 | } |
| 82 | // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify |
| 83 | // first the X, Y coordinate of our tile in that grid. |
| 84 | size_t tileIndexY = tileIndex / mTilesPerRow; |
| 85 | size_t tileIndexX = tileIndex % mTilesPerRow; |
| 86 | // Calculate the starting and ending point of that tile. |
| 87 | size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX; |
| 88 | size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY; |
| 89 | size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX); |
| 90 | size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY); |
| 91 | |
| 92 | // Call the derived class to do the specific work. |
Jean-Luc Brouillet | 5d9c8b6 | 2021-03-10 15:25:06 -0800 | [diff] [blame] | 93 | if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) { |
| 94 | // When the tile covers entire rows, we can take advantage that some ops are not 2D. |
| 95 | processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1); |
| 96 | } else { |
| 97 | processData(threadIndex, startCellX, startCellY, endCellX, endCellY); |
| 98 | } |
Jean-Luc Brouillet | 9b7aff5 | 2021-03-05 12:11:37 -0800 | [diff] [blame] | 99 | } |
| 100 | |
| 101 | TaskProcessor::TaskProcessor(unsigned int numThreads) |
| 102 | : mUsesSimd{cpuSupportsSimd()}, |
| 103 | /* If the requested number of threads is 0, we'll decide based on the number of cores. |
| 104 | * Through empirical testing, we've found that using more than 6 threads does not help. |
| 105 | * There may be more optimal choices to make depending on the SoC but we'll stick to |
| 106 | * this simple heuristic for now. |
| 107 | * |
| 108 | * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less |
| 109 | * worker pool thread than the total number of threads. |
| 110 | */ |
| 111 | mNumberOfPoolThreads{numThreads ? numThreads - 1 |
| 112 | : std::min(6u, std::thread::hardware_concurrency() - 1)} { |
| 113 | for (size_t i = 0; i < mNumberOfPoolThreads; i++) { |
| 114 | mPoolThreads.emplace_back( |
| 115 | std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false)); |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | TaskProcessor::~TaskProcessor() { |
| 120 | { |
| 121 | std::lock_guard<std::mutex> lock(mQueueMutex); |
| 122 | mStopThreads = true; |
| 123 | mWorkAvailableOrStop.notify_all(); |
| 124 | } |
| 125 | |
| 126 | for (auto& thread : mPoolThreads) { |
| 127 | thread.join(); |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) { |
| 132 | if (threadIndex != 0) { |
| 133 | // Set the name of the thread, except for thread 0, which is not part of the pool. |
| 134 | // PR_SET_NAME takes a maximum of 16 characters, including the terminating null. |
| 135 | char name[16]{"RenderScToolkit"}; |
| 136 | prctl(PR_SET_NAME, name, 0, 0, 0); |
| 137 | // ALOGI("Starting thread%d", threadIndex); |
| 138 | } |
| 139 | |
| 140 | std::unique_lock<std::mutex> lock(mQueueMutex); |
| 141 | while (true) { |
| 142 | mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) { |
| 143 | return mStopThreads || (mTilesNotYetStarted > 0) || |
| 144 | (returnWhenNoWork && (mTilesNotYetStarted == 0)); |
| 145 | }); |
| 146 | // ALOGI("Woke thread%d", threadIndex); |
| 147 | |
| 148 | // This ScopedLockAssertion is to help the compiler when it checks thread annotations |
| 149 | // to realize that we have the lock. It's however not completely true; we don't |
| 150 | // hold the lock while processing the tile. |
| 151 | // TODO Figure out how to fix that. |
| 152 | android::base::ScopedLockAssertion lockAssert(mQueueMutex); |
| 153 | if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) { |
| 154 | break; |
| 155 | } |
| 156 | |
| 157 | while (mTilesNotYetStarted > 0 && !mStopThreads) { |
| 158 | // This picks the tiles in decreasing order but that does not matter. |
| 159 | int myTile = --mTilesNotYetStarted; |
| 160 | mTilesInProcess++; |
| 161 | lock.unlock(); |
| 162 | { |
| 163 | // We won't be executing this code unless the main thread is |
| 164 | // holding the mTaskMutex lock, which guards mCurrentTask. |
| 165 | // The compiler can't figure this out. |
| 166 | android::base::ScopedLockAssertion lockAssert(mTaskMutex); |
| 167 | mCurrentTask->processTile(threadIndex, myTile); |
| 168 | } |
| 169 | lock.lock(); |
| 170 | mTilesInProcess--; |
| 171 | if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) { |
| 172 | mWorkIsFinished.notify_one(); |
| 173 | } |
| 174 | } |
| 175 | } |
| 176 | // if (threadIndex != 0) { |
| 177 | // ALOGI("Ending thread%d", threadIndex); |
| 178 | // } |
| 179 | } |
| 180 | |
| 181 | void TaskProcessor::doTask(Task* task) { |
| 182 | std::lock_guard<std::mutex> lockGuard(mTaskMutex); |
| 183 | task->setUsesSimd(mUsesSimd); |
| 184 | mCurrentTask = task; |
| 185 | // Notify the thread pool of available work. |
| 186 | startWork(task); |
| 187 | // Start processing some of the tiles on the calling thread. |
| 188 | processTilesOfWork(0, true); |
| 189 | // Wait for all the pool workers to complete. |
| 190 | waitForPoolWorkersToComplete(); |
| 191 | mCurrentTask = nullptr; |
| 192 | } |
| 193 | |
| 194 | void TaskProcessor::startWork(Task* task) { |
| 195 | /** |
| 196 | * The size in bytes that we're hoping each tile will be. If this value is too small, |
| 197 | * we'll spend too much time in synchronization. If it's too large, some cores may be |
| 198 | * idle while others still have a lot of work to do. Ideally, it would depend on the |
| 199 | * device we're running. 16k is the same value used by RenderScript and seems reasonable |
| 200 | * from ad-hoc tests. |
| 201 | */ |
| 202 | const size_t targetTileSize = 16 * 1024; |
| 203 | |
| 204 | std::lock_guard<std::mutex> lock(mQueueMutex); |
| 205 | assert(mTilesInProcess == 0); |
| 206 | mTilesNotYetStarted = task->setTiling(targetTileSize); |
| 207 | mWorkAvailableOrStop.notify_all(); |
| 208 | } |
| 209 | |
| 210 | void TaskProcessor::waitForPoolWorkersToComplete() { |
| 211 | std::unique_lock<std::mutex> lock(mQueueMutex); |
| 212 | // The predicate, i.e. the lambda, will make sure that |
| 213 | // we terminate even if the main thread calls this after |
| 214 | // mWorkIsFinished is signaled. |
| 215 | mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) { |
| 216 | return mTilesNotYetStarted == 0 && mTilesInProcess == 0; |
| 217 | }); |
| 218 | } |
| 219 | |
| 220 | } // namespace renderscript |
| 221 | } // namespace android |