| #include "caffe2/core/common_gpu.h" |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/cuda_rtc/common_rtc.h" |
| |
| namespace caffe2 { |
| namespace { |
| class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> { |
| public: |
| ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {} |
| |
| template <typename... Args> |
| string KernelName(Args... /*args*/) { |
| return name_; |
| } |
| |
| template <typename... Args> |
| string GetSource(Args... args); |
| |
| private: |
| string name_; |
| }; |
| |
| template <> |
| string ElementwiseRTCFunction::GetSource( |
| int input_size, |
| int output_size, |
| const string command_string) { |
| std::stringstream ss; |
| ss << "extern \"C\" __global__ void " << name_ |
| << "(const size_t nthreads, \n"; |
| // Insert the parameter list. |
| int remain_params = input_size + output_size; |
| for (int i = 0; i < input_size; ++i) { |
| ss << "const float* in" << i << ((remain_params--) ? ", \n" : ""); |
| } |
| for (int i = 0; i < output_size; ++i) { |
| ss << "float* out" << i << ((remain_params--) ? ", \n" : ""); |
| } |
| ss << ") {\n" |
| "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n" |
| "index < nthreads; index += blockDim.x * gridDim.x) {\n" |
| << command_string << "\n" |
| << "}\n}"; |
| return ss.str(); |
| } |
| } // namespace |
| |
| /** |
| * A GPU operator that can generate limited elementwise operations. |
| * |
| * ElementwiseRTCOp allows one to do a simple and limited thing: it takes in |
| * multiple inputs and multiple outputs, as well as a raw string argument |
| * rtc_src. The runtime then generates the following kernel code: |
| * |
| * __global__ void kernel_name(const size_t nthreads, ...) { |
| * for(int index = blockIdx.x * blockDim.x + threadIdx.x; |
| * index < nthreads; index += blockDim.x * gridDim.x) { |
| * rtc_src |
| * } |
| * } |
| * where the "..." part is auto generated, so one can refer to the input and |
| * output as in0, in1, ..., out0, out1... in the rtc_src string. |
| * |
| * For example, if one wants to do a vector multiplication, one can take two |
| * inputs and one outputs, and write rtc_src as |
| * out0[index] = in0[index] * in1[index]; |
| * |
| * This op is currently highly experimental. We do not have a gradient |
| * registered for it either. |
| */ |
| class ElementwiseRTCOp final : public Operator<CUDAContext> { |
| public: |
| ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<CUDAContext>(operator_def, ws) { |
| const string src = OperatorBase::GetSingleArgument<string>("rtc_src", ""); |
| CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size."); |
| func_.Compile(InputSize(), OutputSize(), src); |
| } |
| ~ElementwiseRTCOp() override {} |
| |
| bool RunOnDevice() override { |
| static_assert( |
| sizeof(void*) == sizeof(size_t), |
| "The argbuffer relies on the assumption that void* and " |
| "size_t have the same size."); |
| vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1); |
| size_t* argBuffer = argBuffer_vec.data(); |
| CAFFE_ENFORCE( |
| Input(0).numel() < std::numeric_limits<int>::max(), |
| "The kernel function currently only supports int index."); |
| argBuffer[0] = Input(0).numel(); |
| void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1); |
| for (int i = 0; i < InputSize(); ++i) { |
| ptr_buffer[i] = const_cast<float*>(Input(i).data<float>()); |
| } |
| for (int i = 0; i < OutputSize(); ++i) { |
| Output(i)->ResizeLike(Input(0)); |
| ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>(); |
| } |
| size_t argBufferSize = sizeof(argBuffer); |
| void* config[] = { |
| CU_LAUNCH_PARAM_BUFFER_POINTER, |
| argBuffer, |
| CU_LAUNCH_PARAM_BUFFER_SIZE, |
| &argBufferSize, |
| CU_LAUNCH_PARAM_END}; |
| func_.LaunchEx( |
| CAFFE_GET_BLOCKS(Input(0).numel()), |
| 1, |
| 1, |
| CAFFE_CUDA_NUM_THREADS, |
| 1, |
| 1, |
| 0, |
| context_.cuda_stream(), |
| config); |
| return true; |
| } |
| |
| private: |
| ElementwiseRTCFunction func_; |
| }; |
| |
| namespace { |
| REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp); |
| } |
| |
| } // namespace caffe2 |