blob: dfa3981731e711b3df3ab3b78c28bb8c81a1b909 [file] [log] [blame]
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/cuda_rtc/common_rtc.h"
namespace caffe2 {
namespace {
class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> {
public:
ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
template <typename... Args>
string KernelName(Args... /*args*/) {
return name_;
}
template <typename... Args>
string GetSource(Args... args);
private:
string name_;
};
template <>
string ElementwiseRTCFunction::GetSource(
int input_size,
int output_size,
const string command_string) {
std::stringstream ss;
ss << "extern \"C\" __global__ void " << name_
<< "(const size_t nthreads, \n";
// Insert the parameter list.
int remain_params = input_size + output_size;
for (int i = 0; i < input_size; ++i) {
ss << "const float* in" << i << ((remain_params--) ? ", \n" : "");
}
for (int i = 0; i < output_size; ++i) {
ss << "float* out" << i << ((remain_params--) ? ", \n" : "");
}
ss << ") {\n"
"for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"index < nthreads; index += blockDim.x * gridDim.x) {\n"
<< command_string << "\n"
<< "}\n}";
return ss.str();
}
} // namespace
/**
* A GPU operator that can generate limited elementwise operations.
*
* ElementwiseRTCOp allows one to do a simple and limited thing: it takes in
* multiple inputs and multiple outputs, as well as a raw string argument
* rtc_src. The runtime then generates the following kernel code:
*
* __global__ void kernel_name(const size_t nthreads, ...) {
* for(int index = blockIdx.x * blockDim.x + threadIdx.x;
* index < nthreads; index += blockDim.x * gridDim.x) {
* rtc_src
* }
* }
* where the "..." part is auto generated, so one can refer to the input and
* output as in0, in1, ..., out0, out1... in the rtc_src string.
*
* For example, if one wants to do a vector multiplication, one can take two
* inputs and one outputs, and write rtc_src as
* out0[index] = in0[index] * in1[index];
*
* This op is currently highly experimental. We do not have a gradient
* registered for it either.
*/
class ElementwiseRTCOp final : public Operator<CUDAContext> {
public:
ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CUDAContext>(operator_def, ws) {
const string src = OperatorBase::GetSingleArgument<string>("rtc_src", "");
CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
func_.Compile(InputSize(), OutputSize(), src);
}
~ElementwiseRTCOp() override {}
bool RunOnDevice() override {
static_assert(
sizeof(void*) == sizeof(size_t),
"The argbuffer relies on the assumption that void* and "
"size_t have the same size.");
vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
size_t* argBuffer = argBuffer_vec.data();
CAFFE_ENFORCE(
Input(0).numel() < std::numeric_limits<int>::max(),
"The kernel function currently only supports int index.");
argBuffer[0] = Input(0).numel();
void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
for (int i = 0; i < InputSize(); ++i) {
ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
}
for (int i = 0; i < OutputSize(); ++i) {
Output(i)->ResizeLike(Input(0));
ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
}
size_t argBufferSize = sizeof(argBuffer);
void* config[] = {
CU_LAUNCH_PARAM_BUFFER_POINTER,
argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE,
&argBufferSize,
CU_LAUNCH_PARAM_END};
func_.LaunchEx(
CAFFE_GET_BLOCKS(Input(0).numel()),
1,
1,
CAFFE_CUDA_NUM_THREADS,
1,
1,
0,
context_.cuda_stream(),
config);
return true;
}
private:
ElementwiseRTCFunction func_;
};
namespace {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
}
} // namespace caffe2