caffe2/python/pybind_state_gpu.cc - platform/external/pytorch - Git at Google

 // Note(jiayq): the import_array function is done inside
 // caffe2_python.cc. Read
 // http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
 // for more details.

 #define NO_IMPORT_ARRAY

 #include "pybind_state.h"

 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

 #ifdef CAFFE2_USE_CUDNN
 #include "caffe2/core/common_cudnn.h"
 #endif // CAFFE2_USE_CUDNN
 #include <c10/cuda/CUDAGuard.h>
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/operator_fallback_gpu.h"
 #include "caffe2/python/pybind_state_registry.h"

 #ifdef CAFFE2_USE_TRT
 #include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
 #endif // CAFFE2_USE_TRT

 namespace caffe2 {
 namespace python {

 REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp);
 REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp);

 REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp);
 REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp);

 REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);

 namespace py = pybind11;

 void addCUDAGlobalMethods(py::module& m) {
   m.def("num_cuda_devices", &NumCudaDevices);
   m.def("get_cuda_version", &CudaVersion);
 #ifdef CAFFE2_USE_CUDNN
   m.def("get_cudnn_version", &cudnnCompiledVersion);
   m.attr("cudnn_convolution_fwd_algo_count") =
       py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
   m.attr("cudnn_convolution_bwd_data_algo_count") =
       py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
   m.attr("cudnn_convolution_bwd_filter_algo_count") =
       py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
 #else
   m.def("get_cudnn_version", []() { return static_cast<size_t>(0); });
   m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0);
   m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0);
   m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0);
 #endif
   m.def("get_gpu_memory_info", [](int device_id) {
     CUDAGuard guard(device_id);
     size_t device_free, device_total;
     CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     return std::pair<size_t, size_t>{device_free, device_total};
   });
   m.def("get_cuda_peer_access_pattern", []() {
     std::vector<std::vector<bool>> pattern;
     CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
     return pattern;
   });
   m.def("get_device_properties", [](int deviceid) {
     auto& prop = GetDeviceProperty(deviceid);
     std::map<std::string, py::object> obj;
     obj["name"] = py::cast(prop.name);
     obj["major"] = py::cast(prop.major);
     obj["minor"] = py::cast(prop.minor);
     obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
     return obj;
   });
   m.def(
       "onnx_to_trt_op",
       [](const py::bytes& onnx_model_str,
          const std::unordered_map<std::string, std::vector<int>>&
              output_size_hints,
          int max_batch_size,
          int max_workspace_size,
          int verbosity,
          bool debug_builder) -> py::bytes {
 #ifdef CAFFE2_USE_TRT
         TensorRTTransformer t(
             max_batch_size, max_workspace_size, verbosity, debug_builder);
         auto op_def =
             t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints);
         std::string out;
         op_def.SerializeToString(&out);
         return py::bytes(out);
 #else
         CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
 #endif // CAFFE2_USE_TRT
       });
   m.def(
       "transform_trt",
       [](const py::bytes& pred_net_str,
          const std::unordered_map<std::string, std::vector<int>>& shapes,
          int max_batch_size,
          int max_workspace_size,
          int verbosity,
          bool debug_builder,
          bool build_serializable_op) -> py::bytes {
 #ifdef CAFFE2_USE_TRT
         caffe2::NetDef pred_net;
         if (!ParseProtoFromLargeString(
                 pred_net_str.cast<std::string>(), &pred_net)) {
           LOG(ERROR) << "broken pred_net protobuf";
         }
         std::unordered_map<std::string, TensorShape> tensor_shapes;
         for (const auto& it : shapes) {
           tensor_shapes.emplace(
               it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
         }
         TensorRTTransformer ts(
             max_batch_size,
             max_workspace_size,
             verbosity,
             debug_builder,
             build_serializable_op);
         ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
         std::string pred_net_str2;
         pred_net.SerializeToString(&pred_net_str2);
         return py::bytes(pred_net_str2);
 #else
         CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
 #endif // CAFFE2_USE_TRT
       });
 };

 void addCUDAObjectMethods(py::module& m) {
   py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA")
       .def_property_readonly(
           "data",
           [](DLPackWrapper<CUDAContext>* t) -> py::object {
             CAFFE_ENFORCE_EQ(
                 t->device_option.device_type(),
                 PROTO_CUDA,
                 "Expected CUDA device option for CUDA tensor");

             return t->data();
           },
           "Return DLPack tensor with tensor's data.")
       .def(
           "feed",
           [](DLPackWrapper<CUDAContext>* t, py::object obj) {
             CAFFE_ENFORCE_EQ(
                 t->device_option.device_type(),
                 PROTO_CUDA,
                 "Expected CUDA device option for CUDA tensor");
             t->feed(obj);
           },
           "Copy data from given DLPack tensor into this tensor.")
       .def_property_readonly(
           "_shape",
           [](const DLPackWrapper<CUDAContext>& t) { return t.tensor->sizes(); })
       .def(
           "_reshape",
           [](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) {
             t->tensor->Resize(dims);
           });
 }

 PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) {
   m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";

   addGlobalMethods(m);
   addCUDAGlobalMethods(m);
   addObjectMethods(m);
   addCUDAObjectMethods(m);
   for (const auto& addition : PybindAdditionRegistry()->Keys()) {
     PybindAdditionRegistry()->Create(addition, m);
   }
 }
 } // namespace python
 } // namespace caffe2
	// Note(jiayq): the import_array function is done inside
	// caffe2_python.cc. Read
	// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
	// for more details.

	#define NO_IMPORT_ARRAY

	#include "pybind_state.h"

	#include <pybind11/pybind11.h>
	#include <pybind11/stl.h>

	#ifdef CAFFE2_USE_CUDNN
	#include "caffe2/core/common_cudnn.h"
	#endif // CAFFE2_USE_CUDNN
	#include <c10/cuda/CUDAGuard.h>
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/operators/operator_fallback_gpu.h"
	#include "caffe2/python/pybind_state_registry.h"

	#ifdef CAFFE2_USE_TRT
	#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
	#endif // CAFFE2_USE_TRT

	namespace caffe2 {
	namespace python {

	REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp);
	REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp);

	REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp);
	REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp);

	REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);

	namespace py = pybind11;

	void addCUDAGlobalMethods(py::module& m) {
	m.def("num_cuda_devices", &NumCudaDevices);
	m.def("get_cuda_version", &CudaVersion);
	#ifdef CAFFE2_USE_CUDNN
	m.def("get_cudnn_version", &cudnnCompiledVersion);
	m.attr("cudnn_convolution_fwd_algo_count") =
	py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
	m.attr("cudnn_convolution_bwd_data_algo_count") =
	py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
	m.attr("cudnn_convolution_bwd_filter_algo_count") =
	py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
	#else
	m.def("get_cudnn_version", []() { return static_cast<size_t>(0); });
	m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0);
	m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0);
	m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0);
	#endif
	m.def("get_gpu_memory_info", [](int device_id) {
	CUDAGuard guard(device_id);
	size_t device_free, device_total;
	CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
	return std::pair<size_t, size_t>{device_free, device_total};
	});
	m.def("get_cuda_peer_access_pattern", []() {
	std::vector<std::vector<bool>> pattern;
	CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
	return pattern;
	});
	m.def("get_device_properties", [](int deviceid) {
	auto& prop = GetDeviceProperty(deviceid);
	std::map<std::string, py::object> obj;
	obj["name"] = py::cast(prop.name);
	obj["major"] = py::cast(prop.major);
	obj["minor"] = py::cast(prop.minor);
	obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
	return obj;
	});
	m.def(
	"onnx_to_trt_op",
	[](const py::bytes& onnx_model_str,
	const std::unordered_map<std::string, std::vector<int>>&
	output_size_hints,
	int max_batch_size,
	int max_workspace_size,
	int verbosity,
	bool debug_builder) -> py::bytes {
	#ifdef CAFFE2_USE_TRT
	TensorRTTransformer t(
	max_batch_size, max_workspace_size, verbosity, debug_builder);
	auto op_def =
	t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints);
	std::string out;
	op_def.SerializeToString(&out);
	return py::bytes(out);
	#else
	CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
	#endif // CAFFE2_USE_TRT
	});
	m.def(
	"transform_trt",
	[](const py::bytes& pred_net_str,
	const std::unordered_map<std::string, std::vector<int>>& shapes,
	int max_batch_size,
	int max_workspace_size,
	int verbosity,
	bool debug_builder,
	bool build_serializable_op) -> py::bytes {
	#ifdef CAFFE2_USE_TRT
	caffe2::NetDef pred_net;
	if (!ParseProtoFromLargeString(
	pred_net_str.cast<std::string>(), &pred_net)) {
	LOG(ERROR) << "broken pred_net protobuf";
	}
	std::unordered_map<std::string, TensorShape> tensor_shapes;
	for (const auto& it : shapes) {
	tensor_shapes.emplace(
	it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
	}
	TensorRTTransformer ts(
	max_batch_size,
	max_workspace_size,
	verbosity,
	debug_builder,
	build_serializable_op);
	ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
	std::string pred_net_str2;
	pred_net.SerializeToString(&pred_net_str2);
	return py::bytes(pred_net_str2);
	#else
	CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
	#endif // CAFFE2_USE_TRT
	});
	};

	void addCUDAObjectMethods(py::module& m) {
	py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA")
	.def_property_readonly(
	"data",
	[](DLPackWrapper<CUDAContext>* t) -> py::object {
	CAFFE_ENFORCE_EQ(
	t->device_option.device_type(),
	PROTO_CUDA,
	"Expected CUDA device option for CUDA tensor");

	return t->data();
	},
	"Return DLPack tensor with tensor's data.")
	.def(
	"feed",
	[](DLPackWrapper<CUDAContext>* t, py::object obj) {
	CAFFE_ENFORCE_EQ(
	t->device_option.device_type(),
	PROTO_CUDA,
	"Expected CUDA device option for CUDA tensor");
	t->feed(obj);
	},
	"Copy data from given DLPack tensor into this tensor.")
	.def_property_readonly(
	"_shape",
	[](const DLPackWrapper<CUDAContext>& t) { return t.tensor->sizes(); })
	.def(
	"_reshape",
	[](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) {
	t->tensor->Resize(dims);
	});
	}

	PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) {
	m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";

	addGlobalMethods(m);
	addCUDAGlobalMethods(m);
	addObjectMethods(m);
	addCUDAObjectMethods(m);
	for (const auto& addition : PybindAdditionRegistry()->Keys()) {
	PybindAdditionRegistry()->Create(addition, m);
	}
	}
	} // namespace python
	} // namespace caffe2