caffe2/python/pybind_state.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <unordered_map>

 #include "caffe2/core/context.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/memonger.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/scope_guard.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/python/pybind_state_dlpack.h"
 #include "caffe2/python/pybind_workspace.h"

 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

 #include <Python.h>

 #ifdef USE_NUMPY

 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #define PY_ARRAY_UNIQUE_SYMBOL caffe2_python_ARRAY_API
 #include <numpy/arrayobject.h>

 // Temporary solution for numpy < 1.7 versions: old macro, no promises.
 // You're strongly advised to upgrade to >= 1.7.
 #ifndef NPY_ARRAY_C_CONTIGUOUS
 #define NPY_ARRAY_C_CONTIGUOUS NPY_C_CONTIGUOUS
 #define PyArray_SetBaseObject(arr, x) (PyArray_BASE(arr) = (x))
 #endif

 #else

 struct PyArrayObject; // Forward declaring PyArrayObject for safety

 #endif // USE_NUMPY

 namespace caffe2 {
 namespace python {

 namespace py = pybind11;

 // Add methods common to both CPU and GPU mode.
 void addGlobalMethods(pybind11::module& m);
 // Expose Workspace, Net, Blob
 void addObjectMethods(pybind11::module& m);

 // Get current workspace
 Workspace* GetCurrentWorkspace();

 // Get workspace by name. Returns nullptr if none exists by name.
 Workspace* GetWorkspaceByName(const std::string& name);

 class BlobFeederBase {
  public:
   virtual ~BlobFeederBase();
   virtual void Feed(
       const DeviceOption& option,
       PyArrayObject* array,
       Blob* blob,
       bool in_place = false) = 0;
 };

 C10_DECLARE_TYPED_REGISTRY(
     BlobFeederRegistry,
     DeviceType,
     BlobFeederBase,
     std::unique_ptr);
 #define REGISTER_BLOB_FEEDER(device_type, ...) \
   C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
 inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
   return BlobFeederRegistry()->Create(
       caffe2::ProtoToType(static_cast<DeviceTypeProto>(device_type)));
 }

 static_assert(
     sizeof(int) == sizeof(int32_t),
     "We make an assumption that int is always int32 for numpy "
     "type mapping.");

 int CaffeToNumpyType(const TypeMeta dtype);
 const TypeMeta NumpyTypeToCaffe(int numpy_type);

 class TensorFetcher : public BlobFetcherBase {
  public:
   pybind11::object Fetch(const Blob& blob) override {
     return FetchTensor(blob.Get<Tensor>(), true).obj;
   }

   // Checks whether the data with type `dtype` needs to be copied in the context
   // of `tensor`
   bool NeedsCopy(const Tensor* tensor, const TypeMeta dtype) const {
 #ifdef USE_NUMPY
     return tensor->GetDeviceType() != CPU ||
         CaffeToNumpyType(dtype) == NPY_OBJECT;
 #else
     return tensor->GetDeviceType() != CPU;
 #endif // USE_NUMPY
   }

   FetchedBlob FetchTensor(const Tensor& tensor, bool force_copy) {
 #ifdef USE_NUMPY
     FetchedBlob result;
     CAFFE_ENFORCE_GE(tensor.numel(), 0, "Trying to fetch uninitialized tensor");
     const int numpy_type = CaffeToNumpyType(tensor.dtype());
     CAFFE_ENFORCE(
         numpy_type != -1,
         "This tensor's data type is not supported: ",
         tensor.dtype().name(),
         ".");
     std::vector<npy_intp> npy_dims;
     for (const auto dim : tensor.sizes()) {
       npy_dims.push_back(dim);
     }
     result.copied = force_copy || NeedsCopy(&tensor, tensor.dtype());
     void* outPtr;
     if (result.copied) {
       result.obj = py::reinterpret_steal<py::object>(
           PyArray_SimpleNew(tensor.dim(), npy_dims.data(), numpy_type));
       outPtr = static_cast<void*>(
           PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
     } else {
       outPtr = const_cast<Tensor&>(tensor).raw_mutable_data();
       result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
           tensor.dim(), npy_dims.data(), numpy_type, outPtr));
     }

     if (numpy_type == NPY_OBJECT) {
       PyObject** outObj = reinterpret_cast<PyObject**>(outPtr);
       auto* str = tensor.template data<std::string>();
       for (const auto i : c10::irange(tensor.numel())) {
         outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size());
         str++;
         // cleanup on failure
         if (outObj[i] == nullptr) {
           for (const auto j : c10::irange(i)) {
             Py_DECREF(outObj[j]);
           }
           CAFFE_THROW("Failed to allocate string for ndarray of strings.");
         }
       }
       return result;
     }

     if (result.copied) {
       // TODO: use CUDAGuard here instead of context and employ explicit sync
       // copy
       auto context = CreateContext(tensor.GetDeviceType());
       context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
       context->FinishDeviceComputation();
     }
     return result;
 #else
     CAFFE_THROW("Caffe2 was compiled without NumPy support.");
 #endif // USE_NUMPY
   }
 };

 template <class Context>
 class TensorFeeder : public BlobFeederBase {
  public:
   Tensor FeedTensor(const DeviceOption& option, PyArrayObject* original_array) {
     Tensor out;
     FeedTensor(option, original_array, &out, false);
     return out;
   }

   void FeedTensor(
       const DeviceOption& option,
       PyArrayObject* original_array,
       Tensor* out,
       bool in_place) {
 #ifdef USE_NUMPY
     PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
     auto g = MakeGuard([&]() { Py_XDECREF(array); });

     const auto npy_type = PyArray_TYPE(array);
     const TypeMeta dtype = NumpyTypeToCaffe(npy_type);
     CAFFE_ENFORCE(
         dtype != ScalarType::Undefined,
         "This numpy data type is not supported: ",
         PyArray_TYPE(array),
         ".");
     Context context(option);
     context.SwitchToDevice();
     // numpy requires long int as its dims.
     int ndim = PyArray_NDIM(array);
     npy_intp* npy_dims = PyArray_DIMS(array);
     std::vector<int64_t> dims;
     for (const auto i : c10::irange(ndim)) {
       dims.push_back(npy_dims[i]);
     }

     Tensor& tensor = *out;
     if (in_place) {
       tensor.Resize(dims);
     }
     // Now, copy the data to the tensor.
     switch (npy_type) {
       case NPY_OBJECT: {
         PyObject** input = reinterpret_cast<PyObject**>(PyArray_DATA(array));
         if (!in_place) {
           tensor = caffe2::empty(
               dims, at::dtype<std::string>().device(Context::GetDeviceType()));
         }
         auto* outPtr = tensor.template mutable_data<std::string>();
         for (const auto i : c10::irange(tensor.numel())) {
           char* str;
           Py_ssize_t strSize;
           if (PyBytes_Check(input[i])) {
             CAFFE_ENFORCE(
                 PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
                 "Had a PyBytes object but cannot convert it to a string.");
           } else if (PyUnicode_Check(input[i])) { // string
             str =
                 const_cast<char*>(PyUnicode_AsUTF8AndSize(input[i], &strSize));
             CAFFE_ENFORCE(
                 str,
                 "Had a PyUnicode object but cannot convert it to a string.");
           } else {
             CAFFE_THROW("Unsupported python object type passed into ndarray.");
           }
           outPtr[i] = std::string(str, strSize);
         }
         break;
       }
       case NPY_UNICODE:
         CAFFE_THROW(
             "You are feeding in a numpy array of unicode. Caffe2 C++ does not "
             "support unicode yet. Please ensure that you are passing in bytes "
             "instead of unicode strings.");
         break;
       default:
         if (!in_place) {
           tensor = caffe2::empty(
               dims, at::dtype(dtype).device(Context::GetDeviceType()));
         } else {
           tensor.raw_mutable_data(dtype);
         }
         context.CopyBytesFromCPU(
             tensor.numel() * dtype.itemsize(),
             static_cast<void*>(PyArray_DATA(array)),
             tensor.raw_mutable_data());
     }
     context.FinishDeviceComputation();
 #else
     CAFFE_THROW("Caffe2 compiled without NumPy support.");
 #endif // USE_NUMPY
   }

   virtual void Feed(
       const DeviceOption& option,
       PyArrayObject* original_array,
       Blob* blob,
       bool in_place) {
     if (in_place) {
       FeedTensor(
           option,
           original_array,
           BlobGetMutableTensor(blob, OptionToDevice(option).type()),
           true);
     } else {
       blob->Reset<Tensor>(new Tensor(FeedTensor(option, original_array)));
     }
   }
 };

 namespace python_detail {
 struct Func {
   py::object py_func;
   bool needs_workspace;
 };

 const Func& getOpFunc(const std::string& token);

 const Func& getGradientFunc(const std::string& token);

 } // namespace python_detail

 // TODO: Remove template?
 template <class Context, bool use_dlpack>
 class PythonOpBase : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   PythonOpBase(
       const OperatorDef& operator_def,
       Workspace* ws,
       const std::string& pickled_builder_arg_name)
       : Operator<Context>(operator_def, ws),
         ws_(ws),
         token_(OperatorBase::template GetSingleArgument<std::string>(
             "token",
             "")) {
     using namespace python_detail;
     auto pickled = OperatorBase::template GetSingleArgument<std::string>(
         pickled_builder_arg_name, "");
     CAFFE_ENFORCE(
         !pickled.empty() || !token_.empty(),
         "PythonOp requires either pickled_builder or token arg.");
     if (!pickled.empty()) {
       py::gil_scoped_acquire g;
       try {
         auto pickle =
             py::reinterpret_steal<py::object>(PyImport_ImportModule("pickle"));

         CAFFE_ENFORCE(pickle);
         auto loads = pickle.attr("loads").cast<py::object>();
         CAFFE_ENFORCE(loads);
         py::tuple builder_call;
         try {
           builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
         } catch (const py::error_already_set& e) {
           LOG(INFO) << "Cannot unpickle python operator: " << e.what();
           LOG(INFO) << "Try latin1 encoding for python3 run";
           // to use the `_a` literal for arguments
           using namespace pybind11::literals;
           builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1")
                              .template cast<py::tuple>();
         }
         CAFFE_ENFORCE(builder_call);
         CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
         auto func = builder_call[0].cast<py::object>();
         auto args = builder_call[1].cast<py::tuple>();
         auto kwargs = builder_call[2].cast<py::dict>();
         auto built_func = func(*args, **kwargs);
         CAFFE_ENFORCE(built_func);
         built_func_.reset(new Func{
             built_func,
             OperatorBase::template GetSingleArgument<bool>(
                 "pass_workspace", false)});
       } catch (const py::error_already_set& e) {
         LOG(ERROR) << "Python exception encountered while creating PythonOp: "
                    << e.what();
         // Rethrow exception to preserve python exception type.
         throw;
       }
     }
   }

   bool RunOnDevice() override final {
     auto* pyFunc = built_func_ ? built_func_.get() : &getFunc(token_);
     CAFFE_ENFORCE(pyFunc);
     {
       // Acquire GIL for call to Python runtime.
       py::gil_scoped_acquire g;

       DeviceOption cpu_option;
       cpu_option.set_device_type(PROTO_CPU);

       std::vector<py::object> inputs;
       inputs.reserve(InputSize());
       for (const auto i : c10::irange(InputSize())) {
         const auto* blob = &InputBlob(i);
         // Allow CPU tensors in addition to operator context's tensors
         py::object py_obj;
         CAFFE_ENFORCE(
             BlobIsTensorType(*blob, CPU),
             "We only allow input blob to be CPU Tensor");
         if (use_dlpack) {
           DLPackWrapper<CPUContext> wrapper(
               const_cast<Tensor*>(&(BlobGetTensor(*blob, CPU))), cpu_option);
           // copy wrapper
           py_obj = py::cast(wrapper, py::return_value_policy::copy);
         } else {
           py_obj = py::cast(
               &(BlobGetTensor(*blob, CPU)), py::return_value_policy::reference);
         }
         inputs.push_back(py_obj);
       }
       std::vector<py::object> outputs;
       outputs.reserve(OutputSize());
       for (const auto i : c10::irange(OutputSize())) {
         auto* blob = OutputBlob(i);

         // Python op is always used with CPUContext only and treats inputs and
         // outputs as CPU tensors, CUDA version of PythonOp is implemented
         // through GPUFallbackOp that copies input CUDA blobs to CPU and copies
         // outputs from CUDA to CPU.
         // GPUFallbackOp also allows keeping some of the output blobs on CPU
         // by specifying their indices explicitly in template parameters.

         // PythonDLPack op allows working CPU blobs only through DLPack tensors.
         // We don't have use cases of CUDA version yet, but if there is such use
         // case, we can use GPUFallbackOp to enable it.

         py::object py_obj;
         if (use_dlpack) {
           DLPackWrapper<CPUContext> wrapper(
               BlobGetMutableTensor(blob, CPU), cpu_option);
           py_obj = py::cast(wrapper, py::return_value_policy::copy);
         } else {
           py_obj = py::cast(
               BlobGetMutableTensor(blob, CPU),
               py::return_value_policy::reference);
         }
         outputs.push_back(py_obj);
       }

       try {
         if (pyFunc->needs_workspace) {
           pyFunc->py_func(inputs, outputs, ws_);
         } else {
           pyFunc->py_func(inputs, outputs);
         }
       } catch (const py::error_already_set& e) {
         LOG(ERROR) << "Exception encountered running PythonOp function: "
                    << e.what();
         // Rethrow exception to preserve python exception type.
         throw;
       }
     }
     return true;
   }

   virtual ~PythonOpBase() {
     if (built_func_) {
       // since it may trigger python interpreter when refcount reaches zero
       py::gil_scoped_acquire g;
       built_func_.reset();
     }
   }

  protected:
   virtual const python_detail::Func& getFunc(const std::string& token) = 0;
   Workspace* ws_;

  private:
   const std::string token_;
   std::unique_ptr<python_detail::Func> built_func_;
 };

 template <class Context, bool use_dlpack>
 class PythonOp : public PythonOpBase<Context, use_dlpack> {
  public:
   PythonOp(const OperatorDef& operator_def, Workspace* ws)
       : PythonOpBase<Context, use_dlpack>(operator_def, ws, "pickled_builder") {
   }

  protected:
   const python_detail::Func& getFunc(const std::string& token) override {
     return python_detail::getOpFunc(token);
   }
 };

 template <class Context, bool use_dlpack>
 class PythonGradientOp : public PythonOpBase<Context, use_dlpack> {
  public:
   PythonGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : PythonOpBase<Context, use_dlpack>(
             operator_def,
             ws,
             "pickled_grad_builder") {}

  protected:
   const python_detail::Func& getFunc(const std::string& token) override {
     return python_detail::getGradientFunc(token);
   }
 };

 } // namespace python
 } // namespace caffe2
	#pragma once

	#include <unordered_map>

	#include "caffe2/core/context.h"
	#include "caffe2/core/init.h"
	#include "caffe2/core/logging.h"
	#include "caffe2/core/memonger.h"
	#include "caffe2/core/net.h"
	#include "caffe2/core/operator.h"
	#include "caffe2/core/scope_guard.h"
	#include "caffe2/core/tensor.h"
	#include "caffe2/core/types.h"
	#include "caffe2/core/workspace.h"
	#include "caffe2/proto/caffe2_pb.h"
	#include "caffe2/python/pybind_state_dlpack.h"
	#include "caffe2/python/pybind_workspace.h"

	#include <pybind11/pybind11.h>
	#include <pybind11/stl.h>

	#include <Python.h>

	#ifdef USE_NUMPY

	#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
	#define PY_ARRAY_UNIQUE_SYMBOL caffe2_python_ARRAY_API
	#include <numpy/arrayobject.h>

	// Temporary solution for numpy < 1.7 versions: old macro, no promises.
	// You're strongly advised to upgrade to >= 1.7.
	#ifndef NPY_ARRAY_C_CONTIGUOUS
	#define NPY_ARRAY_C_CONTIGUOUS NPY_C_CONTIGUOUS
	#define PyArray_SetBaseObject(arr, x) (PyArray_BASE(arr) = (x))
	#endif

	#else

	struct PyArrayObject; // Forward declaring PyArrayObject for safety

	#endif // USE_NUMPY

	namespace caffe2 {
	namespace python {

	namespace py = pybind11;

	// Add methods common to both CPU and GPU mode.
	void addGlobalMethods(pybind11::module& m);
	// Expose Workspace, Net, Blob
	void addObjectMethods(pybind11::module& m);

	// Get current workspace
	Workspace* GetCurrentWorkspace();

	// Get workspace by name. Returns nullptr if none exists by name.
	Workspace* GetWorkspaceByName(const std::string& name);

	class BlobFeederBase {
	public:
	virtual ~BlobFeederBase();
	virtual void Feed(
	const DeviceOption& option,
	PyArrayObject* array,
	Blob* blob,
	bool in_place = false) = 0;
	};

	C10_DECLARE_TYPED_REGISTRY(
	BlobFeederRegistry,
	DeviceType,
	BlobFeederBase,
	std::unique_ptr);
	#define REGISTER_BLOB_FEEDER(device_type, ...) \
	C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
	inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
	return BlobFeederRegistry()->Create(
	caffe2::ProtoToType(static_cast<DeviceTypeProto>(device_type)));
	}

	static_assert(
	sizeof(int) == sizeof(int32_t),
	"We make an assumption that int is always int32 for numpy "
	"type mapping.");

	int CaffeToNumpyType(const TypeMeta dtype);
	const TypeMeta NumpyTypeToCaffe(int numpy_type);

	class TensorFetcher : public BlobFetcherBase {
	public:
	pybind11::object Fetch(const Blob& blob) override {
	return FetchTensor(blob.Get<Tensor>(), true).obj;
	}

	// Checks whether the data with type `dtype` needs to be copied in the context
	// of `tensor`
	bool NeedsCopy(const Tensor* tensor, const TypeMeta dtype) const {
	#ifdef USE_NUMPY
	return tensor->GetDeviceType() != CPU \|\|
	CaffeToNumpyType(dtype) == NPY_OBJECT;
	#else
	return tensor->GetDeviceType() != CPU;
	#endif // USE_NUMPY
	}

	FetchedBlob FetchTensor(const Tensor& tensor, bool force_copy) {
	#ifdef USE_NUMPY
	FetchedBlob result;
	CAFFE_ENFORCE_GE(tensor.numel(), 0, "Trying to fetch uninitialized tensor");
	const int numpy_type = CaffeToNumpyType(tensor.dtype());
	CAFFE_ENFORCE(
	numpy_type != -1,
	"This tensor's data type is not supported: ",
	tensor.dtype().name(),
	".");
	std::vector<npy_intp> npy_dims;
	for (const auto dim : tensor.sizes()) {
	npy_dims.push_back(dim);
	}
	result.copied = force_copy \|\| NeedsCopy(&tensor, tensor.dtype());
	void* outPtr;
	if (result.copied) {
	result.obj = py::reinterpret_steal<py::object>(
	PyArray_SimpleNew(tensor.dim(), npy_dims.data(), numpy_type));
	outPtr = static_cast<void*>(
	PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
	} else {
	outPtr = const_cast<Tensor&>(tensor).raw_mutable_data();
	result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
	tensor.dim(), npy_dims.data(), numpy_type, outPtr));
	}

	if (numpy_type == NPY_OBJECT) {
	PyObject outObj = reinterpret_cast<PyObject>(outPtr);
	auto* str = tensor.template data<std::string>();
	for (const auto i : c10::irange(tensor.numel())) {
	outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size());
	str++;
	// cleanup on failure
	if (outObj[i] == nullptr) {
	for (const auto j : c10::irange(i)) {
	Py_DECREF(outObj[j]);
	}
	CAFFE_THROW("Failed to allocate string for ndarray of strings.");
	}
	}
	return result;
	}

	if (result.copied) {
	// TODO: use CUDAGuard here instead of context and employ explicit sync
	// copy
	auto context = CreateContext(tensor.GetDeviceType());
	context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
	context->FinishDeviceComputation();
	}
	return result;
	#else
	CAFFE_THROW("Caffe2 was compiled without NumPy support.");
	#endif // USE_NUMPY
	}
	};

	template <class Context>
	class TensorFeeder : public BlobFeederBase {
	public:
	Tensor FeedTensor(const DeviceOption& option, PyArrayObject* original_array) {
	Tensor out;
	FeedTensor(option, original_array, &out, false);
	return out;
	}

	void FeedTensor(
	const DeviceOption& option,
	PyArrayObject* original_array,
	Tensor* out,
	bool in_place) {
	#ifdef USE_NUMPY
	PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
	auto g = MakeGuard([&]() { Py_XDECREF(array); });

	const auto npy_type = PyArray_TYPE(array);
	const TypeMeta dtype = NumpyTypeToCaffe(npy_type);
	CAFFE_ENFORCE(
	dtype != ScalarType::Undefined,
	"This numpy data type is not supported: ",
	PyArray_TYPE(array),
	".");
	Context context(option);
	context.SwitchToDevice();
	// numpy requires long int as its dims.
	int ndim = PyArray_NDIM(array);
	npy_intp* npy_dims = PyArray_DIMS(array);
	std::vector<int64_t> dims;
	for (const auto i : c10::irange(ndim)) {
	dims.push_back(npy_dims[i]);
	}

	Tensor& tensor = *out;
	if (in_place) {
	tensor.Resize(dims);
	}
	// Now, copy the data to the tensor.
	switch (npy_type) {
	case NPY_OBJECT: {
	PyObject input = reinterpret_cast<PyObject>(PyArray_DATA(array));
	if (!in_place) {
	tensor = caffe2::empty(
	dims, at::dtype<std::string>().device(Context::GetDeviceType()));
	}
	auto* outPtr = tensor.template mutable_data<std::string>();
	for (const auto i : c10::irange(tensor.numel())) {
	char* str;
	Py_ssize_t strSize;
	if (PyBytes_Check(input[i])) {
	CAFFE_ENFORCE(
	PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
	"Had a PyBytes object but cannot convert it to a string.");
	} else if (PyUnicode_Check(input[i])) { // string
	str =
	const_cast<char*>(PyUnicode_AsUTF8AndSize(input[i], &strSize));
	CAFFE_ENFORCE(
	str,
	"Had a PyUnicode object but cannot convert it to a string.");
	} else {
	CAFFE_THROW("Unsupported python object type passed into ndarray.");
	}
	outPtr[i] = std::string(str, strSize);
	}
	break;
	}
	case NPY_UNICODE:
	CAFFE_THROW(
	"You are feeding in a numpy array of unicode. Caffe2 C++ does not "
	"support unicode yet. Please ensure that you are passing in bytes "
	"instead of unicode strings.");
	break;
	default:
	if (!in_place) {
	tensor = caffe2::empty(
	dims, at::dtype(dtype).device(Context::GetDeviceType()));
	} else {
	tensor.raw_mutable_data(dtype);
	}
	context.CopyBytesFromCPU(
	tensor.numel() * dtype.itemsize(),
	static_cast<void*>(PyArray_DATA(array)),
	tensor.raw_mutable_data());
	}
	context.FinishDeviceComputation();
	#else
	CAFFE_THROW("Caffe2 compiled without NumPy support.");
	#endif // USE_NUMPY
	}

	virtual void Feed(
	const DeviceOption& option,
	PyArrayObject* original_array,
	Blob* blob,
	bool in_place) {
	if (in_place) {
	FeedTensor(
	option,
	original_array,
	BlobGetMutableTensor(blob, OptionToDevice(option).type()),
	true);
	} else {
	blob->Reset<Tensor>(new Tensor(FeedTensor(option, original_array)));
	}
	}
	};

	namespace python_detail {
	struct Func {
	py::object py_func;
	bool needs_workspace;
	};

	const Func& getOpFunc(const std::string& token);

	const Func& getGradientFunc(const std::string& token);

	} // namespace python_detail

	// TODO: Remove template?
	template <class Context, bool use_dlpack>
	class PythonOpBase : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;

	PythonOpBase(
	const OperatorDef& operator_def,
	Workspace* ws,
	const std::string& pickled_builder_arg_name)
	: Operator<Context>(operator_def, ws),
	ws_(ws),
	token_(OperatorBase::template GetSingleArgument<std::string>(
	"token",
	"")) {
	using namespace python_detail;
	auto pickled = OperatorBase::template GetSingleArgument<std::string>(
	pickled_builder_arg_name, "");
	CAFFE_ENFORCE(
	!pickled.empty() \|\| !token_.empty(),
	"PythonOp requires either pickled_builder or token arg.");
	if (!pickled.empty()) {
	py::gil_scoped_acquire g;
	try {
	auto pickle =
	py::reinterpret_steal<py::object>(PyImport_ImportModule("pickle"));

	CAFFE_ENFORCE(pickle);
	auto loads = pickle.attr("loads").cast<py::object>();
	CAFFE_ENFORCE(loads);
	py::tuple builder_call;
	try {
	builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
	} catch (const py::error_already_set& e) {
	LOG(INFO) << "Cannot unpickle python operator: " << e.what();
	LOG(INFO) << "Try latin1 encoding for python3 run";
	// to use the `_a` literal for arguments
	using namespace pybind11::literals;
	builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1")
	.template cast<py::tuple>();
	}
	CAFFE_ENFORCE(builder_call);
	CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
	auto func = builder_call[0].cast<py::object>();
	auto args = builder_call[1].cast<py::tuple>();
	auto kwargs = builder_call[2].cast<py::dict>();
	auto built_func = func(args, *kwargs);
	CAFFE_ENFORCE(built_func);
	built_func_.reset(new Func{
	built_func,
	OperatorBase::template GetSingleArgument<bool>(
	"pass_workspace", false)});
	} catch (const py::error_already_set& e) {
	LOG(ERROR) << "Python exception encountered while creating PythonOp: "
	<< e.what();
	// Rethrow exception to preserve python exception type.
	throw;
	}
	}
	}

	bool RunOnDevice() override final {
	auto* pyFunc = built_func_ ? built_func_.get() : &getFunc(token_);
	CAFFE_ENFORCE(pyFunc);
	{
	// Acquire GIL for call to Python runtime.
	py::gil_scoped_acquire g;

	DeviceOption cpu_option;
	cpu_option.set_device_type(PROTO_CPU);

	std::vector<py::object> inputs;
	inputs.reserve(InputSize());
	for (const auto i : c10::irange(InputSize())) {
	const auto* blob = &InputBlob(i);
	// Allow CPU tensors in addition to operator context's tensors
	py::object py_obj;
	CAFFE_ENFORCE(
	BlobIsTensorType(*blob, CPU),
	"We only allow input blob to be CPU Tensor");
	if (use_dlpack) {
	DLPackWrapper<CPUContext> wrapper(
	const_cast<Tensor>(&(BlobGetTensor(blob, CPU))), cpu_option);
	// copy wrapper
	py_obj = py::cast(wrapper, py::return_value_policy::copy);
	} else {
	py_obj = py::cast(
	&(BlobGetTensor(*blob, CPU)), py::return_value_policy::reference);
	}
	inputs.push_back(py_obj);
	}
	std::vector<py::object> outputs;
	outputs.reserve(OutputSize());
	for (const auto i : c10::irange(OutputSize())) {
	auto* blob = OutputBlob(i);

	// Python op is always used with CPUContext only and treats inputs and
	// outputs as CPU tensors, CUDA version of PythonOp is implemented
	// through GPUFallbackOp that copies input CUDA blobs to CPU and copies
	// outputs from CUDA to CPU.
	// GPUFallbackOp also allows keeping some of the output blobs on CPU
	// by specifying their indices explicitly in template parameters.

	// PythonDLPack op allows working CPU blobs only through DLPack tensors.
	// We don't have use cases of CUDA version yet, but if there is such use
	// case, we can use GPUFallbackOp to enable it.

	py::object py_obj;
	if (use_dlpack) {
	DLPackWrapper<CPUContext> wrapper(
	BlobGetMutableTensor(blob, CPU), cpu_option);
	py_obj = py::cast(wrapper, py::return_value_policy::copy);
	} else {
	py_obj = py::cast(
	BlobGetMutableTensor(blob, CPU),
	py::return_value_policy::reference);
	}
	outputs.push_back(py_obj);
	}

	try {
	if (pyFunc->needs_workspace) {
	pyFunc->py_func(inputs, outputs, ws_);
	} else {
	pyFunc->py_func(inputs, outputs);
	}
	} catch (const py::error_already_set& e) {
	LOG(ERROR) << "Exception encountered running PythonOp function: "
	<< e.what();
	// Rethrow exception to preserve python exception type.
	throw;
	}
	}
	return true;
	}

	virtual ~PythonOpBase() {
	if (built_func_) {
	// since it may trigger python interpreter when refcount reaches zero
	py::gil_scoped_acquire g;
	built_func_.reset();
	}
	}

	protected:
	virtual const python_detail::Func& getFunc(const std::string& token) = 0;
	Workspace* ws_;

	private:
	const std::string token_;
	std::unique_ptr<python_detail::Func> built_func_;
	};

	template <class Context, bool use_dlpack>
	class PythonOp : public PythonOpBase<Context, use_dlpack> {
	public:
	PythonOp(const OperatorDef& operator_def, Workspace* ws)
	: PythonOpBase<Context, use_dlpack>(operator_def, ws, "pickled_builder") {
	}

	protected:
	const python_detail::Func& getFunc(const std::string& token) override {
	return python_detail::getOpFunc(token);
	}
	};

	template <class Context, bool use_dlpack>
	class PythonGradientOp : public PythonOpBase<Context, use_dlpack> {
	public:
	PythonGradientOp(const OperatorDef& operator_def, Workspace* ws)
	: PythonOpBase<Context, use_dlpack>(
	operator_def,
	ws,
	"pickled_grad_builder") {}

	protected:
	const python_detail::Func& getFunc(const std::string& token) override {
	return python_detail::getGradientFunc(token);
	}
	};

	} // namespace python
	} // namespace caffe2