caffe2/mpi/mpi_ops.h - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_MPI_MPI_OPS_H_
 #define CAFFE2_MPI_MPI_OPS_H_

 #include <mpi.h>

 #include "caffe2/core/operator.h"
 #include "caffe2/mpi/mpi_common.h"

 namespace caffe2 {

 // TODO(jiayq): if needed, write up the use of color and key with MPI split.
 // Currently, the operator simply creates a communicator that has the
 // same topology as the Caffe2 global communicator.
 template <class Context>
 class MPICreateCommonWorldOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   MPICreateCommonWorldOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws) {}

   bool RunOnDevice() override {
     OperatorBase::Outputs()[0]->Reset(new MPICommonWorldWrapper());
     return true;
   }
 };

 template <class Context>
 class MPIBroadcastOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   MPIBroadcastOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         root_(OperatorBase::template GetSingleArgument<int>("root", 0)) {}
   ~MPIBroadcastOp() {}

   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     CAFFE_ENFORCE(
         OperatorBase::OutputIsTensorType(0, Context::GetDeviceType()),
         "Output is of wrong type.");
     auto* output = Output(0);
     // Make sure that output is already allocated.
     CAFFE_ENFORCE(
         output->numel() > 0,
         "Broadcast op uses in-place operation so the output "
         "should be already allocated.");
     MPI_CHECK(MPI_Bcast(
         output->raw_mutable_data(),
         output->nbytes(),
         MPIDataTypeWrapper<char>::type(),
         root_,
         comm));
     return true;
   }

  protected:
   int root_;
 };

 // MPIReduceOp does Reduce using MPI. Currently, only SUM is supported.
 template <typename T, class Context>
 class MPIReduceOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   MPIReduceOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         root_(OperatorBase::template GetSingleArgument<int>("root", 0)) {}
   ~MPIReduceOp() {}

   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     auto& input = Input(1);
     auto* output = Output(0, input.sizes(), at::dtype<T>());
     MPI_CHECK(MPI_Reduce(
         const_cast<T*>(input.template data<T>()),
         output->template mutable_data<T>(),
         input.numel(),
         MPIDataTypeWrapper<T>::type(),
         MPI_SUM,
         root_,
         comm));
     return true;
   }

  protected:
   int root_;
 };

 // MPIAllgatherOp does MPIAllgather using MPI.
 template <typename T, class Context>
 class MPIAllgatherOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(MPIAllgatherOp);

   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     auto& input = Input(1);
     auto* output = Output(0);
     vector<int64_t> output_dims = input.sizes().vec();
     output_dims[0] *= OperatorBase::Input<MPICommonWorldWrapper>(0).size();
     output->Resize(output_dims);
     MPI_CHECK(MPI_Allgather(
         const_cast<T*>(input.template data<T>()),
         input.numel(),
         MPIDataTypeWrapper<T>::type(),
         output->template mutable_data<T>(),
         input.numel(),
         MPIDataTypeWrapper<T>::type(),
         comm));
     return true;
   }
 };

 // MPIAllreduceOp does MPIAllreduce using MPI. Currently, only SUM is supported.
 template <typename T, class Context>
 class MPIAllreduceOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(MPIAllreduceOp);

   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     auto& input = Input(1);
     auto* output = Output(0, input.sizes(), at::dtype<T>());
     void* source;
     if (output->template mutable_data<T>() == input.template data<T>()) {
       // We are doing in-place call. Special case handling.
       source = MPI_IN_PLACE;
     } else {
       // Normal allreduce takes the source from the input.
       source = const_cast<T*>(input.template data<T>());
     }
     MPI_CHECK(MPI_Allreduce(
         source,
         output->template mutable_data<T>(),
         input.numel(),
         MPIDataTypeWrapper<T>::type(),
         MPI_SUM,
         comm));
     return true;
   }
 };

 template <class Context>
 class MPISendTensorOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   MPISendTensorOp(const OperatorDef& def, Workspace* ws)
       : Operator<Context>(def, ws),
         OP_SINGLE_ARG(int, "dst", dst_, MPI_ANY_SOURCE),
         OP_SINGLE_ARG(int, "tag", tag_, MPI_ANY_TAG),
         OP_SINGLE_ARG(bool, "raw_buffer", raw_buffer_, false) {
     CAFFE_ENFORCE(raw_buffer_, "non-raw-buffer transfer not supported yet.");
     CAFFE_ENFORCE(
         dst_ != MPI_ANY_SOURCE || def.input_size() == 4,
         "You should explicitly specify the to rank either via "
         "argument or via input blobs.");
     CAFFE_ENFORCE(
         tag_ != MPI_ANY_TAG || def.input_size() == 4,
         "You should explicitly specify the tag either via "
         "argument or via input blobs.");
   }

   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
     auto& input = Input(INPUT);
     if (InputSize() == 4) {
       dst_ = OperatorBase::Input<Tensor>(DST, CPU).template data<int>()[0];
       tag_ = OperatorBase::Input<Tensor>(TAG, CPU).template data<int>()[0];
     }
     if (raw_buffer_) {
       // We need to do a const cast to cope with the fact that, before OpenMPI
       // 1.7, MPI_Send expects a non-const pointer although it uses it in a
       // const way.
       MPI_CHECK(MPI_Send(
           const_cast<void*>(input.raw_data()),
           input.nbytes(),
           MPI_CHAR,
           dst_,
           tag_,
           comm));
     } else {
       CAFFE_NOT_IMPLEMENTED;
     }
     return true;
   }

  protected:
   int dst_;
   int tag_;
   bool raw_buffer_;

   INPUT_TAGS(COMM, INPUT, DST, TAG);
 };

 template <class Context>
 class MPIReceiveTensorOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   MPIReceiveTensorOp(const OperatorDef& def, Workspace* ws)
       : Operator<Context>(def, ws),
         OP_SINGLE_ARG(int, "src", src_, MPI_ANY_SOURCE),
         OP_SINGLE_ARG(int, "tag", tag_, MPI_ANY_TAG),
         OP_SINGLE_ARG(bool, "raw_buffer", raw_buffer_, false) {
     CAFFE_ENFORCE(raw_buffer_, "non-raw-buffer transfer not supported yet.");
   }

   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
     if (InputSize() == 4) {
       src_ = OperatorBase::Input<Tensor>(SRC_IN, CPU).template data<int>()[0];
       tag_ = OperatorBase::Input<Tensor>(TAG_IN, CPU).template data<int>()[0];
     }
     MPI_Status status;
     if (raw_buffer_) {
       auto* output = Output(OUTPUT);
       MPI_CHECK(MPI_Recv(
           output->raw_mutable_data(),
           output->nbytes(),
           MPI_CHAR,
           src_,
           tag_,
           comm,
           &status));
     } else {
       CAFFE_NOT_IMPLEMENTED;
     }
     auto* src_out = OperatorBase::Output<Tensor>(SRC_OUT, CPU);
     src_out->Resize();
     src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
     auto* tag_out = OperatorBase::Output<Tensor>(TAG_OUT, CPU);
     tag_out->Resize();
     tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
     return true;
   }

  protected:
   int src_;
   int tag_;
   bool raw_buffer_;
   INPUT_TAGS(COMM, INPUT, SRC_IN, TAG_IN);
   OUTPUT_TAGS(OUTPUT, SRC_OUT, TAG_OUT);
 };

 } // namespace caffe2

 #endif // CAFFE2_MPI_MPI_OPS_H_
	#ifndef CAFFE2_MPI_MPI_OPS_H_
	#define CAFFE2_MPI_MPI_OPS_H_

	#include <mpi.h>

	#include "caffe2/core/operator.h"
	#include "caffe2/mpi/mpi_common.h"

	namespace caffe2 {

	// TODO(jiayq): if needed, write up the use of color and key with MPI split.
	// Currently, the operator simply creates a communicator that has the
	// same topology as the Caffe2 global communicator.
	template <class Context>
	class MPICreateCommonWorldOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	MPICreateCommonWorldOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws) {}

	bool RunOnDevice() override {
	OperatorBase::Outputs()[0]->Reset(new MPICommonWorldWrapper());
	return true;
	}
	};

	template <class Context>
	class MPIBroadcastOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	MPIBroadcastOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	root_(OperatorBase::template GetSingleArgument<int>("root", 0)) {}
	~MPIBroadcastOp() {}

	bool RunOnDevice() override {
	MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
	CAFFE_ENFORCE(
	OperatorBase::OutputIsTensorType(0, Context::GetDeviceType()),
	"Output is of wrong type.");
	auto* output = Output(0);
	// Make sure that output is already allocated.
	CAFFE_ENFORCE(
	output->numel() > 0,
	"Broadcast op uses in-place operation so the output "
	"should be already allocated.");
	MPI_CHECK(MPI_Bcast(
	output->raw_mutable_data(),
	output->nbytes(),
	MPIDataTypeWrapper<char>::type(),
	root_,
	comm));
	return true;
	}

	protected:
	int root_;
	};

	// MPIReduceOp does Reduce using MPI. Currently, only SUM is supported.
	template <typename T, class Context>
	class MPIReduceOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	MPIReduceOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	root_(OperatorBase::template GetSingleArgument<int>("root", 0)) {}
	~MPIReduceOp() {}

	bool RunOnDevice() override {
	MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
	auto& input = Input(1);
	auto* output = Output(0, input.sizes(), at::dtype<T>());
	MPI_CHECK(MPI_Reduce(
	const_cast<T*>(input.template data<T>()),
	output->template mutable_data<T>(),
	input.numel(),
	MPIDataTypeWrapper<T>::type(),
	MPI_SUM,
	root_,
	comm));
	return true;
	}

	protected:
	int root_;
	};

	// MPIAllgatherOp does MPIAllgather using MPI.
	template <typename T, class Context>
	class MPIAllgatherOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	USE_SIMPLE_CTOR_DTOR(MPIAllgatherOp);

	bool RunOnDevice() override {
	MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
	auto& input = Input(1);
	auto* output = Output(0);
	vector<int64_t> output_dims = input.sizes().vec();
	output_dims[0] *= OperatorBase::Input<MPICommonWorldWrapper>(0).size();
	output->Resize(output_dims);
	MPI_CHECK(MPI_Allgather(
	const_cast<T*>(input.template data<T>()),
	input.numel(),
	MPIDataTypeWrapper<T>::type(),
	output->template mutable_data<T>(),
	input.numel(),
	MPIDataTypeWrapper<T>::type(),
	comm));
	return true;
	}
	};

	// MPIAllreduceOp does MPIAllreduce using MPI. Currently, only SUM is supported.
	template <typename T, class Context>
	class MPIAllreduceOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	USE_SIMPLE_CTOR_DTOR(MPIAllreduceOp);

	bool RunOnDevice() override {
	MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
	auto& input = Input(1);
	auto* output = Output(0, input.sizes(), at::dtype<T>());
	void* source;
	if (output->template mutable_data<T>() == input.template data<T>()) {
	// We are doing in-place call. Special case handling.
	source = MPI_IN_PLACE;
	} else {
	// Normal allreduce takes the source from the input.
	source = const_cast<T*>(input.template data<T>());
	}
	MPI_CHECK(MPI_Allreduce(
	source,
	output->template mutable_data<T>(),
	input.numel(),
	MPIDataTypeWrapper<T>::type(),
	MPI_SUM,
	comm));
	return true;
	}
	};

	template <class Context>
	class MPISendTensorOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	MPISendTensorOp(const OperatorDef& def, Workspace* ws)
	: Operator<Context>(def, ws),
	OP_SINGLE_ARG(int, "dst", dst_, MPI_ANY_SOURCE),
	OP_SINGLE_ARG(int, "tag", tag_, MPI_ANY_TAG),
	OP_SINGLE_ARG(bool, "raw_buffer", raw_buffer_, false) {
	CAFFE_ENFORCE(raw_buffer_, "non-raw-buffer transfer not supported yet.");
	CAFFE_ENFORCE(
	dst_ != MPI_ANY_SOURCE \|\| def.input_size() == 4,
	"You should explicitly specify the to rank either via "
	"argument or via input blobs.");
	CAFFE_ENFORCE(
	tag_ != MPI_ANY_TAG \|\| def.input_size() == 4,
	"You should explicitly specify the tag either via "
	"argument or via input blobs.");
	}

	bool RunOnDevice() override {
	MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
	auto& input = Input(INPUT);
	if (InputSize() == 4) {
	dst_ = OperatorBase::Input<Tensor>(DST, CPU).template data<int>()[0];
	tag_ = OperatorBase::Input<Tensor>(TAG, CPU).template data<int>()[0];
	}
	if (raw_buffer_) {
	// We need to do a const cast to cope with the fact that, before OpenMPI
	// 1.7, MPI_Send expects a non-const pointer although it uses it in a
	// const way.
	MPI_CHECK(MPI_Send(
	const_cast<void*>(input.raw_data()),
	input.nbytes(),
	MPI_CHAR,
	dst_,
	tag_,
	comm));
	} else {
	CAFFE_NOT_IMPLEMENTED;
	}
	return true;
	}

	protected:
	int dst_;
	int tag_;
	bool raw_buffer_;

	INPUT_TAGS(COMM, INPUT, DST, TAG);
	};

	template <class Context>
	class MPIReceiveTensorOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	MPIReceiveTensorOp(const OperatorDef& def, Workspace* ws)
	: Operator<Context>(def, ws),
	OP_SINGLE_ARG(int, "src", src_, MPI_ANY_SOURCE),
	OP_SINGLE_ARG(int, "tag", tag_, MPI_ANY_TAG),
	OP_SINGLE_ARG(bool, "raw_buffer", raw_buffer_, false) {
	CAFFE_ENFORCE(raw_buffer_, "non-raw-buffer transfer not supported yet.");
	}

	bool RunOnDevice() override {
	MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(COMM).comm();
	if (InputSize() == 4) {
	src_ = OperatorBase::Input<Tensor>(SRC_IN, CPU).template data<int>()[0];
	tag_ = OperatorBase::Input<Tensor>(TAG_IN, CPU).template data<int>()[0];
	}
	MPI_Status status;
	if (raw_buffer_) {
	auto* output = Output(OUTPUT);
	MPI_CHECK(MPI_Recv(
	output->raw_mutable_data(),
	output->nbytes(),
	MPI_CHAR,
	src_,
	tag_,
	comm,
	&status));
	} else {
	CAFFE_NOT_IMPLEMENTED;
	}
	auto* src_out = OperatorBase::Output<Tensor>(SRC_OUT, CPU);
	src_out->Resize();
	src_out->template mutable_data<int>()[0] = status.MPI_SOURCE;
	auto* tag_out = OperatorBase::Output<Tensor>(TAG_OUT, CPU);
	tag_out->Resize();
	tag_out->template mutable_data<int>()[0] = status.MPI_TAG;
	return true;
	}

	protected:
	int src_;
	int tag_;
	bool raw_buffer_;
	INPUT_TAGS(COMM, INPUT, SRC_IN, TAG_IN);
	OUTPUT_TAGS(OUTPUT, SRC_OUT, TAG_OUT);
	};

	} // namespace caffe2

	#endif // CAFFE2_MPI_MPI_OPS_H_