| // conv_op_impl.h is the templated implementation of the conv_op.h file. |
| #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_ |
| #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_ |
| |
| #include "caffe2/operators/conv_op.h" |
| |
| #include <array> |
| #include <vector> |
| |
| #include "caffe2/core/context.h" |
| #include "caffe2/core/flags.h" |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/operators/conv_pool_op_base.h" |
| #include "caffe2/utils/eigen_utils.h" |
| #include "caffe2/utils/math.h" |
| |
| namespace caffe2 { |
| |
| template <typename T, class Context> |
| bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() { |
| const auto& X = Input(INPUT); |
| const auto& filter = Input(FILTER); |
| auto* Y = Output(0); |
| const int N = X.dim32(0); |
| const int C = X.dim32(1); |
| const int G = group_; |
| CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); |
| const int M = filter.dim32(0); |
| CAFFE_ENFORCE_EQ( |
| C, |
| filter.dim32(1) * G, |
| "Convolution op: input channels does not match: # of input channels ", |
| C, |
| " is not equal to kernel channels * group: ", |
| filter.dim32(1), |
| "*", |
| G); |
| CAFFE_ENFORCE_EQ( |
| M % G, 0, "The number of output channels is not divisible by group."); |
| |
| int kernel_size = 1; |
| for (std::size_t i = 0; i < kernel_.size(); ++i) { |
| CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); |
| kernel_size *= kernel_[i]; |
| } |
| ConvPoolOpBase<Context>::SetOutputSize(X, Y, M); |
| |
| if (N == 0) { |
| Y->template mutable_data<T>(); |
| return true; |
| } |
| |
| const vector<int> X_dims = GetDims(X); |
| const vector<int> Y_dims = GetDims(*Y); |
| const int X_HxW = X.numel() / (N * C); |
| const int Y_HxW = Y->numel() / (N * M); |
| const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend()); |
| vector<int> buffer_shape(Y_dims.size() + 1); |
| buffer_shape[0] = C * kernel_size; |
| std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin() + 1); |
| |
| const int buffer_size = C * kernel_size * Y_HxW; |
| |
| // The dimension of each kernel |
| const int kernel_dim = C / G * kernel_size; |
| const int X_stride = C * X_HxW; |
| const int Y_stride = M * Y_HxW; |
| const int filter_stride = filter.numel() / G; |
| |
| // The col buffer is stored in CHW order as well - kernel_dim, and the height |
| // and width. |
| const T* X_data = X.template data<T>(); |
| const T* filter_data = filter.template data<T>(); |
| const T* bias_data = nullptr; |
| if (InputSize() == 3) { |
| const auto& bias = Input(BIAS); |
| CAFFE_ENFORCE_EQ(bias.dim(), 1); |
| CAFFE_ENFORCE_EQ(bias.dim32(0), M); |
| bias_data = bias.template data<T>(); |
| ConvPoolOpBase<Context>::template SetBiasMultiplier<T>( |
| Y_HxW, &bias_multiplier_); |
| } |
| T* Y_data = Y->template mutable_data<T>(); |
| |
| // Shortcut for 1x1 conv. |
| if (kernel_size == 1 && !HasPad() && !HasStride()) { |
| return Run1x1ConvOnDeviceWithOrderNCHW( |
| N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data); |
| } |
| |
| const auto func = [&](Tensor* col_buffer) { |
| col_buffer->Resize(buffer_shape); |
| T* col_buffer_data = col_buffer->template mutable_data<T>(); |
| // Im2Col, followed by gemm. |
| for (const auto image_id : c10::irange(N)) { |
| (void)image_id; // Suppress unused variable warning |
| if (kernel_.size() == 2) { |
| math::Im2Col<T, Context, StorageOrder::NCHW>( |
| C, |
| X_dims[0], |
| X_dims[1], |
| kernel_h(), |
| kernel_w(), |
| dilation_h(), |
| dilation_w(), |
| pad_t(), |
| pad_l(), |
| pad_b(), |
| pad_r(), |
| stride_h(), |
| stride_w(), |
| X_data, |
| col_buffer_data, |
| &context_); |
| } else { |
| math::Im2ColNd<T, Context, StorageOrder::NCHW>( |
| kernel_.size(), |
| C * X_HxW, |
| buffer_size, |
| img_shape.data(), |
| buffer_shape.data(), |
| kernel_.data(), |
| stride_.data(), |
| dilation_.data(), |
| pads_.data(), |
| X_data, |
| col_buffer_data, |
| &context_); |
| } |
| // Weight term |
| if (G == 1) { |
| math::Gemm<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| M, |
| Y_HxW, |
| kernel_dim, |
| 1.0f, |
| filter_data, |
| col_buffer_data, |
| 0.0f, |
| Y_data, |
| &context_); |
| } else { |
| math::GemmStridedBatched<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| G, |
| M / G, |
| Y_HxW, |
| kernel_dim, |
| 1.0f, |
| filter_data, |
| filter_stride, |
| col_buffer_data, |
| buffer_size / G, |
| 0.0f, |
| Y_data, |
| Y_stride / G, |
| &context_); |
| } |
| if (bias_data != nullptr) { |
| // Bias term can be carried out outside the group definition |
| // to be efficient. |
| math::Gemm<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| M, |
| Y_HxW, |
| 1, |
| 1.0f, |
| bias_data, |
| bias_multiplier_.template data<T>(), |
| 1.0f, |
| Y_data, |
| &context_); |
| } |
| X_data += X_stride; |
| Y_data += Y_stride; |
| } |
| }; |
| if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) { |
| runWithSharedBuffer<Context>(ws_, func); |
| } else { |
| func(&col_buffer_); |
| } |
| return true; |
| } |
| |
| // The implementations. |
| template <typename T, class Context> |
| bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() { |
| CAFFE_ENFORCE_LE( |
| kernel_.size(), |
| 3, |
| "Only 1-3d convolution is supported for NHWC storage type"); |
| const Tensor& X = Input(INPUT); |
| const auto& filter = Input(FILTER); |
| Tensor* Y = Output(0); |
| const int N = X.dim32(0), C = X.dim32(X.dim() - 1); |
| const int G = group_; |
| CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); |
| const int M = filter.dim32(0); |
| CAFFE_ENFORCE_EQ( |
| C, |
| filter.dim32(filter.dim() - 1) * G, |
| "Convolution op: input channels does not match: # of input channels ", |
| C, |
| " is not equal to kernel channels * group: ", |
| filter.dim32(filter.dim() - 1), |
| "*", |
| G); |
| CAFFE_ENFORCE_EQ( |
| M % G, 0, "The number of output channels is not divisible by group."); |
| |
| int kernel_size = 1; |
| for (std::size_t i = 0; i < kernel_.size(); ++i) { |
| CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); |
| kernel_size *= kernel_[i]; |
| } |
| ConvPoolOpBase<Context>::SetOutputSize(X, Y, M); |
| |
| if (N == 0) { |
| Y->template mutable_data<T>(); |
| return true; |
| } |
| |
| const vector<int> Y_dims = GetDims(*Y); |
| const int X_HxW = X.numel() / (N * C); |
| const int Y_HxW = Y->numel() / (N * M); |
| const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend()); |
| vector<int> buffer_shape(Y_dims.size() + 1); |
| std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin()); |
| buffer_shape.back() = C * kernel_size; |
| |
| const int buffer_size = C * kernel_size * Y_HxW; |
| |
| // The dimension of each kernel |
| const int kernel_dim = C / G * kernel_size; |
| // The offset corresponding to a single input image, and a single output |
| // image. |
| const int input_offset = X_HxW * C; |
| const int output_offset = Y->numel() / Y->dim32(0); |
| |
| // The output image size is the spatial size of the output. |
| // The col buffer is stored in HWC order as well - the height and width, and |
| // kernel_dim. |
| const T* X_data = X.template data<T>(); |
| const T* filter_data = filter.template data<T>(); |
| const T* bias_data = nullptr; |
| if (InputSize() == 3) { |
| const auto& bias = Input(BIAS); |
| CAFFE_ENFORCE_EQ(bias.dim(), 1); |
| CAFFE_ENFORCE_EQ(bias.dim32(0), M); |
| bias_data = bias.template data<T>(); |
| } |
| T* Y_data = Y->template mutable_data<T>(); |
| |
| // Specialized path for 1 by 1 convolution with stride 1, pad 0 - we |
| // can skip im2col. |
| if (kernel_dim == (C / group_) && !HasPad() && !HasStride()) { |
| if (bias_data != nullptr) { |
| // For this specialized path, we need a bigger bias_multiplier_ because |
| // we're doing just 1 big GEMM. |
| ConvPoolOpBase<Context>::template SetBiasMultiplier<T>( |
| N * X_HxW, &bias_multiplier_); |
| } |
| return Run1x1ConvOnDeviceWithOrderNHWC( |
| N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data); |
| } |
| |
| if (bias_data != nullptr) { |
| ConvPoolOpBase<Context>::template SetBiasMultiplier<T>( |
| Y_HxW, &bias_multiplier_); |
| } |
| auto f = [&](Tensor* col_buffer) { |
| col_buffer->Resize(buffer_shape); |
| T* col_buffer_data = col_buffer->template mutable_data<T>(); |
| // Im2Col, followed by gemm. |
| for (const auto image_id : c10::irange(N)) { |
| (void)image_id; // Suppress unused variable warning |
| if (kernel_.size() <= 2) { |
| math::Im2Col<T, Context, StorageOrder::NHWC>( |
| C, |
| X.dim32(1), |
| kernel_.size() == 2 ? X.dim32(2) : 1, |
| kernel_h(), |
| kernel_.size() == 2 ? kernel_w() : 1, |
| dilation_h(), |
| kernel_.size() == 2 ? dilation_w() : 1, |
| pad_t(), |
| kernel_.size() == 2 ? pad_l() : 0, |
| kernel_.size() == 2 ? pad_b() : pad_l(), |
| kernel_.size() == 2 ? pad_r() : 0, |
| stride_h(), |
| kernel_.size() == 2 ? stride_w() : 1, |
| X_data, |
| col_buffer_data, |
| &context_, |
| group_); |
| } else { |
| math::Im2ColNd<T, Context, StorageOrder::NHWC>( |
| kernel_.size(), |
| C * X_HxW, |
| buffer_size, |
| img_shape.data(), |
| buffer_shape.data(), |
| kernel_.data(), |
| stride_.data(), |
| dilation_.data(), |
| pads_.data(), |
| X_data, |
| col_buffer_data, |
| &context_, |
| group_); |
| } |
| // Weight term |
| for (const auto group_id : c10::irange(group_)) { |
| // col_buffer_data in G (H W) (R S C/G) layout |
| // filter_data in G K/G (R S C/G) layout |
| math::GemmEx<T, Context>( |
| CblasNoTrans, |
| CblasTrans, |
| Y_HxW, |
| M / group_, |
| kernel_dim, |
| 1, |
| col_buffer_data + group_id * kernel_dim, |
| group_ * kernel_dim, |
| filter_data + group_id * (M / group_) * kernel_dim, |
| kernel_dim, |
| 0, |
| Y_data + group_id * (M / group_), |
| M, |
| &context_); |
| } |
| if (bias_data != nullptr) { |
| // Bias term |
| math::Gemm<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| Y_HxW, |
| M, |
| 1, |
| 1, |
| bias_multiplier_.template data<T>(), |
| bias_data, |
| 1, |
| Y_data, |
| &context_); |
| } |
| X_data += input_offset; |
| Y_data += output_offset; |
| } |
| }; |
| if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) { |
| runWithSharedBuffer<Context>(ws_, f); |
| } else { |
| f(&col_buffer_); |
| } |
| return true; |
| } |
| |
| template <typename T, class Context> |
| bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNCHW( |
| const int N, |
| const int C, |
| const int HxW, |
| const int M, |
| const T* X, |
| const T* filter, |
| const T* bias, |
| T* Y) { |
| const int G = group_; |
| if (G == 1) { |
| math::GemmStridedBatched<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| N, |
| M, |
| HxW, |
| C, |
| 1.0f, |
| filter, |
| 0, |
| X, |
| C * HxW, |
| 0.0f, |
| Y, |
| M * HxW, |
| &context_); |
| } else { |
| const int batch_size = N * G; |
| const int D_X = C / G; |
| const int D_Y = M / G; |
| const int X_stride = D_X * HxW; |
| const int W_stride = D_Y * D_X; |
| const int Y_stride = D_Y * HxW; |
| std::vector<const T*> X_ptr(N * G); |
| std::vector<const T*> W_ptr(N * G); |
| std::vector<T*> Y_ptr(N * G); |
| for (const auto i : c10::irange(N)) { |
| for (const auto j : c10::irange(G)) { |
| const int index = i * G + j; |
| X_ptr[index] = X + index * X_stride; |
| W_ptr[index] = filter + j * W_stride; |
| Y_ptr[index] = Y + index * Y_stride; |
| } |
| } |
| math::GemmBatched<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| batch_size, |
| D_Y, |
| HxW, |
| D_X, |
| 1.0f, |
| W_ptr.data(), |
| X_ptr.data(), |
| 0.0f, |
| Y_ptr.data(), |
| &context_); |
| } |
| if (bias != nullptr) { |
| const T* bias_multiplier_data = bias_multiplier_.template data<T>(); |
| math::GemmStridedBatched<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| N, |
| M, |
| HxW, |
| 1, |
| 1.0f, |
| bias, |
| 0, |
| bias_multiplier_data, |
| 0, |
| 1.0f, |
| Y, |
| M * HxW, |
| &context_); |
| } |
| return true; |
| } |
| |
| template <typename T, class Context> |
| bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC( |
| const int N, |
| const int C, |
| const int HxW, |
| const int M, |
| const T* X, |
| const T* filter, |
| const T* bias, |
| T* Y) { |
| const int G = group_; |
| const int kernel_dim = C / G; |
| for (const auto group_id : c10::irange(group_)) { |
| math::GemmEx<T, Context>( |
| CblasNoTrans, |
| CblasTrans, |
| N * HxW, |
| M / group_, |
| kernel_dim, |
| 1.0f, |
| X + group_id * kernel_dim, |
| C, |
| filter + group_id * (M / group_) * kernel_dim, |
| kernel_dim, |
| 0.0f, |
| Y + group_id * (M / group_), |
| M, |
| &context_); |
| } |
| if (bias != nullptr) { |
| const T* bias_multiplier_data = bias_multiplier_.template data<T>(); |
| math::Gemm<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| N * HxW, |
| M, |
| 1, |
| 1.0f, |
| bias_multiplier_data, |
| bias, |
| 1.0f, |
| Y, |
| &context_); |
| } |
| return true; |
| } |
| |
| template <typename T, class Context> |
| bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() { |
| auto& X = Input(INPUT); |
| auto& filter = Input(FILTER); |
| auto& dY = Input(OUTPUT_GRAD); |
| |
| const int N = X.dim32(0), C = X.dim32(1); |
| |
| const vector<int> input_dims = this->GetDims(X); |
| const int input_image_size = this->GetDimsSize(X); |
| |
| const vector<int> output_dims = this->GetDims(dY); |
| // The output image size is the spatial size of the output. |
| const int output_image_size = this->GetDimsSize(dY); |
| |
| ConvPoolOpBase<Context>::ComputePads(input_dims); |
| CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); |
| const int M = filter.dim32(0); |
| CAFFE_ENFORCE_EQ(C, filter.dim32(1) * group_); |
| |
| int kernel_dims_size = 1; |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (const auto i : c10::irange(kernel_.size())) { |
| CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); |
| kernel_dims_size *= kernel_[i]; |
| } |
| |
| CAFFE_ENFORCE_EQ(M % group_, 0); |
| auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>()); |
| // The dimension of each kernel |
| const int kernel_dim = C / group_ * kernel_dims_size; |
| // The col buffer is stored in CHW order as well - kernel_dim, and the height |
| // and width. |
| vector<int> img_shape; |
| img_shape.assign(X.sizes().begin() + 1, X.sizes().end()); |
| vector<int> col_buffer_shape; |
| col_buffer_shape.push_back(C / group_ * kernel_dims_size); |
| col_buffer_shape.insert( |
| col_buffer_shape.end(), output_dims.begin(), output_dims.end()); |
| vector<int64_t> col_buffer_shape_64; |
| std::copy( |
| col_buffer_shape.cbegin(), |
| col_buffer_shape.cend(), |
| std::back_inserter(col_buffer_shape_64)); |
| ReinitializeTensor( |
| &col_buffer_, |
| col_buffer_shape_64, |
| at::dtype<T>().device(Context::GetDeviceType())); |
| |
| if (kernel_.size() != 2) { |
| // TODO: SetDeviceTensor accept vector<int64_t> |
| SetDeviceTensor(img_shape, &img_shape_device_); |
| SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_); |
| } |
| |
| const int col_buffer_size = |
| (C / group_) * kernel_dims_size * output_image_size; |
| const T* Xdata = X.template data<T>(); |
| const T* filter_data = filter.template data<T>(); |
| const T* dYdata = dY.template data<T>(); |
| T* col_buffer_data = col_buffer_.template mutable_data<T>(); |
| T* dfilter_data = dfilter->template mutable_data<T>(); |
| |
| // Pre-setting the gradients to zero. |
| math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_); |
| |
| T* dbias_data = nullptr; |
| if (!no_bias_) { |
| auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>()); |
| // Removed the check for whether bias_multiplier_ has correct size or not |
| ReinitializeTensor( |
| &bias_multiplier_, |
| vector<int64_t>(1, output_image_size), |
| at::dtype<T>().device(Context::GetDeviceType())); |
| math::Set<T, Context>( |
| output_image_size, |
| static_cast<T>(1), |
| bias_multiplier_.template mutable_data<T>(), |
| &context_); |
| dbias_data = dbias->template mutable_data<T>(); |
| math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_); |
| } |
| |
| if (N == 0) { |
| if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { |
| auto* dX = Output( |
| no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, |
| X.sizes(), |
| at::dtype<T>()); |
| dX->template mutable_data<T>(); |
| } |
| return true; |
| } |
| |
| // The offset corresponding to a single input image, and a single output |
| // image. |
| const int input_offset = C / group_ * input_image_size; |
| const int output_offset = dY.numel() / dY.dim32(0) / group_; |
| const int filter_offset = filter.numel() / group_; |
| for (const auto image_id : c10::irange(N)) { |
| (void)image_id; // Suppress unused variable warning |
| for (const auto group_id : c10::irange(group_)) { |
| // When we compute the gradient with respect to the filters, we need to do |
| // im2col to allow gemm-type computation. |
| if (kernel_.size() == 2) { |
| math::Im2Col<T, Context, StorageOrder::NCHW>( |
| C / group_, |
| input_dims[0], |
| input_dims[1], |
| kernel_h(), |
| kernel_w(), |
| dilation_h(), |
| dilation_w(), |
| pad_t(), |
| pad_l(), |
| pad_b(), |
| pad_r(), |
| stride_h(), |
| stride_w(), |
| Xdata + group_id * input_offset, |
| col_buffer_data, |
| &context_); |
| } else { |
| math::Im2ColNd<T, Context, StorageOrder::NCHW>( |
| kernel_.size(), |
| input_offset, |
| col_buffer_size, |
| img_shape.data(), |
| col_buffer_shape.data(), |
| kernel_.data(), |
| stride_.data(), |
| dilation_.data(), |
| pads_.data(), |
| Xdata + group_id * input_offset, |
| col_buffer_data, |
| &context_); |
| } |
| // Gradient with respect to filter. |
| math::Gemm<T, Context>( |
| CblasNoTrans, |
| CblasTrans, |
| M / group_, |
| kernel_dim, |
| output_image_size, |
| 1, |
| dYdata + group_id * output_offset, |
| col_buffer_data, |
| 1, |
| dfilter_data + group_id * filter_offset, |
| &context_); |
| } |
| if (!no_bias_) { |
| // Gradient with respect to bias can be computed independent from group. |
| math::Gemv<T, Context>( |
| CblasNoTrans, |
| M, |
| output_image_size, |
| 1, |
| dYdata, |
| bias_multiplier_.template data<T>(), |
| 1, |
| dbias_data, |
| &context_); |
| } |
| Xdata += input_offset * group_; |
| dYdata += output_offset * group_; |
| } |
| if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { |
| // Compute the gradient w.r.t. the input. |
| |
| auto* dX = Output( |
| no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); |
| T* dXdata = dX->template mutable_data<T>(); |
| dYdata = dY.template data<T>(); |
| for (const auto image_id : c10::irange(N)) { |
| (void)image_id; // Suppress unused variable warning |
| for (const auto group_id : c10::irange(group_)) { |
| // Compute gradient into col_buffer. |
| math::Gemm<T, Context>( |
| CblasTrans, |
| CblasNoTrans, |
| kernel_dim, |
| output_image_size, |
| M / group_, |
| 1, |
| filter_data + group_id * filter_offset, |
| dYdata, |
| 0, |
| col_buffer_data, |
| &context_); |
| if (kernel_.size() == 2) { |
| math::Col2Im<T, Context, StorageOrder::NCHW>( |
| C / group_, |
| input_dims[0], |
| input_dims[1], |
| kernel_h(), |
| kernel_w(), |
| dilation_h(), |
| dilation_w(), |
| pad_t(), |
| pad_l(), |
| pad_b(), |
| pad_r(), |
| stride_h(), |
| stride_w(), |
| col_buffer_data, |
| dXdata, |
| &context_); |
| } else { |
| math::Col2ImNd<T, Context, StorageOrder::NCHW>( |
| kernel_.size(), |
| input_offset, |
| col_buffer_size, |
| img_shape.data(), |
| col_buffer_shape.data(), |
| kernel_.data(), |
| stride_.data(), |
| dilation_.data(), |
| pads_.data(), |
| col_buffer_data, |
| dXdata, |
| &context_); |
| } |
| dXdata += input_offset; |
| dYdata += output_offset; |
| } |
| } |
| } |
| return true; |
| } |
| |
| template <typename T, class Context> |
| bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() { |
| auto& X = Input(INPUT); |
| auto& filter = Input(FILTER); |
| auto& dY = Input(OUTPUT_GRAD); |
| |
| const int N = X.dim32(0), C = X.dim32(X.dim() - 1); |
| |
| const vector<int> input_dims = this->GetDims(X); |
| const int input_image_size = this->GetDimsSize(X); |
| |
| const vector<int> output_dims = this->GetDims(dY); |
| // The output image size is the spatial size of the output. |
| const int output_image_size = this->GetDimsSize(dY); |
| |
| ConvPoolOpBase<Context>::ComputePads(input_dims); |
| CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); |
| const int M = filter.dim32(0); |
| CAFFE_ENFORCE_EQ(C, filter.dim32(filter.dim() - 1) * group_); |
| |
| int kernel_dims_size = 1; |
| for (const auto i : c10::irange(kernel_.size())) { |
| CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); |
| kernel_dims_size *= kernel_[i]; |
| } |
| |
| CAFFE_ENFORCE_EQ(M % group_, 0); |
| auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>()); |
| // The dimension of each kernel |
| const int kernel_dim = C / group_ * kernel_dims_size; |
| |
| // The col buffer is stored in HWC order as well - the height and width, and |
| // kernel_dim. |
| vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend()); |
| vector<int> col_buffer_shape(output_dims.size() + 1); |
| std::copy(output_dims.cbegin(), output_dims.cend(), col_buffer_shape.begin()); |
| col_buffer_shape.back() = C * kernel_dims_size; |
| vector<int64_t> col_buffer_shape_64; |
| std::copy( |
| col_buffer_shape.cbegin(), |
| col_buffer_shape.cend(), |
| std::back_inserter(col_buffer_shape_64)); |
| ReinitializeTensor( |
| &col_buffer_, |
| col_buffer_shape_64, |
| at::dtype<T>().device(Context::GetDeviceType())); |
| |
| if (kernel_.size() != 2) { |
| SetDeviceTensor(img_shape, &img_shape_device_); |
| SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_); |
| } |
| |
| const int col_buffer_size = C * kernel_dims_size * output_image_size; |
| const T* Xdata = X.template data<T>(); |
| const T* const filter_data = filter.template data<T>(); |
| const T* const dYdata = dY.template data<T>(); |
| T* col_buffer_data = col_buffer_.template mutable_data<T>(); |
| T* dfilter_data = dfilter->template mutable_data<T>(); |
| |
| // Pre-setting the gradients to zero. |
| math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_); |
| |
| T* dbias_data = nullptr; |
| if (!no_bias_) { |
| auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>()); |
| dbias_data = dbias->template mutable_data<T>(); |
| math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_); |
| // Removed the check for whether bias_multiplier_ has correct size or not |
| ReinitializeTensor( |
| &bias_multiplier_, |
| vector<int64_t>(1, output_image_size), |
| at::dtype<T>().device(Context::GetDeviceType())); |
| math::Set<T, Context>( |
| output_image_size, |
| static_cast<T>(1), |
| bias_multiplier_.template mutable_data<T>(), |
| &context_); |
| } |
| |
| if (N == 0) { |
| if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { |
| auto* dX = Output( |
| no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, |
| X.sizes(), |
| at::dtype<T>()); |
| dX->template mutable_data<T>(); |
| } |
| return true; |
| } |
| |
| // The offset corresponding to a single input image, and a single output |
| // image. |
| const size_t input_offset = C * input_image_size; |
| const size_t output_offset = dY.numel() / dY.dim32(0); |
| for (const auto image_id : c10::irange(N)) { |
| // When we compute the gradient with respect to the filters, we need to do |
| // im2col to allow gemm-type computation. |
| if (kernel_.size() <= 2) { |
| math::Im2Col<T, Context, StorageOrder::NHWC>( |
| C, |
| X.size(1), |
| kernel_.size() == 2 ? X.dim32(2) : 1, |
| kernel_h(), |
| kernel_.size() == 2 ? kernel_w() : 1, |
| dilation_h(), |
| kernel_.size() == 2 ? dilation_w() : 1, |
| pad_t(), |
| kernel_.size() == 2 ? pad_l() : 0, |
| kernel_.size() == 2 ? pad_b() : pad_l(), |
| kernel_.size() == 2 ? pad_r() : 0, |
| stride_h(), |
| kernel_.size() == 2 ? stride_w() : 1, |
| Xdata, |
| col_buffer_data, |
| &context_, |
| group_); |
| } else { |
| math::Im2ColNd<T, Context, StorageOrder::NHWC>( |
| kernel_.size(), |
| C * input_image_size, |
| col_buffer_size, |
| img_shape.data(), |
| col_buffer_shape.data(), |
| kernel_.data(), |
| stride_.data(), |
| dilation_.data(), |
| pads_.data(), |
| Xdata, |
| col_buffer_data, |
| &context_, |
| group_); |
| } |
| // Gradient with respect to filter. |
| for (const auto group_id : c10::irange(group_)) { |
| math::GemmEx<T, Context>( |
| CblasTrans, |
| CblasNoTrans, |
| M / group_, |
| kernel_dim, |
| output_image_size, |
| 1, |
| dYdata + output_offset * image_id + group_id * (M / group_), |
| M, |
| col_buffer_data + group_id * kernel_dim, |
| group_ * kernel_dim, |
| 1, |
| dfilter_data + group_id * (M / group_) * kernel_dim, |
| kernel_dim, |
| &context_); |
| } |
| if (!no_bias_) { |
| // Gradient with respect to bias |
| math::Gemv<T, Context>( |
| CblasTrans, |
| output_image_size, |
| M, |
| 1, |
| dYdata + output_offset * image_id, |
| bias_multiplier_.template data<T>(), |
| 1, |
| dbias_data, |
| &context_); |
| } |
| Xdata += input_offset; |
| } // for each image |
| |
| if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { |
| // Compute the gradient w.r.t. the input. |
| |
| auto* dX = Output( |
| no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); |
| T* dXdata = dX->template mutable_data<T>(); |
| for (const auto image_id : c10::irange(N)) { |
| // Compute gradient into col_buffer. |
| for (const auto group_id : c10::irange(group_)) { |
| math::GemmEx<T, Context>( |
| CblasNoTrans, |
| CblasNoTrans, |
| output_image_size, |
| kernel_dim, |
| M / group_, |
| 1, |
| dYdata + output_offset * image_id + group_id * (M / group_), |
| M, |
| filter_data + group_id * (M / group_) * kernel_dim, |
| kernel_dim, |
| 0, |
| col_buffer_data + group_id * kernel_dim, |
| group_ * kernel_dim, |
| &context_); |
| } |
| if (kernel_.size() <= 2) { |
| math::Col2Im<T, Context, StorageOrder::NHWC>( |
| C, |
| X.size(1), |
| kernel_.size() == 2 ? X.dim32(2) : 1, |
| kernel_h(), |
| kernel_.size() == 2 ? kernel_w() : 1, |
| dilation_h(), |
| kernel_.size() == 2 ? dilation_w() : 1, |
| pad_t(), |
| kernel_.size() == 2 ? pad_l() : 0, |
| kernel_.size() == 2 ? pad_b() : pad_l(), |
| kernel_.size() == 2 ? pad_r() : 0, |
| stride_h(), |
| kernel_.size() == 2 ? stride_w() : 1, |
| col_buffer_data, |
| dXdata, |
| &context_, |
| group_); |
| } else { |
| math::Col2ImNd<T, Context, StorageOrder::NHWC>( |
| kernel_.size(), |
| C * input_image_size, |
| col_buffer_size, |
| img_shape.data(), |
| col_buffer_shape.data(), |
| kernel_.data(), |
| stride_.data(), |
| dilation_.data(), |
| pads_.data(), |
| col_buffer_data, |
| dXdata, |
| &context_, |
| group_); |
| } |
| dXdata += input_offset; |
| } // for each image |
| } |
| return true; |
| } |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_ |