caffe2/quantization/server/im2col_dnnlowp.h - platform/external/pytorch - Git at Google

 #pragma once

 #ifdef _OPENMP
 #include <omp.h>
 #endif

 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 #include "caffe2/utils/math/utils.h"

 namespace caffe2 {

 namespace math {

 template <typename T>
 static void Im2ColNCHW(
     const int channels,
     const int height,
     const int width,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int stride_h,
     const int stride_w,
     const T* data_im,
     T* data_col,
     CPUContext* /*context*/,
     const T& zero_point = 0) {
   const int output_h =
       (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
       1;
   const int output_w =
       (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
       1;

   // Fast path for zero padding and no dilation
   // From Torch, THNN_(unfolded_copy)
   if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
       pad_t == 0 && pad_b == 0) {
     for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
       const auto nip = k / (kernel_h * kernel_w);
       const auto rest = k % (kernel_h * kernel_w);
       const auto kh = rest / kernel_w;
       const auto kw = rest % kernel_w;
       auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
           kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
       const auto* src = data_im + nip * (height * width);
       for (const auto y : c10::irange(output_h)) {
         const auto iy = y * stride_h + kh;
         const auto ix = kw;
         if (stride_w == 1) {
           memcpy(
               dst + (y * output_w),
               src + (iy * width + ix),
               sizeof(T) * output_w);
         } else {
           for (const auto x : c10::irange(output_w)) {
             memcpy(
                 dst + (y * output_w + x),
                 src + (iy * width + ix + x * stride_w),
                 sizeof(T));
           }
         }
       }
     }
     return;
   }

   // Fast path for equal padding
   if (pad_l == pad_r && pad_t == pad_b) {
     // From Intel, https://github.com/BVLC/caffe/pull/3536
     const int pad_h = pad_t;
     const int pad_w = pad_l;
     const int channel_size = height * width;
     for (int channel = channels; channel--; data_im += channel_size) {
       for (const auto kernel_row : c10::irange(kernel_h)) {
         for (const auto kernel_col : c10::irange(kernel_w)) {
           int input_row = -pad_h + kernel_row * dilation_h;
           for (int output_rows = output_h; output_rows; output_rows--) {
             if (!utils::IsAGeZeroAndALtB(input_row, height)) {
               for (int output_cols = output_w; output_cols; output_cols--) {
                 *(data_col++) = zero_point;
               }
             } else {
               int input_col = -pad_w + kernel_col * dilation_w;
               for (int output_col = output_w; output_col; output_col--) {
                 if (utils::IsAGeZeroAndALtB(input_col, width)) {
                   *(data_col++) = data_im[input_row * width + input_col];
                 } else {
                   *(data_col++) = zero_point;
                 }
                 input_col += stride_w;
               }
             }
             input_row += stride_h;
           }
         }
       }
     }
     return;
   }

   // Baseline
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

   int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

   int channels_col = channels * kernel_h * kernel_w;
   for (const auto c : c10::irange(channels_col)) {
     int w_offset = c % kernel_w;
     int h_offset = (c / kernel_w) % kernel_h;
     int c_im = c / kernel_h / kernel_w;
     for (const auto h : c10::irange(height_col)) {
       for (const auto w : c10::irange(width_col)) {
         int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
         int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
           data_col[(c * height_col + h) * width_col + w] =
               data_im[(c_im * height + h_pad) * width + w_pad];
         else
           data_col[(c * height_col + h) * width_col + w] = zero_point;
       }
     }
   }
 }

 template <typename T>
 static void Im2ColNdNCHW(
     const int N,
     const int /* img_size*/,
     const int col_size,
     const int* img_shape,
     const int* col_shape,
     const int* kernel_shape,
     const int* stride,
     const int* dilation,
     const int* pad,
     const T* X_data,
     T* Y_data,
     CPUContext* /* context */,
     const T& zero_point = 0) {
   const int outer_size = col_shape[0];
   const int inner_size = col_size / outer_size;
   const int kernel_size = std::accumulate(
       kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
   std::vector<int> d_offset(N, 0);
   std::vector<int> d_iter(N, 0);
   for (const auto i : c10::irange(outer_size)) {
     // Loop over spatial axes in reverse order to compute a per-axis offset.
     int offset = i;
     for (int d_i = N - 1; d_i >= 0; --d_i) {
       d_offset[d_i] = offset % kernel_shape[d_i];
       offset /= kernel_shape[d_i];
     }
     for (const auto j : c10::irange(inner_size)) {
       // Loop over spatial axes in forward order to compute the indices in the
       // image and column, and whether the index lies in the padding.
       const int col_index = i * inner_size + j;
       int img_index = i / kernel_size;
       bool is_padding = false;
       for (const auto d_i : c10::irange(N)) {
         const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
             d_offset[d_i] * dilation[d_i];
         is_padding |= d_img < 0 || d_img >= img_shape[d_i + 1];
         img_index = img_index * img_shape[d_i + 1] + d_img;
       }
       Y_data[col_index] = is_padding ? zero_point : X_data[img_index];
       utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
     }
   }
 }

 /**
  * The layout of the result is N H W G R S C/G.
  * Note that groups are pulled out to an outer dimension so that we can use
  * GEMMs efficiently.
  */
 template <typename T>
 static void Im2ColNHWC(
     const int channels,
     const int height,
     const int width,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int stride_h,
     const int stride_w,
     const T* data_im,
     T* data_col,
     CPUContext* /*context*/,
     const int groups,
     const T& zero_point) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

   int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

 #ifdef _OPENMP
 #pragma omp parallel for if (!omp_in_parallel())
 #endif
   for (int h = 0; h < height_col; ++h) {
     int h_pad = -pad_t + h * stride_h;
     T* data_col_temp =
         data_col + h * width_col * kernel_h * kernel_w * channels;
     int w_pad = -pad_l;
     for (C10_UNUSED const auto w : c10::irange(width_col)) {
       int r = 0;
       for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
         int s = 0;
         for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
           if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
             for (const auto g : c10::irange(groups)) {
               memcpy(
                   data_col_temp +
                       ((g * kernel_h + r) * kernel_w + s) * (channels / groups),
                   data_im + (ih * width + iw) * channels +
                       g * (channels / groups),
                   sizeof(T) * (channels / groups));
             }
           } else {
             // This should be simply padded with zero.
             for (const auto g : c10::irange(groups)) {
               for (int i = 0; i < channels / groups; ++i) {
                 data_col_temp
                     [(((g * kernel_h + r) * kernel_w) + s) *
                          (channels / groups) +
                      i] = zero_point;
               }
             }
           }
         } // for each iw
       } // for each ih
       data_col_temp += kernel_h * kernel_w * channels;
       w_pad += stride_w;
     } // for each output pixel
   } // for each image row
 }

 /**
  * The layout of the result is N T H W G Q R S C/G.
  * Note that groups are pulled out to an outer dimension so that we can use
  * GEMMs efficiently.
  */
 template <typename T>
 static void Im2Col3DNHWC(
     const int channels,
     const int num_frames,
     const int height,
     const int width,
     const int kernel_t,
     const int kernel_h,
     const int kernel_w,
     const int dilation_t,
     const int dilation_h,
     const int dilation_w,
     const int pad_p, // previous frame
     const int pad_t, // top
     const int pad_l, // left
     const int pad_n, // next frame
     const int pad_b, // bottom
     const int pad_r, // right
     const int stride_t,
     const int stride_h,
     const int stride_w,
     const T* data_im,
     T* data_col,
     CPUContext* /*context*/,
     const int groups,
     const T& zero_point) {
   const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

   int frame_col = (num_frames + pad_p + pad_n - dkernel_t) / stride_t + 1;
   int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

 #ifdef _OPENMP
 #pragma omp parallel for if (!omp_in_parallel())
 #endif
   for (int t = 0; t < frame_col; ++t) {
     int t_pad = -pad_p + t * stride_t;
     for (const auto h : c10::irange(height_col)) {
       int h_pad = -pad_t + h * stride_h;
       T* data_col_temp = data_col +
           (t * height_col + h) * width_col * kernel_t * kernel_h * kernel_w *
               channels;
       for (const auto w : c10::irange(width_col)) {
         int w_pad = -pad_l + w * stride_w;
         int q = 0;
         for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
           int r = 0;
           for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
             int s = 0;
             for (int iw = w_pad; iw < w_pad + dkernel_w;
                  iw += dilation_w, ++s) {
               if (it >= 0 && it < num_frames && ih >= 0 && ih < height &&
                   iw >= 0 && iw < width) {
                 for (const auto g : c10::irange(groups)) {
                   memcpy(
                       data_col_temp +
                           (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
                               (channels / groups),
                       data_im + ((it * height + ih) * width + iw) * channels +
                           g * (channels / groups),
                       sizeof(T) * (channels / groups));
                 }
               } else {
                 // This should be simply padded with zero.
                 for (const auto g : c10::irange(groups)) {
                   for (int i = 0; i < channels / groups; ++i) {
                     data_col_temp
                         [((((g * kernel_t + q) * kernel_h + r) * kernel_w) +
                           s) *
                              (channels / groups) +
                          i] = zero_point;
                   }
                 }
               }
             } // for each iw
           } // for each ih
         } // for each it
         data_col_temp += kernel_t * kernel_h * kernel_w * channels;
       } // for each output pixel
     } // for each image row
   } // for each frame
 }

 } // namespace math

 } // namespace caffe2
	#pragma once

	#ifdef _OPENMP
	#include <omp.h>
	#endif

	#include "caffe2/core/operator.h"
	#include "caffe2/utils/math.h"
	#include "caffe2/utils/math/utils.h"

	namespace caffe2 {

	namespace math {

	template <typename T>
	static void Im2ColNCHW(
	const int channels,
	const int height,
	const int width,
	const int kernel_h,
	const int kernel_w,
	const int dilation_h,
	const int dilation_w,
	const int pad_t,
	const int pad_l,
	const int pad_b,
	const int pad_r,
	const int stride_h,
	const int stride_w,
	const T* data_im,
	T* data_col,
	CPUContext* /context/,
	const T& zero_point = 0) {
	const int output_h =
	(height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
	1;
	const int output_w =
	(width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
	1;

	// Fast path for zero padding and no dilation
	// From Torch, THNN_(unfolded_copy)
	if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
	pad_t == 0 && pad_b == 0) {
	for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
	const auto nip = k / (kernel_h * kernel_w);
	const auto rest = k % (kernel_h * kernel_w);
	const auto kh = rest / kernel_w;
	const auto kw = rest % kernel_w;
	auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
	kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
	const auto* src = data_im + nip * (height * width);
	for (const auto y : c10::irange(output_h)) {
	const auto iy = y * stride_h + kh;
	const auto ix = kw;
	if (stride_w == 1) {
	memcpy(
	dst + (y * output_w),
	src + (iy * width + ix),
	sizeof(T) * output_w);
	} else {
	for (const auto x : c10::irange(output_w)) {
	memcpy(
	dst + (y * output_w + x),
	src + (iy * width + ix + x * stride_w),
	sizeof(T));
	}
	}
	}
	}
	return;
	}

	// Fast path for equal padding
	if (pad_l == pad_r && pad_t == pad_b) {
	// From Intel, https://github.com/BVLC/caffe/pull/3536
	const int pad_h = pad_t;
	const int pad_w = pad_l;
	const int channel_size = height * width;
	for (int channel = channels; channel--; data_im += channel_size) {
	for (const auto kernel_row : c10::irange(kernel_h)) {
	for (const auto kernel_col : c10::irange(kernel_w)) {
	int input_row = -pad_h + kernel_row * dilation_h;
	for (int output_rows = output_h; output_rows; output_rows--) {
	if (!utils::IsAGeZeroAndALtB(input_row, height)) {
	for (int output_cols = output_w; output_cols; output_cols--) {
	*(data_col++) = zero_point;
	}
	} else {
	int input_col = -pad_w + kernel_col * dilation_w;
	for (int output_col = output_w; output_col; output_col--) {
	if (utils::IsAGeZeroAndALtB(input_col, width)) {
	(data_col++) = data_im[input_row width + input_col];
	} else {
	*(data_col++) = zero_point;
	}
	input_col += stride_w;
	}
	}
	input_row += stride_h;
	}
	}
	}
	}
	return;
	}

	// Baseline
	const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
	const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

	int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
	int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

	int channels_col = channels * kernel_h * kernel_w;
	for (const auto c : c10::irange(channels_col)) {
	int w_offset = c % kernel_w;
	int h_offset = (c / kernel_w) % kernel_h;
	int c_im = c / kernel_h / kernel_w;
	for (const auto h : c10::irange(height_col)) {
	for (const auto w : c10::irange(width_col)) {
	int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
	int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
	if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
	data_col[(c * height_col + h) * width_col + w] =
	data_im[(c_im * height + h_pad) * width + w_pad];
	else
	data_col[(c * height_col + h) * width_col + w] = zero_point;
	}
	}
	}
	}

	template <typename T>
	static void Im2ColNdNCHW(
	const int N,
	const int /* img_size*/,
	const int col_size,
	const int* img_shape,
	const int* col_shape,
	const int* kernel_shape,
	const int* stride,
	const int* dilation,
	const int* pad,
	const T* X_data,
	T* Y_data,
	CPUContext* /* context */,
	const T& zero_point = 0) {
	const int outer_size = col_shape[0];
	const int inner_size = col_size / outer_size;
	const int kernel_size = std::accumulate(
	kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
	std::vector<int> d_offset(N, 0);
	std::vector<int> d_iter(N, 0);
	for (const auto i : c10::irange(outer_size)) {
	// Loop over spatial axes in reverse order to compute a per-axis offset.
	int offset = i;
	for (int d_i = N - 1; d_i >= 0; --d_i) {
	d_offset[d_i] = offset % kernel_shape[d_i];
	offset /= kernel_shape[d_i];
	}
	for (const auto j : c10::irange(inner_size)) {
	// Loop over spatial axes in forward order to compute the indices in the
	// image and column, and whether the index lies in the padding.
	const int col_index = i * inner_size + j;
	int img_index = i / kernel_size;
	bool is_padding = false;
	for (const auto d_i : c10::irange(N)) {
	const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
	d_offset[d_i] * dilation[d_i];
	is_padding \|= d_img < 0 \|\| d_img >= img_shape[d_i + 1];
	img_index = img_index * img_shape[d_i + 1] + d_img;
	}
	Y_data[col_index] = is_padding ? zero_point : X_data[img_index];
	utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
	}
	}
	}

	/**
	* The layout of the result is N H W G R S C/G.
	* Note that groups are pulled out to an outer dimension so that we can use
	* GEMMs efficiently.
	*/
	template <typename T>
	static void Im2ColNHWC(
	const int channels,
	const int height,
	const int width,
	const int kernel_h,
	const int kernel_w,
	const int dilation_h,
	const int dilation_w,
	const int pad_t,
	const int pad_l,
	const int pad_b,
	const int pad_r,
	const int stride_h,
	const int stride_w,
	const T* data_im,
	T* data_col,
	CPUContext* /context/,
	const int groups,
	const T& zero_point) {
	const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
	const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

	int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
	int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

	#ifdef _OPENMP
	#pragma omp parallel for if (!omp_in_parallel())
	#endif
	for (int h = 0; h < height_col; ++h) {
	int h_pad = -pad_t + h * stride_h;
	T* data_col_temp =
	data_col + h * width_col * kernel_h * kernel_w * channels;
	int w_pad = -pad_l;
	for (C10_UNUSED const auto w : c10::irange(width_col)) {
	int r = 0;
	for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
	int s = 0;
	for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
	if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
	for (const auto g : c10::irange(groups)) {
	memcpy(
	data_col_temp +
	((g * kernel_h + r) * kernel_w + s) * (channels / groups),
	data_im + (ih * width + iw) * channels +
	g * (channels / groups),
	sizeof(T) * (channels / groups));
	}
	} else {
	// This should be simply padded with zero.
	for (const auto g : c10::irange(groups)) {
	for (int i = 0; i < channels / groups; ++i) {
	data_col_temp
	[(((g * kernel_h + r) * kernel_w) + s) *
	(channels / groups) +
	i] = zero_point;
	}
	}
	}
	} // for each iw
	} // for each ih
	data_col_temp += kernel_h * kernel_w * channels;
	w_pad += stride_w;
	} // for each output pixel
	} // for each image row
	}

	/**
	* The layout of the result is N T H W G Q R S C/G.
	* Note that groups are pulled out to an outer dimension so that we can use
	* GEMMs efficiently.
	*/
	template <typename T>
	static void Im2Col3DNHWC(
	const int channels,
	const int num_frames,
	const int height,
	const int width,
	const int kernel_t,
	const int kernel_h,
	const int kernel_w,
	const int dilation_t,
	const int dilation_h,
	const int dilation_w,
	const int pad_p, // previous frame
	const int pad_t, // top
	const int pad_l, // left
	const int pad_n, // next frame
	const int pad_b, // bottom
	const int pad_r, // right
	const int stride_t,
	const int stride_h,
	const int stride_w,
	const T* data_im,
	T* data_col,
	CPUContext* /context/,
	const int groups,
	const T& zero_point) {
	const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
	const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
	const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

	int frame_col = (num_frames + pad_p + pad_n - dkernel_t) / stride_t + 1;
	int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
	int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

	#ifdef _OPENMP
	#pragma omp parallel for if (!omp_in_parallel())
	#endif
	for (int t = 0; t < frame_col; ++t) {
	int t_pad = -pad_p + t * stride_t;
	for (const auto h : c10::irange(height_col)) {
	int h_pad = -pad_t + h * stride_h;
	T* data_col_temp = data_col +
	(t * height_col + h) * width_col * kernel_t * kernel_h * kernel_w *
	channels;
	for (const auto w : c10::irange(width_col)) {
	int w_pad = -pad_l + w * stride_w;
	int q = 0;
	for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
	int r = 0;
	for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
	int s = 0;
	for (int iw = w_pad; iw < w_pad + dkernel_w;
	iw += dilation_w, ++s) {
	if (it >= 0 && it < num_frames && ih >= 0 && ih < height &&
	iw >= 0 && iw < width) {
	for (const auto g : c10::irange(groups)) {
	memcpy(
	data_col_temp +
	(((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
	(channels / groups),
	data_im + ((it * height + ih) * width + iw) * channels +
	g * (channels / groups),
	sizeof(T) * (channels / groups));
	}
	} else {
	// This should be simply padded with zero.
	for (const auto g : c10::irange(groups)) {
	for (int i = 0; i < channels / groups; ++i) {
	data_col_temp
	[((((g * kernel_t + q) * kernel_h + r) * kernel_w) +
	s) *
	(channels / groups) +
	i] = zero_point;
	}
	}
	}
	} // for each iw
	} // for each ih
	} // for each it
	data_col_temp += kernel_t * kernel_h * kernel_w * channels;
	} // for each output pixel
	} // for each image row
	} // for each frame
	}

	} // namespace math

	} // namespace caffe2