blob: ec6738303a3fc8a235ad5b7e0053c4d8c3d2f8b4 [file] [log] [blame]
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
#define TRANSPOSE_CONFIG {true, false, false, false}
using namespace torch::jit::fuser::cuda;
struct TransposeConfig {
bool input1_transpose_axes = false;
bool input2_transpose_axes = false;
bool intermediate_transpose_axes = false;
bool output_transpose_axes = false;
};
std::vector<at::Tensor> generateInputs(
DataType dtype,
int num_dims,
std::pair<int, int> axes,
int perm_size,
int innerdim_size,
bool input1_transpose_axes,
bool input2_transpose_axes,
bool non_vectorize_offset = false,
int iter_size = 32) {
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
std::vector<int64_t> transpose_shape(num_dims, iter_size);
transpose_shape[axes.second] = innerdim_size;
transpose_shape[axes.first] = perm_size;
std::vector<int64_t> non_transpose_shape(num_dims, iter_size);
non_transpose_shape[axes.first] = innerdim_size;
non_transpose_shape[axes.second] = perm_size;
// TensorType: Concrete, Contig, Symbolic
// Vectorization | Unroll - Add 1 to sizes
// Shift axis by 1 to disable vectorize loads
if (non_vectorize_offset) {
for (auto idx : c10::irange(transpose_shape.size())) {
transpose_shape[idx] += 1;
}
for (auto idx : c10::irange(non_transpose_shape.size())) {
non_transpose_shape[idx] += 1;
}
}
auto optionalTransposeSize =
[&transpose_shape, &non_transpose_shape](bool transpose_tensor) {
return (transpose_tensor) ? transpose_shape : non_transpose_shape;
};
at::Tensor aten_input1 =
at::randn(optionalTransposeSize(input1_transpose_axes), options);
at::Tensor aten_input2 =
at::randn(optionalTransposeSize(input2_transpose_axes), options);
return {aten_input1, aten_input2};
}
//------------------------------------------------------------------------------
static void setupTranspose(
Fusion* fusion,
DataType dtype,
int num_dims,
std::pair<int, int> axes,
TransposeConfig tc) {
FusionGuard fg(fusion);
auto optionalTranspose = [axes](TensorView* tv, bool is_transpose) {
return (is_transpose) ? transpose(tv, axes.first, axes.second) : tv;
};
auto input1 = makeContigTensor(num_dims, dtype);
auto input2 = makeContigTensor(num_dims, dtype);
fusion->addInput(input1);
fusion->addInput(input2);
auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes);
auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes);
auto intermediate = add(ot_input1, ot_input2);
auto ot_intermediate =
optionalTranspose(intermediate, tc.intermediate_transpose_axes);
auto output = relu(ot_intermediate);
auto ot_output = optionalTranspose(output, tc.output_transpose_axes);
fusion->addOutput(ot_output);
}
static void NvFuserScheduler_Transpose(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
int num_dims,
std::pair<int, int> axes,
TransposeConfig tc) {
auto aten_inputs = generateInputs(
dtype,
num_dims,
axes,
benchmark_state.range(0),
benchmark_state.range(1),
tc.input1_transpose_axes,
tc.input2_transpose_axes);
auto at_input1 = aten_inputs[0];
auto at_input2 = aten_inputs[1];
std::vector<c10::IValue> fuser_inputs = {at_input1, at_input2};
runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype))));
}
//------------------------------------------------------------------------------
#define NVFUSER_TRANSPOSE_SQUARE_RUN( \
TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
NVFUSER_BENCHMARK_DEFINE( \
TITLE, \
setupTranspose, \
NvFuserScheduler_Transpose, \
DTYPE, \
NUM_DIMS, \
{AXIS1, AXIS2}, \
CONFIG); \
\
NVFUSER_BENCHMARK_RUN(TITLE) \
->RangeMultiplier(8) \
->Args({9, 2408}) \
->Args({16, 512}) \
->Args({18, 96}) \
->Args({24, 96}) \
->Args({24, 256}) \
->Args({24, 512}) \
->Args({32, 27}) \
->Args({32, 96}) \
->Args({32, 288}) \
->Args({32, 864}) \
->Args({40, 120}) \
->Args({48, 128}) \
->Args({48, 256}) \
->Args({49, 512}) \
->Args({49, 1024}) \
->Args({49, 2048}) \
->Args({49, 4608}) \
->Args({64, 64}) \
->Args({64, 96}) \
->Args({64, 128}) \
->Args({64, 147}) \
->Args({64, 192}) \
->Args({64, 256}) \
->Args({64, 288}) \
->Args({64, 512}) \
->Args({80, 64}) \
->Args({81, 1728}) \
->Args({83, 1728}) \
->Args({96, 864}) \
->Args({100, 1280}) \
->Args({100, 4032}) \
->Args({120, 40}) \
->Args({128, 128}) \
->Args({128, 512}) \
->Args({128, 1152}) \
->Args({192, 128}) \
->Args({192, 256}) \
->Args({192, 720}) \
->Args({192, 768}) \
->Args({192, 1120}) \
->Args({192, 1728}) \
->Args({196, 256}) \
->Args({196, 512}) \
->Args({196, 1024}) \
->Args({196, 2304}) \
->Args({256, 256}) \
->Args({256, 1024}) \
->Args({256, 2304}) \
->Args({284, 512}) \
->Args({320, 1280}) \
->Args({320, 1728}) \
->Args({324, 2592}) \
->Args({361, 768}) \
->Args({361, 1120}) \
->Args({384, 2}) \
->Args({384, 32}) \
->Args({384, 128}) \
->Args({384, 256}) \
->Args({384, 512}) \
->Args({384, 1280}) \
->Args({384, 2592}) \
->Args({384, 4032}) \
->Args({448, 1280}) \
->Args({480, 16}) \
->Args({480, 256}) \
->Args({512, 2}) \
->Args({512, 16}) \
->Args({512, 128}) \
->Args({512, 256}) \
->Args({512, 1024}) \
->Args({512, 2048}) \
->Args({512, 3072}) \
->Args({512, 4608}) \
->Args({784, 40}) \
->Args({784, 120}) \
->Args({784, 128}) \
->Args({784, 1152}) \
->Args({1001, 2408}) \
->Args({1024, 16}) \
->Args({1024, 256}) \
->Args({1024, 512}) \
->Args({1024, 1024}) \
->Args({1024, 3072}) \
->Args({1369, 192}) \
->Args({1369, 256}) \
->Args({1369, 288}) \
->Args({2048, 512}) \
->Args({2048, 1024}) \
->Args({2250, 27}) \
->Args({3072, 512}) \
->Args({3072, 1024}) \
->Args({3136, 64}) \
->Args({5329, 720}) \
->Args({5625, 64}) \
->Args({12544, 147}) \
->Args({22201, 288}) \
->Unit(benchmark::kMicrosecond)
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Inner_2D_01_Axis,
DataType::Float,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Inner_3D_02_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Inner_3D_12_Axis,
DataType::Float,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Outer_3D_01_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Inner_2D_01_Axis,
DataType::Half,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Inner_3D_02_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Inner_3D_12_Axis,
DataType::Half,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Outer_3D_01_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
#define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
NVFUSER_BENCHMARK_DEFINE( \
TITLE, \
setupTranspose, \
NvFuserScheduler_Transpose, \
DTYPE, \
NUM_DIMS, \
{AXIS1, AXIS2}, \
CONFIG); \
\
NVFUSER_BENCHMARK_RUN(TITLE) \
->RangeMultiplier(8) \
->Ranges({{2, 256 * 256}, {160, 320}}) \
->Unit(benchmark::kMicrosecond) \
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Inner_2D_01_Axis,
DataType::Float,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Inner_3D_02_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Inner_3D_12_Axis,
DataType::Float,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Outer_3D_01_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Inner_2D_01_Axis,
DataType::Half,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Inner_3D_02_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Inner_3D_12_Axis,
DataType::Half,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Outer_3D_01_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
static void Baseline_Transpose(
benchmark::State& benchmark_state,
DataType dtype,
int num_dims,
std::pair<int, int> axes,
TransposeConfig tc) {
auto aten_inputs = generateInputs(
dtype,
num_dims,
axes,
benchmark_state.range(0),
benchmark_state.range(1),
tc.input1_transpose_axes,
tc.input2_transpose_axes);
auto at_input1 = aten_inputs[0];
auto at_input2 = aten_inputs[1];
auto optionalTransposeAten = [&axes](at::Tensor x, bool is_transpose) {
return (is_transpose) ? at::transpose(x, axes.first, axes.second) : x;
};
for (auto _ : benchmark_state) {
clearL2Cache();
CudaKernelTimer timer;
auto at_ot_input1 =
optionalTransposeAten(at_input1, tc.input1_transpose_axes);
auto at_ot_input2 =
optionalTransposeAten(at_input2, tc.input2_transpose_axes);
auto at_intermediate = add(at_ot_input1, at_ot_input2);
auto at_ot_intermediate =
optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes);
auto at_output = relu(at_ot_intermediate);
auto at_ot_output =
optionalTransposeAten(at_output, tc.output_transpose_axes);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
cudaDeviceSynchronize();
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(at_input1.numel() * 3 * int64_t(dataTypeSize(dtype))));
}
//------------------------------------------------------------------------------
static void Baseline_Transpose_fp32_Inner_2D_01_Axis(
benchmark::State& benchmark_state) {
Baseline_Transpose(
benchmark_state,
DataType::Float,
2 /* num_dims */,
{0, 1} /* axes */,
TRANSPOSE_CONFIG);
}
static void Baseline_Transpose_fp16_Inner_2D_01_Axis(
benchmark::State& benchmark_state) {
Baseline_Transpose(
benchmark_state,
DataType::Half,
2 /* num_dims */,
{0, 1} /* axes */,
TRANSPOSE_CONFIG);
}
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis)
// ->RangeMultiplier(2)
->Ranges({{2, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis)
// ->RangeMultiplier(2)
->Ranges({{2, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------