blob: 58fc057bd85fbb5c4727ed0429c2a7a7cea87c50 [file] [log] [blame]
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
// TODO: add LSTM function to composite operations
// Function Signature: cy, hy = lstm(x, cx)
static void setupFusion(Fusion* fusion) {
FusionGuard fg(fusion);
TensorView* tvs[16];
for (size_t i = 0; i < 16; i++) {
tvs[i] = makeContigTensor(2, DataType::Float);
fusion->addInput(tvs[i]);
}
const auto cx = makeContigTensor(2, DataType::Float);
fusion->addInput(cx);
const auto in_x = add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]);
const auto forget_x = add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]);
const auto cell_x = add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]);
const auto out_x = add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]);
auto lstm_result = lstm(cx, in_x, forget_x, cell_x, out_x);
fusion->addOutput(lstm_result.cell);
fusion->addOutput(lstm_result.hidden);
}
static std::vector<c10::IValue> setupInputs(
int hidden_features,
int batch_size) {
at::manual_seed(0);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
const at::Tensor large_tensor0 =
at::randn({batch_size, hidden_features * 4}, options);
const at::Tensor large_tensor1 =
at::randn({batch_size, hidden_features * 4}, options);
const at::Tensor large_tensor2 =
at::randn({batch_size, hidden_features * 4}, options);
const at::Tensor large_tensor3 =
at::randn({batch_size, hidden_features * 4}, options);
const auto chunked0 = large_tensor0.chunk(4, 1);
const auto chunked1 = large_tensor1.chunk(4, 1);
const auto chunked2 = large_tensor2.chunk(4, 1);
const auto chunked3 = large_tensor3.chunk(4, 1);
std::vector<c10::IValue> inputs;
inputs.insert(inputs.end(), chunked0.begin(), chunked0.end());
inputs.insert(inputs.end(), chunked1.begin(), chunked1.end());
inputs.insert(inputs.end(), chunked2.begin(), chunked2.end());
inputs.insert(inputs.end(), chunked3.begin(), chunked3.end());
const auto at_cx = at::randn({batch_size, hidden_features}, options);
inputs.push_back(at_cx);
return inputs;
}
//------------------------------------------------------------------------------
static void LstmCell_SetupFusion(benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
Fusion fusion;
setupFusion(&fusion);
}
}
BENCHMARK(LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
benchmark_state.PauseTiming();
Fusion fusion;
setupFusion(&fusion);
std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
benchmark_state.ResumeTiming();
// Auto-schedule
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
}
}
BENCHMARK(LstmCell_AutoSchedule)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void LstmCell_Lower(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
for (auto _ : benchmark_state) {
GpuLower gpu_lower(&fusion);
}
}
BENCHMARK(LstmCell_Lower)->Unit(benchmark::kMillisecond);
//------------------------------------------------------------------------------
static void LstmCell_Compile(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
for (auto _ : benchmark_state) {
FusionExecutor executor;
executor.compileFusion(&fusion);
}
}
BENCHMARK(LstmCell_Compile)->Unit(benchmark::kMillisecond);
//------------------------------------------------------------------------------
static void LstmCell_RunFusion(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.compileFusion(&fusion);
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
}
BENCHMARK_CAPTURE(LstmCell_RunFusion, Small, 512, 64)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(LstmCell_RunFusion, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void LstmCell_RunFusion_GpuOnly(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.setMeasureKernelTimeFlag(true);
executor.compileFusion(&fusion);
for (auto _ : benchmark_state) {
clearL2Cache();
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
benchmark_state.SetIterationTime(executor.kernelTimeMs() / 1000.0);
}
}
BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Small, 512, 64)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
static void LstmCell_RunFusion_CpuOnly(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.setExecuteKernelFlag(false);
executor.compileFusion(&fusion);
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
}
}
BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Small, 512, 64)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond);