blob: fd628a163abce1148e83d464f917bd1a2970c7a1 [file] [log] [blame]
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;
std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}
auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);
auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});
fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);
auto at_weight = c10::optional<at::Tensor>(aten_weight);
auto at_bias = c10::optional<at::Tensor>(aten_bias);
const float kEps = 1e-5;
auto aten_results =
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
void LayerNormBackward_ShapeInference_Base(
benchmark::State& benchmark_state,
bool disable_launch_parameter_cache) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerBackwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(args).has_value());
fec->profile(true);
fec->disableKernelLaunch();
fec->runFusionWithInputs(aten_inputs);
if (disable_launch_parameter_cache) {
fec->disableLaunchParamCache();
}
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
fec->runFusionWithInputs(aten_inputs);
}
}
static void LayerNormBackward_ShapeInference(
benchmark::State& benchmark_state) {
LayerNormBackward_ShapeInference_Base(benchmark_state, true);
}
static void LayerNormBackward_NoShapeInferenceCachedBaseline(
benchmark::State& benchmark_state) {
LayerNormBackward_ShapeInference_Base(benchmark_state, false);
}
static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);
auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
void LayerNormForward_ShapeInferenceBase(
benchmark::State& benchmark_state,
bool disable_launch_param_cache) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerForwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(args).has_value());
fec->profile(true);
fec->disableKernelLaunch();
fec->runFusionWithInputs(aten_inputs);
if (disable_launch_param_cache) {
fec->disableLaunchParamCache();
}
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
fec->runFusionWithInputs(aten_inputs);
}
}
static void LayerNormForward_NoShapeInferenceCachedBaseline(
benchmark::State& benchmark_state) {
LayerNormForward_ShapeInferenceBase(benchmark_state, false);
}
static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
LayerNormForward_ShapeInferenceBase(benchmark_state, true);
}
BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
->Unit(benchmark::kMicrosecond);