Update Cutlass to v2.11 (#94188)
Now that we are on CUDA 11+ exclusively, we can update Nvidia's Cutlass to the next version. We also had to remove the cuda build flag : "-D__CUDA_NO_HALF_CONVERSIONS__" since Cutlass no longer builds without it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/94188
Approved by: https://github.com/ezyang, https://github.com/jansel
diff --git a/BUILD.bazel b/BUILD.bazel
index 843b27a..88ba8d6 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -414,7 +414,6 @@
torch_cuda_half_options = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
- "-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index e1b9f38..ec7292f 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -49,14 +49,14 @@
if (low_byte && index < (numel - 1)) {
__half2 value2;
- value2.x = value;
+ value2.x = static_cast<__half>(value);
value2.y = __int2half_rz(0);
atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
} else if (!low_byte && index > 0) {
__half2 value2;
value2.x = __int2half_rz(0);
- value2.y = value;
+ value2.y = static_cast<__half>(value);
atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
} else {
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index aa1644c..d6d7e8a 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -21,7 +21,7 @@
__half a = __float2half(3.0f);
__half b = __float2half(2.0f);
- __half c = a - Half(b);
+ __half c = Half(a) - Half(b);
assert(static_cast<Half>(c) == Half(1.0));
// asserting if the functions used on
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 0012d26..8c46203 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1653,7 +1653,6 @@
message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
" -D__CUDA_NO_HALF_OPERATORS__"
- " -D__CUDA_NO_HALF_CONVERSIONS__"
" -D__CUDA_NO_HALF2_OPERATORS__"
" -D__CUDA_NO_BFLOAT16_CONVERSIONS__")
diff --git a/third_party/cutlass b/third_party/cutlass
index b72cbf9..66d9cdd 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit b72cbf957df8cf84a6d0ff91c190ad51a9c1d24a
+Subproject commit 66d9cddc832c1cdc2b30a8755274f7f74640cfe6
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 11b233f..54e7fa9 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -225,7 +225,6 @@
COMMON_NVCC_FLAGS = [
'-D__CUDA_NO_HALF_OPERATORS__',
- '-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
'--expt-relaxed-constexpr'