[MPS] Fix overflow in cumsum when dtype is bool (#125318) `cumsum` and `cumprod` was (is?) buggy for MPS: https://github.com/pytorch/pytorch/blob/c8d2a55273757c90989fde7c6f05e957aba9a238/aten/src/ATen/native/mps/operations/UnaryOps.mm#L435-L436 A workaround casts the input to int32 prior to performing the op to prevent overflow for certain numeric types. It turns out this issue also affects boolean types: ```python import torch print(torch.ones(128, dtype=torch.bool, device="mps").cumsum(0)[-1]) # tensor(-128, device='mps:0') ``` In this PR I'm adding logic to also cast bool dtypes to int32 prior to `cumsum` and `cumprod`, although output is guaranteed not to overflow for the latter with bools. I'm also adding a test to prevent regressions. Fixes #96614 #106112 #109166 Pull Request resolved: https://github.com/pytorch/pytorch/pull/125318 Approved by: https://github.com/malfet

commit: e15da7856caf2deacbf0d9adabd53e5e3f0d9a1d [log] [tgz]
author: Roy Hvaara <[email protected]> Fri May 03 01:19:21 2024 +0000
committer: PyTorch MergeBot <[email protected]> Fri May 03 01:19:24 2024 +0000
tree: f50e0129c71ef698b91ba681bf95fb7f10f000b7
parent: acac7aa70ffc62090ab12211c36a3ac37e73ed0b [diff]
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index c76a60e..7709c79 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm

@@ -434,7 +434,7 @@
 
   // issue #103810551: cumsum / cumprod are broken for int8, int16 and as chances for overflow are pretty high, cast to
   // int32 fixed in macOS 13.3
-  bool castInputData = (isIntegralType(input.scalar_type(), false) && input.scalar_type() != ScalarType::Int &&
+  bool castInputData = (isIntegralType(input.scalar_type(), true) && input.scalar_type() != ScalarType::Int &&
                         input.scalar_type() != ScalarType::Long);
 
   TORCH_CHECK(macOS13_3_plus || input.scalar_type() != ScalarType::Long,

diff --git a/test/test_mps.py b/test/test_mps.py
index 9582831..38fea5b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py

@@ -156,11 +156,6 @@
         # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
         # Running `msort` with stable `sort` passes.
         'msort': [torch.float16],
-
-        # See https://github.com/pytorch/pytorch/issues/106112 for more information
-        'cumprod': [torch.float32, torch.float16],
-        # See https://github.com/pytorch/pytorch/issues/109166 for more information
-        'masked.cumprod': [torch.float16],
     }
 
     SKIPLIST_GRAD = {
@@ -4273,6 +4268,13 @@
             self.assertEqual(e_string, "MPS does not support cumsum_out_mps op with int64 input." +
                              " Support has been added in macOS 13.3")
 
+    def test_cumsum_bool(self):
+        a = torch.ones(2**16, dtype=torch.bool)
+        t_cpu = a.cumsum(0)
+        t_mps = a.to("mps").cumsum(0)
+
+        self.assertEqual(t_cpu, t_mps)
+
     def test_cumsum_minus_one_axis(self):
         def helper(dtype):
             # Test with axis -1
commit	e15da7856caf2deacbf0d9adabd53e5e3f0d9a1d	[log] [tgz]
author	Roy Hvaara <[email protected]>	Fri May 03 01:19:21 2024 +0000
committer	PyTorch MergeBot <[email protected]>	Fri May 03 01:19:24 2024 +0000
tree	f50e0129c71ef698b91ba681bf95fb7f10f000b7
parent	acac7aa70ffc62090ab12211c36a3ac37e73ed0b [diff]