[MPS] Adding xfaillist with all categories of failures. (#96176)
Fixes #ISSUE_NUMBER
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96176
Approved by: https://github.com/malfet
diff --git a/test/test_mps.py b/test/test_mps.py
index 6abfb45..b1138dc 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10,7 +10,6 @@
import subprocess
import tempfile
import os
-import pprint
import copy
import gc
import torch
@@ -23,7 +22,7 @@
from torch.testing._internal import opinfo
from torch.testing._internal.common_utils import \
(gradcheck, gradgradcheck, run_tests, TestCase, download_file, IS_CI, NoTest,
- TEST_WITH_UBSAN, dtype_abbrs, skipIfSlowGradcheckEnv, TEST_WITH_ASAN, suppress_warnings)
+ TEST_WITH_UBSAN, skipIfSlowGradcheckEnv, TEST_WITH_ASAN, suppress_warnings)
from torch.testing import make_tensor
from torch.testing._comparison import TensorLikePair
from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
@@ -58,97 +57,118 @@
)
)
-def mps_ops_modifier(ops):
- # Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
- MACOS_13_X_XFAILLIST = {
- 'masked.softmax': [torch.float32],
+def mps_ops_grad_modifier(ops):
+ XFAILLIST_GRAD = {
+ # Top 60
+ # CPU: empty is returning all 0's and there is a mismatch with MPS
+ # allocation (MacOS 13). According to
+ # https://pytorch.org/docs/2.0/generated/torch.empty.html
+ # PyTorch `empty`, Returns a tensor filled with uninitialized data.
+ 'empty': [torch.float16, torch.float32],
+
+ # CPU Error: RuntimeError: "addmv_impl_cpu" not implemented for 'Half'
+ 'addr': [torch.float16],
+
+ # Unimplemented ops
+ '__getitem__': [torch.float16],
+ 'prod': [torch.float32], # The operator 'aten::cumprod.out'
+ 'sgn': [torch.float16, torch.float32],
+ '_segment_reduce': [torch.float16, torch.float32],
+ 'unfold_copy': [torch.float16, torch.float32], # unfold_backward is not implemented
+ 'unfold': [torch.float16, torch.float32],
+ 'trace': [torch.float32], # missing in place aten::index_fill_.int_Tensor
+ 'sparse.mmreduce': [torch.float32], # csr not supported
+ 'unique_consecutive': [torch.float16, torch.float32],
+ 'special_modified_bessel_i0': [torch.float16, torch.float32],
+ 'scalar_tensor': [torch.float16, torch.float32],
+ 'cdist': [torch.float32],
+ 'masked.scatter': [torch.float16, torch.float32],
+
+ # Correctness issues
+ 'atanh': [torch.float32],
+
+ # Random output
+ 'exponential': [torch.float16, torch.float32],
+
+ # CPU errors
+ # derivative for aten::floor_divide is not implemented on CPU
+ 'floor_divide': [torch.float16, torch.float32],
+ # derivative for aten::narrow_copy is not implemented on CPU
+ 'narrow_copy': [torch.float16, torch.float32],
+ # RuntimeError: "log_vml_cpu" not implemented for 'Half'
+ '__rpow__': [torch.float16],
+ 'pow': [torch.float16],
+ # 'bool' object is not iterable
+ 'allclose': [torch.float16, torch.float32],
+ 'equal': [torch.float16, torch.float32],
+ # "mse_backward_cpu_out" not implemented for 'Half'
+ 'nn.functional.mse_loss': [torch.float16],
+ # "smooth_l1_backward_cpu_out" not implemented for 'Half'
+ 'nn.functional.smooth_l1_loss': [torch.float16],
+ # cpu error: grad requires non-empty inputs
+ 'randn': [torch.float16, torch.float32],
+ 'signal.windows.bartlett': [torch.float32],
+ 'signal.windows.blackman': [torch.float32],
+ 'signal.windows.cosine': [torch.float32],
+ 'signal.windows.exponential': [torch.float32],
+ 'signal.windows.gaussian': [torch.float32],
+ 'signal.windows.general_cosine': [torch.float32],
+ 'signal.windows.general_hamming': [torch.float32],
+ 'signal.windows.hamming': [torch.float32],
+ 'signal.windows.hann': [torch.float32],
+ 'signal.windows.kaiser': [torch.float32],
+ 'signal.windows.nuttall': [torch.float32],
+ 'empty_permuted': [torch.float16, torch.float32],
+ 'eye': [torch.float16, torch.float32],
+
+ # trunc_tensor not working properly for float16
+ 'divtrunc_rounding': [torch.float16],
+ 'fmod': [torch.float16],
+ }
+
+ MACOS_12_3_XFAILLIST_GRAD = {
+ # Unsupported Border padding mode, forward pass success as fallback to cpu
+ 'grid_sampler_2d': [torch.float32],
+ # Unimplemented
+ 'logaddexp2': [torch.float32],
+
+ # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
+ # fixed in macOS 13. We are not raising error.
+ '__rpow__': [torch.float32],
+ 'pow': [torch.float32],
+ }
+
+ MACOS_BEFORE_13_3_XFAILLIST_GRAD = {
+ # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
'masked.softmin': [torch.float32],
+ 'masked.softmax': [torch.float32],
'masked.log_softmax': [torch.float32],
- }
- MACOS_12_X_XFAILLIST = {
- '__radd__': [torch.uint8],
- '__rdiv__': [torch.uint8],
- '__rmul__': [torch.uint8],
- '__rpow__': [torch.uint8],
- 'abs': [torch.uint8],
- 'acos': [torch.uint8],
- 'acosh': [torch.uint8],
- 'add': [torch.uint8],
- 'asin': [torch.uint8],
- 'asinh': [torch.uint8],
- 'atan': [torch.uint8],
- 'atanh': [torch.uint8],
- 'cos': [torch.uint8],
- 'cosh': [torch.uint8],
- 'deg2rad': [torch.uint8],
- 'diff': [torch.uint8],
- 'equal': [torch.uint8],
- 'erf': [torch.uint8],
- 'exp2': [torch.uint8],
- 'exp': [torch.uint8],
- 'fmax': [torch.uint8],
- 'fmin': [torch.uint8],
- 'fmod': [torch.uint8],
- 'isclose': [torch.uint8],
- 'isnan': [torch.uint8],
- 'kron': [torch.uint8],
- 'log10': [torch.uint8],
- 'log1p': [torch.uint8],
- 'log2': [torch.uint8],
- 'log': [torch.uint8],
- 'logical_and': [torch.uint8],
- 'logical_or': [torch.uint8],
- 'logical_xor': [torch.uint8],
- 'logit': [torch.uint8],
- 'masked.mean': [torch.uint8],
- 'masked.std': [torch.uint8],
- 'masked.var': [torch.uint8],
- 'nn.functional.avg_pool1d': [torch.int64],
- 'nn.functional.avg_pool2d': [torch.int64],
- 'nn.functional.cosine_embedding_loss': [torch.uint8],
- 'nn.functional.poisson_nll_loss': [torch.uint8],
- 'nn.functional.softsign': [torch.uint8],
- 'nn.functional.tanhshrink': [torch.uint8],
- 'pow': [torch.int16, torch.int64, torch.uint8],
- 'rad2deg': [torch.uint8],
- 'reciprocal': [torch.uint8],
- 'remainder': [torch.uint8],
- 'rsqrt': [torch.uint8],
- 'sigmoid': [torch.uint8],
- 'sign': [torch.uint8],
- 'sin': [torch.uint8],
- 'sinh': [torch.uint8],
- 'special.ndtr': [torch.uint8],
- 'sqrt': [torch.uint8],
- 'sub': [torch.uint8],
- 'tan': [torch.uint8],
- 'tanh': [torch.uint8],
- 'true_divide': [torch.uint8],
- 'xlogy': [torch.uint8],
- # Weird
- 'square': [torch.uint8, torch.bool, torch.int16, torch.int32, torch.int64],
+
+ # Unsupported Border padding mode, forward pass success as fallback to cpu
+ 'grid_sampler_2d': [torch.float32],
+
+ # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
+ # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
+ # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
+ # Running `msort` with stable `sort` passes.
+ 'msort': [torch.float16],
+
+ # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
+ # fixed in macOS 13. We are not raising error.
+ 'pow': [torch.float32],
+ '__rpow__': [torch.float32],
}
+ XPASSLIST_GRAD = {
+ 'nn.functional.pairwise_distance': [torch.float16],
+ }
- # Those ops are not expected to work
- XFAILLIST = {
- '__rpow__': [torch.int16, torch.int32, torch.int64],
- 'chalf': None,
- # Unsupported dtypes
- 'dot': [torch.int64],
- 'index_add': [torch.int64],
- 'nn.functional.conv1d': [torch.int64],
- 'nn.functional.conv2d': [torch.int64],
- 'nn.functional.conv_transpose1d': [torch.int64],
- 'nn.functional.conv_transpose2d': [torch.int64],
- # 'remainder': [torch.int64],
- 'sigmoid': [torch.int64],
- # failures due to lack of op implementation on MPS backend
- 'put': None,
- # Weird
- 'byte': [torch.float16, torch.float32],
- 'nn.functional.adaptive_avg_pool1d': [torch.float32],
- 'nn.functional.adaptive_avg_pool2d': [torch.float32],
+ MACOS_13_3_XFAILLIST_GRAD = {
+ # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
+ # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
+ # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
+ # Running `msort` with stable `sort` passes.
+ 'msort': [torch.float16],
}
def addDecorator(op, d) -> None:
@@ -157,19 +177,571 @@
for op in ops:
key = op.name + op.variant_test_name
- if key in XFAILLIST:
+ if key in XFAILLIST_GRAD:
addDecorator(op, DecorateInfo(
unittest.expectedFailure,
- dtypes=XFAILLIST[key]))
+ dtypes=XFAILLIST_GRAD[key]))
- if key in MACOS_13_X_XFAILLIST and torch.backends.mps.is_macos13_or_newer():
+ if key in XPASSLIST_GRAD:
+ addDecorator(op, DecorateInfo(
+ unittest.skip,
+ dtypes=XPASSLIST_GRAD[key]))
+
+ if key in MACOS_12_3_XFAILLIST_GRAD and (not torch.backends.mps.is_macos13_or_newer()):
addDecorator(op, DecorateInfo(
unittest.expectedFailure,
- dtypes=MACOS_13_X_XFAILLIST[key]))
- if key in MACOS_12_X_XFAILLIST and not torch.backends.mps.is_macos13_or_newer():
+ dtypes=MACOS_12_3_XFAILLIST_GRAD[key]))
+
+ if key in MACOS_BEFORE_13_3_XFAILLIST_GRAD and (torch.backends.mps.is_macos13_or_newer() and product_version < 13.3):
addDecorator(op, DecorateInfo(
unittest.expectedFailure,
- dtypes=MACOS_12_X_XFAILLIST[key]))
+ dtypes=MACOS_BEFORE_13_3_XFAILLIST_GRAD[key]))
+
+ if key in MACOS_13_3_XFAILLIST_GRAD and (product_version >= 13.3):
+ addDecorator(op, DecorateInfo(
+ unittest.expectedFailure,
+ dtypes=MACOS_13_3_XFAILLIST_GRAD[key]))
+ yield op
+
+def mps_ops_modifier(ops):
+ # Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
+ MACOS_12_3_XFAILLIST = {
+ # Top 60
+ # expected failures
+ # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
+ # fixed in macOS 13.3. Currently error is not raised.
+ 'pow': [torch.int16, torch.int64, torch.uint8, torch.int8],
+ # expected failures
+ '__rpow__': [torch.uint8, torch.int8],
+
+ # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
+ 'cdist': [torch.float32],
+ 'tan': [torch.uint8, torch.float32],
+
+ # Data type support starts from macOS 13
+ 'nn.functional.avg_pool1d': [torch.int64],
+ 'nn.functional.avg_pool2d': [torch.int64],
+ 'nn.functional.local_response_norm': [torch.int64],
+ '__radd__': [torch.uint8],
+ '__rdiv__': [torch.uint8],
+ '__rmul__': [torch.uint8],
+ 'abs': [torch.uint8],
+ 'acos': [torch.uint8],
+ 'acosh': [torch.uint8],
+ 'add': [torch.uint8],
+ 'asin': [torch.uint8],
+ 'asinh': [torch.uint8],
+ 'atan': [torch.uint8],
+ 'atanh': [torch.uint8],
+ 'ceil': [torch.uint8],
+ 'corrcoef': [torch.uint8],
+ 'cos': [torch.uint8],
+ 'cosh': [torch.uint8],
+ 'cov': [torch.uint8],
+ 'cumulative_trapezoid': [torch.uint8],
+ 'deg2rad': [torch.uint8],
+ 'diff': [torch.uint8],
+ 'eq': [torch.uint8],
+ 'equal': [torch.uint8],
+ 'erf': [torch.uint8],
+ 'exp2': [torch.uint8],
+ 'exp': [torch.uint8],
+ 'expm1': [torch.uint8],
+ 'floor': [torch.uint8],
+ 'fmax': [torch.uint8],
+ 'fmin': [torch.uint8],
+ 'fmod': [torch.uint8],
+ 'ge': [torch.uint8],
+ 'gt': [torch.uint8],
+ 'isclose': [torch.uint8],
+ 'isnan': [torch.uint8],
+ 'kron': [torch.uint8],
+ 'le': [torch.uint8],
+ 'log10': [torch.uint8],
+ 'log1p': [torch.uint8],
+ 'log2': [torch.uint8],
+ 'log': [torch.uint8],
+ 'logical_and': [torch.uint8],
+ 'logical_or': [torch.uint8],
+ 'logical_xor': [torch.uint8],
+ 'logit': [torch.uint8],
+ 'lt': [torch.uint8],
+ 'masked.mean': [torch.uint8],
+ 'masked.std': [torch.uint8],
+ 'masked.var': [torch.uint8],
+ 'maximum': [torch.uint8],
+ 'minimum': [torch.uint8],
+ 'mul': [torch.uint8],
+ 'ne': [torch.uint8],
+ 'neg': [torch.uint8],
+ 'nn.functional.cosine_embedding_loss': [torch.uint8],
+ 'nn.functional.margin_ranking_loss': [torch.uint8],
+ 'nn.functional.poisson_nll_loss': [torch.uint8],
+ 'nn.functional.softsign': [torch.uint8],
+ 'nn.functional.tanhshrink': [torch.uint8],
+ 'nn.functional.triplet_margin_loss': [torch.uint8],
+ 'nn.functional.triplet_margin_with_distance_loss': [torch.uint8],
+ 'nn.functional.pairwise_distance': [torch.uint8, torch.float16],
+ 'outer': [torch.uint8],
+ 'rad2deg': [torch.uint8],
+ 'reciprocal': [torch.uint8],
+ 'remainder': [torch.uint8],
+ 'round': [torch.uint8],
+ 'rsqrt': [torch.uint8],
+ 'sigmoid': [torch.uint8],
+ 'sign': [torch.uint8],
+ 'signbit': [torch.uint8],
+ 'sin': [torch.uint8],
+ 'sinh': [torch.uint8],
+ 'special.ndtr': [torch.uint8],
+ 'sqrt': [torch.uint8],
+ 'sub': [torch.uint8],
+ 'tanh': [torch.uint8],
+ 'trapezoid': [torch.uint8],
+ 'trapz': [torch.uint8],
+ 'true_divide': [torch.uint8],
+ 'trunc': [torch.uint8],
+ 'xlogy': [torch.uint8],
+ 'minbinary': [torch.uint8],
+ 'maxbinary': [torch.uint8],
+ 'divtrunc_rounding': [torch.uint8],
+ 'divfloor_rounding': [torch.uint8],
+ 'divno_rounding_mode': [torch.uint8],
+ 'floor_divide': [torch.uint8],
+ 'ldexp': [torch.uint8],
+ # square internally calls into power, and will type cast to int64, which supports starting from macOS 13
+ 'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+
+ # cpu not giving nan for x/0.0
+ 'atan2': [torch.bool, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ # fill tensors with uninitialized data, causing mismatch with CPU
+ 'empty_permuted': [torch.bool, torch.float16, torch.float32, torch.int16,
+ torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'empty': [torch.bool, torch.float16, torch.float32, torch.int16,
+ torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'dist': [torch.float16], # cpu result off, showing inf values
+ }
+
+ MACOS_BEFORE_13_3_XFAILLIST = {
+ # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
+ 'tan': [torch.float32],
+ 'cdist': [torch.float32],
+
+ # CPU Error: cpu not giving nan for x/0.0
+ 'atan2': [torch.bool, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+
+ # test blow pass on macOS 12 as it falls back to cpu
+ # Argsort case using duplicate indices (undefined behaviour):
+ # - CPU output: tensor([2546, 6917, 3181, ..., 7128, 5133, 30], devuce='cpu')
+ # - MPS output: tensor([2546, 6917, 3181, ..., 7128, 30, 5133], device='mps:0')
+ # Elements from index 30 and 5133 are both equal.
+ # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+ 'argsort': [torch.float16, torch.int8, torch.uint8, torch.bool],
+ # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+ # The values of the sorted tensor match the CPU, but in case of the returned indices this results in undefined behaviour.
+ 'sort': [torch.int8, torch.uint8, torch.bool, torch.float16],
+ # Unsupported dtypes
+ 'cumsum': [torch.int64],
+ 'cumulative_trapezoid': [torch.int64],
+ 'masked.cumsum': [torch.int64],
+ }
+
+ MACOS_13_3_XFAILLIST = {
+ # before macOS 13.3 it falls back to cpu and pass the forward pass
+ 'grid_sampler_2d': [torch.float32], # Unsupported Border padding mode
+
+ # Failure due to precision issue for fp16
+ # on both cpu and mps there are test cases that might produce inf result
+ # 'nn.functional.pairwise_distance': [torch.float16],
+
+ # test blow pass on macOS 12 as it falls back to cpu
+ # Argsort case using duplicate indices (undefined behaviour):
+ # - CPU output: tensor([2546, 6917, 3181, ..., 7128, 5133, 30], devuce='cpu')
+ # - MPS output: tensor([2546, 6917, 3181, ..., 7128, 30, 5133], device='mps:0')
+ # Elements from index 30 and 5133 are both equal.
+ # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+ 'argsort': [torch.float16, torch.int8, torch.uint8, torch.bool],
+ # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+ # The values of the sorted tensor match the CPU, but in case of the returned indices this results in undefined behaviour.
+ 'sort': [torch.int8, torch.uint8, torch.bool, torch.float16],
+ }
+
+ # Those ops are not expected to work
+ UNIMPLEMENTED_XFAILLIST = {
+ # Failures due to lack of op implementation on MPS backend
+ 'login': None,
+ 'log_sigmoid': None,
+ 'log_sigmoid_forward': None,
+ 'linalg.eig': None,
+ 'linalg.eigvals': None,
+ 'fft.fft': None,
+ 'fft.fft2': None,
+ 'fft.fftn': None,
+ 'fft.hfft': None,
+ 'fft.hfft2': None,
+ 'fft.hfftn': None,
+ 'fft.ifft': None,
+ 'fft.ifft2': None,
+ 'fft.ifftn': None,
+ 'fft.ihfft': None,
+ 'fft.ihfft2': None,
+ 'fft.ihfftn': None,
+ 'fft.irfft': None,
+ 'fft.irfft2': None,
+ 'fft.irfftn': None,
+ 'fft.rfft': None,
+ 'fft.rfft2': None,
+ 'fft.rfftn': None,
+ 'put': None,
+ 'stft': None,
+ 'nn.functional.conv_transpose3d': None,
+ 'rounddecimals_neg_3': None,
+ 'rounddecimals_3': None,
+ 'rounddecimals_0': None,
+ '__rsub__': None,
+ 'aminmax': None,
+ 'angle': None,
+ 'bucketize': None,
+ 'cauchy_': None,
+ 'cauchy': None,
+ 'cholesky': None,
+ 'cholesky_inverse': None,
+ 'cholesky_solve': None,
+ 'cummax': None,
+ 'cummin': None,
+ 'cumprod': None,
+ 'digamma': None,
+ 'erfc': None,
+ 'erfinv': None,
+ 'frexp': None,
+ 'gcd': None,
+ 'geqrf': None,
+ 'nn.functional.grid_sample': None, # Unsupported Border padding mode
+ 'heaviside': None,
+ 'histc': None,
+ 'histogram': None,
+ 'histogramdd': None,
+ 'i0': None,
+ 'igamma': None,
+ 'igammac': None,
+ 'index_copy': None,
+ 'index_fill': None,
+ 'index_reduce': None,
+ 'isin': None,
+ 'isneginf': None,
+ 'isposinf': None,
+ 'kthvalue': None,
+ 'lcm': None,
+ 'lerp': None,
+ 'lgamma': None,
+ 'linalg.cholesky': None,
+ 'linalg.cholesky_ex': None,
+ 'linalg.cond': None,
+ 'linalg.detsingular': None,
+ 'linalg.det': None,
+ 'linalg.eigh': None,
+ 'linalg.eigvalsh': None,
+ 'linalg.householder_product': None,
+ 'linalg.ldl_factor': None,
+ 'linalg.ldl_factor_ex': None,
+ 'linalg.ldl_solve': None,
+ 'linalg.lstsq': None,
+ 'linalg.lstsqgrad_oriented': None,
+ 'linalg.lu': None,
+ 'linalg.lu_factor': None,
+ 'linalg.lu_factor_ex': None,
+ 'linalg.lu_solve': None,
+ 'linalg.matrix_norm': [torch.float32],
+ 'linalg.norm': [torch.float32],
+ 'linalg.normsubgradients_at_zero': [torch.float32],
+ 'linalg.qr': None,
+ 'linalg.slogdet': None,
+ 'linalg.solve': None,
+ 'linalg.solve_ex': None,
+ 'linalg.svdvals': None,
+ 'linalg.tensorsolve': None,
+ 'linalg.vander': None,
+ 'linalg.vecdot': None,
+ 'logcumsumexp': None,
+ 'logdet': None,
+ 'lu': None,
+ 'lu_solve': None,
+ 'lu_unpack': None,
+ 'masked.cumprod': None,
+ 'masked.median': None,
+ 'matrix_exp': None,
+ 'mode': None,
+ 'mvlgamma': None,
+ 'mvlgammamvlgamma_p_1': None,
+ 'mvlgammamvlgamma_p_3': None,
+ 'mvlgammamvlgamma_p_5': None,
+ 'nanquantile': None,
+ 'nanmedian': None,
+ 'native_dropout_backward': None,
+ 'nextafter': None,
+ 'normnuc': None,
+ 'nn.functional.fractional_max_pool2d': None,
+ 'nn.functional.fractional_max_pool3d': None,
+ 'nn.functional.adaptive_avg_pool3d': None,
+ 'nn.functional.adaptive_max_pool3d': None,
+ 'nn.functional.interpolatearea': None,
+ 'nn.functional.interpolatebicubic': None,
+ 'nn.functional.interpolatelinear': None,
+ 'nn.functional.interpolatetrilinear': None,
+ 'nn.functional.max_unpool1dgrad': None,
+ 'nn.functional.max_unpool2dgrad': None,
+ 'nn.functional.max_unpool3dgrad': None,
+ 'nn.functional.avg_pool3d': None,
+ 'nn.functional.ctc_loss': None,
+ 'nn.functional.embedding_bag': None,
+ 'nn.functional.hardshrink': None,
+ 'nn.functional.max_pool3d': None,
+ 'nn.functional.max_unpool1d': None,
+ 'nn.functional.max_unpool2d': None,
+ 'nn.functional.max_unpool3d': None,
+ 'nn.functional.mish': None,
+ 'nn.functional.multi_margin_loss': None,
+ 'nn.functional.multilabel_margin_loss': None,
+ 'nn.functional.pdist': None,
+ 'nn.functional.rrelu': None,
+ 'nn.functional.softshrink': None,
+ 'nn.functional.norm': None,
+ 'ormqr': None,
+ 'pca_lowrank': None,
+ 'pinverse': None,
+ 'polar': None,
+ 'polygamma': None,
+ 'polygammapolygamma_n_0': None,
+ 'polygammapolygamma_n_1': None,
+ 'polygammapolygamma_n_2': None,
+ 'polygammapolygamma_n_3': None,
+ 'polygammapolygamma_n_4': None,
+ 'qr': None,
+ 'quantile': None,
+ 'renorm': None,
+ 'rsub': None,
+ 'scatter_reduceamax': None,
+ 'scatter_reduceamin': None,
+ 'scatter_reducemin': None,
+ 'scatter_reducemean': None,
+ 'scatter_reduceprod': None,
+ 'scatter_reducesum': None,
+ 'searchsorted': None,
+ 'segment_reduce': None,
+ '_segment.reduce': None,
+ 'segment.reduce': None,
+ 'segment_reduce_offsets': None,
+ '_segment_reduce_offsets': None,
+ '_segment_reduce_lengths': None,
+ '_segment_reducelengths': None,
+ '_segment_reduceoffsets': None,
+ 'sinc': None,
+ 'sparse.mm': None,
+ 'sparse.mmreduce': None,
+ 'special.airy_ai': None,
+ 'special.bessel_j0': None,
+ 'special.bessel_j1': None,
+ 'special.bessel_y0': None,
+ 'special.bessel_y1': None,
+ 'special.chebyshev_polynomial_t': None,
+ 'special.chebyshev_polynomial_u': None,
+ 'special.entr': None,
+ 'special.erfcx': None,
+ 'special.hermite_polynomial_h': None,
+ 'special.hermite_polynomial_he': None,
+ 'special.i0e': None,
+ 'special.i1': None,
+ 'special.i1e': None,
+ 'special.laguerre_polynomial_l': None,
+ 'special.log_ndtr': None,
+ 'special.modified_bessel_i0': None,
+ 'special.modified_bessel_i1': None,
+ 'special.modified_bessel_k0': None,
+ 'special.modified_bessel_k1': None,
+ 'special.ndtri': None,
+ 'special.polygamma': None,
+ 'special.polygammaspecial_polygamma_n_0': None,
+ 'special.scaled_modified_bessel_k0': None,
+ 'special.scaled_modified_bessel_k1': None,
+ 'special.spherical_bessel_j0': None,
+ 'special.xlog1py': None,
+ 'special.zeta': None,
+ 'std_mean': None,
+ 'std_meanunbiased': None,
+ 'svd_lowrank': None,
+ 'symeig': None,
+ 'take': None,
+ 'to': None,
+ 'to_sparse': None,
+ 'unique': None,
+ 'vdot': None,
+ 'view_as_complex': None,
+ 'segment_reduce': None,
+ 'segment_reduce_': None,
+ '_segment_reduce_lengths': None,
+ '_upsample_bilinear2d_aa': None,
+ 'geometric' : None,
+ 'geometric_': None,
+ 'log_normal_': None,
+ 'log_normal': None,
+ 'bfloat16': None,
+ 'cdouble': None,
+ 'cfloat': None,
+ 'complex': None,
+ 'double': None,
+ 'chalf': None,
+ 'nn.functional.softminwith_dtype': None,
+ 'log_softmaxwith_dtype': None,
+ 'softmaxwith_dtype': None,
+ 'float_power': None,
+ 'full_like': None,
+ 'linalg.matrix_rank': None,
+ 'linalg.matrix_rankhermitian': None,
+ 'linalg.pinv': None,
+ 'linalg.pinvhermitian': None,
+
+ # MPS: input sizes must be divisible by output sizes
+ 'nn.functional.adaptive_avg_pool1d': None,
+ 'nn.functional.adaptive_avg_pool2d': None,
+
+ # Unsupported dtypes
+ # bmm is not supported for integral types
+ 'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ # Cannot convert a MPS Tensor to float64 dtype. The tensors
+ # input data is created with double in common_methods_invocations.py
+ 'nn.functional.batch_norm': [torch.float32],
+ 'ones_like': None,
+ 'zeros_like': None,
+
+ # Convolution for integral types is not supported on MPS
+ 'nn.functional.conv1d': [torch.int64],
+ 'nn.functional.conv2d': [torch.int64],
+ 'nn.functional.conv_transpose1d': [torch.int64],
+ 'nn.functional.conv_transpose2d': [torch.int64],
+
+ # Unsupported dtypes
+ 'dot': [torch.int64],
+ 'index_add': [torch.int64],
+ 'log1p': [torch.int64],
+ 'sigmoid': [torch.int64],
+ 'atan2': [torch.int64],
+
+ # GEMM on MPS is not supported for integral types
+ 'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ '__rmatmul__': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'mm': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'einsum': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'inner': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'linalg.multi_dot': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'mat': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'mv': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+
+ # new_zeros/new_ones: Cannot convert a MPS Tensor to float64 dtype as
+ # the MPS framework doesn't support float64
+ 'new_zeros': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'new_ones': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ # returned output on CPU is float64
+ 'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+
+ # trunc_tensor not working properly for float16
+ 'divtrunc_rounding': [torch.float16],
+ 'fmod': [torch.float16],
+ }
+
+ UNDEFINED_XFAILLIST = {
+ # Top 60 operators
+ # topk fails with duplicate indices
+ 'topk': [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+
+ # Failures due to random output that they generate using
+ # Philox engine causing mismatch with CPU results
+ 'multinomial': [torch.float32], # random results
+ 'uniform': [torch.float16, torch.float32],
+ 'rand_like': [torch.float16, torch.float32],
+ 'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'randn_like': [torch.float16, torch.float32],
+ 'bernoulli': [torch.float32],
+ 'exponential': [torch.float16, torch.float32],
+ 'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
+ 'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+ 'normalin_place': [torch.float16, torch.float32],
+ 'normalnumber_mean': [torch.float16, torch.float32],
+ 'nn.functional.alpha_dropout': [torch.float32],
+ 'nn.functional.dropout': [torch.float32],
+ 'nn.functional.dropout2d': [torch.float32],
+ 'nn.functional.dropout3d': [torch.float32],
+
+ # these fill tensors with uninitialized data, causing mismatch with CPU
+ 'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ 'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+ # 'empty': [torch.int8],
+ 'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16,
+ torch.int32, torch.int64, torch.uint8, torch.int8],
+ # duplicate indices are used in the testcase - undefined behaviour
+ 'index_put': None,
+ # zero to negative integer powers are undefined
+ '__rpow__': [torch.int8, torch.int16, torch.int32, torch.int64],
+ 'resize_': [torch.float16, torch.float32],
+ 'resize_as_': [torch.float16, torch.float32],
+
+ # CPU Errors:
+ 'addr': [torch.bool, torch.int16, torch.int32,
+ torch.int64, torch.uint8, torch.int8], # "addmv_impl_cpu" not implemented for 'Half'
+ 'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16,
+ torch.int32, torch.int64, torch.uint8, torch.int8], # cpu result off, showing random values
+ 'as_strided_partial_views': [torch.bool, torch.float16, torch.float32, torch.int16,
+ torch.int32, torch.int64, torch.uint8, torch.int8], # cpu result off, showing random values
+
+ # random results
+ # mps vs cpu:
+ # Mismatched elements: 40 / 96 (41.7%)
+ # Greatest absolute difference: 17.892311096191406 at index (1, 0, 2) (up to 1e-05 allowed)
+ # Greatest relative difference: inf at index (1, 0, 0) (up to 1.3e-06 allowed)
+ # cuda(2.0.0.dev20230301+cu117) vs cpu:
+ # Mismatched elements: 56 / 96 (58.3%)
+ # Greatest absolute difference: 17.892311096191406 at index (1, 0, 2) (up to 1e-05 allowed)
+ # Greatest relative difference: inf at index (1, 0, 0) (up to 1.3e-06 allowed)
+ 'nn.functional.scaled_dot_product_attention': [torch.float32],
+
+ # Failures due to casting negative float to uint8 is undefined
+ 'byte': [torch.float16, torch.float32],
+ }
+
+ def addDecorator(op, d) -> None:
+ op.decorators = list(op.decorators) if op.decorators is not None else []
+ op.decorators.append(d)
+
+ for op in ops:
+ key = op.name + op.variant_test_name
+ for xfaillist in [UNIMPLEMENTED_XFAILLIST, UNDEFINED_XFAILLIST]:
+ if key in xfaillist:
+ addDecorator(op, DecorateInfo(
+ unittest.expectedFailure,
+ dtypes=xfaillist[key]))
+
+ if key in MACOS_BEFORE_13_3_XFAILLIST and (torch.backends.mps.is_macos13_or_newer() and product_version < 13.3):
+ addDecorator(op, DecorateInfo(
+ unittest.expectedFailure,
+ dtypes=MACOS_BEFORE_13_3_XFAILLIST[key]))
+
+ if key in MACOS_13_3_XFAILLIST and (product_version >= 13.3):
+ addDecorator(op, DecorateInfo(
+ unittest.expectedFailure,
+ dtypes=MACOS_13_3_XFAILLIST[key]))
+
+ if key in MACOS_12_3_XFAILLIST and (not torch.backends.mps.is_macos13_or_newer()):
+ addDecorator(op, DecorateInfo(
+ unittest.expectedFailure,
+ dtypes=MACOS_12_3_XFAILLIST[key]))
yield op
# Same logic as test_cuda.py
@@ -9349,7 +9921,6 @@
for test_options in self.LSTM_TEST_CASES:
self._lstm_helper(num_layers=num_layers, dtype=dtype, device=device, backward=True, **test_options)
-
def test_RNN_cell_no_broadcasting(self):
def test(cell_module, input, hx, input_size, hidden_size):
cell = cell_module(input_size, hidden_size, device='mps')
@@ -9555,6 +10126,8 @@
for t in [torch.double, torch.cdouble, torch.cfloat, torch.bfloat16]:
del MPS_DTYPES[MPS_DTYPES.index(t)]
+MPS_GRAD_DTYPES = [torch.float32, torch.float16]
+
class TestConsistency(TestCaseMPS):
# TODO: This is only used while some ops are being added.
@@ -9562,629 +10135,34 @@
# This can be generated automatically in the `new_mps_allowlist.txt` file
# by doing `EXPECTTEST_ACCEPT=1 python test_mps.py TestConsistencyCPU`
# You most likely do NOT want to modify this manually
- ALLOWLIST_OP = {
- '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'],
- '__rdiv__': ['f16', 'f32', 'i16', 'i32', 'u8'],
- '__rmatmul__': ['f32'],
- '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
- '__rpow__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.log_softmax': ['f32'],
- 'masked.logaddexp': ['f32'],
- 'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.norm': ['f16', 'f32'],
- 'masked.normalize': ['f16', 'f32'],
- 'masked.softmax': ['f32'],
- 'masked.softmin': ['f32'],
- 'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.var': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
- 'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'addbmm': ['f32'],
- 'addcdiv': ['f32'],
- 'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'addmm': ['f32'],
- 'addmv': ['f32'],
- 'addr': ['f32'],
- 'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'allclose': ['f16', 'f32'],
- 'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'arange': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'amix': ['f32'],
- 'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'atan2': ['f32'],
- 'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'baddbmm': ['f32'],
- 'bitwise_and': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'bitwise_left_shift': ['i16', 'i32', 'i64', 'u8'],
- 'bitwise_not': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'bitwise_or': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'],
- 'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'bmm': ['f32'],
- 'broadcast_shapes': ['f32'],
- 'byte': None,
- 'cat': None,
- 'ceil': ['f32', 'int32', 'int64', 'f16'],
- 'chalf': None,
- 'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'clamp_min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'corrcoef': ['f32'],
- 'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
- 'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
- 'cov': ['f32'],
- 'cumsum': ['i8', 'b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'diag': ['f32', 'i32'],
- 'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'diagflat': ['f32', 'i32'],
- 'diagonal_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'dist': ['f32'],
- 'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'einsum': ['f32'],
- 'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
- 'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
- 'floor_divide': ['f32', 'f16'],
- 'fmax': ['b8', 'f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
- 'fmin': ['b8', 'f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
- 'fmod': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
- 'frac': ['f16', 'f32'],
- 'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'gradient': ['f16', 'f32', 'i16'],
- 'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'hypot': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'linalg.matrix_norm': ['f16'],
- 'linalg.matrix_power': ['f32'],
- 'linalg.svd': ['f32'],
- 'linalg.vector_norm': ['f16', 'f32'],
- 'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'log_softmax': ['f32'],
- 'logaddexp': ['f16', 'f32'],
- 'logaddexp2': ['f16', 'f32'],
- 'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'logit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'long': None,
- 'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked_scatter': ['i8', 'b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'matmul': ['f32'],
- 'mm': ['f32'],
- 'mv': ['f32'],
- 'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'nn.functional.adaptive_max_pool1d': ['f32'],
- 'nn.functional.adaptive_max_pool2d': ['f32'],
- 'nn.functional.adaptive_avg_pool1d': ['f32'],
- 'nn.functional.adaptive_avg_pool2d': ['f32'],
- 'nn.functional.avg_pool1d': ['f32', 'i64'],
- 'nn.functional.avg_pool2d': ['f32', 'i64'],
- 'nn.functional.binary_cross_entropy': ['f32'],
- 'nn.functional.binary_cross_entropy_with_logits': ['f32'],
- 'nn.functional.celu': ['f32'],
- 'nn.functional.conv1d': ['f32'],
- 'nn.functional.conv2d': ['f32'],
- 'nn.functional.conv_transpose1d': ['f32'],
- 'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.cosine_similarity': ['f32'],
- 'nn.functional.elu': ['f32'],
- 'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.embedding': ['f16', 'f32'],
- 'nn.functional.gaussian_nll_loss': ['f32'],
- 'nn.functional.glu': ['f32'],
- 'nn.functional.group_norm': ['f32'],
- 'nn.functional.hardsigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
- 'nn.functional.hinge_embedding_loss': ['f32'],
- 'nn.functional.huber_loss': ['f16', 'f32'],
- 'nn.functional.instance_norm': ['f32'],
- 'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
- 'nn.functional.l1_loss': ['f16', 'f32'],
- 'nn.functional.leaky_relu': ['f32'],
- 'nn.functional.linear': ['f32'],
- 'nn.functional.local_response_norm': ['f32'],
- 'nn.functional.logsigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
- 'nn.functional.max_pool1d': ['f32'],
- 'nn.functional.max_pool2d': ['f32'],
- 'max_pool2d_with_indices_backward': ['f32'],
- 'nn.functional.mse_loss': ['f16', 'f32'],
- 'nn.functional.nll_loss': ['f32'],
- 'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.padreflect': ['f32'],
- 'nn.functional.padreplicate': ['f32'],
- # TODO: add f16 test case after solve the accuracy issue,
- # see https://github.com/pytorch/pytorch/pull/95166#issuecomment-1439359181.
- 'nn.functional.pairwise_distance': ['f32', 'i16', 'i32', 'i64'],
- 'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
- 'nn.functional.prelu': ['f32'],
- 'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.relu6': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.selu': ['f32'],
- 'nn.functional.silu': ['f32'],
- 'nn.functional.smooth_l1_loss': ['f16', 'f32'],
- 'nn.functional.soft_margin_loss': ['f32'],
- 'nn.functional.softmin': ['f32'],
- 'nn.functional.softplus': ['f32'],
- 'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
- 'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
- 'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
- 'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64'],
- 'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64'],
- 'nn.functional.upsample_bilinear': ['f32'],
- 'nn.functional.upsample_nearest': ['f32'],
- 'norm': ['f32', 'f16'],
- 'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'pow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'put': None,
- 'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
- 'remainder' : None,
- 'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
- 'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
- 'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'select_scatter': None,
- 'sgn': None,
- 'short': None,
- 'sigmoid': None,
- 'sign': None,
- 'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'slice_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'softmax': ['f32'],
- 'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'svd': ['f32'],
- 't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'tan': ['b8', 'i16', 'i32', 'u8'],
- 'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
- 'tensordot': ['f32'],
- 'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'topk': ['f32', 'f16'],
- 'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
- 'sort': ['f32', 'i16', 'i32', 'i64'],
- 'argsort': ['f32', 'i16', 'i32', 'i64'],
- 'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'tril_indices': ['i32', 'i64'],
- 'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'triu_indices': ['i32', 'i64'],
- 'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'trunc': ['f32'],
- 'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'unsqueeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'view': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'view_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
- 'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'std': ['f16', 'f32'],
- 'var': ['f16', 'f32'],
- 'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'mean': ['f16', 'f32'],
- 'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'native_layer_norm': ['torch.float32'],
- 'nn.functional.layer_norm': ['torch.float32'],
- 'nn.functional.bilinear': ['f32'],
- 'linalg.solve_triangular': ['f32'],
- 'triangular_solve': ['f32'],
- 'trace': None,
- '_native_batch_norm_legit': ['f32'],
- 'native_batch_norm': ['f32'],
- 'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'linalg.inv': ['f32'],
- 'linalg.inv_ex': ['f32'],
- 'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- 'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
- }
-
-
- ALLOWLIST_OP_GRAD = {
- '__radd__': ['f16', 'f32'],
- '__rdiv__': ['f16', 'f32'],
- '__rmatmul__': ['f32'],
- '__rmul__': ['f16', 'f32'],
- '__rpow__': ['f32'],
- 'masked.log_softmax': ['f32'],
- 'masked.logaddexp': ['f32'],
- 'masked.softmax': ['f32'],
- 'masked.softmin': ['f32'],
- 'masked.std': ['f32'],
- 'masked_scatter': ['f16', 'f32'],
- 'abs': ['f16', 'f32'],
- 'acos': ['f32'],
- 'acosh': ['f32'],
- 'add': ['f16', 'f32'],
- 'addbmm': ['f32'],
- 'addcdiv': ['f32'],
- 'addcmul': ['f32'],
- 'addmm': ['f32'],
- 'addmv': ['f32'],
- 'addr': ['f32'],
- 'all': ['f16', 'f32'],
- 'any': ['f16', 'f32'],
- 'arange': ['f16', 'f32'],
- 'argmax': ['f16', 'f32'],
- 'argmin': ['f16', 'f32'],
- 'asin': ['f32'],
- 'asinh': ['f32'],
- 'atan': ['f32'],
- 'atan2': ['f32'],
- 'atleast_1d': ['f16', 'f32'],
- 'atleast_2d': ['f16', 'f32'],
- 'atleast_3d': ['f16', 'f32'],
- 'baddbmm': ['f32'],
- 'block_diag': ['f16', 'f32'],
- 'bmm': ['f32'],
- 'broadcast_shapes': ['f32'],
- 'ceil': ['f32'],
- 'chunk': ['f16', 'f32'],
- 'clone': ['f16', 'f32'],
- 'column_stack': ['f16', 'f32'],
- 'conj': ['f16', 'f32'],
- 'conj_physical': ['f16', 'f32'],
- 'contiguous': ['f16', 'f32'],
- 'copysign': ['f16', 'f32'],
- 'corrcoef': ['f32'],
- 'cos': ['f32'],
- 'cosh': ['f32'],
- 'cumsum': ['f16', 'f32'],
- 'deg2rad': ['f16', 'f32'],
- 'diag': ['f32'],
- 'diag_embed': ['f16', 'f32'],
- 'diagflat': ['f32'],
- 'diagonal_scatter': ['f16', 'f32'],
- 'diff': ['f16', 'f32'],
- 'dist': ['f32'],
- 'dot': ['f32'],
- 'einsum': ['f32'],
- 'erf': ['f32'],
- 'exp': ['f32'],
- 'exp2': ['f16', 'f32'],
- 'fill': ['f16', 'f32'],
- 'flatten': ['f16', 'f32'],
- 'flip': ['f16', 'f32'],
- 'fliplr': ['f16', 'f32'],
- 'flipud': ['f16', 'f32'],
- 'float': ['f32'],
- 'floor': ['f32'],
- 'fmax': ['f16', 'f32'],
- 'fmin': ['f16', 'f32'],
- 'gradient': ['f32'],
- 'half': ['f16'],
- 'hstack': ['f16', 'f32'],
- 'hypot': ['f16', 'f32'],
- 'index_select': ['f16', 'f32'],
- 'index_add': ['f16', 'f32'],
- 'isclose': ['f16', 'f32'],
- 'isfinite': ['f16', 'f32'],
- 'isinf': ['f16', 'f32'],
- 'isnan': ['f16', 'f32'],
- 'isreal': ['f16', 'f32'],
- 'kron': ['f32'],
- 'linalg.matrix_norm': ['f16'],
- 'linalg.svd': ['f32'],
- 'linspace': ['f16', 'f32'],
- 'log': ['f32'],
- 'log10': ['f32'],
- 'log1p': ['f32'],
- 'log2': ['f32'],
- 'log_softmax': ['f32'],
- 'logaddexp': ['f32'],
- 'logical_not': ['f16', 'f32'],
- 'logit': ['f16', 'f32'],
- 'logspace': ['f32'],
- 'matmul': ['f32'],
- 'mm': ['f32'],
- 'mv': ['f32'],
- 'neg': ['f16', 'f32'],
- 'nn.functional.adaptive_max_pool1d': ['f32'],
- 'nn.functional.adaptive_max_pool2d': ['f32'],
- 'nn.functional.adaptive_avg_pool1d': ['f32'],
- 'nn.functional.adaptive_avg_pool2d': ['f32'],
- 'nn.functional.avg_pool1d': ['f32'],
- 'nn.functional.avg_pool2d': ['f32'],
- 'nn.functional.binary_cross_entropy': ['f32'],
- 'nn.functional.celu': ['f32'],
- 'nn.functional.conv1d': ['f32'],
- 'nn.functional.conv2d': ['f32'],
- 'nn.functional.conv_transpose1d': ['f32'],
- 'nn.functional.cosine_embedding_loss': ['f32'],
- 'nn.functional.elu': ['f32'],
- 'nn.functional.feature_alpha_dropout': ['f16', 'f32'],
- 'nn.functional.glu': ['f32'],
- 'nn.functional.hardsigmoid': ['f16', 'f32'],
- 'nn.functional.hardtanh': ['f32'],
- 'nn.functional.hinge_embedding_loss': ['f32'],
- 'nn.functional.huber_loss': ['f16', 'f32'],
- 'nn.functional.instance_norm': ['f32'],
- 'nn.functional.kl_div': ['f32'],
- 'nn.functional.l1_loss': ['f16', 'f32'],
- 'nn.functional.leaky_relu': ['f32'],
- 'nn.functional.local_response_norm': ['f32'],
- 'nn.functional.logsigmoid': ['f16', 'f32'],
- 'nn.functional.margin_ranking_loss': ['f32'],
- 'nn.functional.max_pool1d': ['f32'],
- 'nn.functional.max_pool2d': ['f32'],
- 'nn.functional.mse_loss': ['f32'],
- 'nn.functional.nll_loss': ['f32'],
- 'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
- # TODO: add f16 test case after solve the accuracy issue,
- # see https://github.com/pytorch/pytorch/pull/95166#issuecomment-1439359181.
- 'nn.functional.pairwise_distance': ['f32'],
- 'nn.functional.poisson_nll_loss': ['f32'],
- 'nn.functional.relu': ['f32'],
- 'nn.functional.relu6': ['f32'],
- 'nn.functional.selu': ['f32'],
- 'nn.functional.silu': ['f32'],
- 'nn.functional.soft_margin_loss': ['f32'],
- 'nn.functional.softmin': ['f32'],
- 'nn.functional.softplus': ['f32'],
- 'nn.functional.softsign': ['f16', 'f32'],
- 'nn.functional.smooth_l1_loss': ['f32'],
- 'nn.functional.threshold': ['f32'],
- 'nn.functional.triplet_margin_loss': ['f32'],
- 'nn.functional.triplet_margin_with_distance_loss': ['f32'],
- 'nn.functional.upsample_bilinear': ['f32'],
- 'norm': ['f32', 'f16'],
- 'positive': ['f16', 'f32'],
- 'pow': ['f32'],
- 'rad2deg': ['f16', 'f32'],
- 'real': ['f16', 'f32'],
- 'reciprocal': ['f16', 'f32'],
- 'repeat': ['f16', 'f32'],
- 'repeat_interleave': ['f16', 'f32'],
- 'resolve_conj': ['f16', 'f32'],
- 'resolve_neg': ['f16', 'f32'],
- 'roll': ['f16', 'f32'],
- 'round': ['f32'],
- 'rsqrt': ['f32'],
- 'select_scatter': ['f16', 'f32'],
- 'sign': ['f16', 'f32'],
- 'sin': ['f32'],
- 'sinh': ['f32'],
- 'slice_scatter': ['f16', 'f32'],
- 'softmax': ['f32'],
- 'split': ['f16', 'f32'],
- 'sqrt': ['f32'],
- 'square': ['f16', 'f32'],
- 'squeeze': ['f16', 'f32'],
- 'stack': ['f16', 'f32'],
- 'sub': ['f32'],
- 'sum_to_size': ['f16', 'f32'],
- 'svd': ['f32'],
- 't': ['f16', 'f32'],
- 'tanh': ['f32'],
- 'tensordot': ['f32'],
- 'tile': ['f16', 'f32'],
- 'tril': ['f16', 'f32'],
- 'triu': ['f16', 'f32'],
- 'true_divide': ['f16', 'f32'],
- 'trunc': ['f32'],
- 'unbind': ['f16', 'f32'],
- 'unflatten': ['f16', 'f32'],
- 'unsqueeze': ['f16', 'f32'],
- 'view': ['f16', 'f32'],
- 'view_as': ['f16', 'f32'],
- 'vsplit': ['f16', 'f32'],
- 'vstack': ['f16', 'f32'],
- 'xlogy': ['f16', 'f32'],
- 'zero_': ['f16', 'f32'],
- 'linalg.solve_triangular': ['f32'],
- 'triangular_solve': ['f32'],
- '_native_batch_norm_legit': ['f32'],
- 'native_batch_norm': ['f32'],
- 'native_layer_norm': ['f32'],
- 'nn.functional.gelu': ['f32'],
- 'nn.functional.bilinear': ['f32'],
- 'nn.functional.prelu': ['f32'],
- }
-
- # These ops that are problematic. So never run them even when
- # generating the new allowlist.
- # If the dtype list is None, all dtypes are excluded.
- # All the entries in this list should be removed
- BLOCKLIST = {
- # Functions that hang
- 'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
- # + forward when requires_grad=True or running backward
- 'masked.mean': [torch.bool, torch.float16],
- 'masked.prod': [torch.bool],
- 'masked.sum': [torch.bool],
-
- # Functions that hard crash
- 'std': [torch.float16],
- 'stft': [torch.float32], 'var': [torch.float16],
- # + forward when requires_grad=True or running backward
- 'nn.functional.embedding': [torch.float32, torch.float16],
-
- 'as_strided_scatter': [torch.uint8],
- 'atan2': [torch.int64],
- 'bfloat16': None,
- 'block_diag': [torch.uint8],
- 'diag_embed': [torch.uint8],
- 'diagonal_scatter': [torch.uint8],
- 'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
- 'nn.functional.local_response_norm': [torch.int64],
- 'nn.functional.padcircular': [torch.uint8],
-
-
-
- # These were moved from ALLOWLIST to BLOCK as they are not working
- # locally
- 'tile': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
- '__radd__': ['torch.bool', 'torch.uint8'],
- '__rmul__': ['torch.uint8'],
- 'neg': ['torch.uint8'],
- 'add': ['torch.bool', 'torch.uint8'],
- 'addr': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
- 'diag': ['torch.int64'],
- 'diagflat': ['torch.int64'],
-
- # Functions that are flaky
- # These are detected as "ok" by the expect case but actually fail to run sometimes
- 'as_strided': None,
- 'broadcast_tensors': None,
- 'broadcast': None,
- 'broadcast_to': None,
- 'diagonal': None,
- 'divfloor_rounding': None,
- 'divno_rounding_mode': None,
- 'divtrunc_rounding': None,
- 'dsplit': None,
- 'hsplit': None,
- 'empty': None,
- 'expand_as': None,
- 'expand': None,
- 'ge': None,
- 'ne': None,
- 'le': None,
- 'lt': None,
- 'gt': None,
- 'transpose': None,
- 'splitlist_args': None,
- 'select': None,
- 'reshape': None,
- 'reshape_as': None,
- 'permute': None,
- 'norm': None,
- 'nn.functional.pixel_unshuffle': None,
- 'nn.functional.pixel_shuffle': None,
- 'nn.functional.cross_entropy': None,
- 'nn.functional.one_hot': None,
- 'narrow': None,
- 'movedim': None,
- 'minreduction_with_dim': None,
- 'minreduction_no_dim': None,
- 'minbinary': None,
- 'meshgridvariadic_tensors': None,
- 'meshgridlist_of_tensors': None,
- 'maxreduction_with_dim': None,
- 'maxreduction_no_dim': None,
- 'maxbinary': None,
- 'maximum': None,
- 'minimum': None,
- 'outer': None,
- 'softmaxwith_dtype': None,
- 'rounddecimals_neg_3': None,
- 'rounddecimals_3': None,
- 'rounddecimals_0': None,
- 'normnuc': None,
- 'nn.functional.softminwith_dtype': None,
- 'nn.functional.feature_alpha_dropoutwith_train': None,
- 'log_softmaxwith_dtype': None,
- 'split_with_sizes': None,
- 'trapezoid': None,
- 'eq': None,
- 'mul': None,
- 'cartesian_prod': None,
- 'bool': None,
- 'inner': None,
- 'dstack': None,
- 'take_along_dim': None,
- }
FP16_LOW_PRECISION_LIST = {
'add', 'sub', 'div',
'__rdiv__', '__rmul__',
'nn.functional.huber_loss',
'true_divide', 'kron',
- 'gradient', 'var', 'std',
+ 'gradient', 'var', 'std', 'ldexp',
'linalg.vector_norm',
- 'masked.sum', 'masked.std',
- 'masked.var',
+ 'addr', 'var_mean',
+ 'var_mean_unbiased',
+
+ # for macOS 12
+ 'masked.normalize', 'masked.sum', 'masked.var',
+ 'outer',
+ 'sum_to_size', 'sum',
+ 'mul',
+ 'nansum', 'nanmean',
+ 'norm',
+ }
+
+ FP32_LOW_PRECISION_LIST = {
+ # conv2d and conv_transpose2d results have a very small
+ # difference compared to CPU/CUDA, so we use lower precision on FP32
+ 'nn.functional.conv2d',
+ 'nn.functional.conv_transpose2d',
+ 'matmul', '__rmatmul__',
+ 'linalg.multi_dot',
+ 'addbmm',
}
# Used for accept mode only
@@ -10195,29 +10173,60 @@
def test_output_match(self, device, dtype, op):
self.assertEqual(device, "cpu")
key = op.name + op.variant_test_name
+ run_grad_test = True
- if key in self.BLOCKLIST:
- if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]:
- self.skipTest(f"Running test with {op.name} hangs so skipping")
+ def get_samples():
+ return op.sample_inputs(device, dtype, requires_grad=(dtype.is_floating_point or dtype.is_complex))
+ cpu_samples = get_samples()
- # Make this an expecttest manually
- # When this env variable is set, generate a new ALLOWLIST_OP
- # that reflects the current state of what passes or not
- if os.environ.get("EXPECTTEST_ACCEPT", None) == "1":
- generate_new_truth = True
- else:
- generate_new_truth = False
+ all_backward_pass = True
+ for cpu_sample in cpu_samples:
+ #
+ # Forward check
+ #
+ mps_sample = cpu_sample.transform(
+ lambda x: x.detach().to("mps").requires_grad_(x.requires_grad) if isinstance(x, torch.Tensor) else x)
+
+ cpu_args = [cpu_sample.input] + list(cpu_sample.args)
+ cpu_kwargs = cpu_sample.kwargs
+ mps_args = [mps_sample.input] + list(mps_sample.args)
+ mps_kwargs = mps_sample.kwargs
+
+ # for tensor_split(), the second tensor arg ("tensor_indices_or_sections") must be on CPU only
+ if (op.name == "tensor_split" and isinstance(mps_args[1], torch.Tensor)):
+ mps_args[1] = cpu_args[1]
+
+ cpu_out = op(*cpu_args, **cpu_kwargs)
+ mps_out = op(*mps_args, **mps_kwargs)
+
+ if (op.name in self.FP32_LOW_PRECISION_LIST) and dtype == torch.float32:
+ atol = 1e-4
+ rtol = 3e-5
+ elif op.name in self.FP16_LOW_PRECISION_LIST and dtype == torch.float16:
+ atol = 1e-2
+ rtol = 1e-2
+ elif op.name == "masked.mean":
+ atol = 7e-4
+ rtol = 2e-3
+ elif op.name == "native_layer_norm":
+ atol = 1e-4
+ rtol = 1.3e-5
+ elif op.name in ["pow", "__rpow__"]:
+ atol = 1e-6
+ rtol = 4e-6
+ else:
+ atol = None
+ rtol = None
+
+ self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
+
+
+ @ops(mps_ops_grad_modifier(copy.deepcopy(op_db)), allowed_dtypes=MPS_GRAD_DTYPES)
+ def test_output_grad_match(self, device, dtype, op):
+ self.assertEqual(device, "cpu")
+ key = op.name + op.variant_test_name
run_grad_test = True
- if not generate_new_truth:
- if op.name not in self.ALLOWLIST_OP:
- self.skipTest(f"{op.name} is not in the allow list for test on MPS")
- elif self.ALLOWLIST_OP[op.name] is not None:
- if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]:
- self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded")
-
- if op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name]:
- run_grad_test = False
def get_samples():
return op.sample_inputs(device, dtype, requires_grad=(dtype.is_floating_point or dtype.is_complex))
@@ -10234,8 +10243,6 @@
mps_sample = cpu_sample.transform(
lambda x: x.detach().to("mps").requires_grad_(x.requires_grad) if isinstance(x, torch.Tensor) else x)
- # TODO: This checks only the function variant. We should also check the method and inplace version
- # when they exist
cpu_args = [cpu_sample.input] + list(cpu_sample.args)
cpu_kwargs = cpu_sample.kwargs
mps_args = [mps_sample.input] + list(mps_sample.args)
@@ -10248,21 +10255,26 @@
cpu_out = op(*cpu_args, **cpu_kwargs)
mps_out = op(*mps_args, **mps_kwargs)
- if op.name == "nn.functional.conv2d" and dtype == torch.float32:
+ if (op.name in self.FP32_LOW_PRECISION_LIST) and dtype == torch.float32:
atol = 1e-4
rtol = 3e-5
- elif op.name in self.FP16_LOW_PRECISION_LIST and dtype == torch.float16:
+ elif op.name == "nn.functional.conv2d" or op.name == "linalg.multi_dot" and dtype == torch.float32:
+ atol = 1e-4
+ rtol = 3e-5
+ elif (op.name in self.FP16_LOW_PRECISION_LIST) and dtype == torch.float16:
atol = 1e-2
rtol = 1e-2
- elif op.name == "masked.mean":
+ elif (op.name == "masked.mean"):
atol = 7e-4
rtol = 2e-3
- elif op.name == "native_layer_norm":
+ elif (op.name == "native_layer_norm"):
atol = 1e-4
rtol = 1.3e-5
- elif op.name in ["pow", "__rpow__"]:
- atol = 1e-6
- rtol = 4e-6
+ elif op.name == "norm" and dtype == torch.float16:
+ atol = 7e-4
+ rtol = 1.5e-3
+ elif op.name == "unique" and cpu_kwargs["sorted"] is False:
+ continue
else:
atol = None
rtol = None
@@ -10270,82 +10282,44 @@
self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
except Exception as e:
- if any(s in str(e).lower() for s in ["int64", "float16", "div truc rounding"]):
- self.skipTest(f"Expected Runtime Error: {str(e)}")
-
- if not generate_new_truth:
- raise e
+ raise e
forward_failed = True
all_forward_pass = False
- if not (dtype.is_floating_point or dtype.is_complex):
- # Maybe we should error here instead?
- continue
-
#
# Backward check
#
+ if forward_failed:
+ # We would've failed immediately anyway, but this error is clearer
+ # We error instead of continuing so that all_backward_pass would not be True
+ raise RuntimeError("Forward pass already failed")
- # Skip the grad test if it is not part of the allow list
- if not generate_new_truth and not run_grad_test:
- # TODO: maybe there is a way to print only when we have -v
- # if i == 0:
- # print(f"Skipping gradient check because {op.name} is not on the allow list")
+ cpu_out = (cpu_out,) if isinstance(cpu_out, torch.Tensor) else tuple(cpu_out)
+ mps_out = (mps_out,) if isinstance(mps_out, torch.Tensor) else tuple(mps_out)
+
+ def req_grad(t):
+ return isinstance(t, torch.Tensor) and t.requires_grad
+
+ diff_cpu_out = tuple(t for t in cpu_out if req_grad(t))
+ diff_mps_out = tuple(t for t in mps_out if req_grad(t))
+ diff_cpu_arg = tuple(t for t in pytree.tree_flatten((cpu_args, cpu_kwargs))[0] if req_grad(t))
+ diff_mps_arg = tuple(t for t in pytree.tree_flatten((mps_args, mps_kwargs))[0] if req_grad(t))
+ self.assertEqual(len(diff_cpu_out), len(diff_mps_out))
+ self.assertEqual(len(diff_cpu_arg), len(diff_mps_arg))
+
+ if len(diff_cpu_out) == 0:
continue
+ # rand_like does not work with certain dtypes, so cast to double and cast back
+ cpu_grad_outputs = tuple(torch.rand_like(t.to(dtype=torch.double)).to(dtype=dtype) for t in diff_cpu_out)
+ mps_grad_outputs = tuple(t.to("mps") for t in cpu_grad_outputs)
- try:
- if forward_failed:
- # We would've failed immediately anyway, but this error is clearer
- # We error instead of continuing so that all_backward_pass would not be True
- raise RuntimeError("Forward pass already failed")
+ # Compare computed gradients with cpu given random grad_output vector
+ # Sometimes when the derivative is 0, we just don't bother creating the graph
+ # allow_unused is needed in those cases.
+ cpu_grad_inputs = torch.autograd.grad(diff_cpu_out, diff_cpu_arg, grad_outputs=cpu_grad_outputs, allow_unused=True)
+ mps_grad_inputs = torch.autograd.grad(diff_mps_out, diff_mps_arg, grad_outputs=mps_grad_outputs, allow_unused=True)
- cpu_out = (cpu_out,) if isinstance(cpu_out, torch.Tensor) else tuple(cpu_out)
- mps_out = (mps_out,) if isinstance(mps_out, torch.Tensor) else tuple(mps_out)
-
- def req_grad(t):
- return isinstance(t, torch.Tensor) and t.requires_grad
-
- diff_cpu_out = tuple(t for t in cpu_out if req_grad(t))
- diff_mps_out = tuple(t for t in mps_out if req_grad(t))
- diff_cpu_arg = tuple(t for t in pytree.tree_flatten((cpu_args, cpu_kwargs))[0] if req_grad(t))
- diff_mps_arg = tuple(t for t in pytree.tree_flatten((mps_args, mps_kwargs))[0] if req_grad(t))
- self.assertEqual(len(diff_cpu_out), len(diff_mps_out))
- self.assertEqual(len(diff_cpu_arg), len(diff_mps_arg))
-
- if len(diff_cpu_out) == 0:
- continue
- # rand_like does not work with certain dtypes, so cast to double and cast back
- cpu_grad_outputs = tuple(torch.rand_like(t.to(dtype=torch.double)).to(dtype=dtype) for t in diff_cpu_out)
- mps_grad_outputs = tuple(t.to("mps") for t in cpu_grad_outputs)
-
- # Compare computed gradients with cpu given random grad_output vector
- # Sometimes when the derivative is 0, we just don't bother creating the graph
- # allow_unused is needed in those cases.
- cpu_grad_inputs = torch.autograd.grad(diff_cpu_out, diff_cpu_arg, grad_outputs=cpu_grad_outputs, allow_unused=True)
- mps_grad_inputs = torch.autograd.grad(diff_mps_out, diff_mps_arg, grad_outputs=mps_grad_outputs, allow_unused=True)
-
- self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
- except Exception as e:
- if not generate_new_truth:
- raise e
- all_backward_pass = False
-
- if all_forward_pass and generate_new_truth:
- if dtype_abbrs[dtype] not in self.NEW_ALLOW_LIST[op.name]:
- self.NEW_ALLOW_LIST[op.name].append(dtype_abbrs[dtype])
- # We could write it only once. But I don't know how to detect that the current test is the last one
- # So each test append to the dict and write it.
- with open("new_mps_allowlist.txt", "w") as f:
- pprint.pprint(self.NEW_ALLOW_LIST, stream=f)
-
- if all_backward_pass and generate_new_truth and dtype.is_floating_point:
- if dtype_abbrs[dtype] not in self.NEW_ALLOW_LIST_GRAD[op.name]:
- self.NEW_ALLOW_LIST_GRAD[op.name].append(dtype_abbrs[dtype])
- # We could write it only once. But I don't know how to detect that the current test is the last one
- # So each test append to the dict and write it.
- with open("new_mps_allowlist_grad.txt", "w") as f:
- pprint.pprint(self.NEW_ALLOW_LIST_GRAD, stream=f)
-
+ self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
# Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
@skipIfSlowGradcheckEnv