add Half support for maxpool on CPU (#98819) ### Testing Single socket (28 cores): shape | fp32 forward / ms | fp16 forward / ms | bf16 forward / ms | fp32 backward / ms | fp16 backward / ms | bf16 backward / ms -- | -- | -- | -- | -- | -- | -- size: (1, 56, 264, 264), kernel: 3, stride: 1, mem_format: contig | 4.12895 | 6.9669 | 5.30297 | 0.55775 | 1.98917 | 0.72233 size: (1, 56, 264, 264), kernel: 3, stride: 1, mem_format: CL | 0.85093 | 1.88813 | 1.38063 | 5.5742 | 36.5086 | 10.58552 size: (32, 16, 200, 200), kernel: 3, stride: 1, mem_format: contig | 22.37212 | 37.90383 | 30.94482 | 6.85868 | 10.6116 | 3.9993 size: (32, 16, 200, 200), kernel: 3, stride: 1, mem_format: CL | 5.41658 | 4.71098 | 4.66578 | 6.69875 | 14.7171 | 5.1167 size: (32, 32, 100, 100), kernel: 3, stride: 1, mem_format: contig | 10.69831 | 18.0468 | 13.71657 | 2.61192 | 4.96172 | 1.68635 size: (32, 32, 100, 100), kernel: 3, stride: 1, mem_format: CL | 2.52637 | 2.0096 | 2.0055 | 2.60314 | 7.2093 | 2.49843 size: (4, 19, 10, 16, 16), kernel: 3, stride: 1, mem_format: contig | 0.47605 | 0.88398 | 0.65326 | 0.06525 | 0.115489 | 0.0674 size: (4, 19, 10, 16, 16), kernel: 3, stride: 1, mem_format: CL3d | 0.10902 | 0.25293 | 0.157475 | 0.11386 | 0.53319 | 0.17836 Single core: shape | fp32 forward / ms | fp16 forward / ms | bf16 forward / ms | fp32 backward / ms | fp16 backward / ms | bf16 backward / ms -- | -- | -- | -- | -- | -- | -- size: (1, 56, 264, 264), kernel: 3, stride: 1, mem_format: contig | 90.9809 | 163.473 | 126.1276 | 6.57721 | 41.40833 | 11.82505 size: (1, 56, 264, 264), kernel: 3, stride: 1, mem_format: CL | 9.88405 | 38.39137 | 29.62069 | 7.10636 | 36.97535 | 11.0525 size: (32, 16, 200, 200), kernel: 3, stride: 1, mem_format: contig | 476.782 | 855.4769 | 648.2248 | 46.6488 | 219.2586 | 67.10599 size: (32, 16, 200, 200), kernel: 3, stride: 1, mem_format: CL | 80.29271 | 91.33854 | 87.80345 | 48.81692 | 203.9974 | 63.39004 size: (32, 32, 100, 100), kernel: 3, stride: 1, mem_format: contig | 235.2113 | 419.0799 | 315.4284 | 20.6049 | 107.1524 | 32.39169 size: (32, 32, 100, 100), kernel: 3, stride: 1, mem_format: CL | 29.47653 | 33.54905 | 32.82823 | 22.59674 | 98.5586 | 30.05763 size: (4, 19, 10, 16, 16), kernel: 3, stride: 1, mem_format: contig | 7.90684 | 13.9208 | 10.03272 | 0.23725 | 1.35269 | 0.41728 size: (4, 19, 10, 16, 16), kernel: 3, stride: 1, mem_format: CL3d | 2.33638 | 3.36894 | 2.64635 | 0.26535 | 1.244 | 0.38895 Pull Request resolved: https://github.com/pytorch/pytorch/pull/98819 Approved by: https://github.com/mingfeima, https://github.com/mikaylagawarecki
diff --git a/test/test_mps.py b/test/test_mps.py index 7a06efc..1159e8d 100644 --- a/test/test_mps.py +++ b/test/test_mps.py
@@ -10599,6 +10599,9 @@ 'nn.functional.triplet_margin_loss', 'nn.functional.triplet_margin_with_distance_loss', 'round', 'xlogy', 'addcmul', + 'nn.functional.max_pool2d', + 'nn.functional.gelu', + 'nn.functional.glu', # for macOS 12 'masked.normalize', 'masked.sum', 'masked.var', @@ -10756,10 +10759,6 @@ cpu_grad_inputs = torch.autograd.grad(diff_cpu_out, diff_cpu_arg, grad_outputs=cpu_grad_outputs, allow_unused=True) mps_grad_inputs = torch.autograd.grad(diff_mps_out, diff_mps_arg, grad_outputs=mps_grad_outputs, allow_unused=True) - if op.name in ["nn.functional.gelu", "nn.functional.glu"] and dtype == torch.float16: - atol = 1e-3 - rtol = 1e-3 - self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)