pl/math: Add scalar atan and set fenv in Neon atan

The simplest way to set fenv in Neon atan is by using a scalar
fallback for under/overflow cases, however this routine did not have a
scalar counterpart so we add a new one, based on the same algorithm
and polynomial as the vector variants, and accurate to 2.5 ULP. This
is now used as the fallback for all lanes, when any lane of the Neon
input is special.
diff --git a/pl/math/atan_2u5.c b/pl/math/atan_2u5.c
new file mode 100644
index 0000000..99fea0f
--- /dev/null
+++ b/pl/math/atan_2u5.c
@@ -0,0 +1,73 @@
+/*
+ * Double-precision atan(x) function.
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "atan_common.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define PiOver2 0x1.921fb54442d18p+0
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
+#define OneTop 0x3ff
+
+/* Fast implementation of double-precision atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
+   atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+			     want 0x1.9225645bdd7c3p-1.  */
+double
+atan (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint64_t sign = ix & ~AbsMask;
+  uint64_t ia = ix & AbsMask;
+  uint32_t ia12 = ia >> 52;
+
+  if (unlikely (ia12 >= BigBound || ia12 < TinyBound))
+    {
+      if (ia12 < TinyBound)
+	/* Avoid underflow by returning x.  */
+	return x;
+      if (ia > 0x7ff0000000000000)
+	/* Propagate NaN.  */
+	return __math_invalid (x);
+      /* atan(x) rounds to PiOver2 for large x.  */
+      return asdouble (asuint64 (PiOver2) ^ sign);
+    }
+
+  double z, az, shift;
+  if (ia12 >= OneTop)
+    {
+      /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x).  */
+      z = -1.0 / x;
+      shift = PiOver2;
+      /* Use absolute value only when needed (odd powers of z).  */
+      az = -fabs (z);
+    }
+  else
+    {
+      /* For x < 1, approximate atan(x) directly.  */
+      z = x;
+      shift = 0;
+      az = asdouble (ia);
+    }
+
+  /* Calculate polynomial, shift + z + z^3 * P(z^2).  */
+  double y = eval_poly (z, az, shift);
+  /* Copy sign.  */
+  return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, atan, -10.0, 10.0)
+PL_TEST_ULP (atan, 1.78)
+PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index 44cbc73..05fa306 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -27,6 +27,7 @@
 
 double acosh (double);
 double asinh (double);
+double atan (double);
 double atan2 (double, double);
 double cbrt (double);
 double cosh (double);
diff --git a/pl/math/test/testcases/directed/atan.tst b/pl/math/test/testcases/directed/atan.tst
new file mode 100644
index 0000000..5716276
--- /dev/null
+++ b/pl/math/test/testcases/directed/atan.tst
@@ -0,0 +1,22 @@
+; atan.tst
+;
+; Copyright 1999-2022, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=atan op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=atan op1=7ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=atan op1=fff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=atan op1=00000000.00000000 result=00000000.00000000 errno=0
+func=atan op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=atan op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=atan op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
+
+func=atan op1=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0
+func=atan op1=bff00000.00000000 result=bfe921fb.54442d18.469 errno=0
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 43b4abd..92479ab 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -15,6 +15,8 @@
 
 #define PiOver2 v_f64 (0x1.921fb54442d18p+0)
 #define AbsMask v_u64 (0x7fffffffffffffff)
+#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)).  */
+#define BigBound 0x434	/* top12(asuint64(0x1p53)).  */
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
@@ -24,11 +26,20 @@
 VPCS_ATTR
 v_f64_t V_NAME (atan) (v_f64_t x)
 {
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   v_u64_t ix = v_as_u64_f64 (x);
   v_u64_t sign = ix & ~AbsMask;
 
+#if WANT_SIMD_EXCEPT
+  v_u64_t ia12 = (ix >> 52) & 0x7ff;
+  v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound);
+  /* If any lane is special, fall back to the scalar routine for all lanes.  */
+  if (unlikely (v_any_u64 (special)))
+    return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
+#endif
+
   /* Argument reduction:
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
@@ -46,16 +57,18 @@
 
   /* y = atan(x) if x>0, -atan(-x) otherwise.  */
   y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
-
   return y;
 }
 VPCS_ALIAS
 
 PL_SIG (V, D, 1, atan, -10.0, 10.0)
 PL_TEST_ULP (V_NAME (atan), 1.78)
-PL_TEST_INTERVAL (V_NAME (atan), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atan), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan), 1e6, 1e32, 40000)
+PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000)
+
 #endif