Revert "[atomic] Make pointer get op relaxed instead of acquire"

This reverts commit b1e5650c67266dc158f22355fed206cd1c413f70.

After lots of head-scratching and finally finding the only truly
readable source to be the good old:

  https://www.kernel.org/doc/Documentation/memory-barriers.txt

I've convinced myself that we need consume memory-ordering on get().
The location of memory-barrier in a load should be after, not before
the load.  That needs fixing.  I'll do that separately.
diff --git a/src/hb-atomic-private.hh b/src/hb-atomic-private.hh
index c860582..02cf6f3 100644
--- a/src/hb-atomic-private.hh
+++ b/src/hb-atomic-private.hh
@@ -40,19 +40,20 @@
 /* We need external help for these */
 
 #if defined(hb_atomic_int_impl_add) \
+ && defined(hb_atomic_ptr_impl_get) \
  && defined(hb_atomic_ptr_impl_cmpexch)
 
 /* Defined externally, i.e. in config.h; must have typedef'ed hb_atomic_int_impl_t as well. */
 
 
-#elif !defined(HB_NO_MT) && defined(__ATOMIC_RELAXED)
+#elif !defined(HB_NO_MT) && defined(__ATOMIC_ACQUIRE)
 
 /* C++11-style GCC primitives. */
 
 typedef int hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		__atomic_fetch_add (&(AI), (V), __ATOMIC_ACQ_REL)
 
-#define hb_atomic_ptr_impl_get(P)		__atomic_load_n ((P), __ATOMIC_RELAXED)
+#define hb_atomic_ptr_impl_get(P)		__atomic_load_n ((P), __ATOMIC_ACQUIRE)
 static inline bool
 _hb_atomic_ptr_impl_cmplexch (const void **P, const void *O_, const void *N)
 {
@@ -70,7 +71,7 @@
 typedef int hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		(reinterpret_cast<std::atomic<int> *> (&AI)->fetch_add ((V), std::memory_order_acq_rel))
 
-#define hb_atomic_ptr_impl_get(P)		(reinterpret_cast<std::atomic<void*> *> (P)->load (std::memory_order_relaxed))
+#define hb_atomic_ptr_impl_get(P)		(reinterpret_cast<std::atomic<void*> *> (P)->load (std::memory_order_acquire))
 static inline bool
 _hb_atomic_ptr_impl_cmplexch (const void **P, const void *O_, const void *N)
 {
@@ -84,9 +85,22 @@
 
 #include <windows.h>
 
+/* MinGW has a convoluted history of supporting MemoryBarrier
+ * properly.  As such, define a function to wrap the whole
+ * thing. */
+static inline void _HBMemoryBarrier (void) {
+#if !defined(MemoryBarrier)
+  long dummy = 0;
+  InterlockedExchange (&dummy, 1);
+#else
+  MemoryBarrier ();
+#endif
+}
+
 typedef LONG hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		InterlockedExchangeAdd (&(AI), (V))
 
+#define hb_atomic_ptr_impl_get(P)		(_HBMemoryBarrier (), (void *) *(P))
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	(InterlockedCompareExchangePointer ((void **) (P), (void *) (N), (void *) (O)) == (void *) (O))
 
 
@@ -95,6 +109,7 @@
 typedef int hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		__sync_fetch_and_add (&(AI), (V))
 
+#define hb_atomic_ptr_impl_get(P)		(void *) (__sync_synchronize (), *(P))
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	__sync_bool_compare_and_swap ((P), (O), (N))
 
 
@@ -104,9 +119,10 @@
 #include <mbarrier.h>
 
 typedef unsigned int hb_atomic_int_impl_t;
-#define hb_atomic_int_impl_add(AI, V)		(({__machine_rw_barrier ();}), atomic_add_int_nv (&(AI), (V)) - (V))
+#define hb_atomic_int_impl_add(AI, V)		( ({__machine_rw_barrier ();}), atomic_add_int_nv (&(AI), (V)) - (V))
 
-#define hb_atomic_ptr_impl_cmpexch(P,O,N)	(({__machine_rw_barrier ();}), atomic_cas_ptr ((void **) (P), (void *) (O), (void *) (N)) == (void *) (O) ? true : false)
+#define hb_atomic_ptr_impl_get(P)		( ({__machine_rw_barrier ();}), (void *) *(P))
+#define hb_atomic_ptr_impl_cmpexch(P,O,N)	( ({__machine_rw_barrier ();}), atomic_cas_ptr ((void **) (P), (void *) (O), (void *) (N)) == (void *) (O) ? true : false)
 
 
 #elif !defined(HB_NO_MT) && defined(__APPLE__)
@@ -122,6 +138,7 @@
 typedef int32_t hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		(OSAtomicAdd32Barrier ((V), &(AI)) - (V))
 
+#define hb_atomic_ptr_impl_get(P)		(OSMemoryBarrier (), (void *) *(P))
 #if (MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_4 || __IPHONE_VERSION_MIN_REQUIRED >= 20100)
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	OSAtomicCompareAndSwapPtrBarrier ((void *) (O), (void *) (N), (void **) (P))
 #else
@@ -154,6 +171,7 @@
 typedef int hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)           _hb_fetch_and_add (&(AI), (V))
 
+#define hb_atomic_ptr_impl_get(P)               (__sync(), (void *) *(P))
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)       _hb_compare_and_swaplp ((long*)(P), (long)(O), (long)(N))
 
 #elif !defined(HB_NO_MT)
@@ -163,6 +181,7 @@
 typedef volatile int hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		(((AI) += (V)) - (V))
 
+#define hb_atomic_ptr_impl_get(P)		((void *) *(P))
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	(* (void * volatile *) (P) == (void *) (O) ? (* (void * volatile *) (P) = (void *) (N), true) : false)
 
 
@@ -171,16 +190,13 @@
 typedef int hb_atomic_int_impl_t;
 #define hb_atomic_int_impl_add(AI, V)		(((AI) += (V)) - (V))
 
+#define hb_atomic_ptr_impl_get(P)		((void *) *(P))
 #define hb_atomic_ptr_impl_cmpexch(P,O,N)	(* (void **) (P) == (void *) (O) ? (* (void **) (P) = (void *) (N), true) : false)
 
 
 #endif
 
 
-#ifndef hb_atomic_ptr_impl_get
-#define hb_atomic_ptr_impl_get(P)		((void *) *(P))
-#endif
-
 #ifndef HB_ATOMIC_INT_INIT
 #define HB_ATOMIC_INT_INIT(V)          {V}
 #endif