Use hashing ideas from Carbon's hashtable in absl hashing:
- Use byte swap instead of mixing pointers twice.
- Change order of branches to check for len<=8 first.
- In len<=16 case, do one multiply to mix the data instead of using the logic from go/absl-hash-rl (reinforcement learning was used to optimize the instruction sequence).
- Add special handling for len<=32 cases in 64-bit architectures.

Note that we optimize for latency at the expense of hash quality. We had to change some hashing test cases to lower expectations for hash quality, but loadtests show performance wins.

PiperOrigin-RevId: 710774806
Change-Id: Idc0fd3a9f77dae5f649db138d0ee7225b0a02fd3
diff --git a/absl/hash/BUILD.bazel b/absl/hash/BUILD.bazel
index 4e4f0bd..66ad2ad 100644
--- a/absl/hash/BUILD.bazel
+++ b/absl/hash/BUILD.bazel
@@ -86,15 +86,10 @@
         ":hash_testing",
         ":spy_hash_state",
         "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/container:btree",
-        "//absl/container:flat_hash_map",
         "//absl/container:flat_hash_set",
-        "//absl/container:node_hash_map",
-        "//absl/container:node_hash_set",
         "//absl/memory",
         "//absl/meta:type_traits",
-        "//absl/numeric:int128",
+        "//absl/numeric:bits",
         "//absl/strings:cord_test_helpers",
         "//absl/strings:string_view",
         "//absl/types:optional",
diff --git a/absl/hash/CMakeLists.txt b/absl/hash/CMakeLists.txt
index 99d6fa1..2281d7b 100644
--- a/absl/hash/CMakeLists.txt
+++ b/absl/hash/CMakeLists.txt
@@ -68,18 +68,13 @@
   COPTS
     ${ABSL_TEST_COPTS}
   DEPS
-    absl::btree
+    absl::bits
     absl::cord_test_helpers
-    absl::core_headers
-    absl::flat_hash_map
     absl::flat_hash_set
     absl::hash
     absl::hash_testing
-    absl::int128
     absl::memory
     absl::meta
-    absl::node_hash_map
-    absl::node_hash_set
     absl::optional
     absl::spy_hash_state
     absl::string_view
diff --git a/absl/hash/hash_test.cc b/absl/hash/hash_test.cc
index ed31a2d..0f22983 100644
--- a/absl/hash/hash_test.cc
+++ b/absl/hash/hash_test.cc
@@ -44,6 +44,7 @@
 #include "absl/hash/internal/spy_hash_state.h"
 #include "absl/memory/memory.h"
 #include "absl/meta/type_traits.h"
+#include "absl/numeric/bits.h"
 #include "absl/strings/cord_test_helpers.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
@@ -163,9 +164,11 @@
       std::make_tuple(&i, ptr, nullptr, ptr + 1, n)));
 }
 
+// The test is flaky in ASan and on Apple platforms.
+#if !defined(ABSL_HAVE_ADDRESS_SANITIZER) && !defined(__APPLE__)
 TEST(HashValueTest, PointerAlignment) {
-  // We want to make sure that pointer alignment will not cause bits to be
-  // stuck.
+  // We want to make sure that pointer alignment will not cause too many bits to
+  // be stuck.
 
   constexpr size_t kTotalSize = 1 << 20;
   std::unique_ptr<char[]> data(new char[kTotalSize]);
@@ -189,9 +192,12 @@
     // Limit the scope to the bits we would be using for Swisstable.
     constexpr size_t kMask = (1 << (kLog2NumValues + 7)) - 1;
     size_t stuck_bits = (~bits_or | bits_and) & kMask;
-    EXPECT_EQ(stuck_bits, 0u) << "0x" << std::hex << stuck_bits;
+    // Test that there are less than 3 stuck bits. Sometimes we see stuck_bits
+    // of 0x3.
+    EXPECT_LT(absl::popcount(stuck_bits), 3) << "0x" << std::hex << stuck_bits;
   }
 }
+#endif  // !defined(ABSL_HAVE_ADDRESS_SANITIZER) && !defined(__APPLE__)
 
 TEST(HashValueTest, PointerToMember) {
   struct Bass {
diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc
index 60d3e7f..e0a8ea9 100644
--- a/absl/hash/internal/hash.cc
+++ b/absl/hash/internal/hash.cc
@@ -29,9 +29,10 @@
 uint64_t MixingHashState::CombineLargeContiguousImpl32(
     uint64_t state, const unsigned char* first, size_t len) {
   while (len >= PiecewiseChunkSize()) {
-    state = Mix(state,
-                hash_internal::CityHash32(reinterpret_cast<const char*>(first),
-                                          PiecewiseChunkSize()));
+    state = Mix(
+        state ^ hash_internal::CityHash32(reinterpret_cast<const char*>(first),
+                                          PiecewiseChunkSize()),
+        kMul);
     len -= PiecewiseChunkSize();
     first += PiecewiseChunkSize();
   }
@@ -43,7 +44,7 @@
 uint64_t MixingHashState::CombineLargeContiguousImpl64(
     uint64_t state, const unsigned char* first, size_t len) {
   while (len >= PiecewiseChunkSize()) {
-    state = Mix(state, Hash64(first, PiecewiseChunkSize()));
+    state = Mix(state ^ Hash64(first, PiecewiseChunkSize()), kMul);
     len -= PiecewiseChunkSize();
     first += PiecewiseChunkSize();
   }
@@ -54,22 +55,13 @@
 
 ABSL_CONST_INIT const void* const MixingHashState::kSeed = &kSeed;
 
-// The salt array used by LowLevelHash. This array is NOT the mechanism used to
-// make absl::Hash non-deterministic between program invocations.  See `Seed()`
-// for that mechanism.
-//
-// Any random values are fine. These values are just digits from the decimal
-// part of pi.
-// https://en.wikipedia.org/wiki/Nothing-up-my-sleeve_number
-constexpr uint64_t kHashSalt[5] = {
-    uint64_t{0x243F6A8885A308D3}, uint64_t{0x13198A2E03707344},
-    uint64_t{0xA4093822299F31D0}, uint64_t{0x082EFA98EC4E6C89},
-    uint64_t{0x452821E638D01377},
-};
+#ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
+constexpr uint64_t MixingHashState::kStaticRandomData[];
+#endif
 
 uint64_t MixingHashState::LowLevelHashImpl(const unsigned char* data,
                                            size_t len) {
-  return LowLevelHashLenGt16(data, len, Seed(), kHashSalt);
+  return LowLevelHashLenGt16(data, len, Seed(), &kStaticRandomData[0]);
 }
 
 }  // namespace hash_internal
diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h
index d8300da..f70170b 100644
--- a/absl/hash/internal/hash.h
+++ b/absl/hash/internal/hash.h
@@ -59,6 +59,7 @@
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/internal/endian.h"
 #include "absl/base/internal/unaligned_access.h"
 #include "absl/base/optimization.h"
 #include "absl/base/port.h"
@@ -457,10 +458,9 @@
                                                              T ptr) {
   auto v = reinterpret_cast<uintptr_t>(ptr);
   // Due to alignment, pointers tend to have low bits as zero, and the next few
-  // bits follow a pattern since they are also multiples of some base value.
-  // Mixing the pointer twice helps prevent stuck low bits for certain alignment
-  // values.
-  return H::combine(std::move(hash_state), v, v);
+  // bits follow a pattern since they are also multiples of some base value. The
+  // byte swap in WeakMix helps ensure we still have good entropy in low bits.
+  return H::combine(std::move(hash_state), v);
 }
 
 // AbslHashValue() for hashing nullptr_t
@@ -1021,9 +1021,16 @@
   using uint128 = absl::uint128;
 #endif  // ABSL_HAVE_INTRINSIC_INT128
 
+  // Random data taken from the hexadecimal digits of Pi's fractional component.
+  // https://en.wikipedia.org/wiki/Nothing-up-my-sleeve_number
+  ABSL_CACHELINE_ALIGNED static constexpr uint64_t kStaticRandomData[] = {
+      0x243f'6a88'85a3'08d3, 0x1319'8a2e'0370'7344, 0xa409'3822'299f'31d0,
+      0x082e'fa98'ec4e'6c89, 0x4528'21e6'38d0'1377,
+  };
+
   static constexpr uint64_t kMul =
   sizeof(size_t) == 4 ? uint64_t{0xcc9e2d51}
-                      : uint64_t{0x9ddfea08eb382d69};
+                      : uint64_t{0xdcb22ca68cb134ed};
 
   template <typename T>
   struct FitsIn64Bits : std::integral_constant<bool, sizeof(T) <= 8> {};
@@ -1061,7 +1068,7 @@
   template <typename T, absl::enable_if_t<IntegralFastPath<T>::value, int> = 0>
   static size_t hash(T value) {
     return static_cast<size_t>(
-        Mix(Seed(), static_cast<std::make_unsigned_t<T>>(value)));
+        WeakMix(Seed() ^ static_cast<std::make_unsigned_t<T>>(value)));
   }
 
   // Overload of MixingHashState::hash()
@@ -1121,6 +1128,49 @@
                                         std::integral_constant<int, 8>
                                         /* sizeof_size_t */);
 
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t CombineSmallContiguousImpl(
+      uint64_t state, const unsigned char* first, size_t len) {
+    ABSL_ASSUME(len <= 8);
+    uint64_t v;
+    if (len >= 4) {
+      v = Read4To8(first, len);
+    } else if (len > 0) {
+      v = Read1To3(first, len);
+    } else {
+      // Empty ranges have no effect.
+      return state;
+    }
+    return WeakMix(state ^ v);
+  }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t CombineContiguousImpl9to16(
+      uint64_t state, const unsigned char* first, size_t len) {
+    ABSL_ASSUME(len >= 9);
+    ABSL_ASSUME(len <= 16);
+    // Note: any time one half of the mix function becomes zero it will fail to
+    // incorporate any bits from the other half. However, there is exactly 1 in
+    // 2^64 values for each side that achieve this, and only when the size is
+    // exactly 16 -- for smaller sizes there is an overlapping byte that makes
+    // this impossible unless the seed is *also* incredibly unlucky.
+    auto p = Read9To16(first, len);
+    return Mix(state ^ p.first, kMul ^ p.second);
+  }
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t CombineContiguousImpl17to32(
+      uint64_t state, const unsigned char* first, size_t len) {
+    ABSL_ASSUME(len >= 17);
+    ABSL_ASSUME(len <= 32);
+    // Do two mixes of overlapping 16-byte ranges in parallel to minimize
+    // latency.
+    const uint64_t m0 =
+        Mix(Read8(first) ^ kStaticRandomData[1], Read8(first + 8) ^ state);
+
+    const unsigned char* tail_16b_ptr = first + (len - 16);
+    const uint64_t m1 = Mix(Read8(tail_16b_ptr) ^ kStaticRandomData[3],
+                            Read8(tail_16b_ptr + 8) ^ state);
+    return m0 ^ m1;
+  }
+
   // Slow dispatch path for calls to CombineContiguousImpl with a size argument
   // larger than PiecewiseChunkSize().  Has the same effect as calling
   // CombineContiguousImpl() repeatedly with the chunk stride size.
@@ -1136,8 +1186,8 @@
   // are in .second.
   static std::pair<uint64_t, uint64_t> Read9To16(const unsigned char* p,
                                                  size_t len) {
-    uint64_t low_mem = absl::base_internal::UnalignedLoad64(p);
-    uint64_t high_mem = absl::base_internal::UnalignedLoad64(p + len - 8);
+    uint64_t low_mem = Read8(p);
+    uint64_t high_mem = Read8(p + len - 8);
 #ifdef ABSL_IS_LITTLE_ENDIAN
     uint64_t most_significant = high_mem;
     uint64_t least_significant = low_mem;
@@ -1148,7 +1198,24 @@
     return {least_significant, most_significant};
   }
 
+  // Reads 8 bytes from p.
+  static uint64_t Read8(const unsigned char* p) {
+    // Suppress erroneous array bounds errors on GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+    return absl::base_internal::UnalignedLoad64(p);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+  }
+
   // Reads 4 to 8 bytes from p. Zero pads to fill uint64_t.
+  // TODO(b/384509507): consider optimizing this by not requiring the output to
+  // be equivalent to an integer load for 4/8 bytes. Currently, we rely on this
+  // behavior for the HashConsistentAcrossIntTypes test case. Ditto for
+  // Read1To3.
   static uint64_t Read4To8(const unsigned char* p, size_t len) {
     uint32_t low_mem = absl::base_internal::UnalignedLoad32(p);
     uint32_t high_mem = absl::base_internal::UnalignedLoad32(p + len - 4);
@@ -1183,20 +1250,24 @@
                                  (significant2 << ((len - 1) * 8)));
   }
 
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t Mix(uint64_t state, uint64_t v) {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t Mix(uint64_t lhs, uint64_t rhs) {
     // Though the 128-bit product on AArch64 needs two instructions, it is
     // still a good balance between speed and hash quality.
     using MultType =
         absl::conditional_t<sizeof(size_t) == 4, uint64_t, uint128>;
-    // We do the addition in 64-bit space to make sure the 128-bit
-    // multiplication is fast. If we were to do it as MultType the compiler has
-    // to assume that the high word is non-zero and needs to perform 2
-    // multiplications instead of one.
-    MultType m = state + v;
-    m *= kMul;
+    MultType m = lhs;
+    m *= rhs;
     return static_cast<uint64_t>(m ^ (m >> (sizeof(m) * 8 / 2)));
   }
 
+  // Slightly lower latency than Mix, but with lower quality. The byte swap
+  // helps ensure that low bits still have high quality.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static uint64_t WeakMix(uint64_t n) {
+    // WeakMix doesn't work well on 32-bit platforms so just use Mix.
+    if (sizeof(size_t) < 8) return Mix(n, kMul);
+    return absl::gbswap_64(n * kMul);
+  }
+
   // An extern to avoid bloat on a direct call to LowLevelHash() with fixed
   // values for both the seed and salt parameters.
   static uint64_t LowLevelHashImpl(const unsigned char* data, size_t len);
@@ -1248,21 +1319,15 @@
     std::integral_constant<int, 4> /* sizeof_size_t */) {
   // For large values we use CityHash, for small ones we just use a
   // multiplicative hash.
-  uint64_t v;
-  if (len > 8) {
-    if (ABSL_PREDICT_FALSE(len > PiecewiseChunkSize())) {
-      return CombineLargeContiguousImpl32(state, first, len);
-    }
-    v = hash_internal::CityHash32(reinterpret_cast<const char*>(first), len);
-  } else if (len >= 4) {
-    v = Read4To8(first, len);
-  } else if (len > 0) {
-    v = Read1To3(first, len);
-  } else {
-    // Empty ranges have no effect.
-    return state;
+  if (len <= 8) {
+    return CombineSmallContiguousImpl(state, first, len);
   }
-  return Mix(state, v);
+  if (ABSL_PREDICT_TRUE(len <= PiecewiseChunkSize())) {
+    return Mix(state ^ hash_internal::CityHash32(
+                           reinterpret_cast<const char*>(first), len),
+               kMul);
+  }
+  return CombineLargeContiguousImpl32(state, first, len);
 }
 
 // Overload of MixingHashState::CombineContiguousImpl()
@@ -1271,38 +1336,19 @@
     std::integral_constant<int, 8> /* sizeof_size_t */) {
   // For large values we use LowLevelHash or CityHash depending on the platform,
   // for small ones we just use a multiplicative hash.
-  uint64_t v;
-  if (len > 16) {
-    if (ABSL_PREDICT_FALSE(len > PiecewiseChunkSize())) {
-      return CombineLargeContiguousImpl64(state, first, len);
-    }
-    v = Hash64(first, len);
-  } else if (len > 8) {
-    // This hash function was constructed by the ML-driven algorithm discovery
-    // using reinforcement learning. We fed the agent lots of inputs from
-    // microbenchmarks, SMHasher, low hamming distance from generated inputs and
-    // picked up the one that was good on micro and macrobenchmarks.
-    auto p = Read9To16(first, len);
-    uint64_t lo = p.first;
-    uint64_t hi = p.second;
-    // Rotation by 53 was found to be most often useful when discovering these
-    // hashing algorithms with ML techniques.
-    lo = absl::rotr(lo, 53);
-    state += kMul;
-    lo += state;
-    state ^= hi;
-    uint128 m = state;
-    m *= lo;
-    return static_cast<uint64_t>(m ^ (m >> 64));
-  } else if (len >= 4) {
-    v = Read4To8(first, len);
-  } else if (len > 0) {
-    v = Read1To3(first, len);
-  } else {
-    // Empty ranges have no effect.
-    return state;
+  if (len <= 8) {
+    return CombineSmallContiguousImpl(state, first, len);
   }
-  return Mix(state, v);
+  if (len <= 16) {
+    return CombineContiguousImpl9to16(state, first, len);
+  }
+  if (len <= 32) {
+    return CombineContiguousImpl17to32(state, first, len);
+  }
+  if (ABSL_PREDICT_TRUE(len <= PiecewiseChunkSize())) {
+    return Mix(state ^ Hash64(first, len), kMul);
+  }
+  return CombineLargeContiguousImpl64(state, first, len);
 }
 
 struct AggregateBarrier {};