| /* |
| * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| * |
| */ |
| |
| #ifndef SHARE_GC_SHARED_TASKQUEUE_INLINE_HPP |
| #define SHARE_GC_SHARED_TASKQUEUE_INLINE_HPP |
| |
| #include "gc/shared/taskqueue.hpp" |
| |
| #include "logging/log.hpp" |
| #include "logging/logStream.hpp" |
| #include "memory/allocation.inline.hpp" |
| #include "memory/resourceArea.hpp" |
| #include "oops/oop.inline.hpp" |
| #include "runtime/atomic.hpp" |
| #include "runtime/orderAccess.hpp" |
| #include "utilities/debug.hpp" |
| #include "utilities/ostream.hpp" |
| #include "utilities/stack.inline.hpp" |
| |
| template <class T, MEMFLAGS F> |
| inline GenericTaskQueueSet<T, F>::GenericTaskQueueSet(uint n) : _n(n) { |
| typedef T* GenericTaskQueuePtr; |
| _queues = NEW_C_HEAP_ARRAY(GenericTaskQueuePtr, n, F); |
| for (uint i = 0; i < n; i++) { |
| _queues[i] = nullptr; |
| } |
| } |
| |
| template <class T, MEMFLAGS F> |
| inline GenericTaskQueueSet<T, F>::~GenericTaskQueueSet() { |
| FREE_C_HEAP_ARRAY(T*, _queues); |
| } |
| |
| #if TASKQUEUE_STATS |
| template<class T, MEMFLAGS F> |
| void GenericTaskQueueSet<T, F>::print_taskqueue_stats_hdr(outputStream* const st, const char* label) { |
| st->print_cr("GC Task Stats %s", label); |
| st->print("thr "); TaskQueueStats::print_header(1, st); st->cr(); |
| st->print("--- "); TaskQueueStats::print_header(2, st); st->cr(); |
| } |
| |
| template<class T, MEMFLAGS F> |
| void GenericTaskQueueSet<T, F>::print_taskqueue_stats(outputStream* const st, const char* label) { |
| print_taskqueue_stats_hdr(st, label); |
| |
| TaskQueueStats totals; |
| const uint n = size(); |
| for (uint i = 0; i < n; ++i) { |
| st->print("%3u ", i); queue(i)->stats.print(st); st->cr(); |
| totals += queue(i)->stats; |
| } |
| st->print_raw("tot "); totals.print(st); st->cr(); |
| |
| DEBUG_ONLY(totals.verify()); |
| } |
| |
| template<class T, MEMFLAGS F> |
| void GenericTaskQueueSet<T, F>::reset_taskqueue_stats() { |
| const uint n = size(); |
| for (uint i = 0; i < n; ++i) { |
| queue(i)->stats.reset(); |
| } |
| } |
| |
| template <class T, MEMFLAGS F> |
| inline void GenericTaskQueueSet<T, F>::print_and_reset_taskqueue_stats(const char* label) { |
| if (!log_is_enabled(Trace, gc, task, stats)) { |
| return; |
| } |
| Log(gc, task, stats) log; |
| ResourceMark rm; |
| LogStream ls(log.trace()); |
| |
| print_taskqueue_stats(&ls, label); |
| reset_taskqueue_stats(); |
| } |
| #endif // TASKQUEUE_STATS |
| |
| template<class E, MEMFLAGS F, unsigned int N> |
| inline GenericTaskQueue<E, F, N>::GenericTaskQueue() : |
| _elems(ArrayAllocator<E>::allocate(N, F)), |
| _last_stolen_queue_id(InvalidQueueId), |
| _seed(17 /* random number */) {} |
| |
| template<class E, MEMFLAGS F, unsigned int N> |
| inline GenericTaskQueue<E, F, N>::~GenericTaskQueue() { |
| ArrayAllocator<E>::free(_elems, N); |
| } |
| |
| template<class E, MEMFLAGS F, unsigned int N> inline bool |
| GenericTaskQueue<E, F, N>::push(E t) { |
| uint localBot = bottom_relaxed(); |
| assert(localBot < N, "_bottom out of range."); |
| idx_t top = age_top_relaxed(); |
| uint dirty_n_elems = dirty_size(localBot, top); |
| // A dirty_size of N-1 cannot happen in push. Considering only push: |
| // (1) dirty_n_elems is initially 0. |
| // (2) push adds an element iff dirty_n_elems < max_elems(), which is N - 2. |
| // (3) only push adding an element can increase dirty_n_elems. |
| // => dirty_n_elems <= N - 2, by induction |
| // => dirty_n_elems < N - 1, invariant |
| // |
| // A pop_global that is concurrent with push cannot produce a state where |
| // dirty_size == N-1. pop_global only removes an element if dirty_elems > 0, |
| // so can't underflow to -1 (== N-1) with push. |
| assert(dirty_n_elems <= max_elems(), "n_elems out of range."); |
| if (dirty_n_elems < max_elems()) { |
| _elems[localBot] = t; |
| release_set_bottom(increment_index(localBot)); |
| TASKQUEUE_STATS_ONLY(stats.record_push()); |
| return true; |
| } |
| return false; // Queue is full. |
| } |
| |
| template <class E, MEMFLAGS F, unsigned int N> |
| inline bool OverflowTaskQueue<E, F, N>::push(E t) { |
| if (!taskqueue_t::push(t)) { |
| overflow_stack()->push(t); |
| TASKQUEUE_STATS_ONLY(stats.record_overflow(overflow_stack()->size())); |
| } |
| return true; |
| } |
| |
| template <class E, MEMFLAGS F, unsigned int N> |
| inline bool OverflowTaskQueue<E, F, N>::try_push_to_taskqueue(E t) { |
| return taskqueue_t::push(t); |
| } |
| |
| // pop_local_slow() is done by the owning thread and is trying to |
| // get the last task in the queue. It will compete with pop_global() |
| // that will be used by other threads. The tag age is incremented |
| // whenever the queue goes empty which it will do here if this thread |
| // gets the last task or in pop_global() if the queue wraps (top == 0 |
| // and pop_global() succeeds, see pop_global()). |
| template<class E, MEMFLAGS F, unsigned int N> |
| bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) { |
| // This queue was observed to contain exactly one element; either this |
| // thread will claim it, or a competing "pop_global". In either case, |
| // the queue will be logically empty afterwards. Create a new Age value |
| // that represents the empty queue for the given value of "bottom". (We |
| // must also increment "tag" because of the case where "bottom == 1", |
| // "top == 0". A pop_global could read the queue element in that case, |
| // then have the owner thread do a pop followed by another push. Without |
| // the incrementing of "tag", the pop_global's CAS could succeed, |
| // allowing it to believe it has claimed the stale element.) |
| Age newAge((idx_t)localBot, (idx_t)(oldAge.tag() + 1)); |
| // Perhaps a competing pop_global has already incremented "top", in which |
| // case it wins the element. |
| if (localBot == oldAge.top()) { |
| // No competing pop_global has yet incremented "top"; we'll try to |
| // install new_age, thus claiming the element. |
| Age tempAge = cmpxchg_age(oldAge, newAge); |
| if (tempAge == oldAge) { |
| // We win. |
| assert_not_underflow(localBot, age_top_relaxed()); |
| TASKQUEUE_STATS_ONLY(stats.record_pop_slow()); |
| return true; |
| } |
| } |
| // We lose; a competing pop_global got the element. But the queue is empty |
| // and top is greater than bottom. Fix this representation of the empty queue |
| // to become the canonical one. |
| set_age_relaxed(newAge); |
| assert_not_underflow(localBot, age_top_relaxed()); |
| return false; |
| } |
| |
| template<class E, MEMFLAGS F, unsigned int N> inline bool |
| GenericTaskQueue<E, F, N>::pop_local(E& t, uint threshold) { |
| uint localBot = bottom_relaxed(); |
| // This value cannot be N-1. That can only occur as a result of |
| // the assignment to bottom in this method. If it does, this method |
| // resets the size to 0 before the next call (which is sequential, |
| // since this is pop_local.) |
| uint dirty_n_elems = dirty_size(localBot, age_top_relaxed()); |
| assert_not_underflow(dirty_n_elems); |
| if (dirty_n_elems <= threshold) return false; |
| localBot = decrement_index(localBot); |
| set_bottom_relaxed(localBot); |
| // This is necessary to prevent any read below from being reordered |
| // before the store just above. |
| OrderAccess::fence(); |
| t = _elems[localBot]; |
| // This is a second read of "age"; the "size()" above is the first. |
| // If there's still at least one element in the queue, based on the |
| // "_bottom" and "age" we've read, then there can be no interference with |
| // a "pop_global" operation, and we're done. |
| idx_t tp = age_top_relaxed(); |
| if (clean_size(localBot, tp) > 0) { |
| assert_not_underflow(localBot, tp); |
| TASKQUEUE_STATS_ONLY(stats.record_pop()); |
| return true; |
| } else { |
| // Otherwise, the queue contained exactly one element; we take the slow |
| // path. |
| |
| // The barrier is required to prevent reordering the two reads of age: |
| // one is the age() below, and the other is age_top() above the if-stmt. |
| // The algorithm may fail if age() reads an older value than age_top(). |
| OrderAccess::loadload(); |
| return pop_local_slow(localBot, age_relaxed()); |
| } |
| } |
| |
| template <class E, MEMFLAGS F, unsigned int N> |
| bool OverflowTaskQueue<E, F, N>::pop_overflow(E& t) |
| { |
| if (overflow_empty()) return false; |
| t = overflow_stack()->pop(); |
| return true; |
| } |
| |
| // A pop_global operation may read an element that is being concurrently |
| // written by a push operation. The pop_global operation will not use |
| // such an element, returning failure instead. But the concurrent read |
| // and write places requirements on the element type. |
| // |
| // Strictly, such concurrent reads and writes are undefined behavior. |
| // We ignore that. Instead we require that whatever value tearing may |
| // occur as a result is benign. A trivially copyable type (C++14 3.9/9) |
| // satisfies the requirement. But we might use classes such as oop that |
| // are not trivially copyable (in some build configurations). Such |
| // classes need to be carefully examined with this requirement in mind. |
| // |
| // The sequence where such a read/write collision can arise is as follows. |
| // Assume there is one value in the queue, so bottom == top+1. |
| // (1) Thief is doing a pop_global. It has read age and bottom, and its |
| // captured (localBottom - oldAge.top) == 1. |
| // (2) Owner does a pop_local and wins the race for that element. It |
| // decrements bottom and increments the age tag. |
| // (3) Owner starts a push, writing elems[bottom]. At the same time, Thief |
| // reads elems[oldAge.top]. The owner's bottom == the thief's oldAge.top. |
| // (4) Thief will discard the read value, because its cmpxchg of age will fail. |
| template<class E, MEMFLAGS F, unsigned int N> |
| typename GenericTaskQueue<E, F, N>::PopResult GenericTaskQueue<E, F, N>::pop_global(E& t) { |
| Age oldAge = age_relaxed(); |
| |
| // Architectures with non-multi-copy-atomic memory model require a |
| // full fence here to guarantee that bottom is not older than age, |
| // which is crucial for the correctness of the algorithm. |
| // |
| // We need a full fence here for this case: |
| // |
| // Thread1: set bottom (push) |
| // Thread2: read age, read bottom, set age (pop_global) |
| // Thread3: read age, read bottom (pop_global) |
| // |
| // The requirement is that Thread3 must never read an older bottom |
| // value than Thread2 after Thread3 has seen the age value from |
| // Thread2. |
| OrderAccess::loadload_for_IRIW(); |
| |
| uint localBot = bottom_acquire(); |
| uint n_elems = clean_size(localBot, oldAge.top()); |
| if (n_elems == 0) { |
| return PopResult::Empty; |
| } |
| |
| t = _elems[oldAge.top()]; |
| // Increment top; if it wraps, also increment tag, to distinguish it |
| // from any recent _age for the same top() index. |
| idx_t new_top = increment_index(oldAge.top()); |
| idx_t new_tag = oldAge.tag() + ((new_top == 0) ? 1 : 0); |
| Age newAge(new_top, new_tag); |
| Age resAge = cmpxchg_age(oldAge, newAge); |
| |
| // Note that using "bottom" here might fail, since a pop_local might |
| // have decremented it. |
| assert_not_underflow(localBot, newAge.top()); |
| return resAge == oldAge ? PopResult::Success : PopResult::Contended; |
| } |
| |
| inline int randomParkAndMiller(int *seed0) { |
| const int a = 16807; |
| const int m = 2147483647; |
| const int q = 127773; /* m div a */ |
| const int r = 2836; /* m mod a */ |
| STATIC_ASSERT(sizeof(int) == 4); |
| int seed = *seed0; |
| int hi = seed / q; |
| int lo = seed % q; |
| int test = a * lo - r * hi; |
| if (test > 0) { |
| seed = test; |
| } else { |
| seed = test + m; |
| } |
| *seed0 = seed; |
| return seed; |
| } |
| |
| template<class E, MEMFLAGS F, unsigned int N> |
| int GenericTaskQueue<E, F, N>::next_random_queue_id() { |
| return randomParkAndMiller(&_seed); |
| } |
| |
| template<class T, MEMFLAGS F> |
| typename GenericTaskQueueSet<T, F>::PopResult GenericTaskQueueSet<T, F>::steal_best_of_2(uint queue_num, E& t) { |
| T* const local_queue = queue(queue_num); |
| if (_n > 2) { |
| uint k1 = queue_num; |
| |
| if (local_queue->is_last_stolen_queue_id_valid()) { |
| k1 = local_queue->last_stolen_queue_id(); |
| assert(k1 != queue_num, "Should not be the same"); |
| } else { |
| while (k1 == queue_num) { |
| k1 = local_queue->next_random_queue_id() % _n; |
| } |
| } |
| |
| uint k2 = queue_num; |
| while (k2 == queue_num || k2 == k1) { |
| k2 = local_queue->next_random_queue_id() % _n; |
| } |
| // Sample both and try the larger. |
| uint sz1 = queue(k1)->size(); |
| uint sz2 = queue(k2)->size(); |
| |
| uint sel_k = 0; |
| PopResult suc = PopResult::Empty; |
| |
| if (sz2 > sz1) { |
| sel_k = k2; |
| suc = queue(k2)->pop_global(t); |
| TASKQUEUE_STATS_ONLY(local_queue->record_steal_attempt(suc);) |
| } else if (sz1 > 0) { |
| sel_k = k1; |
| suc = queue(k1)->pop_global(t); |
| TASKQUEUE_STATS_ONLY(local_queue->record_steal_attempt(suc);) |
| } |
| |
| if (suc == PopResult::Success) { |
| local_queue->set_last_stolen_queue_id(sel_k); |
| } else { |
| local_queue->invalidate_last_stolen_queue_id(); |
| } |
| |
| return suc; |
| } else if (_n == 2) { |
| // Just try the other one. |
| uint k = (queue_num + 1) % 2; |
| PopResult res = queue(k)->pop_global(t); |
| TASKQUEUE_STATS_ONLY(local_queue->record_steal_attempt(res);) |
| return res; |
| } else { |
| assert(_n == 1, "can't be zero."); |
| TASKQUEUE_STATS_ONLY(local_queue->record_steal_attempt(PopResult::Empty);) |
| return PopResult::Empty; |
| } |
| } |
| |
| template<class T, MEMFLAGS F> |
| bool GenericTaskQueueSet<T, F>::steal(uint queue_num, E& t) { |
| uint const num_retries = 2 * _n; |
| |
| TASKQUEUE_STATS_ONLY(uint contended_in_a_row = 0;) |
| for (uint i = 0; i < num_retries; i++) { |
| PopResult sr = steal_best_of_2(queue_num, t); |
| if (sr == PopResult::Success) { |
| return true; |
| } else if (sr == PopResult::Contended) { |
| TASKQUEUE_STATS_ONLY( |
| contended_in_a_row++; |
| queue(queue_num)->stats.record_contended_in_a_row(contended_in_a_row); |
| ) |
| } else { |
| assert(sr == PopResult::Empty, "must be"); |
| TASKQUEUE_STATS_ONLY(contended_in_a_row = 0;) |
| } |
| } |
| return false; |
| } |
| |
| template<class E, MEMFLAGS F, unsigned int N> |
| template<class Fn> |
| inline void GenericTaskQueue<E, F, N>::iterate(Fn fn) { |
| uint iters = size(); |
| uint index = bottom_relaxed(); |
| for (uint i = 0; i < iters; ++i) { |
| index = decrement_index(index); |
| fn(_elems[index]); |
| } |
| } |
| |
| |
| #endif // SHARE_GC_SHARED_TASKQUEUE_INLINE_HPP |