| // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // profiler.h: a simple sampling profiler that's always just one #include away! |
| // |
| // Overview |
| // ======== |
| // |
| // This profiler only samples a pseudo-stack, not the actual call stack. |
| // The code to be profiled needs to be instrumented with |
| // pseudo-stack "labels", see ScopedProfilingLabel. |
| // Using pseudo-stacks allows this profiler to be very simple, low-overhead, |
| // portable, and independent of compilation details such as function inlining |
| // and frame pointers. The granularity of instrumentation can be freely chosen, |
| // and it is possible to get some annotate-like detail, i.e. detail within one |
| // function without splitting it into multiple functions. |
| // |
| // This profiler should remain small and simple; its key feature is to fit in |
| // a single header file so that there should never be a reason to refrain |
| // from profiling. More complex and feature-rich alternatives are |
| // readily available. This one offers a strict superset of its |
| // functionality: https://github.com/bgirard/GeckoProfiler, including |
| // intertwining pseudostacks with real call stacks, more annotation options, |
| // and advanced visualization. |
| // |
| // Usage |
| // ===== |
| // |
| // 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is |
| // not enabled, profiling instrumentation from instrumentation.h |
| // (ScopedProfilingLabel, RegisterCurrentThreadForProfiling) |
| // is still defined but does nothing. On the other hand, |
| // when profiling is not enabled, it is an error to #include the |
| // present file. |
| // |
| // 1. Each thread can opt in to profiling by calling |
| // RegisterCurrentThreadForProfiling() defined in instrumentation.h. |
| // This can be done at any time, before or during profiling. |
| // No sample will be collected from a thread until |
| // it has called RegisterCurrentThreadForProfiling(). |
| // |
| // 2. Instrument your code to be profiled with ScopedProfilingLabel, |
| // which is a RAII helper defined in instrumentation.h. The identifier |
| // names (some_label, etc) do not matter; what will show up |
| // in the profile is the string passed to the constructor, which |
| // must be a literal string. See the full example below. |
| // |
| // Note: the overhead of ScopedProfilingLabel is zero when not |
| // enabling profiling (when not defining GEMMLOWP_PROFILING). |
| // |
| // 3. Use the profiler.h interface to control profiling. There are two |
| // functions: StartProfiling() and FinishProfiling(). They must be |
| // called on the same thread. FinishProfiling() prints the profile |
| // on stdout. |
| // |
| // Full example |
| // ============ |
| /* |
| #define GEMMLOWP_PROFILING |
| #include "profiling/instrumentation.h" |
| using namespace gemmlowp; |
| |
| const int iters = 100000000; |
| volatile int i; |
| |
| void Bar() { |
| ScopedProfilingLabel label("Bar"); |
| for (i = 0; i < iters; i++) {} |
| } |
| |
| void Foo() { |
| ScopedProfilingLabel label("Foo"); |
| for (i = 0; i < iters; i++) {} |
| Bar(); |
| } |
| |
| void Init() { |
| RegisterCurrentThreadForProfiling(); |
| } |
| |
| #include "profiling/profiler.h" |
| |
| int main() { |
| Init(); |
| StartProfiling(); |
| Foo(); |
| FinishProfiling(); |
| } |
| * |
| * Output: |
| * |
| gemmlowp profile (1 threads, 304 samples) |
| 100.00% Foo |
| 51.32% other |
| 48.68% Bar |
| 0.00% other (outside of any label) |
| */ |
| // |
| // Interpreting results |
| // ==================== |
| // |
| // Each node shows the absolute percentage, among all the samples, |
| // of the number of samples that recorded the given pseudo-stack. |
| // The percentages are *NOT* relative to the parent node. In addition |
| // to your own labels, you will also see 'other' nodes that collect |
| // the remainder of samples under the parent node that didn't fall into |
| // any of the labelled child nodes. Example: |
| // |
| // 20% Foo |
| // 12% Bar |
| // 6% Xyz |
| // 2% other |
| // |
| // This means that 20% of all labels were under Foo, of which 12%/20%==60% |
| // were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not |
| // under either Bar or Xyz. |
| // |
| // Typically, one wants to keep adding ScopedProfilingLabel's until |
| // the 'other' nodes show low percentages. |
| // |
| // Interpreting results with multiple threads |
| // ========================================== |
| // |
| // At each sample, each thread registered for profiling gets sampled once. |
| // So if there is one "main thread" spending its time in MainFunc() and |
| // 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the |
| // samples will be in WorkerFunc, so the profile will look like this: |
| // |
| // 80% WorkerFunc |
| // 20% MainFunc |
| |
| #ifndef GEMMLOWP_PROFILING_PROFILER_H_ |
| #define GEMMLOWP_PROFILING_PROFILER_H_ |
| |
| #ifndef GEMMLOWP_PROFILING |
| #error Profiling is not enabled! |
| #endif |
| |
| #include <vector> |
| |
| #include "instrumentation.h" |
| |
| namespace gemmlowp { |
| |
| // A tree view of a profile. |
| class ProfileTreeView { |
| struct Node { |
| std::vector<Node*> children; |
| const char* label; |
| std::size_t weight; |
| Node() : label(nullptr), weight(0) {} |
| ~Node() { |
| for (auto child : children) { |
| delete child; |
| } |
| } |
| }; |
| |
| static bool CompareNodes(Node* n1, Node* n2) { |
| return n1->weight > n2->weight; |
| } |
| |
| Node root_; |
| |
| void PrintNode(const Node* node, int level) const { |
| if (level) { |
| for (int i = 1; i < level; i++) { |
| printf(" "); |
| } |
| printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label); |
| } |
| for (auto child : node->children) { |
| PrintNode(child, level + 1); |
| } |
| } |
| |
| static void AddStackToNode(const ProfilingStack& stack, Node* node, |
| std::size_t level) { |
| node->weight++; |
| if (stack.size == level) { |
| return; |
| } |
| Node* child_to_add_to = nullptr; |
| for (auto child : node->children) { |
| if (child->label == stack.labels[level]) { |
| child_to_add_to = child; |
| break; |
| } |
| } |
| if (!child_to_add_to) { |
| child_to_add_to = new Node; |
| child_to_add_to->label = stack.labels[level]; |
| node->children.push_back(child_to_add_to); |
| } |
| AddStackToNode(stack, child_to_add_to, level + 1); |
| return; |
| } |
| |
| void AddStack(const ProfilingStack& stack) { |
| AddStackToNode(stack, &root_, 0); |
| } |
| |
| void AddOtherChildrenToNode(Node* node) { |
| std::size_t top_level_children_weight = 0; |
| for (auto c : node->children) { |
| AddOtherChildrenToNode(c); |
| top_level_children_weight += c->weight; |
| } |
| if (top_level_children_weight) { |
| Node* other_child = new Node; |
| other_child->label = |
| node == &root_ ? "other (outside of any label)" : "other"; |
| other_child->weight = node->weight - top_level_children_weight; |
| node->children.push_back(other_child); |
| } |
| } |
| |
| void AddOtherNodes() { AddOtherChildrenToNode(&root_); } |
| |
| void SortNode(Node* node) { |
| std::sort(node->children.begin(), node->children.end(), CompareNodes); |
| for (auto child : node->children) { |
| SortNode(child); |
| } |
| } |
| |
| void Sort() { SortNode(&root_); } |
| |
| public: |
| explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) { |
| for (auto stack : stacks) { |
| AddStack(stack); |
| } |
| AddOtherNodes(); |
| Sort(); |
| } |
| |
| void Print() const { |
| printf("\n"); |
| printf("gemmlowp profile (%d threads, %d samples)\n", |
| static_cast<int>(ThreadsUnderProfiling().size()), |
| static_cast<int>(root_.weight)); |
| PrintNode(&root_, 0); |
| printf("\n"); |
| } |
| }; |
| |
| // This function is the only place that determines our sampling frequency. |
| inline void WaitOneProfilerTick() { |
| static const int millisecond = 1000000; |
| |
| #if defined __arm__ || defined __aarch64__ |
| // Reduced sampling frequency on mobile devices helps limit time and memory |
| // overhead there. |
| static const int interval = 10 * millisecond; |
| #else |
| static const int interval = 1 * millisecond; |
| #endif |
| |
| timespec ts; |
| ts.tv_sec = 0; |
| ts.tv_nsec = interval; |
| nanosleep(&ts, nullptr); |
| } |
| |
| // This is how we track whether we've already started profiling, |
| // to guard against misuse of the API. |
| inline bool& IsProfiling() { |
| static bool b; |
| return b; |
| } |
| |
| // This is how we tell the profiler thread to finish. |
| inline bool& ProfilerThreadShouldFinish() { |
| static bool b; |
| return b; |
| } |
| |
| // The profiler thread. See ProfilerThreadFunc. |
| inline pthread_t& ProfilerThread() { |
| static pthread_t t; |
| return t; |
| } |
| |
| // Records a stack from a running thread. |
| // The tricky part is that we're not interrupting the thread. |
| // This is OK because we're looking at a pseudo-stack of labels, |
| // not at the real thread stack, and if the pseudo-stack changes |
| // while we're recording it, we are OK with getting either the |
| // old or the new stack. Note that ProfilingStack::Pop |
| // only decrements the size, and doesn't null the popped label, |
| // so if we're concurrently recording it, it shouldn't change |
| // under our feet until another label is pushed, at which point |
| // we are OK with getting either this new label or the old one. |
| // In the end, the key atomicity property that we are relying on |
| // here is that pointers are changed atomically, and the labels |
| // are pointers (to literal strings). |
| inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) { |
| ScopedLock sl(thread->stack.lock); |
| assert(!dst->size); |
| while (dst->size < thread->stack.size) { |
| dst->labels[dst->size] = thread->stack.labels[dst->size]; |
| dst->size++; |
| } |
| } |
| |
| // The profiler thread's entry point. |
| // Note that a separate thread is to be started each time we call |
| // StartProfiling(), and finishes when we call FinishProfiling(). |
| // So here we only need to handle the recording and reporting of |
| // a single profile. |
| inline void* ProfilerThreadFunc(void*) { |
| assert(ProfilerThread() == pthread_self()); |
| |
| // Since we only handle one profile per profiler thread, the |
| // profile data (the array of recorded stacks) can be a local variable here. |
| std::vector<ProfilingStack> stacks; |
| |
| while (!ProfilerThreadShouldFinish()) { |
| WaitOneProfilerTick(); |
| { |
| ScopedLock sl(GlobalMutexes::Profiler()); |
| for (auto t : ThreadsUnderProfiling()) { |
| ProfilingStack s; |
| RecordStack(t, &s); |
| stacks.push_back(s); |
| } |
| } |
| } |
| |
| // Profiling is finished and we now report the results. |
| ProfileTreeView(stacks).Print(); |
| |
| return nullptr; |
| } |
| |
| // Starts recording samples. |
| inline void StartProfiling() { |
| ScopedLock sl(GlobalMutexes::Profiler()); |
| ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); |
| IsProfiling() = true; |
| ProfilerThreadShouldFinish() = false; |
| pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr); |
| } |
| |
| // Stops recording samples, and prints a profile tree-view on stdout. |
| inline void FinishProfiling() { |
| { |
| ScopedLock sl(GlobalMutexes::Profiler()); |
| ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); |
| // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, |
| // as the scary comments below should make clear. |
| // Should we use a condition variable? |
| ProfilerThreadShouldFinish() = true; |
| } // must release the lock here to avoid deadlock with profiler thread. |
| pthread_join(ProfilerThread(), nullptr); |
| IsProfiling() = false; // yikes, this should be guarded by the lock! |
| } |
| |
| } // namespace gemmlowp |
| |
| #endif // GEMMLOWP_PROFILING_PROFILER_H_ |