| // Copyright 2014 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // instrumentation.h: contains the definitions needed to |
| // instrument code for profiling: |
| // ScopedProfilingLabel, RegisterCurrentThreadForProfiling. |
| // |
| // profiler.h is only needed to drive the profiler: |
| // StartProfiling, FinishProfiling. |
| // |
| // See the usage example in profiler.h. |
| |
| #ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_ |
| #define GEMMLOWP_PROFILING_INSTRUMENTATION_H_ |
| |
| #include <pthread.h> |
| #include <cstdint> |
| #include <cassert> |
| #include <cstdlib> |
| #include <algorithm> |
| |
| #ifdef GEMMLOWP_PROFILING |
| #include <set> |
| #include <cstdio> |
| #include <cstring> |
| #endif |
| |
| // We should always use C++11 thread_local; unfortunately that |
| // isn't fully supported on Apple yet. |
| #ifdef __APPLE__ |
| #define GEMMLOWP_THREAD_LOCAL static __thread |
| #else |
| #define GEMMLOWP_THREAD_LOCAL thread_local |
| #endif |
| |
| namespace gemmlowp { |
| |
| inline void ReleaseBuildAssertion(bool condition, const char* msg) { |
| if (!condition) { |
| fprintf(stderr, "gemmlowp error: %s\n", msg); |
| abort(); |
| } |
| } |
| |
| // To be used as template parameter for GlobalLock. |
| // GlobalLock<ProfilerLockId> is the profiler global lock: |
| // registering threads, starting profiling, finishing profiling, and |
| // the profiler itself as it samples threads, all need to lock it. |
| struct ProfilerLockId; |
| |
| // A very plain global lock. Templated in LockId so we can have multiple |
| // locks, one for each LockId type. |
| template <typename LockId> |
| class GlobalLock { |
| static pthread_mutex_t* Mutex() { |
| static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; |
| return &m; |
| } |
| |
| public: |
| static void Lock() { pthread_mutex_lock(Mutex()); } |
| static void Unlock() { pthread_mutex_unlock(Mutex()); } |
| }; |
| |
| // A very simple RAII helper to lock and unlock a GlobalLock |
| template <typename LockId> |
| struct AutoGlobalLock { |
| AutoGlobalLock() { GlobalLock<LockId>::Lock(); } |
| ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } |
| }; |
| |
| // MemoryBarrier is purely a compile-time thing; it tells two things |
| // to the compiler: |
| // 1) It prevents reordering code across it |
| // (thanks to the 'volatile' after 'asm') |
| // 2) It requires the compiler to assume that any value previously |
| // read from memory, may have changed. Thus it offers an alternative |
| // to using 'volatile' variables. |
| inline void MemoryBarrier() { asm volatile("" ::: "memory"); } |
| |
| // Profiling definitions. Two paths: when profiling is enabled, |
| // and when profiling is disabled. |
| #ifdef GEMMLOWP_PROFILING |
| // This code path is when profiling is enabled. |
| |
| // A pseudo-call-stack. Contrary to a real call-stack, this only |
| // contains pointers to literal strings that were manually entered |
| // in the instrumented code (see ScopedProfilingLabel). |
| struct ProfilingStack { |
| static const std::size_t kMaxSize = 15; |
| typedef const char* LabelsArrayType[kMaxSize]; |
| LabelsArrayType labels; |
| std::size_t size; |
| |
| ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } |
| |
| void Push(const char* label) { |
| MemoryBarrier(); |
| ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); |
| labels[size] = label; |
| MemoryBarrier(); |
| size++; |
| MemoryBarrier(); |
| } |
| |
| void Pop() { |
| MemoryBarrier(); |
| ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); |
| size--; |
| MemoryBarrier(); |
| } |
| |
| void UpdateTop(const char* new_label) { |
| MemoryBarrier(); |
| assert(size); |
| labels[size - 1] = new_label; |
| MemoryBarrier(); |
| } |
| |
| ProfilingStack& operator=(const ProfilingStack& other) { |
| memcpy(this, &other, sizeof(ProfilingStack)); |
| return *this; |
| } |
| |
| bool operator==(const ProfilingStack& other) const { |
| return !memcmp(this, &other, sizeof(ProfilingStack)); |
| } |
| }; |
| |
| static_assert( |
| !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)), |
| "ProfilingStack should have power-of-two size to fit in cache lines"); |
| |
| struct ThreadInfo; |
| |
| // The global set of threads being profiled. |
| inline std::set<ThreadInfo*>& ThreadsUnderProfiling() { |
| static std::set<ThreadInfo*> v; |
| return v; |
| } |
| |
| struct ThreadInfo { |
| pthread_key_t key; // used only to get a callback at thread exit. |
| ProfilingStack stack; |
| |
| ThreadInfo() { |
| pthread_key_create(&key, ThreadExitCallback); |
| pthread_setspecific(key, this); |
| } |
| |
| static void ThreadExitCallback(void* ptr) { |
| AutoGlobalLock<ProfilerLockId> lock; |
| ThreadInfo* self = static_cast<ThreadInfo*>(ptr); |
| ThreadsUnderProfiling().erase(self); |
| pthread_key_delete(self->key); |
| } |
| }; |
| |
| inline ThreadInfo& ThreadLocalThreadInfo() { |
| GEMMLOWP_THREAD_LOCAL ThreadInfo i; |
| return i; |
| } |
| |
| // ScopedProfilingLabel is how one instruments code for profiling |
| // with this profiler. Construct local ScopedProfilingLabel variables, |
| // passing a literal string describing the local code. Profile |
| // samples will then be annotated with this label, while it is in scope |
| // (whence the name --- also known as RAII). |
| // See the example in profiler.h. |
| class ScopedProfilingLabel { |
| ProfilingStack* profiling_stack_; |
| |
| public: |
| explicit ScopedProfilingLabel(const char* label) |
| : profiling_stack_(&ThreadLocalThreadInfo().stack) { |
| profiling_stack_->Push(label); |
| } |
| |
| ~ScopedProfilingLabel() { profiling_stack_->Pop(); } |
| |
| void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); } |
| }; |
| |
| // To be called once on each thread to be profiled. |
| inline void RegisterCurrentThreadForProfiling() { |
| AutoGlobalLock<ProfilerLockId> lock; |
| ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); |
| } |
| |
| #else // not GEMMLOWP_PROFILING |
| // This code path is when profiling is disabled. |
| |
| // This empty definition of ScopedProfilingLabel ensures that |
| // it has zero runtime overhead when profiling is disabled. |
| struct ScopedProfilingLabel { |
| explicit ScopedProfilingLabel(const char*) {} |
| void Update(const char*) {} |
| }; |
| |
| inline void RegisterCurrentThreadForProfiling() {} |
| |
| #endif |
| |
| } // end namespace gemmlowp |
| |
| #endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_ |