Add GPU draw stats to gfxinfo and GraphicsStatsService

Test: ran dumpsys with gfxinfo and graphicsstats
Change-Id: Id9950de87dc4343c6878baa6a6dd42fbc8aeddef
diff --git a/libs/hwui/FrameInfo.cpp b/libs/hwui/FrameInfo.cpp
index 71cc9a8..0698775 100644
--- a/libs/hwui/FrameInfo.cpp
+++ b/libs/hwui/FrameInfo.cpp
@@ -37,13 +37,14 @@
         "FrameCompleted",
         "DequeueBufferDuration",
         "QueueBufferDuration",
+        "GpuCompleted",
 };
 
 static_assert((sizeof(FrameInfoNames) / sizeof(FrameInfoNames[0])) ==
                       static_cast<int>(FrameInfoIndex::NumIndexes),
               "size mismatch: FrameInfoNames doesn't match the enum!");
 
-static_assert(static_cast<int>(FrameInfoIndex::NumIndexes) == 16,
+static_assert(static_cast<int>(FrameInfoIndex::NumIndexes) == 17,
               "Must update value in FrameMetrics.java#FRAME_STATS_COUNT (and here)");
 
 void FrameInfo::importUiThreadInfo(int64_t* info) {
diff --git a/libs/hwui/FrameInfo.h b/libs/hwui/FrameInfo.h
index b75192f..51674fb 100644
--- a/libs/hwui/FrameInfo.h
+++ b/libs/hwui/FrameInfo.h
@@ -51,6 +51,8 @@
     DequeueBufferDuration,
     QueueBufferDuration,
 
+    GpuCompleted,
+
     // Must be the last value!
     // Also must be kept in sync with FrameMetrics.java#FRAME_STATS_COUNT
     NumIndexes
@@ -143,6 +145,13 @@
         return duration(FrameInfoIndex::IntendedVsync, FrameInfoIndex::FrameCompleted);
     }
 
+    inline int64_t gpuDrawTime() const {
+        // GPU start time is approximated to the moment before swapBuffer is invoked.
+        // We could add an EGLSyncKHR fence at the beginning of the frame, but that is an overhead.
+        int64_t endTime = get(FrameInfoIndex::GpuCompleted);
+        return endTime > 0 ? endTime - get(FrameInfoIndex::SwapBuffers) : -1;
+    }
+
     inline int64_t& set(FrameInfoIndex index) { return mFrameInfo[static_cast<int>(index)]; }
 
     inline int64_t get(FrameInfoIndex index) const {
diff --git a/libs/hwui/JankTracker.cpp b/libs/hwui/JankTracker.cpp
index 53c5ad8..eae3584 100644
--- a/libs/hwui/JankTracker.cpp
+++ b/libs/hwui/JankTracker.cpp
@@ -232,5 +232,13 @@
                                                     : FrameInfoIndex::IntendedVsync;
 }
 
+void JankTracker::finishGpuDraw(const FrameInfo& frame) {
+    int64_t totalGPUDrawTime = frame.gpuDrawTime();
+    if (totalGPUDrawTime >= 0) {
+        mData->reportGPUFrame(totalGPUDrawTime);
+        (*mGlobalData)->reportGPUFrame(totalGPUDrawTime);
+    }
+}
+
 } /* namespace uirenderer */
 } /* namespace android */
diff --git a/libs/hwui/JankTracker.h b/libs/hwui/JankTracker.h
index 110211e..08059268 100644
--- a/libs/hwui/JankTracker.h
+++ b/libs/hwui/JankTracker.h
@@ -58,6 +58,7 @@
 
     FrameInfo* startFrame() { return &mFrames.next(); }
     void finishFrame(const FrameInfo& frame);
+    void finishGpuDraw(const FrameInfo& frame);
 
     void dumpStats(int fd) { dumpData(fd, &mDescription, mData.get()); }
     void dumpFrames(int fd);
diff --git a/libs/hwui/ProfileData.cpp b/libs/hwui/ProfileData.cpp
index c7f9232..7921662 100644
--- a/libs/hwui/ProfileData.cpp
+++ b/libs/hwui/ProfileData.cpp
@@ -98,6 +98,10 @@
     if (mStatStartTime > other.mStatStartTime || mStatStartTime == 0) {
         mStatStartTime = other.mStatStartTime;
     }
+    for (size_t i = 0; i < other.mGPUFrameCounts.size(); i++) {
+        mGPUFrameCounts[i] >>= divider;
+        mGPUFrameCounts[i] += other.mGPUFrameCounts[i];
+    }
 }
 
 void ProfileData::dump(int fd) const {
@@ -117,6 +121,14 @@
     histogramForEach([fd](HistogramEntry entry) {
         dprintf(fd, " %ums=%u", entry.renderTimeMs, entry.frameCount);
     });
+    dprintf(fd, "\n50th gpu percentile: %ums", findGPUPercentile(50));
+    dprintf(fd, "\n90th gpu percentile: %ums", findGPUPercentile(90));
+    dprintf(fd, "\n95th gpu percentile: %ums", findGPUPercentile(95));
+    dprintf(fd, "\n99th gpu percentile: %ums", findGPUPercentile(99));
+    dprintf(fd, "\nGPU HISTOGRAM:");
+    histogramGPUForEach([fd](HistogramEntry entry) {
+        dprintf(fd, " %ums=%u", entry.renderTimeMs, entry.frameCount);
+    });
 }
 
 uint32_t ProfileData::findPercentile(int percentile) const {
@@ -140,6 +152,7 @@
 void ProfileData::reset() {
     mJankTypeCounts.fill(0);
     mFrameCounts.fill(0);
+    mGPUFrameCounts.fill(0);
     mSlowFrameCounts.fill(0);
     mTotalFrameCount = 0;
     mJankFrameCount = 0;
@@ -167,5 +180,40 @@
     }
 }
 
+uint32_t ProfileData::findGPUPercentile(int percentile) const {
+    uint32_t totalGPUFrameCount = 0;  // this is usually mTotalFrameCount - 3.
+    for (int i = mGPUFrameCounts.size() - 1; i >= 0; i--) {
+        totalGPUFrameCount += mGPUFrameCounts[i];
+    }
+    int pos = percentile * totalGPUFrameCount / 100;
+    int remaining = totalGPUFrameCount - pos;
+    for (int i = mGPUFrameCounts.size() - 1; i >= 0; i--) {
+        remaining -= mGPUFrameCounts[i];
+        if (remaining <= 0) {
+            return GPUFrameTimeForFrameCountIndex(i);
+        }
+    }
+    return 0;
+}
+
+uint32_t ProfileData::GPUFrameTimeForFrameCountIndex(uint32_t index) {
+    return index != 25 ? index + 1 : 4950;
+}
+
+void ProfileData::reportGPUFrame(int64_t duration) {
+    uint32_t index = static_cast<uint32_t>(ns2ms(duration));
+    if (index > 25) {
+        index = 25;
+    }
+
+    mGPUFrameCounts[index]++;
+}
+
+void ProfileData::histogramGPUForEach(const std::function<void(HistogramEntry)>& callback) const {
+    for (size_t i = 0; i < mGPUFrameCounts.size(); i++) {
+        callback(HistogramEntry{GPUFrameTimeForFrameCountIndex(i), mGPUFrameCounts[i]});
+    }
+}
+
 } /* namespace uirenderer */
 } /* namespace android */
\ No newline at end of file
diff --git a/libs/hwui/ProfileData.h b/libs/hwui/ProfileData.h
index 564920b..ccbffc6 100644
--- a/libs/hwui/ProfileData.h
+++ b/libs/hwui/ProfileData.h
@@ -54,8 +54,10 @@
     void mergeWith(const ProfileData& other);
     void dump(int fd) const;
     uint32_t findPercentile(int percentile) const;
+    uint32_t findGPUPercentile(int percentile) const;
 
     void reportFrame(int64_t duration);
+    void reportGPUFrame(int64_t duration);
     void reportJank() { mJankFrameCount++; }
     void reportJankType(JankType type) { mJankTypeCounts[static_cast<int>(type)]++; }
 
@@ -69,15 +71,21 @@
         uint32_t frameCount;
     };
     void histogramForEach(const std::function<void(HistogramEntry)>& callback) const;
+    void histogramGPUForEach(const std::function<void(HistogramEntry)>& callback) const;
 
     constexpr static int HistogramSize() {
         return std::tuple_size<decltype(ProfileData::mFrameCounts)>::value +
                std::tuple_size<decltype(ProfileData::mSlowFrameCounts)>::value;
     }
 
+    constexpr static int GPUHistogramSize() {
+        return std::tuple_size<decltype(ProfileData::mGPUFrameCounts)>::value;
+    }
+
     // Visible for testing
     static uint32_t frameTimeForFrameCountIndex(uint32_t index);
     static uint32_t frameTimeForSlowFrameCountIndex(uint32_t index);
+    static uint32_t GPUFrameTimeForFrameCountIndex(uint32_t index);
 
 private:
     // Open our guts up to unit tests
@@ -88,6 +96,9 @@
     std::array<uint32_t, 57> mFrameCounts;
     // Holds a histogram of frame times in 50ms increments from 150ms to 5s
     std::array<uint16_t, 97> mSlowFrameCounts;
+    // Holds a histogram of GPU draw times in 1ms increments. Frames longer than 25ms are placed in
+    // last bucket.
+    std::array<uint32_t, 26> mGPUFrameCounts;
 
     uint32_t mTotalFrameCount;
     uint32_t mJankFrameCount;
diff --git a/libs/hwui/protos/graphicsstats.proto b/libs/hwui/protos/graphicsstats.proto
index 1226d44..0cd5c62 100644
--- a/libs/hwui/protos/graphicsstats.proto
+++ b/libs/hwui/protos/graphicsstats.proto
@@ -46,6 +46,9 @@
 
     // The frame time histogram for the package
     repeated GraphicsStatsHistogramBucketProto histogram = 6;
+
+    // The gpu frame time histogram for the package
+    repeated GraphicsStatsHistogramBucketProto gpu_histogram = 7;
 }
 
 message GraphicsStatsJankSummaryProto {
diff --git a/libs/hwui/renderthread/CanvasContext.cpp b/libs/hwui/renderthread/CanvasContext.cpp
index d19351b..88a0c6e 100644
--- a/libs/hwui/renderthread/CanvasContext.cpp
+++ b/libs/hwui/renderthread/CanvasContext.cpp
@@ -147,6 +147,7 @@
         mNativeSurface = new ReliableSurface{std::move(surface)};
         // TODO: Fix error handling & re-shorten timeout
         mNativeSurface->setDequeueTimeout(4000_ms);
+        mNativeSurface->enableFrameTimestamps(true);
     } else {
         mNativeSurface = nullptr;
     }
@@ -294,6 +295,7 @@
     // just keep using the previous frame's structure instead
     if (!wasSkipped(mCurrentFrameInfo)) {
         mCurrentFrameInfo = mJankTracker.startFrame();
+        mLast4FrameInfos.next().first = mCurrentFrameInfo;
     }
     mCurrentFrameInfo->importUiThreadInfo(uiFrameInfo);
     mCurrentFrameInfo->set(FrameInfoIndex::SyncQueued) = syncQueued;
@@ -445,7 +447,7 @@
                                       mContentDrawBounds, mOpaque, mLightInfo, mRenderNodes,
                                       &(profiler()));
 
-    int64_t frameCompleteNr = mFrameCompleteCallbacks.size() ? getFrameNumber() : -1;
+    int64_t frameCompleteNr = getFrameNumber();
 
     waitOnFences();
 
@@ -500,11 +502,13 @@
         }
         mCurrentFrameInfo->set(FrameInfoIndex::DequeueBufferDuration) = swap.dequeueDuration;
         mCurrentFrameInfo->set(FrameInfoIndex::QueueBufferDuration) = swap.queueDuration;
+        mLast4FrameInfos[-1].second = frameCompleteNr;
         mHaveNewSurface = false;
         mFrameNumber = -1;
     } else {
         mCurrentFrameInfo->set(FrameInfoIndex::DequeueBufferDuration) = 0;
         mCurrentFrameInfo->set(FrameInfoIndex::QueueBufferDuration) = 0;
+        mLast4FrameInfos[-1].second = -1;
     }
 
     // TODO: Use a fence for real completion?
@@ -537,6 +541,19 @@
         mFrameMetricsReporter->reportFrameMetrics(mCurrentFrameInfo->data());
     }
 
+    if (mLast4FrameInfos.size() == mLast4FrameInfos.capacity()) {
+        // By looking 4 frames back, we guarantee all SF stats are available. There are at
+        // most 3 buffers in BufferQueue. Surface object keeps stats for the last 8 frames.
+        FrameInfo* forthBehind = mLast4FrameInfos.front().first;
+        int64_t composedFrameId = mLast4FrameInfos.front().second;
+        nsecs_t acquireTime = -1;
+        mNativeSurface->getFrameTimestamps(composedFrameId, nullptr, &acquireTime, nullptr, nullptr,
+            nullptr, nullptr, nullptr, nullptr, nullptr);
+        // Ignore default -1, NATIVE_WINDOW_TIMESTAMP_INVALID and NATIVE_WINDOW_TIMESTAMP_PENDING
+        forthBehind->set(FrameInfoIndex::GpuCompleted) = acquireTime > 0 ? acquireTime : -1;
+        mJankTracker.finishGpuDraw(*forthBehind);
+    }
+
     GpuMemoryTracker::onFrameCompleted();
 }
 
diff --git a/libs/hwui/renderthread/CanvasContext.h b/libs/hwui/renderthread/CanvasContext.h
index f9b9310..8a76d6b 100644
--- a/libs/hwui/renderthread/CanvasContext.h
+++ b/libs/hwui/renderthread/CanvasContext.h
@@ -29,6 +29,7 @@
 #include "RenderNode.h"
 #include "renderthread/RenderTask.h"
 #include "renderthread/RenderThread.h"
+#include "utils/RingBuffer.h"
 
 #include <SkBitmap.h>
 #include <SkRect.h>
@@ -41,6 +42,7 @@
 #include <future>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace android {
@@ -260,6 +262,7 @@
     std::vector<sp<RenderNode>> mRenderNodes;
 
     FrameInfo* mCurrentFrameInfo = nullptr;
+    RingBuffer<std::pair<FrameInfo*, int64_t>, 4> mLast4FrameInfos;
     std::string mName;
     JankTracker mJankTracker;
     FrameInfoVisualizer mProfiler;
diff --git a/libs/hwui/renderthread/ReliableSurface.h b/libs/hwui/renderthread/ReliableSurface.h
index 41fc35e..7f1a078 100644
--- a/libs/hwui/renderthread/ReliableSurface.h
+++ b/libs/hwui/renderthread/ReliableSurface.h
@@ -49,6 +49,21 @@
         return ret;
     }
 
+    status_t getFrameTimestamps(uint64_t frameNumber,
+            nsecs_t* outRequestedPresentTime, nsecs_t* outAcquireTime,
+            nsecs_t* outLatchTime, nsecs_t* outFirstRefreshStartTime,
+            nsecs_t* outLastRefreshStartTime, nsecs_t* outGlCompositionDoneTime,
+            nsecs_t* outDisplayPresentTime, nsecs_t* outDequeueReadyTime,
+            nsecs_t* outReleaseTime) {
+        return mSurface->getFrameTimestamps(frameNumber, outRequestedPresentTime, outAcquireTime,
+            outLatchTime, outFirstRefreshStartTime, outLastRefreshStartTime,
+            outGlCompositionDoneTime, outDisplayPresentTime, outDequeueReadyTime, outReleaseTime);
+    }
+
+    void enableFrameTimestamps(bool enable) {
+        return mSurface->enableFrameTimestamps(enable);
+    }
+
 private:
     const sp<Surface> mSurface;
 
diff --git a/libs/hwui/service/GraphicsStatsService.cpp b/libs/hwui/service/GraphicsStatsService.cpp
index 8a16b20..8b5912b 100644
--- a/libs/hwui/service/GraphicsStatsService.cpp
+++ b/libs/hwui/service/GraphicsStatsService.cpp
@@ -40,6 +40,7 @@
 static_assert(sizeof(sCurrentFileVersion) == sHeaderSize, "Header size is wrong");
 
 constexpr int sHistogramSize = ProfileData::HistogramSize();
+constexpr int sGPUHistogramSize = ProfileData::GPUHistogramSize();
 
 static bool mergeProfileDataIntoProto(protos::GraphicsStatsProto* proto,
                                       const std::string& package, int64_t versionCode,
@@ -211,6 +212,37 @@
         bucket->set_frame_count(bucket->frame_count() + entry.frameCount);
         index++;
     });
+    if (hitMergeError) return false;
+    // fill in GPU frame time histogram
+    creatingHistogram = false;
+    if (proto->gpu_histogram_size() == 0) {
+        proto->mutable_gpu_histogram()->Reserve(sGPUHistogramSize);
+        creatingHistogram = true;
+    } else if (proto->gpu_histogram_size() != sGPUHistogramSize) {
+        ALOGE("GPU histogram size mismatch, proto is %d expected %d", proto->gpu_histogram_size(),
+              sGPUHistogramSize);
+        return false;
+    }
+    index = 0;
+    data->histogramGPUForEach([&](ProfileData::HistogramEntry entry) {
+        if (hitMergeError) return;
+
+        protos::GraphicsStatsHistogramBucketProto* bucket;
+        if (creatingHistogram) {
+            bucket = proto->add_gpu_histogram();
+            bucket->set_render_millis(entry.renderTimeMs);
+        } else {
+            bucket = proto->mutable_gpu_histogram(index);
+            if (bucket->render_millis() != static_cast<int32_t>(entry.renderTimeMs)) {
+                ALOGW("GPU frame time mistmatch %d vs. %u", bucket->render_millis(),
+                      entry.renderTimeMs);
+                hitMergeError = true;
+                return;
+            }
+        }
+        bucket->set_frame_count(bucket->frame_count() + entry.frameCount);
+        index++;
+    });
     return !hitMergeError;
 }
 
@@ -226,6 +258,22 @@
     return 0;
 }
 
+static int32_t findGPUPercentile(protos::GraphicsStatsProto* proto, int percentile) {
+    uint32_t totalGPUFrameCount = 0;  // this is usually  proto->summary().total_frames() - 3.
+    for (auto it = proto->gpu_histogram().rbegin(); it != proto->gpu_histogram().rend(); ++it) {
+        totalGPUFrameCount += it->frame_count();
+    }
+    int32_t pos = percentile * totalGPUFrameCount / 100;
+    int32_t remaining = totalGPUFrameCount - pos;
+    for (auto it = proto->gpu_histogram().rbegin(); it != proto->gpu_histogram().rend(); ++it) {
+        remaining -= it->frame_count();
+        if (remaining <= 0) {
+            return it->render_millis();
+        }
+    }
+    return 0;
+}
+
 void dumpAsTextToFd(protos::GraphicsStatsProto* proto, int fd) {
     // This isn't a full validation, just enough that we can deref at will
     if (proto->package_name().empty() || !proto->has_summary()) {
@@ -255,6 +303,14 @@
     for (const auto& it : proto->histogram()) {
         dprintf(fd, " %dms=%d", it.render_millis(), it.frame_count());
     }
+    dprintf(fd, "\n50th gpu percentile: %dms", findGPUPercentile(proto, 50));
+    dprintf(fd, "\n90th gpu percentile: %dms", findGPUPercentile(proto, 90));
+    dprintf(fd, "\n95th gpu percentile: %dms", findGPUPercentile(proto, 95));
+    dprintf(fd, "\n99th gpu percentile: %dms", findGPUPercentile(proto, 99));
+    dprintf(fd, "\nGPU HISTOGRAM:");
+    for (const auto& it : proto->gpu_histogram()) {
+        dprintf(fd, " %dms=%d", it.render_millis(), it.frame_count());
+    }
     dprintf(fd, "\n");
 }