Add MicroProfiler for low-overhead profiling of JSC/bridge performance

Summary: We have a lot of small-ish calls to JSC and within the bridge that add up during TTI. This gives us a way to measure them in aggregate in a reasonable way. From the comments: MicroProfiler is a performance profiler for measuring the cumulative impact of a large number of small-ish calls. This is normally a problem for standard profilers like Systrace because the overhead of the profiler itself skews the timings you are able to collect. This is especially a problem when doing nested calls to profiled functions, as the parent calls will contain the overhead of their profiling plus the overhead of all their childrens' profiling. MicroProfiler attempts to be low overhead by 1) aggregating timings in memory and 2) trying to remove estimated profiling overhead from the returned timings. To remove estimated overhead, at the beginning of each trace we calculate the average cost of profiling a no-op code section, as well as invoking the average cost of invoking the system clock. The former is subtracted out for each child profiler section that is invoked within a parent profiler section. The latter is subtracted from each section, child or not. The usage is similar to Systrace: you put a MICRO_PROFILER_BLOCK in the block you want to profile and C++ RAII will handle timing it. After MicroProfiler::stopProfiling() is called, a table of tracing data is emitted to glog (which shows up in logcat on Android). Differential Revision: D3635319 fbshipit-source-id: 01390b8ac76a68dd425cba2adfdde6e4957440cc
2016-08-02 07:32:59 -07:00 · 2016-08-02 07:32:59 -07:00 · 0418353249
parent 8258366a3d
commit 0418353249
4 changed files with 323 additions and 0 deletions
--- a/ReactCommon/cxxreact/BUCK
+++ b/ReactCommon/cxxreact/BUCK
@ -166,6 +166,7 @@ react_library(
  ] + REACT_LIBRARY_EXTRA_COMPILER_FLAGS,
  deps = [
    '//xplat/fbsystrace:fbsystrace',
    react_native_xplat_target('microprofiler:microprofiler'),
  ],
  visibility = [ 'PUBLIC' ],
 )
--- a/ReactCommon/microprofiler/BUCK
+++ b/ReactCommon/microprofiler/BUCK
@ -0,0 +1,23 @@
 cxx_library(
  name = 'microprofiler',
  force_static = True,
  compiler_flags = [
    '-Wall',
    '-Werror',
    '-std=c++11',
    '-fexceptions',
  ],
  exported_headers = [
    'MicroProfiler.h',
  ],
  srcs = [
    'MicroProfiler.cpp',
  ],
  header_namespace = 'microprofiler',
  deps = [
    '//xplat/third-party/glog:glog',
  ],
  visibility = [
    'PUBLIC',
  ],
 )
--- a/ReactCommon/microprofiler/MicroProfiler.cpp
+++ b/ReactCommon/microprofiler/MicroProfiler.cpp
@ -0,0 +1,220 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #include <algorithm>
 #include <chrono>
 #include <mutex>
 #include <vector>
 #include <time.h>
 #include <glog/logging.h>
 #include "MicroProfiler.h"
 // iOS doesn't support 'thread_local'. If we reimplement this to use pthread_setspecific
 // we can get rid of this
 #define MICRO_PROFILER_STUB_IMPLEMENTATION defined(__APPLE__)
 namespace facebook {
 namespace react {
 #if !MICRO_PROFILER_STUB_IMPLEMENTATION
 struct TraceData {
  TraceData();
  ~TraceData();
  void addTime(MicroProfilerName name, uint_fast64_t time, uint_fast32_t internalClockCalls);
  std::thread::id threadId_;
  uint_fast64_t startTime_;
  std::atomic_uint_fast64_t times_[MicroProfilerName::__LENGTH__] = {};
  std::atomic_uint_fast32_t calls_[MicroProfilerName::__LENGTH__] = {};
  std::atomic_uint_fast32_t childProfileSections_[MicroProfilerName::__LENGTH__] = {};
 };
 struct ProfilingImpl {
  std::mutex mutex_;
  std::vector<TraceData*> allTraceData_;
  bool isProfiling_ = false;
  uint_fast64_t startTime_;
  uint_fast64_t endTime_;
  uint_fast64_t clockOverhead_;
  uint_fast64_t profileSectionOverhead_;
 };
 static ProfilingImpl profiling;
 thread_local TraceData myTraceData;
 thread_local uint_fast32_t profileSections = 0;
 static uint_fast64_t nowNs() {
  struct timespec time;
  clock_gettime(CLOCK_REALTIME, &time);
  return uint_fast64_t(1000000000) * time.tv_sec + time.tv_nsec;
 }
 static uint_fast64_t diffNs(uint_fast64_t start, uint_fast64_t end) {
  return end - start;
 }
 static std::string formatTimeNs(uint_fast64_t timeNs) {
  std::ostringstream out;
  out.precision(2);
  if (timeNs < 1000) {
    out << timeNs << "ns";
  } else if (timeNs < 1000000) {
    out << timeNs / 1000.0 << "us";
  } else {
    out << std::fixed << timeNs / 1000000.0 << "ms";
  }
  return out.str();
 }
 MicroProfilerSection::MicroProfilerSection(MicroProfilerName name) :
    isProfiling_(profiling.isProfiling_),
    name_(name),
    startNumProfileSections_(profileSections) {
  if (!isProfiling_) {
    return;
  }
  profileSections++;
  startTime_ = nowNs();
 }
 MicroProfilerSection::~MicroProfilerSection() {
  if (!isProfiling_ || !profiling.isProfiling_) {
    return;
  }
  auto endTime = nowNs();
  auto endNumProfileSections = profileSections;
  myTraceData.addTime(name_, endTime - startTime_, endNumProfileSections - startNumProfileSections_ - 1);
 }
 TraceData::TraceData() :
    threadId_(std::this_thread::get_id()) {
  std::lock_guard<std::mutex> lock(profiling.mutex_);
  profiling.allTraceData_.push_back(this);
 }
 TraceData::~TraceData() {
  std::lock_guard<std::mutex> lock(profiling.mutex_);
  auto& infos = profiling.allTraceData_;
  infos.erase(std::remove(infos.begin(), infos.end(), this), infos.end());
 }
 void TraceData::addTime(MicroProfilerName name, uint_fast64_t time, uint_fast32_t childprofileSections) {
  times_[name] += time;
  calls_[name]++;
  childProfileSections_[name] += childprofileSections;
 }
 static void printReport() {
  LOG(ERROR) << "======= MICRO PROFILER REPORT =======";
  LOG(ERROR) << "- Total Time: " << formatTimeNs(diffNs(profiling.startTime_, profiling.endTime_));
  LOG(ERROR) << "- Clock Overhead: " << formatTimeNs(profiling.clockOverhead_);
  LOG(ERROR) << "- Profiler Section Overhead: " << formatTimeNs(profiling.profileSectionOverhead_);
  for (auto info : profiling.allTraceData_) {
    LOG(ERROR) << "--- Thread ID 0x" << std::hex << info->threadId_ << " ---";
    for (int i = 0; i < MicroProfilerName::__LENGTH__; i++) {
      if (info->times_[i] > 0) {
        auto totalTime = info->times_[i].load();
        auto calls = info->calls_[i].load();
        auto clockOverhead = profiling.clockOverhead_ * calls + profiling.profileSectionOverhead_ * info->childProfileSections_[i].load();
        if (totalTime < clockOverhead) {
          LOG(ERROR) << "- " << MicroProfiler::profilingNameToString(static_cast<MicroProfilerName>(i)) << ": "
              << "ERROR: Total time was " << totalTime << "ns but clock overhead was calculated to be " << clockOverhead << "ns!";
        } else {
          auto correctedTime = totalTime - clockOverhead;
          auto timePerCall = correctedTime / calls;
          LOG(ERROR) << "- " << MicroProfiler::profilingNameToString(static_cast<MicroProfilerName>(i)) << ": "
              << formatTimeNs(correctedTime) << " (" << calls << " calls, " << formatTimeNs(timePerCall) << "/call)";
        }
      }
    }
  }
 }
 static void clearProfiling() {
  CHECK(!profiling.isProfiling_) << "Trying to clear profiling but profiling was already started!";
  for (auto info : profiling.allTraceData_) {
    for (unsigned int i = 0; i < MicroProfilerName::__LENGTH__; i++) {
      info->times_[i] = 0;
      info->calls_[i] = 0;
      info->childProfileSections_[i] = 0;
    }
  }
 }
 static uint_fast64_t calculateClockOverhead() {
  int numCalls = 1000000;
  uint_fast64_t start = nowNs();
  for (int i = 0; i < numCalls; i++) {
    nowNs();
  }
  uint_fast64_t end = nowNs();
  return (end - start) / numCalls;
 }
 static uint_fast64_t calculateProfileSectionOverhead() {
  int numCalls = 1000000;
  uint_fast64_t start = nowNs();
  profiling.isProfiling_ = true;
  for (int i = 0; i < numCalls; i++) {
    MICRO_PROFILER_SECTION(static_cast<MicroProfilerName>(0));
  }
  uint_fast64_t end = nowNs();
  profiling.isProfiling_ = false;
  return (end - start) / numCalls;
 }
 void MicroProfiler::startProfiling() {
  CHECK(!profiling.isProfiling_) << "Trying to start profiling but profiling was already started!";
  profiling.clockOverhead_ = calculateClockOverhead();
  profiling.profileSectionOverhead_ = calculateProfileSectionOverhead();
  std::lock_guard<std::mutex> lock(profiling.mutex_);
  clearProfiling();
  profiling.startTime_ = nowNs();
  profiling.isProfiling_ = true;
 }
 void MicroProfiler::stopProfiling() {
  CHECK(profiling.isProfiling_) << "Trying to stop profiling but profiling hasn't been started!";
  profiling.isProfiling_ = false;
  profiling.endTime_ = nowNs();
  std::lock_guard<std::mutex> lock(profiling.mutex_);
  printReport();
  clearProfiling();
 }
 bool MicroProfiler::isProfiling() {
  return profiling.isProfiling_;
 }
 void MicroProfiler::runInternalBenchmark() {
  MicroProfiler::startProfiling();
  for (int i = 0; i < 1000000; i++) {
    MICRO_PROFILER_SECTION_NAMED(outer, __INTERNAL_BENCHMARK_OUTER);
    {
      MICRO_PROFILER_SECTION_NAMED(inner, __INTERNAL_BENCHMARK_INNER);
    }
  }
  MicroProfiler::stopProfiling();
 }
 #else
 void MicroProfiler::startProfiling() {
  CHECK(false) << "This platform has a stub implementation of the micro profiler and cannot collect traces";
 }
 void MicroProfiler::stopProfiling() {
 }
 bool MicroProfiler::isProfiling() {
  return false;
 }
 void MicroProfiler::runInternalBenchmark() {
 }
 #endif
 } }
--- a/ReactCommon/microprofiler/MicroProfiler.h
+++ b/ReactCommon/microprofiler/MicroProfiler.h
@ -0,0 +1,79 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #pragma once
 #include <atomic>
 #include <thread>
 // #define WITH_MICRO_PROFILER 1
 #ifdef WITH_MICRO_PROFILER
 #define MICRO_PROFILER_SECTION(name) MicroProfilerSection __b(name)
 #define MICRO_PROFILER_SECTION_NAMED(var_name, name) MicroProfilerSection var_name(name)
 #else
 #define MICRO_PROFILER_SECTION(name)
 #define MICRO_PROFILER_SECTION_NAMED(var_name, name)
 #endif
 namespace facebook {
 namespace react {
 enum MicroProfilerName {
  __INTERNAL_BENCHMARK_INNER,
  __INTERNAL_BENCHMARK_OUTER,
  __LENGTH__,
 };
 /**
 * MicroProfiler is a performance profiler for measuring the cumulative impact of
 * a large number of small-ish calls. This is normally a problem for standard profilers
 * like Systrace because the overhead of the profiler itself skews the timings you
 * are able to collect. This is especially a problem when doing nested calls to
 * profiled functions, as the parent calls will contain the overhead of their profiling
 * plus the overhead of all their childrens' profiling.
 *
 * MicroProfiler attempts to be low overhead by 1) aggregating timings in memory and
 * 2) trying to remove estimated profiling overhead from the returned timings.
 *
 * To remove estimated overhead, at the beginning of each trace we calculate the
 * average cost of profiling a no-op code section, as well as invoking the average
 * cost of invoking the system clock. The former is subtracted out for each child
 * profiler section that is invoked within a parent profiler section. The latter is
 * subtracted from each section, child or not.
 *
 * After MicroProfiler::stopProfiling() is called, a table of tracing data is emitted
 * to glog (which shows up in logcat on Android).
 */
 struct MicroProfiler {
  static const char* profilingNameToString(MicroProfilerName name) {
    switch (name) {
      case __INTERNAL_BENCHMARK_INNER:
        return "__INTERNAL_BENCHMARK_INNER";
      case __INTERNAL_BENCHMARK_OUTER:
        return "__INTERNAL_BENCHMARK_OUTER";
      case __LENGTH__:
        throw std::runtime_error("__LENGTH__ has no name");
      default:
        throw std::runtime_error("Trying to convert unknown MicroProfilerName to string");
    }
  }
  static void startProfiling();
  static void stopProfiling();
  static bool isProfiling();
  static void runInternalBenchmark();
 };
 class MicroProfilerSection {
 public:
  MicroProfilerSection(MicroProfilerName name);
  ~MicroProfilerSection();
 private:
  bool isProfiling_;
  MicroProfilerName name_;
  uint_fast64_t startTime_;
  uint_fast32_t startNumProfileSections_;
 };
 } }