1/* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "GrCaps.h" 9#include "GrContextFactory.h" 10#include "Benchmark.h" 11#include "ResultsWriter.h" 12#include "SkCommandLineFlags.h" 13#include "SkOSFile.h" 14#include "SkStream.h" 15#include "SkSurface.h" 16#include "SkTime.h" 17#include "SkTLList.h" 18#include "SkThreadUtils.h" 19#include "Stats.h" 20#include "Timer.h" 21#include "VisualSKPBench.h" 22#include "gl/GrGLDefines.h" 23#include "../private/SkMutex.h" 24#include "../private/SkSemaphore.h" 25#include "../private/SkGpuFenceSync.h" 26 27// posix only for now 28#include <unistd.h> 29#include <sys/types.h> 30#include <sys/wait.h> 31 32/* 33 * This is an experimental GPU only benchmarking program. The initial implementation will only 34 * support SKPs. 35 */ 36 37// To get image decoders linked in we have to do the below magic 38#include "SkForceLinking.h" 39#include "SkImageDecoder.h" 40__SK_FORCE_IMAGE_DECODER_LINKING; 41 42static const int kAutoTuneLoops = 0; 43 44static const int kDefaultLoops = 45#ifdef SK_DEBUG 46 1; 47#else 48 kAutoTuneLoops; 49#endif 50 51static SkString loops_help_txt() { 52 SkString help; 53 help.printf("Number of times to run each bench. Set this to %d to auto-" 54 "tune for each bench. Timings are only reported when auto-tuning.", 55 kAutoTuneLoops); 56 return help; 57} 58 59DEFINE_string(skps, "skps", "Directory to read skps from."); 60DEFINE_string2(match, m, nullptr, 61 "[~][^]substring[$] [...] of GM name to run.\n" 62 "Multiple matches may be separated by spaces.\n" 63 "~ causes a matching bench to always be skipped\n" 64 "^ requires the start of the bench to match\n" 65 "$ requires the end of the bench to match\n" 66 "^ and $ requires an exact match\n" 67 "If a bench does not match any list entry,\n" 68 "it is skipped unless some list entry starts with ~"); 69DEFINE_int32(gpuFrameLag, 5, "If unknown, estimated maximum number of frames GPU allows to lag."); 70DEFINE_int32(samples, 10, "Number of samples to measure for each bench."); 71DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this."); 72DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str()); 73DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU."); 74DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs."); 75DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together"); 76DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process"); 77 78static SkString humanize(double ms) { 79 return HumanizeMs(ms); 80} 81#define HUMANIZE(ms) humanize(ms).c_str() 82 83namespace kilobench { 84class BenchmarkStream { 85public: 86 BenchmarkStream() : fCurrentSKP(0) { 87 for (int i = 0; i < FLAGS_skps.count(); i++) { 88 if (SkStrEndsWith(FLAGS_skps[i], ".skp")) { 89 fSKPs.push_back() = FLAGS_skps[i]; 90 } else { 91 SkOSFile::Iter it(FLAGS_skps[i], ".skp"); 92 SkString path; 93 while (it.next(&path)) { 94 fSKPs.push_back() = SkOSPath::Join(FLAGS_skps[0], path.c_str()); 95 } 96 } 97 } 98 } 99 100 Benchmark* next() { 101 Benchmark* bench = nullptr; 102 // skips non matching benches 103 while ((bench = this->innerNext()) && 104 (SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getUniqueName()) || 105 !bench->isSuitableFor(Benchmark::kGPU_Backend))) { 106 delete bench; 107 } 108 return bench; 109 } 110 111private: 112 static bool ReadPicture(const char* path, SkAutoTUnref<SkPicture>* pic) { 113 // Not strictly necessary, as it will be checked again later, 114 // but helps to avoid a lot of pointless work if we're going to skip it. 115 if (SkCommandLineFlags::ShouldSkip(FLAGS_match, path)) { 116 return false; 117 } 118 119 SkAutoTDelete<SkStream> stream(SkStream::NewFromFile(path)); 120 if (stream.get() == nullptr) { 121 SkDebugf("Could not read %s.\n", path); 122 return false; 123 } 124 125 pic->reset(SkPicture::CreateFromStream(stream.get())); 126 if (pic->get() == nullptr) { 127 SkDebugf("Could not read %s as an SkPicture.\n", path); 128 return false; 129 } 130 return true; 131 } 132 133 Benchmark* innerNext() { 134 // Render skps 135 while (fCurrentSKP < fSKPs.count()) { 136 const SkString& path = fSKPs[fCurrentSKP++]; 137 SkAutoTUnref<SkPicture> pic; 138 if (!ReadPicture(path.c_str(), &pic)) { 139 continue; 140 } 141 142 SkString name = SkOSPath::Basename(path.c_str()); 143 return new VisualSKPBench(name.c_str(), pic.get()); 144 } 145 146 return nullptr; 147 } 148 149 SkTArray<SkString> fSKPs; 150 int fCurrentSKP; 151}; 152 153struct GPUTarget { 154 void setup() { 155 fGL->makeCurrent(); 156 // Make sure we're done with whatever came before. 157 SK_GL(*fGL, Finish()); 158 } 159 160 SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; } 161 162 void endTiming(bool usePlatformSwapBuffers) { 163 if (fGL) { 164 SK_GL(*fGL, Flush()); 165 if (usePlatformSwapBuffers) { 166 fGL->swapBuffers(); 167 } else { 168 fGL->waitOnSyncOrSwap(); 169 } 170 } 171 } 172 void finish() { 173 SK_GL(*fGL, Finish()); 174 } 175 176 bool needsFrameTiming(int* maxFrameLag) const { 177 if (!fGL->getMaxGpuFrameLag(maxFrameLag)) { 178 // Frame lag is unknown. 179 *maxFrameLag = FLAGS_gpuFrameLag; 180 } 181 return true; 182 } 183 184 bool init(Benchmark* bench, GrContextFactory* factory, bool useDfText, 185 GrContextFactory::GLContextType ctxType, 186 GrContextFactory::GLContextOptions ctxOptions, int numSamples) { 187 GrContext* context = factory->get(ctxType, ctxOptions); 188 int maxRTSize = context->caps()->maxRenderTargetSize(); 189 SkImageInfo info = SkImageInfo::Make(SkTMin(bench->getSize().fX, maxRTSize), 190 SkTMin(bench->getSize().fY, maxRTSize), 191 kN32_SkColorType, kPremul_SkAlphaType); 192 uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag : 193 0; 194 SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType); 195 fSurface.reset(SkSurface::NewRenderTarget(context, 196 SkBudgeted::kNo, info, 197 numSamples, &props)); 198 fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext; 199 if (!fSurface.get()) { 200 return false; 201 } 202 203 // Kilobench should only be used on platforms with fence sync support 204 SkASSERT(fGL->fenceSyncSupport()); 205 return true; 206 } 207 208 SkCanvas* getCanvas() const { 209 if (!fSurface.get()) { 210 return nullptr; 211 } 212 return fSurface->getCanvas(); 213 } 214 215 bool capturePixels(SkBitmap* bmp) { 216 SkCanvas* canvas = this->getCanvas(); 217 if (!canvas) { 218 return false; 219 } 220 bmp->setInfo(canvas->imageInfo()); 221 if (!canvas->readPixels(bmp, 0, 0)) { 222 SkDebugf("Can't read canvas pixels.\n"); 223 return false; 224 } 225 return true; 226 } 227 228 SkGLContext* gl() { return fGL; } 229 230private: 231 SkGLContext* fGL; 232 SkAutoTDelete<SkSurface> fSurface; 233}; 234 235static bool write_canvas_png(GPUTarget* target, const SkString& filename) { 236 237 if (filename.isEmpty()) { 238 return false; 239 } 240 if (target->getCanvas() && 241 kUnknown_SkColorType == target->getCanvas()->imageInfo().colorType()) { 242 return false; 243 } 244 245 SkBitmap bmp; 246 247 if (!target->capturePixels(&bmp)) { 248 return false; 249 } 250 251 SkString dir = SkOSPath::Dirname(filename.c_str()); 252 if (!sk_mkdir(dir.c_str())) { 253 SkDebugf("Can't make dir %s.\n", dir.c_str()); 254 return false; 255 } 256 SkFILEWStream stream(filename.c_str()); 257 if (!stream.isValid()) { 258 SkDebugf("Can't write %s.\n", filename.c_str()); 259 return false; 260 } 261 if (!SkImageEncoder::EncodeStream(&stream, bmp, SkImageEncoder::kPNG_Type, 100)) { 262 SkDebugf("Can't encode a PNG.\n"); 263 return false; 264 } 265 return true; 266} 267 268static int detect_forever_loops(int loops) { 269 // look for a magic run-forever value 270 if (loops < 0) { 271 loops = SK_MaxS32; 272 } 273 return loops; 274} 275 276static int clamp_loops(int loops) { 277 if (loops < 1) { 278 SkDebugf("ERROR: clamping loops from %d to 1. " 279 "There's probably something wrong with the bench.\n", loops); 280 return 1; 281 } 282 if (loops > FLAGS_maxLoops) { 283 SkDebugf("WARNING: clamping loops from %d to FLAGS_maxLoops, %d.\n", loops, FLAGS_maxLoops); 284 return FLAGS_maxLoops; 285 } 286 return loops; 287} 288 289static double now_ms() { return SkTime::GetNSecs() * 1e-6; } 290 291struct TimingThread { 292 TimingThread(SkGLContext* mainContext) 293 : fFenceSync(mainContext->fenceSync()) 294 , fMainContext(mainContext) 295 , fDone(false) {} 296 297 static void Loop(void* data) { 298 TimingThread* timingThread = reinterpret_cast<TimingThread*>(data); 299 timingThread->timingLoop(); 300 } 301 302 // To ensure waiting for the sync actually does something, we check to make sure the we exceed 303 // some small value 304 const double kMinElapsed = 1e-6; 305 bool sanity(double start) const { 306 double elapsed = now_ms() - start; 307 return elapsed > kMinElapsed; 308 } 309 310 void waitFence(SkPlatformGpuFence sync) { 311 SkDEBUGCODE(double start = now_ms()); 312 fFenceSync->waitFence(sync, false); 313 SkASSERT(sanity(start)); 314 } 315 316 void timingLoop() { 317 // Create a context which shares display lists with the main thread 318 SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard, 319 fMainContext)); 320 glContext->makeCurrent(); 321 322 // Basic timing methodology is: 323 // 1) Wait on semaphore until main thread indicates its time to start timing the frame 324 // 2) Wait on frame start sync, record time. This is start of the frame. 325 // 3) Wait on semaphore until main thread indicates its time to finish timing the frame 326 // 4) Wait on frame end sync, record time. FrameEndTime - FrameStartTime = frame time 327 // 5) Wait on semaphore until main thread indicates we should time the next frame or quit 328 while (true) { 329 fSemaphore.wait(); 330 331 // get start sync 332 SkPlatformGpuFence startSync = this->popStartSync(); 333 334 // wait on sync 335 this->waitFence(startSync); 336 double start = kilobench::now_ms(); 337 338 // do we want to sleep here? 339 // wait for end sync 340 fSemaphore.wait(); 341 342 // get end sync 343 SkPlatformGpuFence endSync = this->popEndSync(); 344 345 // wait on sync 346 this->waitFence(endSync); 347 double elapsed = kilobench::now_ms() - start; 348 349 // No mutex needed, client won't touch timings until we're done 350 fTimings.push_back(elapsed); 351 352 // clean up fences 353 fFenceSync->deleteFence(startSync); 354 fFenceSync->deleteFence(endSync); 355 356 fSemaphore.wait(); 357 if (this->isDone()) { 358 break; 359 } 360 } 361 } 362 363 void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); } 364 365 SkPlatformGpuFence popStartSync() { 366 return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); 367 } 368 369 void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); } 370 371 SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); } 372 373 void setDone() { 374 SkAutoMutexAcquire done(fDoneMutex); 375 fDone = true; 376 fSemaphore.signal(); 377 } 378 379 typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue; 380 381 void pushSync(SyncQueue* queue, SkMutex* mutex) { 382 SkAutoMutexAcquire am(mutex); 383 *queue->addToHead() = fFenceSync->insertFence(); 384 fSemaphore.signal(); 385 } 386 387 SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) { 388 SkAutoMutexAcquire am(mutex); 389 SkPlatformGpuFence sync = *queue->head(); 390 queue->popHead(); 391 return sync; 392 } 393 394 bool isDone() { 395 SkAutoMutexAcquire am1(fFrameStartSyncsMutex); 396 SkAutoMutexAcquire done(fDoneMutex); 397 if (fDone && fFrameStartSyncs.isEmpty()) { 398 return true; 399 } else { 400 return false; 401 } 402 } 403 404 const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; } 405 406private: 407 SkGpuFenceSync* fFenceSync; 408 SkSemaphore fSemaphore; 409 SkMutex fFrameStartSyncsMutex; 410 SyncQueue fFrameStartSyncs; 411 SkMutex fFrameEndSyncsMutex; 412 SyncQueue fFrameEndSyncs; 413 SkTArray<double> fTimings; 414 SkMutex fDoneMutex; 415 SkGLContext* fMainContext; 416 bool fDone; 417}; 418 419static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) { 420 SkCanvas* canvas = target->getCanvas(); 421 canvas->clear(SK_ColorWHITE); 422 bench->preDraw(canvas); 423 424 if (timingThread) { 425 timingThread->pushStartSync(); 426 } 427 double start = now_ms(); 428 canvas = target->beginTiming(canvas); 429 bench->draw(loops, canvas); 430 canvas->flush(); 431 target->endTiming(timingThread ? true : false); 432 433 double elapsed = now_ms() - start; 434 if (timingThread) { 435 timingThread->pushEndSync(); 436 timingThread->setDone(); 437 } 438 bench->postDraw(canvas); 439 return elapsed; 440} 441 442// TODO For now we don't use the background timing thread to tune loops 443static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) { 444 // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs. 445 int loops = bench->calculateLoops(FLAGS_loops); 446 if (kAutoTuneLoops == loops) { 447 loops = 1; 448 double elapsed = 0; 449 do { 450 if (1<<30 == loops) { 451 // We're about to wrap. Something's wrong with the bench. 452 loops = 0; 453 break; 454 } 455 loops *= 2; 456 // If the GPU lets frames lag at all, we need to make sure we're timing 457 // _this_ round, not still timing last round. 458 for (int i = 0; i < maxGpuFrameLag; i++) { 459 elapsed = time(loops, bench, target, nullptr); 460 } 461 } while (elapsed < FLAGS_gpuMs); 462 463 // We've overshot at least a little. Scale back linearly. 464 loops = (int)ceil(loops * FLAGS_gpuMs / elapsed); 465 loops = clamp_loops(loops); 466 467 // Make sure we're not still timing our calibration. 468 target->finish(); 469 } else { 470 loops = detect_forever_loops(loops); 471 } 472 473 // Pretty much the same deal as the calibration: do some warmup to make 474 // sure we're timing steady-state pipelined frames. 475 for (int i = 0; i < maxGpuFrameLag - 1; i++) { 476 time(loops, bench, target, nullptr); 477 } 478 479 return loops; 480} 481 482struct AutoSetupContextBenchAndTarget { 483 AutoSetupContextBenchAndTarget(Benchmark* bench) : fBenchmark(bench) { 484 GrContextOptions grContextOpts; 485 fCtxFactory.reset(new GrContextFactory(grContextOpts)); 486 487 SkAssertResult(fTarget.init(bench, fCtxFactory, false, 488 GrContextFactory::kNative_GLContextType, 489 GrContextFactory::kNone_GLContextOptions, 0)); 490 491 fCanvas = fTarget.getCanvas(); 492 fTarget.setup(); 493 494 bench->perCanvasPreDraw(fCanvas); 495 fTarget.needsFrameTiming(&fMaxFrameLag); 496 } 497 498 int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); } 499 500 double timeSample(int loops, TimingThread* timingThread) { 501 for (int i = 0; i < fMaxFrameLag; i++) { 502 time(loops, fBenchmark, &fTarget, timingThread); 503 } 504 505 return time(loops, fBenchmark, &fTarget, timingThread) / loops; 506 } 507 508 void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); } 509 510 SkAutoTDelete<GrContextFactory> fCtxFactory; 511 GPUTarget fTarget; 512 SkCanvas* fCanvas; 513 Benchmark* fBenchmark; 514 int fMaxFrameLag; 515}; 516 517int setup_loops(Benchmark* bench) { 518 AutoSetupContextBenchAndTarget ascbt(bench); 519 int loops = ascbt.getLoops(); 520 ascbt.teardownBench(); 521 522 if (!FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) { 523 SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], "gpu"); 524 pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName()); 525 pngFilename.append(".png"); 526 write_canvas_png(&ascbt.fTarget, pngFilename); 527 } 528 return loops; 529} 530 531struct Sample { 532 double fCpu; 533 double fGpu; 534}; 535 536Sample time_sample(Benchmark* bench, int loops) { 537 AutoSetupContextBenchAndTarget ascbt(bench); 538 539 Sample sample; 540 if (FLAGS_useBackgroundThread) { 541 TimingThread timingThread(ascbt.fTarget.gl()); 542 SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread)); 543 nativeThread->start(); 544 sample.fCpu = ascbt.timeSample(loops, &timingThread); 545 nativeThread->join(); 546 547 // return the min 548 double min = SK_ScalarMax; 549 for (int i = 0; i < timingThread.timings().count(); i++) { 550 min = SkTMin(min, timingThread.timings()[i]); 551 } 552 sample.fGpu = min; 553 } else { 554 sample.fCpu = ascbt.timeSample(loops, nullptr); 555 } 556 557 ascbt.teardownBench(); 558 559 return sample; 560} 561 562} // namespace kilobench 563 564static const int kOutResultSize = 1024; 565 566void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) { 567 SkString newName(name); 568 newName.appendf("_%s", mod); 569 Stats stats(samples); 570 const double stddev_percent = 100 * sqrt(stats.var) / stats.mean; 571 SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n" 572 , loops 573 , HUMANIZE(stats.min) 574 , HUMANIZE(stats.median) 575 , HUMANIZE(stats.mean) 576 , HUMANIZE(stats.max) 577 , stddev_percent 578 , stats.plot.c_str() 579 , "gpu" 580 , newName.c_str() 581 ); 582} 583 584int kilobench_main() { 585 kilobench::BenchmarkStream benchStream; 586 587 SkDebugf("loops\tmin\tmedian\tmean\tmax\tstddev\t%-*s\tconfig\tbench\n", 588 FLAGS_samples, "samples"); 589 590 int descriptors[2]; 591 if (pipe(descriptors) != 0) { 592 SkFAIL("Failed to open a pipe\n"); 593 } 594 595 while (Benchmark* b = benchStream.next()) { 596 SkAutoTDelete<Benchmark> bench(b); 597 598 int loops = 1; 599 SkTArray<double> cpuSamples; 600 SkTArray<double> gpuSamples; 601 for (int i = 0; i < FLAGS_samples + 1; i++) { 602 // We fork off a new process to setup the grcontext and run the test while we wait 603 if (FLAGS_useMultiProcess) { 604 int childPid = fork(); 605 if (childPid > 0) { 606 char result[kOutResultSize]; 607 if (read(descriptors[0], result, kOutResultSize) < 0) { 608 SkFAIL("Failed to read from pipe\n"); 609 } 610 611 // if samples == 0 then parse # of loops 612 // else parse float 613 if (i == 0) { 614 sscanf(result, "%d", &loops); 615 } else { 616 sscanf(result, "%lf %lf", &cpuSamples.push_back(), 617 &gpuSamples.push_back()); 618 } 619 620 // wait until exit 621 int status; 622 waitpid(childPid, &status, 0); 623 } else if (0 == childPid) { 624 char result[kOutResultSize]; 625 if (i == 0) { 626 sprintf(result, "%d", kilobench::setup_loops(bench)); 627 } else { 628 kilobench::Sample sample = kilobench::time_sample(bench, loops); 629 sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu); 630 } 631 632 // Make sure to write the null terminator 633 if (write(descriptors[1], result, strlen(result) + 1) < 0) { 634 SkFAIL("Failed to write to pipe\n"); 635 } 636 return 0; 637 } else { 638 SkFAIL("Fork failed\n"); 639 } 640 } else { 641 if (i == 0) { 642 loops = kilobench::setup_loops(bench); 643 } else { 644 kilobench::Sample sample = kilobench::time_sample(bench, loops); 645 cpuSamples.push_back(sample.fCpu); 646 gpuSamples.push_back(sample.fGpu); 647 } 648 } 649 } 650 651 printResult(cpuSamples, loops, bench->getUniqueName(), "cpu"); 652 if (FLAGS_useBackgroundThread) { 653 printResult(gpuSamples, loops, bench->getUniqueName(), "gpu"); 654 } 655 } 656 return 0; 657} 658 659#if !defined SK_BUILD_FOR_IOS 660int main(int argc, char** argv) { 661 SkCommandLineFlags::Parse(argc, argv); 662 return kilobench_main(); 663} 664#endif 665