1/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "GrCaps.h"
9#include "GrContextFactory.h"
10#include "Benchmark.h"
11#include "ResultsWriter.h"
12#include "SkCommandLineFlags.h"
13#include "SkOSFile.h"
14#include "SkStream.h"
15#include "SkSurface.h"
16#include "SkTime.h"
17#include "SkTLList.h"
18#include "SkThreadUtils.h"
19#include "Stats.h"
20#include "Timer.h"
21#include "VisualSKPBench.h"
22#include "gl/GrGLDefines.h"
23#include "../private/SkMutex.h"
24#include "../private/SkSemaphore.h"
25#include "../private/SkGpuFenceSync.h"
26
27// posix only for now
28#include <unistd.h>
29#include <sys/types.h>
30#include <sys/wait.h>
31
32/*
33 * This is an experimental GPU only benchmarking program.  The initial implementation will only
34 * support SKPs.
35 */
36
37// To get image decoders linked in we have to do the below magic
38#include "SkForceLinking.h"
39#include "SkImageDecoder.h"
40__SK_FORCE_IMAGE_DECODER_LINKING;
41
42static const int kAutoTuneLoops = 0;
43
44static const int kDefaultLoops =
45#ifdef SK_DEBUG
46    1;
47#else
48    kAutoTuneLoops;
49#endif
50
51static SkString loops_help_txt() {
52    SkString help;
53    help.printf("Number of times to run each bench. Set this to %d to auto-"
54                "tune for each bench. Timings are only reported when auto-tuning.",
55                kAutoTuneLoops);
56    return help;
57}
58
59DEFINE_string(skps, "skps", "Directory to read skps from.");
60DEFINE_string2(match, m, nullptr,
61               "[~][^]substring[$] [...] of GM name to run.\n"
62               "Multiple matches may be separated by spaces.\n"
63               "~ causes a matching bench to always be skipped\n"
64               "^ requires the start of the bench to match\n"
65               "$ requires the end of the bench to match\n"
66               "^ and $ requires an exact match\n"
67               "If a bench does not match any list entry,\n"
68               "it is skipped unless some list entry starts with ~");
69DEFINE_int32(gpuFrameLag, 5, "If unknown, estimated maximum number of frames GPU allows to lag.");
70DEFINE_int32(samples, 10, "Number of samples to measure for each bench.");
71DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");
72DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());
73DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");
74DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");
75DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");
76DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");
77
78static SkString humanize(double ms) {
79    return HumanizeMs(ms);
80}
81#define HUMANIZE(ms) humanize(ms).c_str()
82
83namespace kilobench {
84class BenchmarkStream {
85public:
86    BenchmarkStream() : fCurrentSKP(0) {
87        for (int i = 0; i < FLAGS_skps.count(); i++) {
88            if (SkStrEndsWith(FLAGS_skps[i], ".skp")) {
89                fSKPs.push_back() = FLAGS_skps[i];
90            } else {
91                SkOSFile::Iter it(FLAGS_skps[i], ".skp");
92                SkString path;
93                while (it.next(&path)) {
94                    fSKPs.push_back() = SkOSPath::Join(FLAGS_skps[0], path.c_str());
95                }
96            }
97        }
98    }
99
100    Benchmark* next() {
101        Benchmark* bench = nullptr;
102        // skips non matching benches
103        while ((bench = this->innerNext()) &&
104               (SkCommandLineFlags::ShouldSkip(FLAGS_match, bench->getUniqueName()) ||
105                !bench->isSuitableFor(Benchmark::kGPU_Backend))) {
106            delete bench;
107        }
108        return bench;
109    }
110
111private:
112    static bool ReadPicture(const char* path, SkAutoTUnref<SkPicture>* pic) {
113        // Not strictly necessary, as it will be checked again later,
114        // but helps to avoid a lot of pointless work if we're going to skip it.
115        if (SkCommandLineFlags::ShouldSkip(FLAGS_match, path)) {
116            return false;
117        }
118
119        SkAutoTDelete<SkStream> stream(SkStream::NewFromFile(path));
120        if (stream.get() == nullptr) {
121            SkDebugf("Could not read %s.\n", path);
122            return false;
123        }
124
125        pic->reset(SkPicture::CreateFromStream(stream.get()));
126        if (pic->get() == nullptr) {
127            SkDebugf("Could not read %s as an SkPicture.\n", path);
128            return false;
129        }
130        return true;
131    }
132
133    Benchmark* innerNext() {
134        // Render skps
135        while (fCurrentSKP < fSKPs.count()) {
136            const SkString& path = fSKPs[fCurrentSKP++];
137            SkAutoTUnref<SkPicture> pic;
138            if (!ReadPicture(path.c_str(), &pic)) {
139                continue;
140            }
141
142            SkString name = SkOSPath::Basename(path.c_str());
143            return new VisualSKPBench(name.c_str(), pic.get());
144        }
145
146        return nullptr;
147    }
148
149    SkTArray<SkString> fSKPs;
150    int fCurrentSKP;
151};
152
153struct GPUTarget {
154    void setup() {
155        fGL->makeCurrent();
156        // Make sure we're done with whatever came before.
157        SK_GL(*fGL, Finish());
158    }
159
160    SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }
161
162    void endTiming(bool usePlatformSwapBuffers) {
163        if (fGL) {
164            SK_GL(*fGL, Flush());
165            if (usePlatformSwapBuffers) {
166                fGL->swapBuffers();
167            } else {
168                fGL->waitOnSyncOrSwap();
169            }
170        }
171    }
172    void finish() {
173        SK_GL(*fGL, Finish());
174    }
175
176    bool needsFrameTiming(int* maxFrameLag) const {
177        if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {
178            // Frame lag is unknown.
179            *maxFrameLag = FLAGS_gpuFrameLag;
180        }
181        return true;
182    }
183
184    bool init(Benchmark* bench, GrContextFactory* factory, bool useDfText,
185              GrContextFactory::GLContextType ctxType,
186              GrContextFactory::GLContextOptions ctxOptions, int numSamples) {
187        GrContext* context = factory->get(ctxType, ctxOptions);
188        int maxRTSize = context->caps()->maxRenderTargetSize();
189        SkImageInfo info = SkImageInfo::Make(SkTMin(bench->getSize().fX, maxRTSize),
190                                             SkTMin(bench->getSize().fY, maxRTSize),
191                                              kN32_SkColorType, kPremul_SkAlphaType);
192        uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :
193                                                  0;
194        SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);
195        fSurface.reset(SkSurface::NewRenderTarget(context,
196                                                  SkBudgeted::kNo, info,
197                                                  numSamples, &props));
198        fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
199        if (!fSurface.get()) {
200            return false;
201        }
202
203        // Kilobench should only be used on platforms with fence sync support
204        SkASSERT(fGL->fenceSyncSupport());
205        return true;
206    }
207
208    SkCanvas* getCanvas() const {
209        if (!fSurface.get()) {
210            return nullptr;
211        }
212        return fSurface->getCanvas();
213    }
214
215    bool capturePixels(SkBitmap* bmp) {
216        SkCanvas* canvas = this->getCanvas();
217        if (!canvas) {
218            return false;
219        }
220        bmp->setInfo(canvas->imageInfo());
221        if (!canvas->readPixels(bmp, 0, 0)) {
222            SkDebugf("Can't read canvas pixels.\n");
223            return false;
224        }
225        return true;
226    }
227
228    SkGLContext* gl() { return fGL; }
229
230private:
231    SkGLContext* fGL;
232    SkAutoTDelete<SkSurface> fSurface;
233};
234
235static bool write_canvas_png(GPUTarget* target, const SkString& filename) {
236
237    if (filename.isEmpty()) {
238        return false;
239    }
240    if (target->getCanvas() &&
241        kUnknown_SkColorType == target->getCanvas()->imageInfo().colorType()) {
242        return false;
243    }
244
245    SkBitmap bmp;
246
247    if (!target->capturePixels(&bmp)) {
248        return false;
249    }
250
251    SkString dir = SkOSPath::Dirname(filename.c_str());
252    if (!sk_mkdir(dir.c_str())) {
253        SkDebugf("Can't make dir %s.\n", dir.c_str());
254        return false;
255    }
256    SkFILEWStream stream(filename.c_str());
257    if (!stream.isValid()) {
258        SkDebugf("Can't write %s.\n", filename.c_str());
259        return false;
260    }
261    if (!SkImageEncoder::EncodeStream(&stream, bmp, SkImageEncoder::kPNG_Type, 100)) {
262        SkDebugf("Can't encode a PNG.\n");
263        return false;
264    }
265    return true;
266}
267
268static int detect_forever_loops(int loops) {
269    // look for a magic run-forever value
270    if (loops < 0) {
271        loops = SK_MaxS32;
272    }
273    return loops;
274}
275
276static int clamp_loops(int loops) {
277    if (loops < 1) {
278        SkDebugf("ERROR: clamping loops from %d to 1. "
279                 "There's probably something wrong with the bench.\n", loops);
280        return 1;
281    }
282    if (loops > FLAGS_maxLoops) {
283        SkDebugf("WARNING: clamping loops from %d to FLAGS_maxLoops, %d.\n", loops, FLAGS_maxLoops);
284        return FLAGS_maxLoops;
285    }
286    return loops;
287}
288
289static double now_ms() { return SkTime::GetNSecs() * 1e-6; }
290
291struct TimingThread {
292    TimingThread(SkGLContext* mainContext)
293        : fFenceSync(mainContext->fenceSync())
294        ,  fMainContext(mainContext)
295        ,  fDone(false) {}
296
297    static void Loop(void* data) {
298        TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);
299        timingThread->timingLoop();
300    }
301
302    // To ensure waiting for the sync actually does something, we check to make sure the we exceed
303    // some small value
304    const double kMinElapsed = 1e-6;
305    bool sanity(double start) const {
306        double elapsed = now_ms() - start;
307        return elapsed > kMinElapsed;
308    }
309
310    void waitFence(SkPlatformGpuFence sync) {
311        SkDEBUGCODE(double start = now_ms());
312        fFenceSync->waitFence(sync, false);
313        SkASSERT(sanity(start));
314    }
315
316    void timingLoop() {
317        // Create a context which shares display lists with the main thread
318        SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,
319                                                                       fMainContext));
320        glContext->makeCurrent();
321
322        // Basic timing methodology is:
323        // 1) Wait on semaphore until main thread indicates its time to start timing the frame
324        // 2) Wait on frame start sync, record time.  This is start of the frame.
325        // 3) Wait on semaphore until main thread indicates its time to finish timing the frame
326        // 4) Wait on frame end sync, record time.  FrameEndTime - FrameStartTime = frame time
327        // 5) Wait on semaphore until main thread indicates we should time the next frame or quit
328        while (true) {
329            fSemaphore.wait();
330
331            // get start sync
332            SkPlatformGpuFence startSync = this->popStartSync();
333
334            // wait on sync
335            this->waitFence(startSync);
336            double start = kilobench::now_ms();
337
338            // do we want to sleep here?
339            // wait for end sync
340            fSemaphore.wait();
341
342            // get end sync
343            SkPlatformGpuFence endSync = this->popEndSync();
344
345            // wait on sync
346            this->waitFence(endSync);
347            double elapsed = kilobench::now_ms() - start;
348
349            // No mutex needed, client won't touch timings until we're done
350            fTimings.push_back(elapsed);
351
352            // clean up fences
353            fFenceSync->deleteFence(startSync);
354            fFenceSync->deleteFence(endSync);
355
356            fSemaphore.wait();
357            if (this->isDone()) {
358                break;
359            }
360        }
361    }
362
363    void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }
364
365    SkPlatformGpuFence popStartSync() {
366        return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);
367    }
368
369    void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }
370
371    SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }
372
373    void setDone() {
374        SkAutoMutexAcquire done(fDoneMutex);
375        fDone = true;
376        fSemaphore.signal();
377    }
378
379    typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;
380
381    void pushSync(SyncQueue* queue, SkMutex* mutex) {
382        SkAutoMutexAcquire am(mutex);
383        *queue->addToHead() = fFenceSync->insertFence();
384        fSemaphore.signal();
385    }
386
387    SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {
388        SkAutoMutexAcquire am(mutex);
389        SkPlatformGpuFence sync = *queue->head();
390        queue->popHead();
391        return sync;
392    }
393
394    bool isDone() {
395        SkAutoMutexAcquire am1(fFrameStartSyncsMutex);
396        SkAutoMutexAcquire done(fDoneMutex);
397        if (fDone && fFrameStartSyncs.isEmpty()) {
398            return true;
399        } else {
400            return false;
401        }
402    }
403
404    const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }
405
406private:
407    SkGpuFenceSync* fFenceSync;
408    SkSemaphore fSemaphore;
409    SkMutex fFrameStartSyncsMutex;
410    SyncQueue fFrameStartSyncs;
411    SkMutex fFrameEndSyncsMutex;
412    SyncQueue fFrameEndSyncs;
413    SkTArray<double> fTimings;
414    SkMutex fDoneMutex;
415    SkGLContext* fMainContext;
416    bool fDone;
417};
418
419static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {
420    SkCanvas* canvas = target->getCanvas();
421    canvas->clear(SK_ColorWHITE);
422    bench->preDraw(canvas);
423
424    if (timingThread) {
425        timingThread->pushStartSync();
426    }
427    double start = now_ms();
428    canvas = target->beginTiming(canvas);
429    bench->draw(loops, canvas);
430    canvas->flush();
431    target->endTiming(timingThread ? true : false);
432
433    double elapsed = now_ms() - start;
434    if (timingThread) {
435        timingThread->pushEndSync();
436        timingThread->setDone();
437    }
438    bench->postDraw(canvas);
439    return elapsed;
440}
441
442// TODO For now we don't use the background timing thread to tune loops
443static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {
444    // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
445    int loops = bench->calculateLoops(FLAGS_loops);
446    if (kAutoTuneLoops == loops) {
447        loops = 1;
448        double elapsed = 0;
449        do {
450            if (1<<30 == loops) {
451                // We're about to wrap.  Something's wrong with the bench.
452                loops = 0;
453                break;
454            }
455            loops *= 2;
456            // If the GPU lets frames lag at all, we need to make sure we're timing
457            // _this_ round, not still timing last round.
458            for (int i = 0; i < maxGpuFrameLag; i++) {
459                elapsed = time(loops, bench, target, nullptr);
460            }
461        } while (elapsed < FLAGS_gpuMs);
462
463        // We've overshot at least a little.  Scale back linearly.
464        loops = (int)ceil(loops * FLAGS_gpuMs / elapsed);
465        loops = clamp_loops(loops);
466
467        // Make sure we're not still timing our calibration.
468        target->finish();
469    } else {
470        loops = detect_forever_loops(loops);
471    }
472
473    // Pretty much the same deal as the calibration: do some warmup to make
474    // sure we're timing steady-state pipelined frames.
475    for (int i = 0; i < maxGpuFrameLag - 1; i++) {
476        time(loops, bench, target, nullptr);
477    }
478
479    return loops;
480}
481
482struct AutoSetupContextBenchAndTarget {
483    AutoSetupContextBenchAndTarget(Benchmark* bench) : fBenchmark(bench) {
484        GrContextOptions grContextOpts;
485        fCtxFactory.reset(new GrContextFactory(grContextOpts));
486
487        SkAssertResult(fTarget.init(bench, fCtxFactory, false,
488                                    GrContextFactory::kNative_GLContextType,
489                                    GrContextFactory::kNone_GLContextOptions, 0));
490
491        fCanvas = fTarget.getCanvas();
492        fTarget.setup();
493
494        bench->perCanvasPreDraw(fCanvas);
495        fTarget.needsFrameTiming(&fMaxFrameLag);
496    }
497
498    int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }
499
500    double timeSample(int loops, TimingThread* timingThread) {
501        for (int i = 0; i < fMaxFrameLag; i++) {
502            time(loops, fBenchmark, &fTarget, timingThread);
503        }
504
505        return time(loops, fBenchmark, &fTarget, timingThread) / loops;
506    }
507
508    void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }
509
510    SkAutoTDelete<GrContextFactory> fCtxFactory;
511    GPUTarget fTarget;
512    SkCanvas* fCanvas;
513    Benchmark* fBenchmark;
514    int fMaxFrameLag;
515};
516
517int setup_loops(Benchmark* bench) {
518    AutoSetupContextBenchAndTarget ascbt(bench);
519    int loops = ascbt.getLoops();
520    ascbt.teardownBench();
521
522    if (!FLAGS_writePath.isEmpty() && FLAGS_writePath[0]) {
523        SkString pngFilename = SkOSPath::Join(FLAGS_writePath[0], "gpu");
524        pngFilename = SkOSPath::Join(pngFilename.c_str(), bench->getUniqueName());
525        pngFilename.append(".png");
526        write_canvas_png(&ascbt.fTarget, pngFilename);
527    }
528    return loops;
529}
530
531struct Sample {
532    double fCpu;
533    double fGpu;
534};
535
536Sample time_sample(Benchmark* bench, int loops) {
537    AutoSetupContextBenchAndTarget ascbt(bench);
538
539    Sample sample;
540    if (FLAGS_useBackgroundThread) {
541        TimingThread timingThread(ascbt.fTarget.gl());
542        SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));
543        nativeThread->start();
544        sample.fCpu = ascbt.timeSample(loops, &timingThread);
545        nativeThread->join();
546
547        // return the min
548        double min = SK_ScalarMax;
549        for (int i = 0; i < timingThread.timings().count(); i++) {
550            min = SkTMin(min, timingThread.timings()[i]);
551        }
552        sample.fGpu = min;
553    } else {
554        sample.fCpu = ascbt.timeSample(loops, nullptr);
555    }
556
557    ascbt.teardownBench();
558
559    return sample;
560}
561
562} // namespace kilobench
563
564static const int kOutResultSize = 1024;
565
566void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {
567    SkString newName(name);
568    newName.appendf("_%s", mod);
569    Stats stats(samples);
570    const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
571    SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
572        , loops
573        , HUMANIZE(stats.min)
574        , HUMANIZE(stats.median)
575        , HUMANIZE(stats.mean)
576        , HUMANIZE(stats.max)
577        , stddev_percent
578        , stats.plot.c_str()
579        , "gpu"
580        , newName.c_str()
581    );
582}
583
584int kilobench_main() {
585    kilobench::BenchmarkStream benchStream;
586
587    SkDebugf("loops\tmin\tmedian\tmean\tmax\tstddev\t%-*s\tconfig\tbench\n",
588             FLAGS_samples, "samples");
589
590    int descriptors[2];
591    if (pipe(descriptors) != 0) {
592        SkFAIL("Failed to open a pipe\n");
593    }
594
595    while (Benchmark* b = benchStream.next()) {
596        SkAutoTDelete<Benchmark> bench(b);
597
598        int loops = 1;
599        SkTArray<double> cpuSamples;
600        SkTArray<double> gpuSamples;
601        for (int i = 0; i < FLAGS_samples + 1; i++) {
602            // We fork off a new process to setup the grcontext and run the test while we wait
603            if (FLAGS_useMultiProcess) {
604                int childPid = fork();
605                if (childPid > 0) {
606                    char result[kOutResultSize];
607                    if (read(descriptors[0], result, kOutResultSize) < 0) {
608                         SkFAIL("Failed to read from pipe\n");
609                    }
610
611                    // if samples == 0 then parse # of loops
612                    // else parse float
613                    if (i == 0) {
614                        sscanf(result, "%d", &loops);
615                    } else {
616                        sscanf(result, "%lf %lf", &cpuSamples.push_back(),
617                                                  &gpuSamples.push_back());
618                    }
619
620                    // wait until exit
621                    int status;
622                    waitpid(childPid, &status, 0);
623                } else if (0 == childPid) {
624                    char result[kOutResultSize];
625                    if (i == 0) {
626                        sprintf(result, "%d", kilobench::setup_loops(bench));
627                    } else {
628                        kilobench::Sample sample = kilobench::time_sample(bench, loops);
629                        sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);
630                    }
631
632                    // Make sure to write the null terminator
633                    if (write(descriptors[1], result, strlen(result) + 1) < 0) {
634                        SkFAIL("Failed to write to pipe\n");
635                    }
636                    return 0;
637                } else {
638                    SkFAIL("Fork failed\n");
639                }
640            } else {
641                if (i == 0) {
642                    loops = kilobench::setup_loops(bench);
643                } else {
644                    kilobench::Sample sample = kilobench::time_sample(bench, loops);
645                    cpuSamples.push_back(sample.fCpu);
646                    gpuSamples.push_back(sample.fGpu);
647                }
648            }
649        }
650
651        printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");
652        if (FLAGS_useBackgroundThread) {
653            printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");
654        }
655    }
656    return 0;
657}
658
659#if !defined SK_BUILD_FOR_IOS
660int main(int argc, char** argv) {
661    SkCommandLineFlags::Parse(argc, argv);
662    return kilobench_main();
663}
664#endif
665