1/*
2 * Copyright 2014 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "Benchmark.h"
9#include "SkRandom.h"
10#include "SkTemplates.h"
11#include "SkUtils.h"
12
13template <typename Memcpy32>
14class Memcpy32Bench : public Benchmark {
15public:
16    explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
17        : fCount(count)
18        , fMemcpy32(memcpy32)
19        , fName(SkStringPrintf("%s_%d", name, count)) {}
20
21    virtual const char* onGetName() SK_OVERRIDE {
22        return fName.c_str();
23    }
24
25    virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
26        return backend == kNonRendering_Backend;
27    }
28
29    virtual void onPreDraw() SK_OVERRIDE {
30        fDst.reset(fCount);
31        fSrc.reset(fCount);
32
33        SkRandom rand;
34        for (int i = 0; i < fCount; i++) {
35            fSrc[i] = rand.nextU();
36        }
37    }
38
39    virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
40        for (int i = 0; i < loops; i++) {
41            fMemcpy32(fDst, fSrc, fCount);
42        }
43    }
44
45private:
46    SkAutoTMalloc<uint32_t> fDst, fSrc;
47
48    int fCount;
49    Memcpy32 fMemcpy32;
50    const SkString fName;
51};
52
53template <typename Memcpy32>
54static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
55    return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
56}
57#define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )
58
59
60// Let the libc developers do what they think is best.
61static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
62    memcpy(dst, src, sizeof(uint32_t) * count);
63}
64BENCH(memcpy32_memcpy, 10)
65BENCH(memcpy32_memcpy, 100)
66BENCH(memcpy32_memcpy, 1000)
67BENCH(memcpy32_memcpy, 10000)
68BENCH(memcpy32_memcpy, 100000)
69
70// Let the compiler's autovectorizer do what it thinks is best.
71static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
72    while (count --> 0) {
73        *dst++ = *src++;
74    }
75}
76BENCH(memcpy32_autovectorize, 10)
77BENCH(memcpy32_autovectorize, 100)
78BENCH(memcpy32_autovectorize, 1000)
79BENCH(memcpy32_autovectorize, 10000)
80BENCH(memcpy32_autovectorize, 100000)
81
82#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
83
84// Align dst to 16 bytes, then use aligned stores.  src isn't algined, so use unaligned loads.
85static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
86    if (count >= 16) {
87        while (uintptr_t(dst) & 0xF) {
88            *dst++ = *src++;
89            count--;
90        }
91
92        __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
93        const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
94        dst += 16 * (count / 16);
95        src += 16 * (count / 16);
96        while (count >= 16) {
97            __m128i a = _mm_loadu_si128(src128++);
98            __m128i b = _mm_loadu_si128(src128++);
99            __m128i c = _mm_loadu_si128(src128++);
100            __m128i d = _mm_loadu_si128(src128++);
101
102            _mm_store_si128(dst128++, a);
103            _mm_store_si128(dst128++, b);
104            _mm_store_si128(dst128++, c);
105            _mm_store_si128(dst128++, d);
106
107            count -= 16;
108        }
109    }
110
111    while (count --> 0) {
112        *dst++ = *src++;
113    }
114}
115BENCH(memcpy32_sse2_align, 10)
116BENCH(memcpy32_sse2_align, 100)
117BENCH(memcpy32_sse2_align, 1000)
118BENCH(memcpy32_sse2_align, 10000)
119BENCH(memcpy32_sse2_align, 100000)
120
121// Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
122static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
123    __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
124    const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
125    dst += 16 * (count / 16);
126    src += 16 * (count / 16);
127    while (count >= 16) {
128        __m128i a = _mm_loadu_si128(src128++);
129        __m128i b = _mm_loadu_si128(src128++);
130        __m128i c = _mm_loadu_si128(src128++);
131        __m128i d = _mm_loadu_si128(src128++);
132
133        _mm_storeu_si128(dst128++, a);
134        _mm_storeu_si128(dst128++, b);
135        _mm_storeu_si128(dst128++, c);
136        _mm_storeu_si128(dst128++, d);
137
138        count -= 16;
139    }
140
141    while (count --> 0) {
142        *dst++ = *src++;
143    }
144}
145BENCH(memcpy32_sse2_unalign, 10)
146BENCH(memcpy32_sse2_unalign, 100)
147BENCH(memcpy32_sse2_unalign, 1000)
148BENCH(memcpy32_sse2_unalign, 10000)
149BENCH(memcpy32_sse2_unalign, 100000)
150
151// Test our chosen best, from SkUtils.h
152BENCH(sk_memcpy32, 10)
153BENCH(sk_memcpy32, 100)
154BENCH(sk_memcpy32, 1000)
155BENCH(sk_memcpy32, 10000)
156BENCH(sk_memcpy32, 100000)
157
158#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
159
160#undef BENCH
161