1fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot/*
2fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * Copyright 2016 Google Inc.
3fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *
4fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * Use of this source code is governed by a BSD-style license that can be
5fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * found in the LICENSE file.
6fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot */
7fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
8fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#include "Benchmark.h"
9fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#include "SkTypes.h"
10fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
11fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot/**
12fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * There's a good variety of ways to pack from int down to uint16_t with SSE,
13fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * depending on the specific instructions available.
14fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *
15fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * SSE2 offers an int -> int16_t pack instruction.  We can use this in two ways:
16fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *    - subtract off 32768, int -> int16_t, add 32768 back                                  (sse2_a)
17fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *    - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b)
18fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * SSSE3 adds a byte shuffle, so we just put the bytes where we want them.                  (ssse3)
19fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * SSE41 added an int -> uint16_t pack instruction.                                         (sse41)
20fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *
21fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * Findings so far:
22fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *   - sse41 < ssse3 <<< sse2_b < sse2_a;
23fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *   - the ssse3 version is only slightly slower than the sse41 version, maybe not at all
24fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *   - the sse2_a is only slightly slower than the sse2_b version
25fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *   - the ssse3 and sse41 versions are about 3x faster than either sse2 version
26fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot *   - the sse41 version seems to cause some code generation trouble.
27fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot */
28fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
29fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
30fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
31fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#include <immintrin.h>
32fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
33fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robottemplate <__m128i (kernel)(__m128i)>
34fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotclass pack_int_uint16_t_Bench : public Benchmark {
35fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotpublic:
36fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    pack_int_uint16_t_Bench(const char* impl) {
37fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        fName.append("pack_int_uint16_t_");
38fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        fName.append(impl);
39fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    }
40fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
41fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
42fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    const char* onGetName() override { return fName.c_str(); }
43fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
44fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    void onDraw(int loops, SkCanvas*) override {
45fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        __m128i x = _mm_set1_epi32(0x42424242);
46fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        while (loops --> 0) {
47fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot            x = kernel(x);
48fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        }
49fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
50fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        volatile int blackhole = 0;
51fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        blackhole ^= _mm_cvtsi128_si32(x);
52fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    }
53fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
54fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    SkString fName;
55fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot};
56fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
57fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace {
58fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    __m128i sse2_a(__m128i x) {
59fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000));
60fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000));
61fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    }
62fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot}
63fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); )
64fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
65fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace {
66fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    __m128i sse2_b(__m128i x) {
67fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
68fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        return _mm_packs_epi32(x,x);
69fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    }
70fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot}
71fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); )
72fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
73fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
74fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace {
75fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    __m128i ssse3(__m128i x) {
76fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        // TODO: Can we force the bench to load the mask inside the loop?  Would be more realistic.
77fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        const int _ = ~0;
78fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
79fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    }
80fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot}
81fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); )
82fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#endif
83fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
84fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
85fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace {
86fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    __m128i sse41(__m128i x) {
87fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot        return _mm_packus_epi32(x,x);
88fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot    }
89fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot}
90fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); )
91fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#endif
92fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot
93fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#endif  // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
94