1fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot/* 2fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * Copyright 2016 Google Inc. 3fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * 4fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * Use of this source code is governed by a BSD-style license that can be 5fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * found in the LICENSE file. 6fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot */ 7fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 8fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#include "Benchmark.h" 9fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#include "SkTypes.h" 10fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 11fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot/** 12fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * There's a good variety of ways to pack from int down to uint16_t with SSE, 13fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * depending on the specific instructions available. 14fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * 15fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * SSE2 offers an int -> int16_t pack instruction. We can use this in two ways: 16fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - subtract off 32768, int -> int16_t, add 32768 back (sse2_a) 17fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b) 18fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3) 19fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * SSE41 added an int -> uint16_t pack instruction. (sse41) 20fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * 21fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * Findings so far: 22fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - sse41 < ssse3 <<< sse2_b < sse2_a; 23fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - the ssse3 version is only slightly slower than the sse41 version, maybe not at all 24fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - the sse2_a is only slightly slower than the sse2_b version 25fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - the ssse3 and sse41 versions are about 3x faster than either sse2 version 26fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot * - the sse41 version seems to cause some code generation trouble. 27fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot */ 28fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 29fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 30fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 31fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#include <immintrin.h> 32fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 33fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robottemplate <__m128i (kernel)(__m128i)> 34fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotclass pack_int_uint16_t_Bench : public Benchmark { 35fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotpublic: 36fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot pack_int_uint16_t_Bench(const char* impl) { 37fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot fName.append("pack_int_uint16_t_"); 38fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot fName.append(impl); 39fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 40fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 41fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } 42fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot const char* onGetName() override { return fName.c_str(); } 43fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 44fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot void onDraw(int loops, SkCanvas*) override { 45fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot __m128i x = _mm_set1_epi32(0x42424242); 46fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot while (loops --> 0) { 47fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot x = kernel(x); 48fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 49fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 50fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot volatile int blackhole = 0; 51fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot blackhole ^= _mm_cvtsi128_si32(x); 52fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 53fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 54fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot SkString fName; 55fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot}; 56fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 57fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace { 58fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot __m128i sse2_a(__m128i x) { 59fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000)); 60fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000)); 61fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 62fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot} 63fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); ) 64fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 65fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace { 66fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot __m128i sse2_b(__m128i x) { 67fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16); 68fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot return _mm_packs_epi32(x,x); 69fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 70fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot} 71fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); ) 72fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 73fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 74fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace { 75fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot __m128i ssse3(__m128i x) { 76fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot // TODO: Can we force the bench to load the mask inside the loop? Would be more realistic. 77fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot const int _ = ~0; 78fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); 79fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 80fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot} 81fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); ) 82fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#endif 83fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 84fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 85fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robotnamespace { 86fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot __m128i sse41(__m128i x) { 87fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot return _mm_packus_epi32(x,x); 88fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot } 89fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot} 90fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team RobotDEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); ) 91fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#endif 92fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot 93fe17456d5e528078ce69b5f15cf7adf1fab963fandroid-build-team Robot#endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 94