1/*
2 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBitmapFilter_opts_SSE2.h"
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBitmapProcState_opts_SSSE3.h"
11#include "SkBitmapScaler.h"
12#include "SkBlitMask.h"
13#include "SkBlitRow.h"
14#include "SkBlitRow_opts_SSE2.h"
15#include "SkBlitRow_opts_SSE4.h"
16#include "SkOncePtr.h"
17#include "SkRTConf.h"
18
19#if defined(_MSC_VER) && defined(_WIN64)
20#include <intrin.h>
21#endif
22
23/* This file must *not* be compiled with -msse or any other optional SIMD
24   extension, otherwise gcc may generate SIMD instructions even for scalar ops
25   (and thus give an invalid instruction on Pentium3 on the code below).
26   For example, only files named *_SSE2.cpp in this directory should be
27   compiled with -msse2 or higher. */
28
29
30/* Function to get the CPU SSE-level in runtime, for different compilers. */
31#ifdef _MSC_VER
32static inline void getcpuid(int info_type, int info[4]) {
33#if defined(_WIN64)
34    __cpuid(info, info_type);
35#else
36    __asm {
37        mov    eax, [info_type]
38        cpuid
39        mov    edi, [info]
40        mov    [edi], eax
41        mov    [edi+4], ebx
42        mov    [edi+8], ecx
43        mov    [edi+12], edx
44    }
45#endif
46}
47#elif defined(__x86_64__)
48static inline void getcpuid(int info_type, int info[4]) {
49    asm volatile (
50        "cpuid \n\t"
51        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
52        : "a"(info_type)
53    );
54}
55#else
56static inline void getcpuid(int info_type, int info[4]) {
57    // We save and restore ebx, so this code can be compatible with -fPIC
58    asm volatile (
59        "pushl %%ebx      \n\t"
60        "cpuid            \n\t"
61        "movl %%ebx, %1   \n\t"
62        "popl %%ebx       \n\t"
63        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
64        : "a"(info_type)
65    );
66}
67#endif
68
69////////////////////////////////////////////////////////////////////////////////
70
71/* Fetch the SIMD level directly from the CPU, at run-time.
72 * Only checks the levels needed by the optimizations in this file.
73 */
74static int* get_SIMD_level() {
75    int cpu_info[4] = { 0, 0, 0, 0 };
76    getcpuid(1, cpu_info);
77
78    int* level = new int;
79
80    if ((cpu_info[2] & (1<<20)) != 0) {
81        *level = SK_CPU_SSE_LEVEL_SSE42;
82    } else if ((cpu_info[2] & (1<<19)) != 0) {
83        *level = SK_CPU_SSE_LEVEL_SSE41;
84    } else if ((cpu_info[2] & (1<<9)) != 0) {
85        *level = SK_CPU_SSE_LEVEL_SSSE3;
86    } else if ((cpu_info[3] & (1<<26)) != 0) {
87        *level = SK_CPU_SSE_LEVEL_SSE2;
88    } else {
89        *level = 0;
90    }
91    return level;
92}
93
94SK_DECLARE_STATIC_ONCE_PTR(int, gSIMDLevel);
95
96/* Verify that the requested SIMD level is supported in the build.
97 * If not, check if the platform supports it.
98 */
99static inline bool supports_simd(int minLevel) {
100#if defined(SK_CPU_SSE_LEVEL)
101    if (minLevel <= SK_CPU_SSE_LEVEL) {
102        return true;
103    } else
104#endif
105    {
106#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
107        /* For the Android framework we should always know at compile time if the device
108         * we are building for supports SSSE3.  The one exception to this rule is on the
109         * emulator where we are compiled without the -mssse3 option (so we have no
110         * SSSE3 procs) but can be run on a host machine that supports SSSE3
111         * instructions. So for that particular case we disable our SSSE3 options.
112         */
113        return false;
114#else
115        return minLevel <= *gSIMDLevel.get(get_SIMD_level);
116#endif
117    }
118}
119
120////////////////////////////////////////////////////////////////////////////////
121
122void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
123    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
124        procs->fExtraHorizontalReads = 3;
125        procs->fConvolveVertically = &convolveVertically_SSE2;
126        procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
127        procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
128        procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
129    }
130}
131
132////////////////////////////////////////////////////////////////////////////////
133
134void SkBitmapProcState::platformProcs() {
135    /* Every optimization in the function requires at least SSE2 */
136    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
137        return;
138    }
139    const bool ssse3 = supports_simd(SK_CPU_SSE_LEVEL_SSSE3);
140
141    /* Check fSampleProc32 */
142    if (fSampleProc32 == S32_opaque_D32_filter_DX) {
143        if (ssse3) {
144            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
145        } else {
146            fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
147        }
148    } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
149        if (ssse3) {
150            fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
151        }
152    } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
153        if (ssse3) {
154            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
155        } else {
156            fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
157        }
158    } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
159        if (ssse3) {
160            fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
161        }
162    }
163
164    /* Check fMatrixProc */
165    if (fMatrixProc == ClampX_ClampY_filter_scale) {
166        fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
167    } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
168        fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
169    } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
170        fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
171    } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
172        fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
173    }
174}
175
176////////////////////////////////////////////////////////////////////////////////
177
178static const SkBlitRow::Proc16 platform_16_procs[] = {
179    S32_D565_Opaque_SSE2,               // S32_D565_Opaque
180    nullptr,                               // S32_D565_Blend
181    S32A_D565_Opaque_SSE2,              // S32A_D565_Opaque
182    nullptr,                               // S32A_D565_Blend
183    S32_D565_Opaque_Dither_SSE2,        // S32_D565_Opaque_Dither
184    nullptr,                               // S32_D565_Blend_Dither
185    S32A_D565_Opaque_Dither_SSE2,       // S32A_D565_Opaque_Dither
186    nullptr,                               // S32A_D565_Blend_Dither
187};
188
189SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
190    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
191        return platform_16_procs[flags];
192    } else {
193        return nullptr;
194    }
195}
196
197static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = {
198    Color32A_D565_SSE2,                 // Color32A_D565,
199    nullptr,                               // Color32A_D565_Dither
200};
201
202SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
203/* If you're thinking about writing an SSE4 version of this, do check it's
204 * actually faster on Atom. Our original SSE4 version was slower than this
205 * SSE2 version on Silvermont, and only marginally faster on a Core i7,
206 * mainly due to the MULLD timings.
207 */
208    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
209        return platform_565_colorprocs_SSE2[flags];
210    } else {
211        return nullptr;
212    }
213}
214
215static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
216    nullptr,                               // S32_Opaque,
217    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
218    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
219    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
220};
221
222static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
223    nullptr,                               // S32_Opaque,
224    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
225    S32A_Opaque_BlitRow32_SSE4,         // S32A_Opaque
226    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
227};
228
229SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
230    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
231        return platform_32_procs_SSE4[flags];
232    } else
233    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
234        return platform_32_procs_SSE2[flags];
235    } else {
236        return nullptr;
237    }
238}
239
240////////////////////////////////////////////////////////////////////////////////
241
242SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
243    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
244        if (isOpaque) {
245            return SkBlitLCD16OpaqueRow_SSE2;
246        } else {
247            return SkBlitLCD16Row_SSE2;
248        }
249    } else {
250        return nullptr;
251    }
252
253}
254
255SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
256    return nullptr;
257}
258