1/*
2 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBitmapFilter_opts_SSE2.h"
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBitmapProcState_opts_SSSE3.h"
11#include "SkBlitMask.h"
12#include "SkBlitRect_opts_SSE2.h"
13#include "SkBlitRow.h"
14#include "SkBlitRow_opts_SSE2.h"
15#include "SkBlurImage_opts_SSE2.h"
16#include "SkMorphology_opts.h"
17#include "SkMorphology_opts_SSE2.h"
18#include "SkRTConf.h"
19#include "SkUtils.h"
20#include "SkUtils_opts_SSE2.h"
21#include "SkXfermode.h"
22#include "SkXfermode_proccoeff.h"
23
24#if defined(_MSC_VER) && defined(_WIN64)
25#include <intrin.h>
26#endif
27
28/* This file must *not* be compiled with -msse or any other optional SIMD
29   extension, otherwise gcc may generate SIMD instructions even for scalar ops
30   (and thus give an invalid instruction on Pentium3 on the code below).
31   For example, only files named *_SSE2.cpp in this directory should be
32   compiled with -msse2 or higher. */
33
34
35/* Function to get the CPU SSE-level in runtime, for different compilers. */
36#ifdef _MSC_VER
37static inline void getcpuid(int info_type, int info[4]) {
38#if defined(_WIN64)
39    __cpuid(info, info_type);
40#else
41    __asm {
42        mov    eax, [info_type]
43        cpuid
44        mov    edi, [info]
45        mov    [edi], eax
46        mov    [edi+4], ebx
47        mov    [edi+8], ecx
48        mov    [edi+12], edx
49    }
50#endif
51}
52#elif defined(__x86_64__)
53static inline void getcpuid(int info_type, int info[4]) {
54    asm volatile (
55        "cpuid \n\t"
56        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
57        : "a"(info_type)
58    );
59}
60#else
61static inline void getcpuid(int info_type, int info[4]) {
62    // We save and restore ebx, so this code can be compatible with -fPIC
63    asm volatile (
64        "pushl %%ebx      \n\t"
65        "cpuid            \n\t"
66        "movl %%ebx, %1   \n\t"
67        "popl %%ebx       \n\t"
68        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
69        : "a"(info_type)
70    );
71}
72#endif
73
74////////////////////////////////////////////////////////////////////////////////
75
76/* Fetch the SIMD level directly from the CPU, at run-time.
77 * Only checks the levels needed by the optimizations in this file.
78 */
79static int get_SIMD_level() {
80    int cpu_info[4] = { 0 };
81
82    getcpuid(1, cpu_info);
83    if ((cpu_info[2] & (1<<20)) != 0) {
84        return SK_CPU_SSE_LEVEL_SSE42;
85    } else if ((cpu_info[2] & (1<<9)) != 0) {
86        return SK_CPU_SSE_LEVEL_SSSE3;
87    } else if ((cpu_info[3] & (1<<26)) != 0) {
88        return SK_CPU_SSE_LEVEL_SSE2;
89    } else {
90        return 0;
91    }
92}
93
94/* Verify that the requested SIMD level is supported in the build.
95 * If not, check if the platform supports it.
96 */
97static inline bool supports_simd(int minLevel) {
98#if defined(SK_CPU_SSE_LEVEL)
99    if (minLevel <= SK_CPU_SSE_LEVEL) {
100        return true;
101    } else
102#endif
103    {
104#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
105        /* For the Android framework we should always know at compile time if the device
106         * we are building for supports SSSE3.  The one exception to this rule is on the
107         * emulator where we are compiled without the -mssse3 option (so we have no
108         * SSSE3 procs) but can be run on a host machine that supports SSSE3
109         * instructions. So for that particular case we disable our SSSE3 options.
110         */
111        return false;
112#else
113        static int gSIMDLevel = get_SIMD_level();
114        return (minLevel <= gSIMDLevel);
115#endif
116    }
117}
118
119////////////////////////////////////////////////////////////////////////////////
120
121SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", false, "Use SSE optimized version of high quality image filters");
122
123void SkBitmapProcState::platformConvolutionProcs(SkConvolutionProcs* procs) {
124    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
125        procs->fExtraHorizontalReads = 3;
126        procs->fConvolveVertically = &convolveVertically_SSE2;
127        procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
128        procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
129        procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
130    }
131}
132
133////////////////////////////////////////////////////////////////////////////////
134
135void SkBitmapProcState::platformProcs() {
136    /* Every optimization in the function requires at least SSE2 */
137    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
138        return;
139    }
140
141    /* Check fSampleProc32 */
142    if (fSampleProc32 == S32_opaque_D32_filter_DX) {
143        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
144            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
145        } else {
146            fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
147        }
148    } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
149        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
150            fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
151        }
152    } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
153        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
154            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
155        } else {
156            fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
157        }
158    } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
159        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
160            fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
161        }
162    }
163
164    /* Check fSampleProc16 */
165    if (fSampleProc16 == S32_D16_filter_DX) {
166        fSampleProc16 = S32_D16_filter_DX_SSE2;
167    }
168
169    /* Check fMatrixProc */
170    if (fMatrixProc == ClampX_ClampY_filter_scale) {
171        fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
172    } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
173        fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
174    } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
175        fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
176    } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
177        fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
178    }
179
180    /* Check fShaderProc32 */
181    if (c_hqfilter_sse) {
182        if (fShaderProc32 == highQualityFilter32) {
183            fShaderProc32 = highQualityFilter_SSE2;
184        }
185    }
186}
187
188////////////////////////////////////////////////////////////////////////////////
189
190static SkBlitRow::Proc platform_16_procs[] = {
191    S32_D565_Opaque_SSE2,               // S32_D565_Opaque
192    NULL,                               // S32_D565_Blend
193    S32A_D565_Opaque_SSE2,              // S32A_D565_Opaque
194    NULL,                               // S32A_D565_Blend
195    S32_D565_Opaque_Dither_SSE2,        // S32_D565_Opaque_Dither
196    NULL,                               // S32_D565_Blend_Dither
197    S32A_D565_Opaque_Dither_SSE2,       // S32A_D565_Opaque_Dither
198    NULL,                               // S32A_D565_Blend_Dither
199};
200
201SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
202    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
203        return platform_16_procs[flags];
204    } else {
205        return NULL;
206    }
207}
208
209static SkBlitRow::Proc32 platform_32_procs[] = {
210    NULL,                               // S32_Opaque,
211    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
212    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
213    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
214};
215
216SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
217    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
218        return platform_32_procs[flags];
219    } else {
220        return NULL;
221    }
222}
223
224SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
225    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
226        return Color32_SSE2;
227    } else {
228        return NULL;
229    }
230}
231
232SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
233
234SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
235/* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled.
236    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
237        return ColorRect32_SSE2;
238    } else {
239        return NULL;
240    }
241*/
242    return NULL;
243}
244
245////////////////////////////////////////////////////////////////////////////////
246
247SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
248                                                     SkMask::Format maskFormat,
249                                                     SkColor color) {
250    if (SkMask::kA8_Format != maskFormat) {
251        return NULL;
252    }
253
254    ColorProc proc = NULL;
255    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
256        switch (dstCT) {
257            case kN32_SkColorType:
258                // The SSE2 version is not (yet) faster for black, so we check
259                // for that.
260                if (SK_ColorBLACK != color) {
261                    proc = SkARGB32_A8_BlitMask_SSE2;
262                }
263                break;
264            default:
265                break;
266        }
267    }
268    return proc;
269}
270
271SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
272    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
273        if (isOpaque) {
274            return SkBlitLCD16OpaqueRow_SSE2;
275        } else {
276            return SkBlitLCD16Row_SSE2;
277        }
278    } else {
279        return NULL;
280    }
281
282}
283
284SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
285    return NULL;
286}
287
288////////////////////////////////////////////////////////////////////////////////
289
290SkMemset16Proc SkMemset16GetPlatformProc() {
291    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
292        return sk_memset16_SSE2;
293    } else {
294        return NULL;
295    }
296}
297
298SkMemset32Proc SkMemset32GetPlatformProc() {
299    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
300        return sk_memset32_SSE2;
301    } else {
302        return NULL;
303    }
304}
305
306SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
307    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
308        return sk_memcpy32_SSE2;
309    } else {
310        return NULL;
311    }
312}
313
314////////////////////////////////////////////////////////////////////////////////
315
316SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
317    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
318        return NULL;
319    }
320    switch (type) {
321        case kDilateX_SkMorphologyProcType:
322            return SkDilateX_SSE2;
323        case kDilateY_SkMorphologyProcType:
324            return SkDilateY_SSE2;
325        case kErodeX_SkMorphologyProcType:
326            return SkErodeX_SSE2;
327        case kErodeY_SkMorphologyProcType:
328            return SkErodeY_SSE2;
329        default:
330            return NULL;
331    }
332}
333
334////////////////////////////////////////////////////////////////////////////////
335
336bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
337                               SkBoxBlurProc* boxBlurY,
338                               SkBoxBlurProc* boxBlurXY,
339                               SkBoxBlurProc* boxBlurYX) {
340#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
341    return false;
342#else
343    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
344        return false;
345    }
346    return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
347#endif
348}
349
350////////////////////////////////////////////////////////////////////////////////
351
352extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
353                                                                SkXfermode::Mode mode);
354
355SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
356                                                    SkXfermode::Mode mode);
357
358SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
359                                                    SkXfermode::Mode mode) {
360    return NULL;
361}
362
363SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
364                                               SkXfermode::Mode mode);
365
366SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
367                                               SkXfermode::Mode mode) {
368    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
369        return SkPlatformXfermodeFactory_impl_SSE2(rec, mode);
370    } else {
371        return SkPlatformXfermodeFactory_impl(rec, mode);
372    }
373}
374
375SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode);
376
377SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) {
378    return NULL;
379}
380