1/*
2 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBitmapFilter_opts_SSE2.h"
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBitmapProcState_opts_SSSE3.h"
11#include "SkBitmapScaler.h"
12#include "SkBlitMask.h"
13#include "SkBlitRect_opts_SSE2.h"
14#include "SkBlitRow.h"
15#include "SkBlitRow_opts_SSE2.h"
16#include "SkBlitRow_opts_SSE4.h"
17#include "SkBlurImage_opts_SSE2.h"
18#include "SkBlurImage_opts_SSE4.h"
19#include "SkMorphology_opts.h"
20#include "SkMorphology_opts_SSE2.h"
21#include "SkRTConf.h"
22#include "SkUtils.h"
23#include "SkUtils_opts_SSE2.h"
24#include "SkXfermode.h"
25#include "SkXfermode_proccoeff.h"
26
27#if defined(_MSC_VER) && defined(_WIN64)
28#include <intrin.h>
29#endif
30
31/* This file must *not* be compiled with -msse or any other optional SIMD
32   extension, otherwise gcc may generate SIMD instructions even for scalar ops
33   (and thus give an invalid instruction on Pentium3 on the code below).
34   For example, only files named *_SSE2.cpp in this directory should be
35   compiled with -msse2 or higher. */
36
37
38/* Function to get the CPU SSE-level in runtime, for different compilers. */
39#ifdef _MSC_VER
40static inline void getcpuid(int info_type, int info[4]) {
41#if defined(_WIN64)
42    __cpuid(info, info_type);
43#else
44    __asm {
45        mov    eax, [info_type]
46        cpuid
47        mov    edi, [info]
48        mov    [edi], eax
49        mov    [edi+4], ebx
50        mov    [edi+8], ecx
51        mov    [edi+12], edx
52    }
53#endif
54}
55#elif defined(__x86_64__)
56static inline void getcpuid(int info_type, int info[4]) {
57    asm volatile (
58        "cpuid \n\t"
59        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
60        : "a"(info_type)
61    );
62}
63#else
64static inline void getcpuid(int info_type, int info[4]) {
65    // We save and restore ebx, so this code can be compatible with -fPIC
66    asm volatile (
67        "pushl %%ebx      \n\t"
68        "cpuid            \n\t"
69        "movl %%ebx, %1   \n\t"
70        "popl %%ebx       \n\t"
71        : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
72        : "a"(info_type)
73    );
74}
75#endif
76
77////////////////////////////////////////////////////////////////////////////////
78
79/* Fetch the SIMD level directly from the CPU, at run-time.
80 * Only checks the levels needed by the optimizations in this file.
81 */
82static int get_SIMD_level() {
83    int cpu_info[4] = { 0 };
84
85    getcpuid(1, cpu_info);
86    if ((cpu_info[2] & (1<<20)) != 0) {
87        return SK_CPU_SSE_LEVEL_SSE42;
88    } else if ((cpu_info[2] & (1<<19)) != 0) {
89        return SK_CPU_SSE_LEVEL_SSE41;
90    } else if ((cpu_info[2] & (1<<9)) != 0) {
91        return SK_CPU_SSE_LEVEL_SSSE3;
92    } else if ((cpu_info[3] & (1<<26)) != 0) {
93        return SK_CPU_SSE_LEVEL_SSE2;
94    } else {
95        return 0;
96    }
97}
98
99/* Verify that the requested SIMD level is supported in the build.
100 * If not, check if the platform supports it.
101 */
102static inline bool supports_simd(int minLevel) {
103#if defined(SK_CPU_SSE_LEVEL)
104    if (minLevel <= SK_CPU_SSE_LEVEL) {
105        return true;
106    } else
107#endif
108    {
109#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
110        /* For the Android framework we should always know at compile time if the device
111         * we are building for supports SSSE3.  The one exception to this rule is on the
112         * emulator where we are compiled without the -mssse3 option (so we have no
113         * SSSE3 procs) but can be run on a host machine that supports SSSE3
114         * instructions. So for that particular case we disable our SSSE3 options.
115         */
116        return false;
117#else
118        static int gSIMDLevel = get_SIMD_level();
119        return (minLevel <= gSIMDLevel);
120#endif
121    }
122}
123
124////////////////////////////////////////////////////////////////////////////////
125
126SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", true, "Use SSE optimized version of high quality image filters");
127
128void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
129    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
130        procs->fExtraHorizontalReads = 3;
131        procs->fConvolveVertically = &convolveVertically_SSE2;
132        procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
133        procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
134        procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
135    }
136}
137
138////////////////////////////////////////////////////////////////////////////////
139
140void SkBitmapProcState::platformProcs() {
141    /* Every optimization in the function requires at least SSE2 */
142    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
143        return;
144    }
145
146    /* Check fSampleProc32 */
147    if (fSampleProc32 == S32_opaque_D32_filter_DX) {
148        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
149            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
150        } else {
151            fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
152        }
153    } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
154        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
155            fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
156        }
157    } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
158        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
159            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
160        } else {
161            fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
162        }
163    } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
164        if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
165            fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
166        }
167    }
168
169    /* Check fSampleProc16 */
170    if (fSampleProc16 == S32_D16_filter_DX) {
171        fSampleProc16 = S32_D16_filter_DX_SSE2;
172    }
173
174    /* Check fMatrixProc */
175    if (fMatrixProc == ClampX_ClampY_filter_scale) {
176        fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
177    } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
178        fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
179    } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
180        fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
181    } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
182        fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
183    }
184
185    /* Check fShaderProc32 */
186    if (c_hqfilter_sse) {
187        if (fShaderProc32 == highQualityFilter32) {
188            fShaderProc32 = highQualityFilter_SSE2;
189        }
190    }
191}
192
193////////////////////////////////////////////////////////////////////////////////
194
195static SkBlitRow::Proc platform_16_procs[] = {
196    S32_D565_Opaque_SSE2,               // S32_D565_Opaque
197    NULL,                               // S32_D565_Blend
198    S32A_D565_Opaque_SSE2,              // S32A_D565_Opaque
199    NULL,                               // S32A_D565_Blend
200    S32_D565_Opaque_Dither_SSE2,        // S32_D565_Opaque_Dither
201    NULL,                               // S32_D565_Blend_Dither
202    S32A_D565_Opaque_Dither_SSE2,       // S32A_D565_Opaque_Dither
203    NULL,                               // S32A_D565_Blend_Dither
204};
205
206SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
207    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
208        return platform_16_procs[flags];
209    } else {
210        return NULL;
211    }
212}
213
214static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
215    NULL,                               // S32_Opaque,
216    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
217    S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
218    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
219};
220
221#if defined(SK_ATT_ASM_SUPPORTED)
222static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
223    NULL,                               // S32_Opaque,
224    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
225    S32A_Opaque_BlitRow32_SSE4_asm,     // S32A_Opaque
226    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
227};
228#endif
229
230SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
231#if defined(SK_ATT_ASM_SUPPORTED)
232    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
233        return platform_32_procs_SSE4[flags];
234    } else
235#endif
236    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
237        return platform_32_procs_SSE2[flags];
238    } else {
239        return NULL;
240    }
241}
242
243SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
244    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
245        return Color32_SSE2;
246    } else {
247        return NULL;
248    }
249}
250
251SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
252
253SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
254/* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled.
255    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
256        return ColorRect32_SSE2;
257    } else {
258        return NULL;
259    }
260*/
261    return NULL;
262}
263
264////////////////////////////////////////////////////////////////////////////////
265
266SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
267                                                     SkMask::Format maskFormat,
268                                                     SkColor color) {
269    if (SkMask::kA8_Format != maskFormat) {
270        return NULL;
271    }
272
273    ColorProc proc = NULL;
274    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
275        switch (dstCT) {
276            case kN32_SkColorType:
277                // The SSE2 version is not (yet) faster for black, so we check
278                // for that.
279                if (SK_ColorBLACK != color) {
280                    proc = SkARGB32_A8_BlitMask_SSE2;
281                }
282                break;
283            default:
284                break;
285        }
286    }
287    return proc;
288}
289
290SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
291    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
292        if (isOpaque) {
293            return SkBlitLCD16OpaqueRow_SSE2;
294        } else {
295            return SkBlitLCD16Row_SSE2;
296        }
297    } else {
298        return NULL;
299    }
300
301}
302
303SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
304    return NULL;
305}
306
307////////////////////////////////////////////////////////////////////////////////
308
309SkMemset16Proc SkMemset16GetPlatformProc() {
310    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
311        return sk_memset16_SSE2;
312    } else {
313        return NULL;
314    }
315}
316
317SkMemset32Proc SkMemset32GetPlatformProc() {
318    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
319        return sk_memset32_SSE2;
320    } else {
321        return NULL;
322    }
323}
324
325SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
326    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
327        return sk_memcpy32_SSE2;
328    } else {
329        return NULL;
330    }
331}
332
333////////////////////////////////////////////////////////////////////////////////
334
335SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
336    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
337        return NULL;
338    }
339    switch (type) {
340        case kDilateX_SkMorphologyProcType:
341            return SkDilateX_SSE2;
342        case kDilateY_SkMorphologyProcType:
343            return SkDilateY_SSE2;
344        case kErodeX_SkMorphologyProcType:
345            return SkErodeX_SSE2;
346        case kErodeY_SkMorphologyProcType:
347            return SkErodeY_SSE2;
348        default:
349            return NULL;
350    }
351}
352
353////////////////////////////////////////////////////////////////////////////////
354
355bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
356                               SkBoxBlurProc* boxBlurY,
357                               SkBoxBlurProc* boxBlurXY,
358                               SkBoxBlurProc* boxBlurYX) {
359#ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
360    return false;
361#else
362    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
363        return SkBoxBlurGetPlatformProcs_SSE4(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
364    }
365    else if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
366        return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
367    }
368    return false;
369#endif
370}
371
372////////////////////////////////////////////////////////////////////////////////
373
374extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
375                                                                SkXfermode::Mode mode);
376
377SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
378                                                    SkXfermode::Mode mode);
379
380SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
381                                                    SkXfermode::Mode mode) {
382    return NULL;
383}
384
385SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
386                                               SkXfermode::Mode mode);
387
388SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
389                                               SkXfermode::Mode mode) {
390    if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
391        return SkPlatformXfermodeFactory_impl_SSE2(rec, mode);
392    } else {
393        return SkPlatformXfermodeFactory_impl(rec, mode);
394    }
395}
396
397SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode);
398
399SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) {
400    return NULL;
401}
402