1/* 2 * Copyright 2009 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBitmapFilter_opts_SSE2.h" 9#include "SkBitmapProcState_opts_SSE2.h" 10#include "SkBitmapProcState_opts_SSSE3.h" 11#include "SkBitmapScaler.h" 12#include "SkBlitMask.h" 13#include "SkBlitRow.h" 14#include "SkBlitRow_opts_SSE2.h" 15#include "SkBlitRow_opts_SSE4.h" 16#include "SkOncePtr.h" 17#include "SkRTConf.h" 18 19#if defined(_MSC_VER) && defined(_WIN64) 20#include <intrin.h> 21#endif 22 23/* This file must *not* be compiled with -msse or any other optional SIMD 24 extension, otherwise gcc may generate SIMD instructions even for scalar ops 25 (and thus give an invalid instruction on Pentium3 on the code below). 26 For example, only files named *_SSE2.cpp in this directory should be 27 compiled with -msse2 or higher. */ 28 29 30/* Function to get the CPU SSE-level in runtime, for different compilers. */ 31#ifdef _MSC_VER 32static inline void getcpuid(int info_type, int info[4]) { 33#if defined(_WIN64) 34 __cpuid(info, info_type); 35#else 36 __asm { 37 mov eax, [info_type] 38 cpuid 39 mov edi, [info] 40 mov [edi], eax 41 mov [edi+4], ebx 42 mov [edi+8], ecx 43 mov [edi+12], edx 44 } 45#endif 46} 47#elif defined(__x86_64__) 48static inline void getcpuid(int info_type, int info[4]) { 49 asm volatile ( 50 "cpuid \n\t" 51 : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) 52 : "a"(info_type) 53 ); 54} 55#else 56static inline void getcpuid(int info_type, int info[4]) { 57 // We save and restore ebx, so this code can be compatible with -fPIC 58 asm volatile ( 59 "pushl %%ebx \n\t" 60 "cpuid \n\t" 61 "movl %%ebx, %1 \n\t" 62 "popl %%ebx \n\t" 63 : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3]) 64 : "a"(info_type) 65 ); 66} 67#endif 68 69//////////////////////////////////////////////////////////////////////////////// 70 71/* Fetch the SIMD level directly from the CPU, at run-time. 72 * Only checks the levels needed by the optimizations in this file. 73 */ 74static int* get_SIMD_level() { 75 int cpu_info[4] = { 0, 0, 0, 0 }; 76 getcpuid(1, cpu_info); 77 78 int* level = new int; 79 80 if ((cpu_info[2] & (1<<20)) != 0) { 81 *level = SK_CPU_SSE_LEVEL_SSE42; 82 } else if ((cpu_info[2] & (1<<19)) != 0) { 83 *level = SK_CPU_SSE_LEVEL_SSE41; 84 } else if ((cpu_info[2] & (1<<9)) != 0) { 85 *level = SK_CPU_SSE_LEVEL_SSSE3; 86 } else if ((cpu_info[3] & (1<<26)) != 0) { 87 *level = SK_CPU_SSE_LEVEL_SSE2; 88 } else { 89 *level = 0; 90 } 91 return level; 92} 93 94SK_DECLARE_STATIC_ONCE_PTR(int, gSIMDLevel); 95 96/* Verify that the requested SIMD level is supported in the build. 97 * If not, check if the platform supports it. 98 */ 99static inline bool supports_simd(int minLevel) { 100#if defined(SK_CPU_SSE_LEVEL) 101 if (minLevel <= SK_CPU_SSE_LEVEL) { 102 return true; 103 } else 104#endif 105 { 106#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) 107 /* For the Android framework we should always know at compile time if the device 108 * we are building for supports SSSE3. The one exception to this rule is on the 109 * emulator where we are compiled without the -mssse3 option (so we have no 110 * SSSE3 procs) but can be run on a host machine that supports SSSE3 111 * instructions. So for that particular case we disable our SSSE3 options. 112 */ 113 return false; 114#else 115 return minLevel <= *gSIMDLevel.get(get_SIMD_level); 116#endif 117 } 118} 119 120//////////////////////////////////////////////////////////////////////////////// 121 122void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) { 123 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 124 procs->fExtraHorizontalReads = 3; 125 procs->fConvolveVertically = &convolveVertically_SSE2; 126 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2; 127 procs->fConvolveHorizontally = &convolveHorizontally_SSE2; 128 procs->fApplySIMDPadding = &applySIMDPadding_SSE2; 129 } 130} 131 132//////////////////////////////////////////////////////////////////////////////// 133 134void SkBitmapProcState::platformProcs() { 135 /* Every optimization in the function requires at least SSE2 */ 136 if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 137 return; 138 } 139 const bool ssse3 = supports_simd(SK_CPU_SSE_LEVEL_SSSE3); 140 141 /* Check fSampleProc32 */ 142 if (fSampleProc32 == S32_opaque_D32_filter_DX) { 143 if (ssse3) { 144 fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; 145 } else { 146 fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; 147 } 148 } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) { 149 if (ssse3) { 150 fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3; 151 } 152 } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { 153 if (ssse3) { 154 fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; 155 } else { 156 fSampleProc32 = S32_alpha_D32_filter_DX_SSE2; 157 } 158 } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) { 159 if (ssse3) { 160 fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3; 161 } 162 } 163 164 /* Check fMatrixProc */ 165 if (fMatrixProc == ClampX_ClampY_filter_scale) { 166 fMatrixProc = ClampX_ClampY_filter_scale_SSE2; 167 } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) { 168 fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; 169 } else if (fMatrixProc == ClampX_ClampY_filter_affine) { 170 fMatrixProc = ClampX_ClampY_filter_affine_SSE2; 171 } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) { 172 fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2; 173 } 174} 175 176//////////////////////////////////////////////////////////////////////////////// 177 178static const SkBlitRow::Proc16 platform_16_procs[] = { 179 S32_D565_Opaque_SSE2, // S32_D565_Opaque 180 nullptr, // S32_D565_Blend 181 S32A_D565_Opaque_SSE2, // S32A_D565_Opaque 182 nullptr, // S32A_D565_Blend 183 S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither 184 nullptr, // S32_D565_Blend_Dither 185 S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither 186 nullptr, // S32A_D565_Blend_Dither 187}; 188 189SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { 190 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 191 return platform_16_procs[flags]; 192 } else { 193 return nullptr; 194 } 195} 196 197static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = { 198 Color32A_D565_SSE2, // Color32A_D565, 199 nullptr, // Color32A_D565_Dither 200}; 201 202SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { 203/* If you're thinking about writing an SSE4 version of this, do check it's 204 * actually faster on Atom. Our original SSE4 version was slower than this 205 * SSE2 version on Silvermont, and only marginally faster on a Core i7, 206 * mainly due to the MULLD timings. 207 */ 208 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 209 return platform_565_colorprocs_SSE2[flags]; 210 } else { 211 return nullptr; 212 } 213} 214 215static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = { 216 nullptr, // S32_Opaque, 217 S32_Blend_BlitRow32_SSE2, // S32_Blend, 218 S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque 219 S32A_Blend_BlitRow32_SSE2, // S32A_Blend, 220}; 221 222static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = { 223 nullptr, // S32_Opaque, 224 S32_Blend_BlitRow32_SSE2, // S32_Blend, 225 S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque 226 S32A_Blend_BlitRow32_SSE2, // S32A_Blend, 227}; 228 229SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 230 if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) { 231 return platform_32_procs_SSE4[flags]; 232 } else 233 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 234 return platform_32_procs_SSE2[flags]; 235 } else { 236 return nullptr; 237 } 238} 239 240//////////////////////////////////////////////////////////////////////////////// 241 242SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { 243 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 244 if (isOpaque) { 245 return SkBlitLCD16OpaqueRow_SSE2; 246 } else { 247 return SkBlitLCD16Row_SSE2; 248 } 249 } else { 250 return nullptr; 251 } 252 253} 254 255SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) { 256 return nullptr; 257} 258