SkBlurMask.cpp revision 91f489a65d436d36c7fe580af2775cd0cd13c8d2
1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)/* 3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * Copyright 2006 The Android Open Source Project 4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * 590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) * Use of this source code is governed by a BSD-style license that can be 690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) * found in the LICENSE file. 790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) */ 8c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 9c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 10c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "SkBlurMask.h" 11c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "SkMath.h" 120de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)#include "SkTemplates.h" 13c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "SkEndian.h" 14c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 150de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)#define UNROLL_SEPARABLE_LOOPS 16c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 17c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)/** 18c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * This function performs a box blur in X, of the given radius. If the 19c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * "transpose" parameter is true, it will transpose the pixels on write, 20c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * such that X and Y are swapped. Reads are always performed from contiguous 21c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * memory in X, for speed. The destination buffer (dst) must be at least 22c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) * (width + leftRadius + rightRadius) * height bytes in size. 23c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) */ 24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)static int boxBlur(const uint8_t* src, int src_y_stride, uint8_t* dst, 25c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int leftRadius, int rightRadius, int width, int height, 26c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) bool transpose) 27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles){ 28c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int diameter = leftRadius + rightRadius; 29c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int kernelSize = diameter + 1; 30c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int border = SkMin32(width, diameter); 31c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) uint32_t scale = (1 << 24) / kernelSize; 32c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int new_width = width + SkMax32(leftRadius, rightRadius) * 2; 33c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int dst_x_stride = transpose ? height : 1; 34c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int dst_y_stride = transpose ? 1 : new_width; 35c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) for (int y = 0; y < height; ++y) { 36c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) int sum = 0; 37c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) uint8_t* dptr = dst + y * dst_y_stride; 38c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const uint8_t* right = src + y * src_y_stride; 39c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const uint8_t* left = right; 40c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) for (int x = 0; x < rightRadius - leftRadius; x++) { 41c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) *dptr = 0; 42c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) dptr += dst_x_stride; 430de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) } 44c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#define LEFT_BORDER_ITER \ 45c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) sum += *right++; \ 46c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) *dptr = (sum * scale) >> 24; \ 47c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) dptr += dst_x_stride; 480de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) 490de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) int x = 0; 500de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)#ifdef UNROLL_SEPARABLE_LOOPS 510de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) for (; x < border - 16; x += 16) { 520de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 530de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 540de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 550de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 560de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 570de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 580de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 590de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 600de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 610de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) LEFT_BORDER_ITER 62c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 63c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 64c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 65c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 66c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 67c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 68c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } 69c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#endif 70c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) for (; x < border; ++x) { 71c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) LEFT_BORDER_ITER 72c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } 73c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#undef LEFT_BORDER_ITER 74c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#define TRIVIAL_ITER \ 75c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) *dptr = (sum * scale) >> 24; \ 76c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) dptr += dst_x_stride; 77c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) x = width; 78c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#ifdef UNROLL_SEPARABLE_LOOPS 79c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) for (; x < diameter - 16; x += 16) { 80c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 81c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 82c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 83c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 84c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 85c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 86c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 87c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 88c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 89c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 90c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 91c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 92c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 93c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 94c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 95c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) TRIVIAL_ITER 96c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } 970de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)#endif 980de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) for (; x < diameter; ++x) { 990de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) TRIVIAL_ITER 1000de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) } 1010de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)#undef TRIVIAL_ITER 1020de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)#define CENTER_ITER \ 1030de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles) sum += *right++; \ 104c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) *dptr = (sum * scale) >> 24; \ 105c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) sum -= *left++; \ 106c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) dptr += dst_x_stride; 107c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 108c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) x = diameter; 109c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#ifdef UNROLL_SEPARABLE_LOOPS 110c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) for (; x < width - 16; x += 16) { 111c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) CENTER_ITER 112c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) CENTER_ITER 113c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) CENTER_ITER 114c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) CENTER_ITER 115 CENTER_ITER 116 CENTER_ITER 117 CENTER_ITER 118 CENTER_ITER 119 CENTER_ITER 120 CENTER_ITER 121 CENTER_ITER 122 CENTER_ITER 123 CENTER_ITER 124 CENTER_ITER 125 CENTER_ITER 126 CENTER_ITER 127 } 128#endif 129 for (; x < width; ++x) { 130 CENTER_ITER 131 } 132#undef CENTER_ITER 133#define RIGHT_BORDER_ITER \ 134 *dptr = (sum * scale) >> 24; \ 135 sum -= *left++; \ 136 dptr += dst_x_stride; 137 138 x = 0; 139#ifdef UNROLL_SEPARABLE_LOOPS 140 for (; x < border - 16; x += 16) { 141 RIGHT_BORDER_ITER 142 RIGHT_BORDER_ITER 143 RIGHT_BORDER_ITER 144 RIGHT_BORDER_ITER 145 RIGHT_BORDER_ITER 146 RIGHT_BORDER_ITER 147 RIGHT_BORDER_ITER 148 RIGHT_BORDER_ITER 149 RIGHT_BORDER_ITER 150 RIGHT_BORDER_ITER 151 RIGHT_BORDER_ITER 152 RIGHT_BORDER_ITER 153 RIGHT_BORDER_ITER 154 RIGHT_BORDER_ITER 155 RIGHT_BORDER_ITER 156 RIGHT_BORDER_ITER 157 } 158#endif 159 for (; x < border; ++x) { 160 RIGHT_BORDER_ITER 161 } 162#undef RIGHT_BORDER_ITER 163 for (int x = 0; x < leftRadius - rightRadius; x++) { 164 *dptr = 0; 165 dptr += dst_x_stride; 166 } 167 SkASSERT(sum == 0); 168 } 169 return new_width; 170} 171 172/** 173 * This variant of the box blur handles blurring of non-integer radii. It 174 * keeps two running sums: an outer sum for the rounded-up kernel radius, and 175 * an inner sum for the rounded-down kernel radius. For each pixel, it linearly 176 * interpolates between them. In float this would be: 177 * outer_weight * outer_sum / kernelSize + 178 * (1.0 - outer_weight) * innerSum / (kernelSize - 2) 179 */ 180static int boxBlurInterp(const uint8_t* src, int src_y_stride, uint8_t* dst, 181 int radius, int width, int height, 182 bool transpose, uint8_t outer_weight) 183{ 184 int diameter = radius * 2; 185 int kernelSize = diameter + 1; 186 int border = SkMin32(width, diameter); 187 int inner_weight = 255 - outer_weight; 188 outer_weight += outer_weight >> 7; 189 inner_weight += inner_weight >> 7; 190 uint32_t outer_scale = (outer_weight << 16) / kernelSize; 191 uint32_t inner_scale = (inner_weight << 16) / (kernelSize - 2); 192 int new_width = width + diameter; 193 int dst_x_stride = transpose ? height : 1; 194 int dst_y_stride = transpose ? 1 : new_width; 195 for (int y = 0; y < height; ++y) { 196 int outer_sum = 0, inner_sum = 0; 197 uint8_t* dptr = dst + y * dst_y_stride; 198 const uint8_t* right = src + y * src_y_stride; 199 const uint8_t* left = right; 200 int x = 0; 201 202#define LEFT_BORDER_ITER \ 203 inner_sum = outer_sum; \ 204 outer_sum += *right++; \ 205 *dptr = (outer_sum * outer_scale + inner_sum * inner_scale) >> 24; \ 206 dptr += dst_x_stride; 207 208#ifdef UNROLL_SEPARABLE_LOOPS 209 for (;x < border - 16; x += 16) { 210 LEFT_BORDER_ITER 211 LEFT_BORDER_ITER 212 LEFT_BORDER_ITER 213 LEFT_BORDER_ITER 214 LEFT_BORDER_ITER 215 LEFT_BORDER_ITER 216 LEFT_BORDER_ITER 217 LEFT_BORDER_ITER 218 LEFT_BORDER_ITER 219 LEFT_BORDER_ITER 220 LEFT_BORDER_ITER 221 LEFT_BORDER_ITER 222 LEFT_BORDER_ITER 223 LEFT_BORDER_ITER 224 LEFT_BORDER_ITER 225 LEFT_BORDER_ITER 226 } 227#endif 228 229 for (;x < border; x++) { 230 LEFT_BORDER_ITER 231 } 232#undef LEFT_BORDER_ITER 233 for (int x = width; x < diameter; ++x) { 234 *dptr = (outer_sum * outer_scale + inner_sum * inner_scale) >> 24; 235 dptr += dst_x_stride; 236 } 237 x = diameter; 238 239#define CENTER_ITER \ 240 inner_sum = outer_sum - *left; \ 241 outer_sum += *right++; \ 242 *dptr = (outer_sum * outer_scale + inner_sum * inner_scale) >> 24; \ 243 dptr += dst_x_stride; \ 244 outer_sum -= *left++; 245 246#ifdef UNROLL_SEPARABLE_LOOPS 247 for (; x < width - 16; x += 16) { 248 CENTER_ITER 249 CENTER_ITER 250 CENTER_ITER 251 CENTER_ITER 252 CENTER_ITER 253 CENTER_ITER 254 CENTER_ITER 255 CENTER_ITER 256 CENTER_ITER 257 CENTER_ITER 258 CENTER_ITER 259 CENTER_ITER 260 CENTER_ITER 261 CENTER_ITER 262 CENTER_ITER 263 CENTER_ITER 264 } 265#endif 266 for (; x < width; ++x) { 267 CENTER_ITER 268 } 269#undef CENTER_ITER 270 271 #define RIGHT_BORDER_ITER \ 272 inner_sum = outer_sum - *left++; \ 273 *dptr = (outer_sum * outer_scale + inner_sum * inner_scale) >> 24; \ 274 dptr += dst_x_stride; \ 275 outer_sum = inner_sum; 276 277 x = 0; 278#ifdef UNROLL_SEPARABLE_LOOPS 279 for (; x < border - 16; x += 16) { 280 RIGHT_BORDER_ITER 281 RIGHT_BORDER_ITER 282 RIGHT_BORDER_ITER 283 RIGHT_BORDER_ITER 284 RIGHT_BORDER_ITER 285 RIGHT_BORDER_ITER 286 RIGHT_BORDER_ITER 287 RIGHT_BORDER_ITER 288 RIGHT_BORDER_ITER 289 RIGHT_BORDER_ITER 290 RIGHT_BORDER_ITER 291 RIGHT_BORDER_ITER 292 RIGHT_BORDER_ITER 293 RIGHT_BORDER_ITER 294 RIGHT_BORDER_ITER 295 RIGHT_BORDER_ITER 296 } 297#endif 298 for (; x < border; x++) { 299 RIGHT_BORDER_ITER 300 } 301#undef RIGHT_BORDER_ITER 302 SkASSERT(outer_sum == 0 && inner_sum == 0); 303 } 304 return new_width; 305} 306 307static void get_adjusted_radii(SkScalar passRadius, int *loRadius, int *hiRadius) 308{ 309 *loRadius = *hiRadius = SkScalarCeil(passRadius); 310 if (SkIntToScalar(*hiRadius) - passRadius > SkFloatToScalar(0.5f)) { 311 *loRadius = *hiRadius - 1; 312 } 313} 314 315// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows, 316// breakeven on Mac, and ~15% slowdown on Linux. 317// Reading a word at a time when bulding the sum buffer seems to give 318// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux. 319#if defined(SK_BUILD_FOR_WIN32) 320#define UNROLL_KERNEL_LOOP 1 321#endif 322 323/** The sum buffer is an array of u32 to hold the accumulated sum of all of the 324 src values at their position, plus all values above and to the left. 325 When we sample into this buffer, we need an initial row and column of 0s, 326 so we have an index correspondence as follows: 327 328 src[i, j] == sum[i+1, j+1] 329 sum[0, j] == sum[i, 0] == 0 330 331 We assume that the sum buffer's stride == its width 332 */ 333static void build_sum_buffer(uint32_t sum[], int srcW, int srcH, 334 const uint8_t src[], int srcRB) { 335 int sumW = srcW + 1; 336 337 SkASSERT(srcRB >= srcW); 338 // mod srcRB so we can apply it after each row 339 srcRB -= srcW; 340 341 int x, y; 342 343 // zero out the top row and column 344 memset(sum, 0, sumW * sizeof(sum[0])); 345 sum += sumW; 346 347 // special case first row 348 uint32_t X = 0; 349 *sum++ = 0; // initialze the first column to 0 350 for (x = srcW - 1; x >= 0; --x) { 351 X = *src++ + X; 352 *sum++ = X; 353 } 354 src += srcRB; 355 356 // now do the rest of the rows 357 for (y = srcH - 1; y > 0; --y) { 358 uint32_t L = 0; 359 uint32_t C = 0; 360 *sum++ = 0; // initialze the first column to 0 361 362 for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) { 363 uint32_t T = sum[-sumW]; 364 X = *src++ + L + T - C; 365 *sum++ = X; 366 L = X; 367 C = T; 368 } 369 370 for (; x >= 4; x-=4) { 371 uint32_t T = sum[-sumW]; 372 X = *src++ + L + T - C; 373 *sum++ = X; 374 L = X; 375 C = T; 376 T = sum[-sumW]; 377 X = *src++ + L + T - C; 378 *sum++ = X; 379 L = X; 380 C = T; 381 T = sum[-sumW]; 382 X = *src++ + L + T - C; 383 *sum++ = X; 384 L = X; 385 C = T; 386 T = sum[-sumW]; 387 X = *src++ + L + T - C; 388 *sum++ = X; 389 L = X; 390 C = T; 391 } 392 393 for (; x >= 0; --x) { 394 uint32_t T = sum[-sumW]; 395 X = *src++ + L + T - C; 396 *sum++ = X; 397 L = X; 398 C = T; 399 } 400 src += srcRB; 401 } 402} 403 404/** 405 * This is the path for apply_kernel() to be taken when the kernel 406 * is wider than the source image. 407 */ 408static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[], 409 int sw, int sh) { 410 SkASSERT(2*rx > sw); 411 412 uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); 413 414 int sumStride = sw + 1; 415 416 int dw = sw + 2*rx; 417 int dh = sh + 2*ry; 418 419 int prev_y = -2*ry; 420 int next_y = 1; 421 422 for (int y = 0; y < dh; y++) { 423 int py = SkClampPos(prev_y) * sumStride; 424 int ny = SkFastMin32(next_y, sh) * sumStride; 425 426 int prev_x = -2*rx; 427 int next_x = 1; 428 429 for (int x = 0; x < dw; x++) { 430 int px = SkClampPos(prev_x); 431 int nx = SkFastMin32(next_x, sw); 432 433 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 434 *dst++ = SkToU8(tmp * scale >> 24); 435 436 prev_x += 1; 437 next_x += 1; 438 } 439 440 prev_y += 1; 441 next_y += 1; 442 } 443} 444/** 445 * sw and sh are the width and height of the src. Since the sum buffer 446 * matches that, but has an extra row and col at the beginning (with zeros), 447 * we can just use sw and sh as our "max" values for pinning coordinates 448 * when sampling into sum[][] 449 * 450 * The inner loop is conceptually simple; we break it into several sections 451 * to improve performance. Here's the original version: 452 for (int x = 0; x < dw; x++) { 453 int px = SkClampPos(prev_x); 454 int nx = SkFastMin32(next_x, sw); 455 456 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 457 *dst++ = SkToU8(tmp * scale >> 24); 458 459 prev_x += 1; 460 next_x += 1; 461 } 462 * The sections are: 463 * left-hand section, where prev_x is clamped to 0 464 * center section, where neither prev_x nor next_x is clamped 465 * right-hand section, where next_x is clamped to sw 466 * On some operating systems, the center section is unrolled for additional 467 * speedup. 468*/ 469static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[], 470 int sw, int sh) { 471 if (2*rx > sw) { 472 kernel_clamped(dst, rx, ry, sum, sw, sh); 473 return; 474 } 475 476 uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); 477 478 int sumStride = sw + 1; 479 480 int dw = sw + 2*rx; 481 int dh = sh + 2*ry; 482 483 int prev_y = -2*ry; 484 int next_y = 1; 485 486 SkASSERT(2*rx <= dw - 2*rx); 487 488 for (int y = 0; y < dh; y++) { 489 int py = SkClampPos(prev_y) * sumStride; 490 int ny = SkFastMin32(next_y, sh) * sumStride; 491 492 int prev_x = -2*rx; 493 int next_x = 1; 494 int x = 0; 495 496 for (; x < 2*rx; x++) { 497 SkASSERT(prev_x <= 0); 498 SkASSERT(next_x <= sw); 499 500 int px = 0; 501 int nx = next_x; 502 503 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 504 *dst++ = SkToU8(tmp * scale >> 24); 505 506 prev_x += 1; 507 next_x += 1; 508 } 509 510 int i0 = prev_x + py; 511 int i1 = next_x + ny; 512 int i2 = next_x + py; 513 int i3 = prev_x + ny; 514 515#if UNROLL_KERNEL_LOOP 516 for (; x < dw - 2*rx - 4; x += 4) { 517 SkASSERT(prev_x >= 0); 518 SkASSERT(next_x <= sw); 519 520 uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 521 *dst++ = SkToU8(tmp * scale >> 24); 522 tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 523 *dst++ = SkToU8(tmp * scale >> 24); 524 tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 525 *dst++ = SkToU8(tmp * scale >> 24); 526 tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 527 *dst++ = SkToU8(tmp * scale >> 24); 528 529 prev_x += 4; 530 next_x += 4; 531 } 532#endif 533 534 for (; x < dw - 2*rx; x++) { 535 SkASSERT(prev_x >= 0); 536 SkASSERT(next_x <= sw); 537 538 uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 539 *dst++ = SkToU8(tmp * scale >> 24); 540 541 prev_x += 1; 542 next_x += 1; 543 } 544 545 for (; x < dw; x++) { 546 SkASSERT(prev_x >= 0); 547 SkASSERT(next_x > sw); 548 549 int px = prev_x; 550 int nx = sw; 551 552 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 553 *dst++ = SkToU8(tmp * scale >> 24); 554 555 prev_x += 1; 556 next_x += 1; 557 } 558 559 prev_y += 1; 560 next_y += 1; 561 } 562} 563 564/** 565 * This is the path for apply_kernel_interp() to be taken when the kernel 566 * is wider than the source image. 567 */ 568static void kernel_interp_clamped(uint8_t dst[], int rx, int ry, 569 const uint32_t sum[], int sw, int sh, U8CPU outer_weight) { 570 SkASSERT(2*rx > sw); 571 572 int inner_weight = 255 - outer_weight; 573 574 // round these guys up if they're bigger than 127 575 outer_weight += outer_weight >> 7; 576 inner_weight += inner_weight >> 7; 577 578 uint32_t outer_scale = (outer_weight << 16) / ((2*rx + 1)*(2*ry + 1)); 579 uint32_t inner_scale = (inner_weight << 16) / ((2*rx - 1)*(2*ry - 1)); 580 581 int sumStride = sw + 1; 582 583 int dw = sw + 2*rx; 584 int dh = sh + 2*ry; 585 586 int prev_y = -2*ry; 587 int next_y = 1; 588 589 for (int y = 0; y < dh; y++) { 590 int py = SkClampPos(prev_y) * sumStride; 591 int ny = SkFastMin32(next_y, sh) * sumStride; 592 593 int ipy = SkClampPos(prev_y + 1) * sumStride; 594 int iny = SkClampMax(next_y - 1, sh) * sumStride; 595 596 int prev_x = -2*rx; 597 int next_x = 1; 598 599 for (int x = 0; x < dw; x++) { 600 int px = SkClampPos(prev_x); 601 int nx = SkFastMin32(next_x, sw); 602 603 int ipx = SkClampPos(prev_x + 1); 604 int inx = SkClampMax(next_x - 1, sw); 605 606 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 607 - sum[nx+py] - sum[px+ny]; 608 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 609 - sum[inx+ipy] - sum[ipx+iny]; 610 *dst++ = SkToU8((outer_sum * outer_scale 611 + inner_sum * inner_scale) >> 24); 612 613 prev_x += 1; 614 next_x += 1; 615 } 616 prev_y += 1; 617 next_y += 1; 618 } 619} 620 621/** 622 * sw and sh are the width and height of the src. Since the sum buffer 623 * matches that, but has an extra row and col at the beginning (with zeros), 624 * we can just use sw and sh as our "max" values for pinning coordinates 625 * when sampling into sum[][] 626 * 627 * The inner loop is conceptually simple; we break it into several variants 628 * to improve performance. Here's the original version: 629 for (int x = 0; x < dw; x++) { 630 int px = SkClampPos(prev_x); 631 int nx = SkFastMin32(next_x, sw); 632 633 int ipx = SkClampPos(prev_x + 1); 634 int inx = SkClampMax(next_x - 1, sw); 635 636 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 637 - sum[nx+py] - sum[px+ny]; 638 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 639 - sum[inx+ipy] - sum[ipx+iny]; 640 *dst++ = SkToU8((outer_sum * outer_scale 641 + inner_sum * inner_scale) >> 24); 642 643 prev_x += 1; 644 next_x += 1; 645 } 646 * The sections are: 647 * left-hand section, where prev_x is clamped to 0 648 * center section, where neither prev_x nor next_x is clamped 649 * right-hand section, where next_x is clamped to sw 650 * On some operating systems, the center section is unrolled for additional 651 * speedup. 652*/ 653static void apply_kernel_interp(uint8_t dst[], int rx, int ry, 654 const uint32_t sum[], int sw, int sh, U8CPU outer_weight) { 655 SkASSERT(rx > 0 && ry > 0); 656 SkASSERT(outer_weight <= 255); 657 658 if (2*rx > sw) { 659 kernel_interp_clamped(dst, rx, ry, sum, sw, sh, outer_weight); 660 return; 661 } 662 663 int inner_weight = 255 - outer_weight; 664 665 // round these guys up if they're bigger than 127 666 outer_weight += outer_weight >> 7; 667 inner_weight += inner_weight >> 7; 668 669 uint32_t outer_scale = (outer_weight << 16) / ((2*rx + 1)*(2*ry + 1)); 670 uint32_t inner_scale = (inner_weight << 16) / ((2*rx - 1)*(2*ry - 1)); 671 672 int sumStride = sw + 1; 673 674 int dw = sw + 2*rx; 675 int dh = sh + 2*ry; 676 677 int prev_y = -2*ry; 678 int next_y = 1; 679 680 SkASSERT(2*rx <= dw - 2*rx); 681 682 for (int y = 0; y < dh; y++) { 683 int py = SkClampPos(prev_y) * sumStride; 684 int ny = SkFastMin32(next_y, sh) * sumStride; 685 686 int ipy = SkClampPos(prev_y + 1) * sumStride; 687 int iny = SkClampMax(next_y - 1, sh) * sumStride; 688 689 int prev_x = -2*rx; 690 int next_x = 1; 691 int x = 0; 692 693 for (; x < 2*rx; x++) { 694 SkASSERT(prev_x < 0); 695 SkASSERT(next_x <= sw); 696 697 int px = 0; 698 int nx = next_x; 699 700 int ipx = 0; 701 int inx = next_x - 1; 702 703 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 704 - sum[nx+py] - sum[px+ny]; 705 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 706 - sum[inx+ipy] - sum[ipx+iny]; 707 *dst++ = SkToU8((outer_sum * outer_scale 708 + inner_sum * inner_scale) >> 24); 709 710 prev_x += 1; 711 next_x += 1; 712 } 713 714 int i0 = prev_x + py; 715 int i1 = next_x + ny; 716 int i2 = next_x + py; 717 int i3 = prev_x + ny; 718 int i4 = prev_x + 1 + ipy; 719 int i5 = next_x - 1 + iny; 720 int i6 = next_x - 1 + ipy; 721 int i7 = prev_x + 1 + iny; 722 723#if UNROLL_KERNEL_LOOP 724 for (; x < dw - 2*rx - 4; x += 4) { 725 SkASSERT(prev_x >= 0); 726 SkASSERT(next_x <= sw); 727 728 uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 729 uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 730 *dst++ = SkToU8((outer_sum * outer_scale 731 + inner_sum * inner_scale) >> 24); 732 outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 733 inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 734 *dst++ = SkToU8((outer_sum * outer_scale 735 + inner_sum * inner_scale) >> 24); 736 outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 737 inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 738 *dst++ = SkToU8((outer_sum * outer_scale 739 + inner_sum * inner_scale) >> 24); 740 outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 741 inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 742 *dst++ = SkToU8((outer_sum * outer_scale 743 + inner_sum * inner_scale) >> 24); 744 745 prev_x += 4; 746 next_x += 4; 747 } 748#endif 749 750 for (; x < dw - 2*rx; x++) { 751 SkASSERT(prev_x >= 0); 752 SkASSERT(next_x <= sw); 753 754 uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 755 uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 756 *dst++ = SkToU8((outer_sum * outer_scale 757 + inner_sum * inner_scale) >> 24); 758 759 prev_x += 1; 760 next_x += 1; 761 } 762 763 for (; x < dw; x++) { 764 SkASSERT(prev_x >= 0); 765 SkASSERT(next_x > sw); 766 767 int px = prev_x; 768 int nx = sw; 769 770 int ipx = prev_x + 1; 771 int inx = sw; 772 773 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 774 - sum[nx+py] - sum[px+ny]; 775 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 776 - sum[inx+ipy] - sum[ipx+iny]; 777 *dst++ = SkToU8((outer_sum * outer_scale 778 + inner_sum * inner_scale) >> 24); 779 780 prev_x += 1; 781 next_x += 1; 782 } 783 784 prev_y += 1; 785 next_y += 1; 786 } 787} 788 789#include "SkColorPriv.h" 790 791static void merge_src_with_blur(uint8_t dst[], int dstRB, 792 const uint8_t src[], int srcRB, 793 const uint8_t blur[], int blurRB, 794 int sw, int sh) { 795 dstRB -= sw; 796 srcRB -= sw; 797 blurRB -= sw; 798 while (--sh >= 0) { 799 for (int x = sw - 1; x >= 0; --x) { 800 *dst = SkToU8(SkAlphaMul(*blur, SkAlpha255To256(*src))); 801 dst += 1; 802 src += 1; 803 blur += 1; 804 } 805 dst += dstRB; 806 src += srcRB; 807 blur += blurRB; 808 } 809} 810 811static void clamp_with_orig(uint8_t dst[], int dstRowBytes, 812 const uint8_t src[], int srcRowBytes, 813 int sw, int sh, 814 SkBlurMask::Style style) { 815 int x; 816 while (--sh >= 0) { 817 switch (style) { 818 case SkBlurMask::kSolid_Style: 819 for (x = sw - 1; x >= 0; --x) { 820 int s = *src; 821 int d = *dst; 822 *dst = SkToU8(s + d - SkMulDiv255Round(s, d)); 823 dst += 1; 824 src += 1; 825 } 826 break; 827 case SkBlurMask::kOuter_Style: 828 for (x = sw - 1; x >= 0; --x) { 829 if (*src) { 830 *dst = SkToU8(SkAlphaMul(*dst, SkAlpha255To256(255 - *src))); 831 } 832 dst += 1; 833 src += 1; 834 } 835 break; 836 default: 837 SkDEBUGFAIL("Unexpected blur style here"); 838 break; 839 } 840 dst += dstRowBytes - sw; 841 src += srcRowBytes - sw; 842 } 843} 844 845/////////////////////////////////////////////////////////////////////////////// 846 847// we use a local funciton to wrap the class static method to work around 848// a bug in gcc98 849void SkMask_FreeImage(uint8_t* image); 850void SkMask_FreeImage(uint8_t* image) { 851 SkMask::FreeImage(image); 852} 853 854bool SkBlurMask::Blur(SkMask* dst, const SkMask& src, 855 SkScalar radius, Style style, Quality quality, 856 SkIPoint* margin, bool separable) 857{ 858 if (src.fFormat != SkMask::kA8_Format) { 859 return false; 860 } 861 862 // Force high quality off for small radii (performance) 863 if (radius < SkIntToScalar(3)) { 864 quality = kLow_Quality; 865 } 866 867 // highQuality: use three box blur passes as a cheap way to approximate a Gaussian blur 868 int passCount = (kHigh_Quality == quality) ? 3 : 1; 869 SkScalar passRadius = SkScalarDiv(radius, SkScalarSqrt(SkIntToScalar(passCount))); 870 871 int rx = SkScalarCeil(passRadius); 872 int outer_weight = 255 - SkScalarRound((SkIntToScalar(rx) - passRadius) * 255); 873 874 SkASSERT(rx >= 0); 875 SkASSERT((unsigned)outer_weight <= 255); 876 if (rx <= 0) { 877 return false; 878 } 879 880 int ry = rx; // only do square blur for now 881 882 int padx = passCount * rx; 883 int pady = passCount * ry; 884 if (margin) { 885 margin->set(padx, pady); 886 } 887 dst->fBounds.set(src.fBounds.fLeft - padx, src.fBounds.fTop - pady, 888 src.fBounds.fRight + padx, src.fBounds.fBottom + pady); 889 dst->fRowBytes = dst->fBounds.width(); 890 dst->fFormat = SkMask::kA8_Format; 891 dst->fImage = NULL; 892 893 if (src.fImage) { 894 size_t dstSize = dst->computeImageSize(); 895 if (0 == dstSize) { 896 return false; // too big to allocate, abort 897 } 898 899 int sw = src.fBounds.width(); 900 int sh = src.fBounds.height(); 901 const uint8_t* sp = src.fImage; 902 uint8_t* dp = SkMask::AllocImage(dstSize); 903 904 SkAutoTCallVProc<uint8_t, SkMask_FreeImage> autoCall(dp); 905 906 // build the blurry destination 907 if (separable) { 908 SkAutoTMalloc<uint8_t> tmpBuffer(dstSize); 909 uint8_t* tp = tmpBuffer.get(); 910 int w = sw, h = sh; 911 912 if (outer_weight == 255) { 913 int loRadius, hiRadius; 914 get_adjusted_radii(passRadius, &loRadius, &hiRadius); 915 if (kHigh_Quality == quality) { 916 // Do three X blurs, with a transpose on the final one. 917 w = boxBlur(sp, src.fRowBytes, tp, loRadius, hiRadius, w, h, false); 918 w = boxBlur(tp, w, dp, hiRadius, loRadius, w, h, false); 919 w = boxBlur(dp, w, tp, hiRadius, hiRadius, w, h, true); 920 // Do three Y blurs, with a transpose on the final one. 921 h = boxBlur(tp, h, dp, loRadius, hiRadius, h, w, false); 922 h = boxBlur(dp, h, tp, hiRadius, loRadius, h, w, false); 923 h = boxBlur(tp, h, dp, hiRadius, hiRadius, h, w, true); 924 } else { 925 w = boxBlur(sp, src.fRowBytes, tp, rx, rx, w, h, true); 926 h = boxBlur(tp, h, dp, ry, ry, h, w, true); 927 } 928 } else { 929 if (kHigh_Quality == quality) { 930 // Do three X blurs, with a transpose on the final one. 931 w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, false, outer_weight); 932 w = boxBlurInterp(tp, w, dp, rx, w, h, false, outer_weight); 933 w = boxBlurInterp(dp, w, tp, rx, w, h, true, outer_weight); 934 // Do three Y blurs, with a transpose on the final one. 935 h = boxBlurInterp(tp, h, dp, ry, h, w, false, outer_weight); 936 h = boxBlurInterp(dp, h, tp, ry, h, w, false, outer_weight); 937 h = boxBlurInterp(tp, h, dp, ry, h, w, true, outer_weight); 938 } else { 939 w = boxBlurInterp(sp, src.fRowBytes, tp, rx, w, h, true, outer_weight); 940 h = boxBlurInterp(tp, h, dp, ry, h, w, true, outer_weight); 941 } 942 } 943 } else { 944 const size_t storageW = sw + 2 * (passCount - 1) * rx + 1; 945 const size_t storageH = sh + 2 * (passCount - 1) * ry + 1; 946 SkAutoTMalloc<uint32_t> storage(storageW * storageH); 947 uint32_t* sumBuffer = storage.get(); 948 949 //pass1: sp is source, dp is destination 950 build_sum_buffer(sumBuffer, sw, sh, sp, src.fRowBytes); 951 if (outer_weight == 255) { 952 apply_kernel(dp, rx, ry, sumBuffer, sw, sh); 953 } else { 954 apply_kernel_interp(dp, rx, ry, sumBuffer, sw, sh, outer_weight); 955 } 956 957 if (kHigh_Quality == quality) { 958 //pass2: dp is source, tmpBuffer is destination 959 int tmp_sw = sw + 2 * rx; 960 int tmp_sh = sh + 2 * ry; 961 SkAutoTMalloc<uint8_t> tmpBuffer(dstSize); 962 build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, dp, tmp_sw); 963 if (outer_weight == 255) 964 apply_kernel(tmpBuffer.get(), rx, ry, sumBuffer, tmp_sw, tmp_sh); 965 else 966 apply_kernel_interp(tmpBuffer.get(), rx, ry, sumBuffer, 967 tmp_sw, tmp_sh, outer_weight); 968 969 //pass3: tmpBuffer is source, dp is destination 970 tmp_sw += 2 * rx; 971 tmp_sh += 2 * ry; 972 build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, tmpBuffer.get(), tmp_sw); 973 if (outer_weight == 255) 974 apply_kernel(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh); 975 else 976 apply_kernel_interp(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh, 977 outer_weight); 978 } 979 } 980 981 dst->fImage = dp; 982 // if need be, alloc the "real" dst (same size as src) and copy/merge 983 // the blur into it (applying the src) 984 if (style == kInner_Style) { 985 // now we allocate the "real" dst, mirror the size of src 986 size_t srcSize = src.computeImageSize(); 987 if (0 == srcSize) { 988 return false; // too big to allocate, abort 989 } 990 dst->fImage = SkMask::AllocImage(srcSize); 991 merge_src_with_blur(dst->fImage, src.fRowBytes, 992 sp, src.fRowBytes, 993 dp + passCount * (rx + ry * dst->fRowBytes), 994 dst->fRowBytes, sw, sh); 995 SkMask::FreeImage(dp); 996 } else if (style != kNormal_Style) { 997 clamp_with_orig(dp + passCount * (rx + ry * dst->fRowBytes), 998 dst->fRowBytes, sp, src.fRowBytes, sw, sh, style); 999 } 1000 (void)autoCall.detach(); 1001 } 1002 1003 if (style == kInner_Style) { 1004 dst->fBounds = src.fBounds; // restore trimmed bounds 1005 dst->fRowBytes = src.fRowBytes; 1006 } 1007 1008 return true; 1009} 1010 1011bool SkBlurMask::BlurSeparable(SkMask* dst, const SkMask& src, 1012 SkScalar radius, Style style, Quality quality, 1013 SkIPoint* margin) 1014{ 1015 return SkBlurMask::Blur(dst, src, radius, style, quality, margin, true); 1016} 1017 1018bool SkBlurMask::Blur(SkMask* dst, const SkMask& src, 1019 SkScalar radius, Style style, Quality quality, 1020 SkIPoint* margin) 1021{ 1022 return SkBlurMask::Blur(dst, src, radius, style, quality, margin, false); 1023} 1024