1 2/* 3 * Copyright 2006 The Android Open Source Project 4 * 5 * Use of this source code is governed by a BSD-style license that can be 6 * found in the LICENSE file. 7 */ 8 9 10#include "SkBlurMask.h" 11#include "SkMath.h" 12#include "SkTemplates.h" 13#include "SkEndian.h" 14 15// Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows, 16// breakeven on Mac, and ~15% slowdown on Linux. 17// Reading a word at a time when bulding the sum buffer seems to give 18// us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux. 19#if defined(SK_BUILD_FOR_WIN32) 20#define UNROLL_KERNEL_LOOP 1 21#endif 22 23/** The sum buffer is an array of u32 to hold the accumulated sum of all of the 24 src values at their position, plus all values above and to the left. 25 When we sample into this buffer, we need an initial row and column of 0s, 26 so we have an index correspondence as follows: 27 28 src[i, j] == sum[i+1, j+1] 29 sum[0, j] == sum[i, 0] == 0 30 31 We assume that the sum buffer's stride == its width 32 */ 33static void build_sum_buffer(uint32_t sum[], int srcW, int srcH, 34 const uint8_t src[], int srcRB) { 35 int sumW = srcW + 1; 36 37 SkASSERT(srcRB >= srcW); 38 // mod srcRB so we can apply it after each row 39 srcRB -= srcW; 40 41 int x, y; 42 43 // zero out the top row and column 44 memset(sum, 0, sumW * sizeof(sum[0])); 45 sum += sumW; 46 47 // special case first row 48 uint32_t X = 0; 49 *sum++ = 0; // initialze the first column to 0 50 for (x = srcW - 1; x >= 0; --x) { 51 X = *src++ + X; 52 *sum++ = X; 53 } 54 src += srcRB; 55 56 // now do the rest of the rows 57 for (y = srcH - 1; y > 0; --y) { 58 uint32_t L = 0; 59 uint32_t C = 0; 60 *sum++ = 0; // initialze the first column to 0 61 62 for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) { 63 uint32_t T = sum[-sumW]; 64 X = *src++ + L + T - C; 65 *sum++ = X; 66 L = X; 67 C = T; 68 } 69 70 for (; x >= 4; x-=4) { 71 uint32_t T = sum[-sumW]; 72 X = *src++ + L + T - C; 73 *sum++ = X; 74 L = X; 75 C = T; 76 T = sum[-sumW]; 77 X = *src++ + L + T - C; 78 *sum++ = X; 79 L = X; 80 C = T; 81 T = sum[-sumW]; 82 X = *src++ + L + T - C; 83 *sum++ = X; 84 L = X; 85 C = T; 86 T = sum[-sumW]; 87 X = *src++ + L + T - C; 88 *sum++ = X; 89 L = X; 90 C = T; 91 } 92 93 for (; x >= 0; --x) { 94 uint32_t T = sum[-sumW]; 95 X = *src++ + L + T - C; 96 *sum++ = X; 97 L = X; 98 C = T; 99 } 100 src += srcRB; 101 } 102} 103 104/** 105 * This is the path for apply_kernel() to be taken when the kernel 106 * is wider than the source image. 107 */ 108static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[], 109 int sw, int sh) { 110 SkASSERT(2*rx > sw); 111 112 uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); 113 114 int sumStride = sw + 1; 115 116 int dw = sw + 2*rx; 117 int dh = sh + 2*ry; 118 119 int prev_y = -2*ry; 120 int next_y = 1; 121 122 for (int y = 0; y < dh; y++) { 123 int py = SkClampPos(prev_y) * sumStride; 124 int ny = SkFastMin32(next_y, sh) * sumStride; 125 126 int prev_x = -2*rx; 127 int next_x = 1; 128 129 for (int x = 0; x < dw; x++) { 130 int px = SkClampPos(prev_x); 131 int nx = SkFastMin32(next_x, sw); 132 133 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 134 *dst++ = SkToU8(tmp * scale >> 24); 135 136 prev_x += 1; 137 next_x += 1; 138 } 139 140 prev_y += 1; 141 next_y += 1; 142 } 143} 144/** 145 * sw and sh are the width and height of the src. Since the sum buffer 146 * matches that, but has an extra row and col at the beginning (with zeros), 147 * we can just use sw and sh as our "max" values for pinning coordinates 148 * when sampling into sum[][] 149 * 150 * The inner loop is conceptually simple; we break it into several sections 151 * to improve performance. Here's the original version: 152 for (int x = 0; x < dw; x++) { 153 int px = SkClampPos(prev_x); 154 int nx = SkFastMin32(next_x, sw); 155 156 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 157 *dst++ = SkToU8(tmp * scale >> 24); 158 159 prev_x += 1; 160 next_x += 1; 161 } 162 * The sections are: 163 * left-hand section, where prev_x is clamped to 0 164 * center section, where neither prev_x nor next_x is clamped 165 * right-hand section, where next_x is clamped to sw 166 * On some operating systems, the center section is unrolled for additional 167 * speedup. 168*/ 169static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[], 170 int sw, int sh) { 171 if (2*rx > sw) { 172 kernel_clamped(dst, rx, ry, sum, sw, sh); 173 return; 174 } 175 176 uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1)); 177 178 int sumStride = sw + 1; 179 180 int dw = sw + 2*rx; 181 int dh = sh + 2*ry; 182 183 int prev_y = -2*ry; 184 int next_y = 1; 185 186 SkASSERT(2*rx <= dw - 2*rx); 187 188 for (int y = 0; y < dh; y++) { 189 int py = SkClampPos(prev_y) * sumStride; 190 int ny = SkFastMin32(next_y, sh) * sumStride; 191 192 int prev_x = -2*rx; 193 int next_x = 1; 194 int x = 0; 195 196 for (; x < 2*rx; x++) { 197 SkASSERT(prev_x <= 0); 198 SkASSERT(next_x <= sw); 199 200 int px = 0; 201 int nx = next_x; 202 203 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 204 *dst++ = SkToU8(tmp * scale >> 24); 205 206 prev_x += 1; 207 next_x += 1; 208 } 209 210 int i0 = prev_x + py; 211 int i1 = next_x + ny; 212 int i2 = next_x + py; 213 int i3 = prev_x + ny; 214 215#if UNROLL_KERNEL_LOOP 216 for (; x < dw - 2*rx - 4; x += 4) { 217 SkASSERT(prev_x >= 0); 218 SkASSERT(next_x <= sw); 219 220 uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 221 *dst++ = SkToU8(tmp * scale >> 24); 222 tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 223 *dst++ = SkToU8(tmp * scale >> 24); 224 tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 225 *dst++ = SkToU8(tmp * scale >> 24); 226 tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 227 *dst++ = SkToU8(tmp * scale >> 24); 228 229 prev_x += 4; 230 next_x += 4; 231 } 232#endif 233 234 for (; x < dw - 2*rx; x++) { 235 SkASSERT(prev_x >= 0); 236 SkASSERT(next_x <= sw); 237 238 uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 239 *dst++ = SkToU8(tmp * scale >> 24); 240 241 prev_x += 1; 242 next_x += 1; 243 } 244 245 for (; x < dw; x++) { 246 SkASSERT(prev_x >= 0); 247 SkASSERT(next_x > sw); 248 249 int px = prev_x; 250 int nx = sw; 251 252 uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny]; 253 *dst++ = SkToU8(tmp * scale >> 24); 254 255 prev_x += 1; 256 next_x += 1; 257 } 258 259 prev_y += 1; 260 next_y += 1; 261 } 262} 263 264/** 265 * This is the path for apply_kernel_interp() to be taken when the kernel 266 * is wider than the source image. 267 */ 268static void kernel_interp_clamped(uint8_t dst[], int rx, int ry, 269 const uint32_t sum[], int sw, int sh, U8CPU outer_weight) { 270 SkASSERT(2*rx > sw); 271 272 int inner_weight = 255 - outer_weight; 273 274 // round these guys up if they're bigger than 127 275 outer_weight += outer_weight >> 7; 276 inner_weight += inner_weight >> 7; 277 278 uint32_t outer_scale = (outer_weight << 16) / ((2*rx + 1)*(2*ry + 1)); 279 uint32_t inner_scale = (inner_weight << 16) / ((2*rx - 1)*(2*ry - 1)); 280 281 int sumStride = sw + 1; 282 283 int dw = sw + 2*rx; 284 int dh = sh + 2*ry; 285 286 int prev_y = -2*ry; 287 int next_y = 1; 288 289 for (int y = 0; y < dh; y++) { 290 int py = SkClampPos(prev_y) * sumStride; 291 int ny = SkFastMin32(next_y, sh) * sumStride; 292 293 int ipy = SkClampPos(prev_y + 1) * sumStride; 294 int iny = SkClampMax(next_y - 1, sh) * sumStride; 295 296 int prev_x = -2*rx; 297 int next_x = 1; 298 299 for (int x = 0; x < dw; x++) { 300 int px = SkClampPos(prev_x); 301 int nx = SkFastMin32(next_x, sw); 302 303 int ipx = SkClampPos(prev_x + 1); 304 int inx = SkClampMax(next_x - 1, sw); 305 306 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 307 - sum[nx+py] - sum[px+ny]; 308 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 309 - sum[inx+ipy] - sum[ipx+iny]; 310 *dst++ = SkToU8((outer_sum * outer_scale 311 + inner_sum * inner_scale) >> 24); 312 313 prev_x += 1; 314 next_x += 1; 315 } 316 prev_y += 1; 317 next_y += 1; 318 } 319} 320 321/** 322 * sw and sh are the width and height of the src. Since the sum buffer 323 * matches that, but has an extra row and col at the beginning (with zeros), 324 * we can just use sw and sh as our "max" values for pinning coordinates 325 * when sampling into sum[][] 326 * 327 * The inner loop is conceptually simple; we break it into several variants 328 * to improve performance. Here's the original version: 329 for (int x = 0; x < dw; x++) { 330 int px = SkClampPos(prev_x); 331 int nx = SkFastMin32(next_x, sw); 332 333 int ipx = SkClampPos(prev_x + 1); 334 int inx = SkClampMax(next_x - 1, sw); 335 336 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 337 - sum[nx+py] - sum[px+ny]; 338 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 339 - sum[inx+ipy] - sum[ipx+iny]; 340 *dst++ = SkToU8((outer_sum * outer_scale 341 + inner_sum * inner_scale) >> 24); 342 343 prev_x += 1; 344 next_x += 1; 345 } 346 * The sections are: 347 * left-hand section, where prev_x is clamped to 0 348 * center section, where neither prev_x nor next_x is clamped 349 * right-hand section, where next_x is clamped to sw 350 * On some operating systems, the center section is unrolled for additional 351 * speedup. 352*/ 353static void apply_kernel_interp(uint8_t dst[], int rx, int ry, 354 const uint32_t sum[], int sw, int sh, U8CPU outer_weight) { 355 SkASSERT(rx > 0 && ry > 0); 356 SkASSERT(outer_weight <= 255); 357 358 if (2*rx > sw) { 359 kernel_interp_clamped(dst, rx, ry, sum, sw, sh, outer_weight); 360 return; 361 } 362 363 int inner_weight = 255 - outer_weight; 364 365 // round these guys up if they're bigger than 127 366 outer_weight += outer_weight >> 7; 367 inner_weight += inner_weight >> 7; 368 369 uint32_t outer_scale = (outer_weight << 16) / ((2*rx + 1)*(2*ry + 1)); 370 uint32_t inner_scale = (inner_weight << 16) / ((2*rx - 1)*(2*ry - 1)); 371 372 int sumStride = sw + 1; 373 374 int dw = sw + 2*rx; 375 int dh = sh + 2*ry; 376 377 int prev_y = -2*ry; 378 int next_y = 1; 379 380 SkASSERT(2*rx <= dw - 2*rx); 381 382 for (int y = 0; y < dh; y++) { 383 int py = SkClampPos(prev_y) * sumStride; 384 int ny = SkFastMin32(next_y, sh) * sumStride; 385 386 int ipy = SkClampPos(prev_y + 1) * sumStride; 387 int iny = SkClampMax(next_y - 1, sh) * sumStride; 388 389 int prev_x = -2*rx; 390 int next_x = 1; 391 int x = 0; 392 393 for (; x < 2*rx; x++) { 394 SkASSERT(prev_x < 0); 395 SkASSERT(next_x <= sw); 396 397 int px = 0; 398 int nx = next_x; 399 400 int ipx = 0; 401 int inx = next_x - 1; 402 403 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 404 - sum[nx+py] - sum[px+ny]; 405 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 406 - sum[inx+ipy] - sum[ipx+iny]; 407 *dst++ = SkToU8((outer_sum * outer_scale 408 + inner_sum * inner_scale) >> 24); 409 410 prev_x += 1; 411 next_x += 1; 412 } 413 414 int i0 = prev_x + py; 415 int i1 = next_x + ny; 416 int i2 = next_x + py; 417 int i3 = prev_x + ny; 418 int i4 = prev_x + 1 + ipy; 419 int i5 = next_x - 1 + iny; 420 int i6 = next_x - 1 + ipy; 421 int i7 = prev_x + 1 + iny; 422 423#if UNROLL_KERNEL_LOOP 424 for (; x < dw - 2*rx - 4; x += 4) { 425 SkASSERT(prev_x >= 0); 426 SkASSERT(next_x <= sw); 427 428 uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 429 uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 430 *dst++ = SkToU8((outer_sum * outer_scale 431 + inner_sum * inner_scale) >> 24); 432 outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 433 inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 434 *dst++ = SkToU8((outer_sum * outer_scale 435 + inner_sum * inner_scale) >> 24); 436 outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 437 inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 438 *dst++ = SkToU8((outer_sum * outer_scale 439 + inner_sum * inner_scale) >> 24); 440 outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 441 inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 442 *dst++ = SkToU8((outer_sum * outer_scale 443 + inner_sum * inner_scale) >> 24); 444 445 prev_x += 4; 446 next_x += 4; 447 } 448#endif 449 450 for (; x < dw - 2*rx; x++) { 451 SkASSERT(prev_x >= 0); 452 SkASSERT(next_x <= sw); 453 454 uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++]; 455 uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++]; 456 *dst++ = SkToU8((outer_sum * outer_scale 457 + inner_sum * inner_scale) >> 24); 458 459 prev_x += 1; 460 next_x += 1; 461 } 462 463 for (; x < dw; x++) { 464 SkASSERT(prev_x >= 0); 465 SkASSERT(next_x > sw); 466 467 int px = prev_x; 468 int nx = sw; 469 470 int ipx = prev_x + 1; 471 int inx = sw; 472 473 uint32_t outer_sum = sum[px+py] + sum[nx+ny] 474 - sum[nx+py] - sum[px+ny]; 475 uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny] 476 - sum[inx+ipy] - sum[ipx+iny]; 477 *dst++ = SkToU8((outer_sum * outer_scale 478 + inner_sum * inner_scale) >> 24); 479 480 prev_x += 1; 481 next_x += 1; 482 } 483 484 prev_y += 1; 485 next_y += 1; 486 } 487} 488 489#include "SkColorPriv.h" 490 491static void merge_src_with_blur(uint8_t dst[], int dstRB, 492 const uint8_t src[], int srcRB, 493 const uint8_t blur[], int blurRB, 494 int sw, int sh) { 495 dstRB -= sw; 496 srcRB -= sw; 497 blurRB -= sw; 498 while (--sh >= 0) { 499 for (int x = sw - 1; x >= 0; --x) { 500 *dst = SkToU8(SkAlphaMul(*blur, SkAlpha255To256(*src))); 501 dst += 1; 502 src += 1; 503 blur += 1; 504 } 505 dst += dstRB; 506 src += srcRB; 507 blur += blurRB; 508 } 509} 510 511static void clamp_with_orig(uint8_t dst[], int dstRowBytes, 512 const uint8_t src[], int srcRowBytes, 513 int sw, int sh, 514 SkBlurMask::Style style) { 515 int x; 516 while (--sh >= 0) { 517 switch (style) { 518 case SkBlurMask::kSolid_Style: 519 for (x = sw - 1; x >= 0; --x) { 520 int s = *src; 521 int d = *dst; 522 *dst = SkToU8(s + d - SkMulDiv255Round(s, d)); 523 dst += 1; 524 src += 1; 525 } 526 break; 527 case SkBlurMask::kOuter_Style: 528 for (x = sw - 1; x >= 0; --x) { 529 if (*src) { 530 *dst = SkToU8(SkAlphaMul(*dst, SkAlpha255To256(255 - *src))); 531 } 532 dst += 1; 533 src += 1; 534 } 535 break; 536 default: 537 SkDEBUGFAIL("Unexpected blur style here"); 538 break; 539 } 540 dst += dstRowBytes - sw; 541 src += srcRowBytes - sw; 542 } 543} 544 545/////////////////////////////////////////////////////////////////////////////// 546 547// we use a local funciton to wrap the class static method to work around 548// a bug in gcc98 549void SkMask_FreeImage(uint8_t* image); 550void SkMask_FreeImage(uint8_t* image) { 551 SkMask::FreeImage(image); 552} 553 554bool SkBlurMask::Blur(SkMask* dst, const SkMask& src, 555 SkScalar radius, Style style, Quality quality, 556 SkIPoint* margin) 557{ 558 if (src.fFormat != SkMask::kA8_Format) { 559 return false; 560 } 561 562 // Force high quality off for small radii (performance) 563 if (radius < SkIntToScalar(3)) quality = kLow_Quality; 564 565 // highQuality: use three box blur passes as a cheap way to approximate a Gaussian blur 566 int passCount = (quality == kHigh_Quality) ? 3 : 1; 567 SkScalar passRadius = SkScalarDiv(radius, SkScalarSqrt(SkIntToScalar(passCount))); 568 569 int rx = SkScalarCeil(passRadius); 570 int outer_weight = 255 - SkScalarRound((SkIntToScalar(rx) - passRadius) * 255); 571 572 SkASSERT(rx >= 0); 573 SkASSERT((unsigned)outer_weight <= 255); 574 if (rx <= 0) { 575 return false; 576 } 577 578 int ry = rx; // only do square blur for now 579 580 int padx = passCount * rx; 581 int pady = passCount * ry; 582 if (margin) { 583 margin->set(padx, pady); 584 } 585 dst->fBounds.set(src.fBounds.fLeft - padx, src.fBounds.fTop - pady, 586 src.fBounds.fRight + padx, src.fBounds.fBottom + pady); 587 dst->fRowBytes = dst->fBounds.width(); 588 dst->fFormat = SkMask::kA8_Format; 589 dst->fImage = NULL; 590 591 if (src.fImage) { 592 size_t dstSize = dst->computeImageSize(); 593 if (0 == dstSize) { 594 return false; // too big to allocate, abort 595 } 596 597 int sw = src.fBounds.width(); 598 int sh = src.fBounds.height(); 599 const uint8_t* sp = src.fImage; 600 uint8_t* dp = SkMask::AllocImage(dstSize); 601 602 SkAutoTCallVProc<uint8_t, SkMask_FreeImage> autoCall(dp); 603 604 // build the blurry destination 605 { 606 const size_t storageW = sw + 2 * (passCount - 1) * rx + 1; 607 const size_t storageH = sh + 2 * (passCount - 1) * ry + 1; 608 SkAutoTMalloc<uint32_t> storage(storageW * storageH); 609 uint32_t* sumBuffer = storage.get(); 610 611 //pass1: sp is source, dp is destination 612 build_sum_buffer(sumBuffer, sw, sh, sp, src.fRowBytes); 613 if (outer_weight == 255) { 614 apply_kernel(dp, rx, ry, sumBuffer, sw, sh); 615 } else { 616 apply_kernel_interp(dp, rx, ry, sumBuffer, sw, sh, outer_weight); 617 } 618 619 if (quality == kHigh_Quality) { 620 //pass2: dp is source, tmpBuffer is destination 621 int tmp_sw = sw + 2 * rx; 622 int tmp_sh = sh + 2 * ry; 623 SkAutoTMalloc<uint8_t> tmpBuffer(dstSize); 624 build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, dp, tmp_sw); 625 if (outer_weight == 255) 626 apply_kernel(tmpBuffer.get(), rx, ry, sumBuffer, tmp_sw, tmp_sh); 627 else 628 apply_kernel_interp(tmpBuffer.get(), rx, ry, sumBuffer, 629 tmp_sw, tmp_sh, outer_weight); 630 631 //pass3: tmpBuffer is source, dp is destination 632 tmp_sw += 2 * rx; 633 tmp_sh += 2 * ry; 634 build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, tmpBuffer.get(), tmp_sw); 635 if (outer_weight == 255) 636 apply_kernel(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh); 637 else 638 apply_kernel_interp(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh, 639 outer_weight); 640 } 641 } 642 643 dst->fImage = dp; 644 // if need be, alloc the "real" dst (same size as src) and copy/merge 645 // the blur into it (applying the src) 646 if (style == kInner_Style) { 647 // now we allocate the "real" dst, mirror the size of src 648 size_t srcSize = src.computeImageSize(); 649 if (0 == srcSize) { 650 return false; // too big to allocate, abort 651 } 652 dst->fImage = SkMask::AllocImage(srcSize); 653 merge_src_with_blur(dst->fImage, src.fRowBytes, 654 sp, src.fRowBytes, 655 dp + passCount * (rx + ry * dst->fRowBytes), 656 dst->fRowBytes, sw, sh); 657 SkMask::FreeImage(dp); 658 } else if (style != kNormal_Style) { 659 clamp_with_orig(dp + passCount * (rx + ry * dst->fRowBytes), 660 dst->fRowBytes, sp, src.fRowBytes, sw, sh, style); 661 } 662 (void)autoCall.detach(); 663 } 664 665 if (style == kInner_Style) { 666 dst->fBounds = src.fBounds; // restore trimmed bounds 667 dst->fRowBytes = src.fRowBytes; 668 } 669 670 return true; 671} 672 673