SkConvolver.cpp revision 6950de6c4166fabb35e6c756fc009e0cf1c47819
1// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "SkConvolver.h" 6#include "SkMath.h" 7#include "SkSize.h" 8#include "SkTypes.h" 9 10namespace { 11 12 // Converts the argument to an 8-bit unsigned value by clamping to the range 13 // 0-255. 14 inline unsigned char ClampTo8(int a) { 15 if (static_cast<unsigned>(a) < 256) { 16 return a; // Avoid the extra check in the common case. 17 } 18 if (a < 0) { 19 return 0; 20 } 21 return 255; 22 } 23 24 // Stores a list of rows in a circular buffer. The usage is you write into it 25 // by calling AdvanceRow. It will keep track of which row in the buffer it 26 // should use next, and the total number of rows added. 27 class CircularRowBuffer { 28 public: 29 // The number of pixels in each row is given in |sourceRowPixelWidth|. 30 // The maximum number of rows needed in the buffer is |maxYFilterSize| 31 // (we only need to store enough rows for the biggest filter). 32 // 33 // We use the |firstInputRow| to compute the coordinates of all of the 34 // following rows returned by Advance(). 35 CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize, 36 int firstInputRow) 37 : fRowByteWidth(destRowPixelWidth * 4), 38 fNumRows(maxYFilterSize), 39 fNextRow(0), 40 fNextRowCoordinate(firstInputRow) { 41 fBuffer.reset(fRowByteWidth * maxYFilterSize); 42 fRowAddresses.reset(fNumRows); 43 } 44 45 // Moves to the next row in the buffer, returning a pointer to the beginning 46 // of it. 47 unsigned char* advanceRow() { 48 unsigned char* row = &fBuffer[fNextRow * fRowByteWidth]; 49 fNextRowCoordinate++; 50 51 // Set the pointer to the next row to use, wrapping around if necessary. 52 fNextRow++; 53 if (fNextRow == fNumRows) { 54 fNextRow = 0; 55 } 56 return row; 57 } 58 59 // Returns a pointer to an "unrolled" array of rows. These rows will start 60 // at the y coordinate placed into |*firstRowIndex| and will continue in 61 // order for the maximum number of rows in this circular buffer. 62 // 63 // The |firstRowIndex_| may be negative. This means the circular buffer 64 // starts before the top of the image (it hasn't been filled yet). 65 unsigned char* const* GetRowAddresses(int* firstRowIndex) { 66 // Example for a 4-element circular buffer holding coords 6-9. 67 // Row 0 Coord 8 68 // Row 1 Coord 9 69 // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10. 70 // Row 3 Coord 7 71 // 72 // The "next" row is also the first (lowest) coordinate. This computation 73 // may yield a negative value, but that's OK, the math will work out 74 // since the user of this buffer will compute the offset relative 75 // to the firstRowIndex and the negative rows will never be used. 76 *firstRowIndex = fNextRowCoordinate - fNumRows; 77 78 int curRow = fNextRow; 79 for (int i = 0; i < fNumRows; i++) { 80 fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth]; 81 82 // Advance to the next row, wrapping if necessary. 83 curRow++; 84 if (curRow == fNumRows) { 85 curRow = 0; 86 } 87 } 88 return &fRowAddresses[0]; 89 } 90 91 private: 92 // The buffer storing the rows. They are packed, each one fRowByteWidth. 93 SkTArray<unsigned char> fBuffer; 94 95 // Number of bytes per row in the |buffer|. 96 int fRowByteWidth; 97 98 // The number of rows available in the buffer. 99 int fNumRows; 100 101 // The next row index we should write into. This wraps around as the 102 // circular buffer is used. 103 int fNextRow; 104 105 // The y coordinate of the |fNextRow|. This is incremented each time a 106 // new row is appended and does not wrap. 107 int fNextRowCoordinate; 108 109 // Buffer used by GetRowAddresses(). 110 SkTArray<unsigned char*> fRowAddresses; 111 }; 112 113// Convolves horizontally along a single row. The row data is given in 114// |srcData| and continues for the numValues() of the filter. 115template<bool hasAlpha> 116 void ConvolveHorizontally(const unsigned char* srcData, 117 const SkConvolutionFilter1D& filter, 118 unsigned char* outRow) { 119 // Loop over each pixel on this row in the output image. 120 int numValues = filter.numValues(); 121 for (int outX = 0; outX < numValues; outX++) { 122 // Get the filter that determines the current output pixel. 123 int filterOffset, filterLength; 124 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 125 filter.FilterForValue(outX, &filterOffset, &filterLength); 126 127 // Compute the first pixel in this row that the filter affects. It will 128 // touch |filterLength| pixels (4 bytes each) after this. 129 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; 130 131 // Apply the filter to the row to get the destination pixel in |accum|. 132 int accum[4] = {0}; 133 for (int filterX = 0; filterX < filterLength; filterX++) { 134 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; 135 accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; 136 accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; 137 accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; 138 if (hasAlpha) { 139 accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; 140 } 141 } 142 143 // Bring this value back in range. All of the filter scaling factors 144 // are in fixed point with kShiftBits bits of fractional part. 145 accum[0] >>= SkConvolutionFilter1D::kShiftBits; 146 accum[1] >>= SkConvolutionFilter1D::kShiftBits; 147 accum[2] >>= SkConvolutionFilter1D::kShiftBits; 148 if (hasAlpha) { 149 accum[3] >>= SkConvolutionFilter1D::kShiftBits; 150 } 151 152 // Store the new pixel. 153 outRow[outX * 4 + 0] = ClampTo8(accum[0]); 154 outRow[outX * 4 + 1] = ClampTo8(accum[1]); 155 outRow[outX * 4 + 2] = ClampTo8(accum[2]); 156 if (hasAlpha) { 157 outRow[outX * 4 + 3] = ClampTo8(accum[3]); 158 } 159 } 160 } 161 162 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize). We originally 163 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles 164 // suffer here too. 165 // 166 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. https://bug.skia.org/2575 167 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) 168 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), noinline)) 169 #else 170 #define SK_MAYBE_DISABLE_VECTORIZATION 171 #endif 172 173 SK_MAYBE_DISABLE_VECTORIZATION 174 static void ConvolveHorizontallyAlpha(const unsigned char* srcData, 175 const SkConvolutionFilter1D& filter, 176 unsigned char* outRow) { 177 return ConvolveHorizontally<true>(srcData, filter, outRow); 178 } 179 180 SK_MAYBE_DISABLE_VECTORIZATION 181 static void ConvolveHorizontallyNoAlpha(const unsigned char* srcData, 182 const SkConvolutionFilter1D& filter, 183 unsigned char* outRow) { 184 return ConvolveHorizontally<false>(srcData, filter, outRow); 185 } 186 187 #undef SK_MAYBE_DISABLE_VECTORIZATION 188 189 190// Does vertical convolution to produce one output row. The filter values and 191// length are given in the first two parameters. These are applied to each 192// of the rows pointed to in the |sourceDataRows| array, with each row 193// being |pixelWidth| wide. 194// 195// The output must have room for |pixelWidth * 4| bytes. 196template<bool hasAlpha> 197 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 198 int filterLength, 199 unsigned char* const* sourceDataRows, 200 int pixelWidth, 201 unsigned char* outRow) { 202 // We go through each column in the output and do a vertical convolution, 203 // generating one output pixel each time. 204 for (int outX = 0; outX < pixelWidth; outX++) { 205 // Compute the number of bytes over in each row that the current column 206 // we're convolving starts at. The pixel will cover the next 4 bytes. 207 int byteOffset = outX * 4; 208 209 // Apply the filter to one column of pixels. 210 int accum[4] = {0}; 211 for (int filterY = 0; filterY < filterLength; filterY++) { 212 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; 213 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; 214 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; 215 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; 216 if (hasAlpha) { 217 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; 218 } 219 } 220 221 // Bring this value back in range. All of the filter scaling factors 222 // are in fixed point with kShiftBits bits of precision. 223 accum[0] >>= SkConvolutionFilter1D::kShiftBits; 224 accum[1] >>= SkConvolutionFilter1D::kShiftBits; 225 accum[2] >>= SkConvolutionFilter1D::kShiftBits; 226 if (hasAlpha) { 227 accum[3] >>= SkConvolutionFilter1D::kShiftBits; 228 } 229 230 // Store the new pixel. 231 outRow[byteOffset + 0] = ClampTo8(accum[0]); 232 outRow[byteOffset + 1] = ClampTo8(accum[1]); 233 outRow[byteOffset + 2] = ClampTo8(accum[2]); 234 if (hasAlpha) { 235 unsigned char alpha = ClampTo8(accum[3]); 236 237 // Make sure the alpha channel doesn't come out smaller than any of the 238 // color channels. We use premultipled alpha channels, so this should 239 // never happen, but rounding errors will cause this from time to time. 240 // These "impossible" colors will cause overflows (and hence random pixel 241 // values) when the resulting bitmap is drawn to the screen. 242 // 243 // We only need to do this when generating the final output row (here). 244 int maxColorChannel = SkTMax(outRow[byteOffset + 0], 245 SkTMax(outRow[byteOffset + 1], 246 outRow[byteOffset + 2])); 247 if (alpha < maxColorChannel) { 248 outRow[byteOffset + 3] = maxColorChannel; 249 } else { 250 outRow[byteOffset + 3] = alpha; 251 } 252 } else { 253 // No alpha channel, the image is opaque. 254 outRow[byteOffset + 3] = 0xff; 255 } 256 } 257 } 258 259 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 260 int filterLength, 261 unsigned char* const* sourceDataRows, 262 int pixelWidth, 263 unsigned char* outRow, 264 bool sourceHasAlpha) { 265 if (sourceHasAlpha) { 266 ConvolveVertically<true>(filterValues, filterLength, 267 sourceDataRows, pixelWidth, 268 outRow); 269 } else { 270 ConvolveVertically<false>(filterValues, filterLength, 271 sourceDataRows, pixelWidth, 272 outRow); 273 } 274 } 275 276} // namespace 277 278// SkConvolutionFilter1D --------------------------------------------------------- 279 280SkConvolutionFilter1D::SkConvolutionFilter1D() 281: fMaxFilter(0) { 282} 283 284SkConvolutionFilter1D::~SkConvolutionFilter1D() { 285} 286 287void SkConvolutionFilter1D::AddFilter(int filterOffset, 288 const float* filterValues, 289 int filterLength) { 290 SkASSERT(filterLength > 0); 291 292 SkTArray<ConvolutionFixed> fixedValues; 293 fixedValues.reset(filterLength); 294 295 for (int i = 0; i < filterLength; ++i) { 296 fixedValues.push_back(FloatToFixed(filterValues[i])); 297 } 298 299 AddFilter(filterOffset, &fixedValues[0], filterLength); 300} 301 302void SkConvolutionFilter1D::AddFilter(int filterOffset, 303 const ConvolutionFixed* filterValues, 304 int filterLength) { 305 // It is common for leading/trailing filter values to be zeros. In such 306 // cases it is beneficial to only store the central factors. 307 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on 308 // a 1080p image this optimization gives a ~10% speed improvement. 309 int filterSize = filterLength; 310 int firstNonZero = 0; 311 while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) { 312 firstNonZero++; 313 } 314 315 if (firstNonZero < filterLength) { 316 // Here we have at least one non-zero factor. 317 int lastNonZero = filterLength - 1; 318 while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) { 319 lastNonZero--; 320 } 321 322 filterOffset += firstNonZero; 323 filterLength = lastNonZero + 1 - firstNonZero; 324 SkASSERT(filterLength > 0); 325 326 for (int i = firstNonZero; i <= lastNonZero; i++) { 327 fFilterValues.push_back(filterValues[i]); 328 } 329 } else { 330 // Here all the factors were zeroes. 331 filterLength = 0; 332 } 333 334 FilterInstance instance; 335 336 // We pushed filterLength elements onto fFilterValues 337 instance.fDataLocation = (static_cast<int>(fFilterValues.count()) - 338 filterLength); 339 instance.fOffset = filterOffset; 340 instance.fTrimmedLength = filterLength; 341 instance.fLength = filterSize; 342 fFilters.push_back(instance); 343 344 fMaxFilter = SkTMax(fMaxFilter, filterLength); 345} 346 347const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter( 348 int* specifiedFilterlength, 349 int* filterOffset, 350 int* filterLength) const { 351 const FilterInstance& filter = fFilters[0]; 352 *filterOffset = filter.fOffset; 353 *filterLength = filter.fTrimmedLength; 354 *specifiedFilterlength = filter.fLength; 355 if (filter.fTrimmedLength == 0) { 356 return nullptr; 357 } 358 359 return &fFilterValues[filter.fDataLocation]; 360} 361 362bool BGRAConvolve2D(const unsigned char* sourceData, 363 int sourceByteRowStride, 364 bool sourceHasAlpha, 365 const SkConvolutionFilter1D& filterX, 366 const SkConvolutionFilter1D& filterY, 367 int outputByteRowStride, 368 unsigned char* output, 369 const SkConvolutionProcs& convolveProcs, 370 bool useSimdIfPossible) { 371 372 int maxYFilterSize = filterY.maxFilter(); 373 374 // The next row in the input that we will generate a horizontally 375 // convolved row for. If the filter doesn't start at the beginning of the 376 // image (this is the case when we are only resizing a subset), then we 377 // don't want to generate any output rows before that. Compute the starting 378 // row for convolution as the first pixel for the first vertical filter. 379 int filterOffset, filterLength; 380 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 381 filterY.FilterForValue(0, &filterOffset, &filterLength); 382 int nextXRow = filterOffset; 383 384 // We loop over each row in the input doing a horizontal convolution. This 385 // will result in a horizontally convolved image. We write the results into 386 // a circular buffer of convolved rows and do vertical convolution as rows 387 // are available. This prevents us from having to store the entire 388 // intermediate image and helps cache coherency. 389 // We will need four extra rows to allow horizontal convolution could be done 390 // simultaneously. We also pad each row in row buffer to be aligned-up to 391 // 16 bytes. 392 // TODO(jiesun): We do not use aligned load from row buffer in vertical 393 // convolution pass yet. Somehow Windows does not like it. 394 int rowBufferWidth = (filterX.numValues() + 15) & ~0xF; 395 int rowBufferHeight = maxYFilterSize + 396 (convolveProcs.fConvolve4RowsHorizontally ? 4 : 0); 397 398 // check for too-big allocation requests : crbug.com/528628 399 { 400 int64_t size = sk_64_mul(rowBufferWidth, rowBufferHeight); 401 // need some limit, to avoid over-committing success from malloc, but then 402 // crashing when we try to actually use the memory. 403 // 100meg seems big enough to allow "normal" zoom factors and image sizes through 404 // while avoiding the crash seen by the bug (crbug.com/528628) 405 if (size > 100 * 1024 * 1024) { 406// SkDebugf("BGRAConvolve2D: tmp allocation [%lld] too big\n", size); 407 return false; 408 } 409 } 410 411 CircularRowBuffer rowBuffer(rowBufferWidth, 412 rowBufferHeight, 413 filterOffset); 414 415 // Loop over every possible output row, processing just enough horizontal 416 // convolutions to run each subsequent vertical convolution. 417 SkASSERT(outputByteRowStride >= filterX.numValues() * 4); 418 int numOutputRows = filterY.numValues(); 419 420 // We need to check which is the last line to convolve before we advance 4 421 // lines in one iteration. 422 int lastFilterOffset, lastFilterLength; 423 424 // SSE2 can access up to 3 extra pixels past the end of the 425 // buffer. At the bottom of the image, we have to be careful 426 // not to access data past the end of the buffer. Normally 427 // we fall back to the C++ implementation for the last row. 428 // If the last row is less than 3 pixels wide, we may have to fall 429 // back to the C++ version for more rows. Compute how many 430 // rows we need to avoid the SSE implementation for here. 431 filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset, 432 &lastFilterLength); 433 int avoidSimdRows = 1 + convolveProcs.fExtraHorizontalReads / 434 (lastFilterOffset + lastFilterLength); 435 436 filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset, 437 &lastFilterLength); 438 439 for (int outY = 0; outY < numOutputRows; outY++) { 440 filterValues = filterY.FilterForValue(outY, 441 &filterOffset, &filterLength); 442 443 // Generate output rows until we have enough to run the current filter. 444 while (nextXRow < filterOffset + filterLength) { 445 if (convolveProcs.fConvolve4RowsHorizontally && 446 nextXRow + 3 < lastFilterOffset + lastFilterLength - 447 avoidSimdRows) { 448 const unsigned char* src[4]; 449 unsigned char* outRow[4]; 450 for (int i = 0; i < 4; ++i) { 451 src[i] = &sourceData[(uint64_t)(nextXRow + i) * sourceByteRowStride]; 452 outRow[i] = rowBuffer.advanceRow(); 453 } 454 convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow, 4*rowBufferWidth); 455 nextXRow += 4; 456 } else { 457 // Check if we need to avoid SSE2 for this row. 458 if (convolveProcs.fConvolveHorizontally && 459 nextXRow < lastFilterOffset + lastFilterLength - 460 avoidSimdRows) { 461 convolveProcs.fConvolveHorizontally( 462 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], 463 filterX, rowBuffer.advanceRow(), sourceHasAlpha); 464 } else { 465 if (sourceHasAlpha) { 466 ConvolveHorizontallyAlpha( 467 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], 468 filterX, rowBuffer.advanceRow()); 469 } else { 470 ConvolveHorizontallyNoAlpha( 471 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], 472 filterX, rowBuffer.advanceRow()); 473 } 474 } 475 nextXRow++; 476 } 477 } 478 479 // Compute where in the output image this row of final data will go. 480 unsigned char* curOutputRow = &output[(uint64_t)outY * outputByteRowStride]; 481 482 // Get the list of rows that the circular buffer has, in order. 483 int firstRowInCircularBuffer; 484 unsigned char* const* rowsToConvolve = 485 rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); 486 487 // Now compute the start of the subset of those rows that the filter 488 // needs. 489 unsigned char* const* firstRowForFilter = 490 &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; 491 492 if (convolveProcs.fConvolveVertically) { 493 convolveProcs.fConvolveVertically(filterValues, filterLength, 494 firstRowForFilter, 495 filterX.numValues(), curOutputRow, 496 sourceHasAlpha); 497 } else { 498 ConvolveVertically(filterValues, filterLength, 499 firstRowForFilter, 500 filterX.numValues(), curOutputRow, 501 sourceHasAlpha); 502 } 503 } 504 return true; 505} 506