1/* 2 * Copyright 2011 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include <emmintrin.h> 9#include "SkBlitRect_opts_SSE2.h" 10#include "SkBlitRow.h" 11#include "SkColorPriv.h" 12 13/* Simple blitting of opaque rectangles less than 31 pixels wide: 14 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 15 */ 16static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, 17 int width, int height, 18 size_t rowBytes, uint32_t color) { 19 SkASSERT(255 == SkGetPackedA32(color)); 20 SkASSERT(width > 0); 21 SkASSERT(width < 31); 22 23 while (--height >= 0) { 24 SkPMColor* dst = destination; 25 int count = width; 26 27 while (count > 4) { 28 *dst++ = color; 29 *dst++ = color; 30 *dst++ = color; 31 *dst++ = color; 32 count -= 4; 33 } 34 35 while (count > 0) { 36 *dst++ = color; 37 --count; 38 } 39 40 destination = (uint32_t*)((char*)destination + rowBytes); 41 } 42} 43 44/* 45 * Fast blitting of opaque rectangles at least 31 pixels wide: 46 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 47 * A 31 pixel rectangle is guaranteed to have at least one 48 * 16-pixel aligned span that can take advantage of mm_store. 49 */ 50static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, 51 int width, int height, 52 size_t rowBytes, uint32_t color) { 53 SkASSERT(255 == SkGetPackedA32(color)); 54 SkASSERT(width >= 31); 55 56 __m128i color_wide = _mm_set1_epi32(color); 57 while (--height >= 0) { 58 // Prefetching one row ahead to L1 cache can equal hardware 59 // performance for large/tall rects, but never *beats* 60 // hardware performance. 61 SkPMColor* dst = destination; 62 int count = width; 63 64 while (((size_t)dst) & 0x0F) { 65 *dst++ = color; 66 --count; 67 } 68 __m128i *d = reinterpret_cast<__m128i*>(dst); 69 70 // Googling suggests _mm_stream is only going to beat _mm_store 71 // for things that wouldn't fit in L2 cache anyway, typically 72 // >500kB, and precisely fill cache lines. For us, with 73 // arrays > 100k elements _mm_stream is still 100%+ slower than 74 // mm_store. 75 76 // Unrolling to count >= 64 is a break-even for most 77 // input patterns; we seem to be saturating the bus and having 78 // low enough overhead at 32. 79 80 while (count >= 32) { 81 _mm_store_si128(d++, color_wide); 82 _mm_store_si128(d++, color_wide); 83 _mm_store_si128(d++, color_wide); 84 _mm_store_si128(d++, color_wide); 85 _mm_store_si128(d++, color_wide); 86 _mm_store_si128(d++, color_wide); 87 _mm_store_si128(d++, color_wide); 88 _mm_store_si128(d++, color_wide); 89 count -= 32; 90 } 91 if (count >= 16) { 92 _mm_store_si128(d++, color_wide); 93 _mm_store_si128(d++, color_wide); 94 _mm_store_si128(d++, color_wide); 95 _mm_store_si128(d++, color_wide); 96 count -= 16; 97 } 98 dst = reinterpret_cast<uint32_t*>(d); 99 100 // Unrolling the loop in the Narrow code is a significant performance 101 // gain, but unrolling this loop appears to make no difference in 102 // benchmarks with either mm_store_si128 or individual sets. 103 104 while (count > 0) { 105 *dst++ = color; 106 --count; 107 } 108 109 destination = (uint32_t*)((char*)destination + rowBytes); 110 } 111} 112 113void ColorRect32_SSE2(SkPMColor* destination, 114 int width, int height, 115 size_t rowBytes, uint32_t color) { 116 if (0 == height || 0 == width || 0 == color) { 117 return; 118 } 119 unsigned colorA = SkGetPackedA32(color); 120 colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423). 121 if (255 == colorA) { 122 if (width < 31) { 123 BlitRect32_OpaqueNarrow_SSE2(destination, width, height, 124 rowBytes, color); 125 } else { 126 BlitRect32_OpaqueWide_SSE2(destination, width, height, 127 rowBytes, color); 128 } 129 } else { 130 SkBlitRow::ColorRect32(destination, width, height, rowBytes, color); 131 } 132} 133