SkBlitRect_opts_SSE2.cpp revision 8dd90a926a8660da2bacc7af149f4ac5b2e7c64c
18dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com/* 28dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * Copyright 2011 Google Inc. 38dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * 48dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * Use of this source code is governed by a BSD-style license that can be 58dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * found in the LICENSE file. 68dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com */ 78dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 88dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include "SkBlitRect_opts_SSE2.h" 98dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include "SkBlitRow.h" 108dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include "SkColorPriv.h" 118dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 128dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include <emmintrin.h> 138dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 148dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com/** Simple blitting of opaque rectangles less than 31 pixels wide: 158dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 168dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com*/ 178dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.comvoid BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, 188dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com int width, int height, 198dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com size_t rowBytes, uint32_t color) { 208dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkASSERT(255 == SkGetPackedA32(color)); 218dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkASSERT(width > 0); 228dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkASSERT(width < 31); 238dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 248dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (--height >= 0) { 258dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkPMColor* dst = destination; 268dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com int count = width; 278dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 288dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (count > 4) { 298dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 308dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 318dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 328dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 338dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com count -= 4; 348dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 358dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 368dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (count > 0) { 378dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 388dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com --count; 398dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 408dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 418dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com destination = (uint32_t*)((char*)destination + rowBytes); 428dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 438dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com} 448dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 458dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com/** 468dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com Fast blitting of opaque rectangles at least 31 pixels wide: 478dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 488dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com A 31 pixel rectangle is guaranteed to have at least one 498dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 16-pixel aligned span that can take advantage of mm_store. 508dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com*/ 518dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.comvoid BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, 528dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com int width, int height, 538dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com size_t rowBytes, uint32_t color) { 548dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkASSERT(255 == SkGetPackedA32(color)); 558dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkASSERT(width >= 31); 568dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 578dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com __m128i color_wide = _mm_set1_epi32(color); 588dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (--height >= 0) { 598dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // Prefetching one row ahead to L1 cache can equal hardware 608dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // performance for large/tall rects, but never *beats* 618dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // hardware performance. 628dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkPMColor* dst = destination; 638dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com int count = width; 648dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 658dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (((size_t)dst) & 0x0F) { 668dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 678dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com --count; 688dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 698dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 708dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 718dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // Googling suggests _mm_stream is only going to beat _mm_store 728dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // for things that wouldn't fit in L2 cache anyway, typically 738dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // >500kB, and precisely fill cache lines. For us, with 748dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // arrays > 100k elements _mm_stream is still 100%+ slower than 758dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // mm_store. 768dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 778dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // Unrolling to count >= 64 is a break-even for most 788dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // input patterns; we seem to be saturating the bus and having 798dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // low enough overhead at 32. 808dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 818dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (count >= 32) { 828dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 838dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 848dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 858dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 868dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 878dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 888dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 898dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 908dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com count -= 32; 918dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 928dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com if (count >= 16) { 938dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 948dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 958dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 968dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com _mm_store_si128(d++, color_wide); 978dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com count -= 16; 988dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 998dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com dst = reinterpret_cast<uint32_t*>(d); 1008dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 1018dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // Unrolling the loop in the Narrow code is a significant performance 1028dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // gain, but unrolling this loop appears to make no difference in 1038dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com // benchmarks with either mm_store_si128 or individual sets. 1048dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 1058dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com while (count > 0) { 1068dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *dst++ = color; 1078dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com --count; 1088dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 1098dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 1108dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com destination = (uint32_t*)((char*)destination + rowBytes); 1118dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 1128dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com} 1138dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 1148dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.comvoid ColorRect32_SSE2(SkPMColor* destination, 1158dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com int width, int height, 1168dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com size_t rowBytes, uint32_t color) { 1178dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com if (0 == height || 0 == width || 0 == color) { 1188dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com return; 1198dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com } 1208dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com unsigned colorA = SkGetPackedA32(color); 1218dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //if (255 == colorA) { 1228dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //if (width < 31) { 1238dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //BlitRect32_OpaqueNarrow_SSE2(destination, width, height, 1248dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //rowBytes, color); 1258dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //} else { 1268dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //BlitRect32_OpaqueWide_SSE2(destination, width, height, 1278dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //rowBytes, color); 1288dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //} 1298dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //} else { 1308dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com SkBlitRow::ColorRect32(destination, width, height, rowBytes, color); 1318dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com //} 1328dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com} 1338dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com 134