SkBlitRect_opts_SSE2.cpp revision 8dd90a926a8660da2bacc7af149f4ac5b2e7c64c
18dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com/*
28dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * Copyright 2011 Google Inc.
38dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com *
48dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * Use of this source code is governed by a BSD-style license that can be
58dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com * found in the LICENSE file.
68dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com */
78dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
88dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include "SkBlitRect_opts_SSE2.h"
98dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include "SkBlitRow.h"
108dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include "SkColorPriv.h"
118dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
128dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com#include <emmintrin.h>
138dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
148dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com/** Simple blitting of opaque rectangles less than 31 pixels wide:
158dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
168dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com*/
178dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.comvoid BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
188dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                                  int width, int height,
198dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                                  size_t rowBytes, uint32_t color) {
208dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    SkASSERT(255 == SkGetPackedA32(color));
218dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    SkASSERT(width > 0);
228dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    SkASSERT(width < 31);
238dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
248dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    while (--height >= 0) {
258dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        SkPMColor* dst = destination;
268dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        int count = width;
278dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
288dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        while (count > 4) {
298dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
308dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
318dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
328dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
338dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            count -= 4;
348dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        }
358dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
368dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        while (count > 0) {
378dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
388dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            --count;
398dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        }
408dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
418dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        destination = (uint32_t*)((char*)destination + rowBytes);
428dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    }
438dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com}
448dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
458dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com/**
468dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com  Fast blitting of opaque rectangles at least 31 pixels wide:
478dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com  inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
488dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com  A 31 pixel rectangle is guaranteed to have at least one
498dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com  16-pixel aligned span that can take advantage of mm_store.
508dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com*/
518dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.comvoid BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
528dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                                int width, int height,
538dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                                size_t rowBytes, uint32_t color) {
548dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    SkASSERT(255 == SkGetPackedA32(color));
558dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    SkASSERT(width >= 31);
568dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
578dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    __m128i color_wide = _mm_set1_epi32(color);
588dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    while (--height >= 0) {
598dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // Prefetching one row ahead to L1 cache can equal hardware
608dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // performance for large/tall rects, but never *beats*
618dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // hardware performance.
628dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        SkPMColor* dst = destination;
638dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        int count = width;
648dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
658dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        while (((size_t)dst) & 0x0F) {
668dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
678dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            --count;
688dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        }
698dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
708dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
718dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // Googling suggests _mm_stream is only going to beat _mm_store
728dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // for things that wouldn't fit in L2 cache anyway, typically
738dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // >500kB, and precisely fill cache lines.  For us, with
748dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // arrays > 100k elements _mm_stream is still 100%+ slower than
758dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // mm_store.
768dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
778dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // Unrolling to count >= 64 is a break-even for most
788dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // input patterns; we seem to be saturating the bus and having
798dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // low enough overhead at 32.
808dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
818dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        while (count >= 32) {
828dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
838dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
848dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
858dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
868dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
878dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
888dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
898dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
908dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            count -= 32;
918dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        }
928dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        if (count >= 16) {
938dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
948dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
958dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
968dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            _mm_store_si128(d++, color_wide);
978dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            count -= 16;
988dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        }
998dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        dst = reinterpret_cast<uint32_t*>(d);
1008dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
1018dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // Unrolling the loop in the Narrow code is a significant performance
1028dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // gain, but unrolling this loop appears to make no difference in
1038dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        // benchmarks with either mm_store_si128 or individual sets.
1048dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
1058dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        while (count > 0) {
1068dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            *dst++ = color;
1078dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            --count;
1088dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        }
1098dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
1108dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        destination = (uint32_t*)((char*)destination + rowBytes);
1118dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    }
1128dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com}
1138dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
1148dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.comvoid ColorRect32_SSE2(SkPMColor* destination,
1158dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                      int width, int height,
1168dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                      size_t rowBytes, uint32_t color) {
1178dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    if (0 == height || 0 == width || 0 == color) {
1188dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        return;
1198dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    }
1208dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    unsigned colorA = SkGetPackedA32(color);
1218dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    //if (255 == colorA) {
1228dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        //if (width < 31) {
1238dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            //BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
1248dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                                         //rowBytes, color);
1258dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        //} else {
1268dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com            //BlitRect32_OpaqueWide_SSE2(destination, width, height,
1278dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com                                       //rowBytes, color);
1288dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        //}
1298dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    //} else {
1308dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com        SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
1318dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com    //}
1328dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com}
1338dd90a926a8660da2bacc7af149f4ac5b2e7c64ctomhudson@google.com
134