1/*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBlitRect_opts_SSE2.h"
10#include "SkBlitRow.h"
11#include "SkColorPriv.h"
12
13/* Simple blitting of opaque rectangles less than 31 pixels wide:
14 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
15 */
16static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
17                                  int width, int height,
18                                  size_t rowBytes, uint32_t color) {
19    SkASSERT(255 == SkGetPackedA32(color));
20    SkASSERT(width > 0);
21    SkASSERT(width < 31);
22
23    while (--height >= 0) {
24        SkPMColor* dst = destination;
25        int count = width;
26
27        while (count > 4) {
28            *dst++ = color;
29            *dst++ = color;
30            *dst++ = color;
31            *dst++ = color;
32            count -= 4;
33        }
34
35        while (count > 0) {
36            *dst++ = color;
37            --count;
38        }
39
40        destination = (uint32_t*)((char*)destination + rowBytes);
41    }
42}
43
44/*
45 * Fast blitting of opaque rectangles at least 31 pixels wide:
46 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
47 * A 31 pixel rectangle is guaranteed to have at least one
48 * 16-pixel aligned span that can take advantage of mm_store.
49 */
50static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
51                                int width, int height,
52                                size_t rowBytes, uint32_t color) {
53    SkASSERT(255 == SkGetPackedA32(color));
54    SkASSERT(width >= 31);
55
56    __m128i color_wide = _mm_set1_epi32(color);
57    while (--height >= 0) {
58        // Prefetching one row ahead to L1 cache can equal hardware
59        // performance for large/tall rects, but never *beats*
60        // hardware performance.
61        SkPMColor* dst = destination;
62        int count = width;
63
64        while (((size_t)dst) & 0x0F) {
65            *dst++ = color;
66            --count;
67        }
68        __m128i *d = reinterpret_cast<__m128i*>(dst);
69
70        // Googling suggests _mm_stream is only going to beat _mm_store
71        // for things that wouldn't fit in L2 cache anyway, typically
72        // >500kB, and precisely fill cache lines.  For us, with
73        // arrays > 100k elements _mm_stream is still 100%+ slower than
74        // mm_store.
75
76        // Unrolling to count >= 64 is a break-even for most
77        // input patterns; we seem to be saturating the bus and having
78        // low enough overhead at 32.
79
80        while (count >= 32) {
81            _mm_store_si128(d++, color_wide);
82            _mm_store_si128(d++, color_wide);
83            _mm_store_si128(d++, color_wide);
84            _mm_store_si128(d++, color_wide);
85            _mm_store_si128(d++, color_wide);
86            _mm_store_si128(d++, color_wide);
87            _mm_store_si128(d++, color_wide);
88            _mm_store_si128(d++, color_wide);
89            count -= 32;
90        }
91        if (count >= 16) {
92            _mm_store_si128(d++, color_wide);
93            _mm_store_si128(d++, color_wide);
94            _mm_store_si128(d++, color_wide);
95            _mm_store_si128(d++, color_wide);
96            count -= 16;
97        }
98        dst = reinterpret_cast<uint32_t*>(d);
99
100        // Unrolling the loop in the Narrow code is a significant performance
101        // gain, but unrolling this loop appears to make no difference in
102        // benchmarks with either mm_store_si128 or individual sets.
103
104        while (count > 0) {
105            *dst++ = color;
106            --count;
107        }
108
109        destination = (uint32_t*)((char*)destination + rowBytes);
110    }
111}
112
113void ColorRect32_SSE2(SkPMColor* destination,
114                      int width, int height,
115                      size_t rowBytes, uint32_t color) {
116    if (0 == height || 0 == width || 0 == color) {
117        return;
118    }
119    unsigned colorA = SkGetPackedA32(color);
120    colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
121    if (255 == colorA) {
122        if (width < 31) {
123            BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
124                                         rowBytes, color);
125        } else {
126            BlitRect32_OpaqueWide_SSE2(destination, width, height,
127                                       rowBytes, color);
128        }
129    } else {
130        SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
131    }
132}
133