1dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org/*
2ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Copyright 2009 The Android Open Source Project
3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com *
4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be
5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file.
6dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org */
7dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
8dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org#include <emmintrin.h>
9dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org#include "SkBitmapProcState_opts_SSE2.h"
104b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org#include "SkColorPriv.h"
119cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com#include "SkPaint.h"
12dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org#include "SkUtils.h"
13dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
14dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.orgvoid S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
15dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org                                   const uint32_t* xy,
16dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org                                   int count, uint32_t* colors) {
17dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    SkASSERT(count > 0 && colors != NULL);
189cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
19c77392ed58ec78ab19fa0e3ff99fb8110854fba2reed    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
20aa4f0c682d4eced5c0c3aa711f76d440eae60588senorblanco@chromium.org    SkASSERT(s.fAlphaScale == 256);
21dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
22dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
23e5f48243bdbed2662be7a31be0888abc273b09e8scroggo@google.com    size_t rb = s.fBitmap->rowBytes();
24dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    uint32_t XY = *xy++;
25dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    unsigned y0 = XY >> 14;
26dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
27dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
28dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    unsigned subY = y0 & 0xF;
29dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
30dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // ( 0,  0,  0,  0,  0,  0,  0, 16)
31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    __m128i sixteen = _mm_cvtsi32_si128(16);
32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // ( 0,  0,  0,  0, 16, 16, 16, 16)
34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    sixteen = _mm_shufflelo_epi16(sixteen, 0);
35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // ( 0,  0,  0,  0,  0,  0,  0,  y)
37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    __m128i allY = _mm_cvtsi32_si128(subY);
38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // ( 0,  0,  0,  0,  y,  y,  y,  y)
40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    allY = _mm_shufflelo_epi16(allY, 0);
41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    __m128i negY = _mm_sub_epi16(sixteen, allY);
44dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
45dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
46dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    allY = _mm_unpacklo_epi64(allY, negY);
47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // (16, 16, 16, 16, 16, 16, 16, 16 )
49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    sixteen = _mm_shuffle_epi32(sixteen, 0);
50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    // ( 0,  0,  0,  0,  0,  0,  0,  0)
52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    __m128i zero = _mm_setzero_si128();
53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    do {
54dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
55dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        unsigned x0 = XX >> 18;
56dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        unsigned x1 = XX & 0x3FFF;
57dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (0, 0, 0, 0, 0, 0, 0, x)
59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
60fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (0, 0, 0, 0, x, x, x, x)
62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        allX = _mm_shufflelo_epi16(allX, 0);
63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
64dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (x, x, x, x, x, x, x, x)
65dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        allX = _mm_shuffle_epi32(allX, 0);
66dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
67dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
68dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i negX = _mm_sub_epi16(sixteen, allX);
69dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
70dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // Load 4 samples (pixels).
71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (0, 0, a00, a10)
77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
78dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
79dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // Expand to 16 bits per component.
80dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
81dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
82dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // ((a00 * (16-y)), (a10 * y)).
83dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        a00a10 = _mm_mullo_epi16(a00a10, allY);
84dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
85dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
86dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        a00a10 = _mm_mullo_epi16(a00a10, negX);
87dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (0, 0, a01, a10)
89dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
90dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
91dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // Expand to 16 bits per component.
92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (a01 * (16-y)), (a11 * y)
95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        a01a11 = _mm_mullo_epi16(a01a11, allY);
96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (a01 * (16-y) * x), (a11 * y * x)
98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        a01a11 = _mm_mullo_epi16(a01a11, allX);
99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i sum = _mm_add_epi16(a00a10, a01a11);
102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
103dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (DC, a00*w00 + a01*w01)
104dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
105dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
106dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
107dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        sum = _mm_add_epi16(sum, shifted);
108dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
109dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // Divide each 16 bit component by 256.
110dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        sum = _mm_srli_epi16(sum, 8);
111dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
112dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // Pack lower 4 16 bit values of sum into lower 4 bytes.
113dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        sum = _mm_packus_epi16(sum, zero);
114dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
115dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        // Extract low int and store.
116dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        *colors++ = _mm_cvtsi128_si32(sum);
117dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    } while (--count > 0);
118dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org}
119f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
120f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.orgvoid S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
121f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org                                  const uint32_t* xy,
122f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org                                  int count, uint32_t* colors) {
123f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    SkASSERT(count > 0 && colors != NULL);
1249cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
125c77392ed58ec78ab19fa0e3ff99fb8110854fba2reed    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
126f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    SkASSERT(s.fAlphaScale < 256);
127f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
128f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
129e5f48243bdbed2662be7a31be0888abc273b09e8scroggo@google.com    size_t rb = s.fBitmap->rowBytes();
130f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    uint32_t XY = *xy++;
131f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    unsigned y0 = XY >> 14;
132f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
133f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
134f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    unsigned subY = y0 & 0xF;
135f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
136f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( 0,  0,  0,  0,  0,  0,  0, 16)
137f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    __m128i sixteen = _mm_cvtsi32_si128(16);
138f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
139f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( 0,  0,  0,  0, 16, 16, 16, 16)
140f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    sixteen = _mm_shufflelo_epi16(sixteen, 0);
141f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
142f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( 0,  0,  0,  0,  0,  0,  0,  y)
143f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    __m128i allY = _mm_cvtsi32_si128(subY);
144f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
145f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( 0,  0,  0,  0,  y,  y,  y,  y)
146f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    allY = _mm_shufflelo_epi16(allY, 0);
147f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
148f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
149f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    __m128i negY = _mm_sub_epi16(sixteen, allY);
150f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
151f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
152f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    allY = _mm_unpacklo_epi64(allY, negY);
153f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
154f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // (16, 16, 16, 16, 16, 16, 16, 16 )
155f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    sixteen = _mm_shuffle_epi32(sixteen, 0);
156f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
157f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( 0,  0,  0,  0,  0,  0,  0,  0)
158f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    __m128i zero = _mm_setzero_si128();
159f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
160f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
161f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
162f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
163f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    do {
164f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
165f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        unsigned x0 = XX >> 18;
166f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        unsigned x1 = XX & 0x3FFF;
167f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
168f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (0, 0, 0, 0, 0, 0, 0, x)
169f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
170fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
171f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (0, 0, 0, 0, x, x, x, x)
172f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        allX = _mm_shufflelo_epi16(allX, 0);
173f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
174f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (x, x, x, x, x, x, x, x)
175f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        allX = _mm_shuffle_epi32(allX, 0);
176f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
177f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
178f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i negX = _mm_sub_epi16(sixteen, allX);
179f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
180f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Load 4 samples (pixels).
181f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
182f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
183f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
184f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
185f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
186f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (0, 0, a00, a10)
187f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
188f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
189f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Expand to 16 bits per component.
190f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
191f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
192f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // ((a00 * (16-y)), (a10 * y)).
193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        a00a10 = _mm_mullo_epi16(a00a10, allY);
194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        a00a10 = _mm_mullo_epi16(a00a10, negX);
197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (0, 0, a01, a10)
199f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
200f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
201f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Expand to 16 bits per component.
202f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
203f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
204f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (a01 * (16-y)), (a11 * y)
205f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        a01a11 = _mm_mullo_epi16(a01a11, allY);
206f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
207f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (a01 * (16-y) * x), (a11 * y * x)
208f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        a01a11 = _mm_mullo_epi16(a01a11, allX);
209f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
210f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
211f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i sum = _mm_add_epi16(a00a10, a01a11);
212f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
213f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (DC, a00*w00 + a01*w01)
214f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
215f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
216f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
217f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        sum = _mm_add_epi16(sum, shifted);
218f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
219f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Divide each 16 bit component by 256.
220f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        sum = _mm_srli_epi16(sum, 8);
221f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
222f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Multiply by alpha.
223f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        sum = _mm_mullo_epi16(sum, alpha);
224f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
225f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Divide each 16 bit component by 256.
226f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        sum = _mm_srli_epi16(sum, 8);
227f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
228f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Pack lower 4 16 bit values of sum into lower 4 bytes.
229f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        sum = _mm_packus_epi16(sum, zero);
230f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
231f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        // Extract low int and store.
232f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org        *colors++ = _mm_cvtsi128_si32(sum);
233f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org    } while (--count > 0);
234f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org}
23506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
23606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.comstatic inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
23706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                                 SkFixed one) {
23806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    unsigned i = SkClampMax(f >> 16, max);
23906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    i = (i << 4) | ((f >> 12) & 0xF);
24006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    return (i << 14) | SkClampMax((f + one) >> 16, max);
24106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com}
24206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
24306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com/*  SSE version of ClampX_ClampY_filter_scale()
24406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *  portable version is in core/SkBitmapProcState_matrix.h
24506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com */
24606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.comvoid ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
24706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                     int count, int x, int y) {
24806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
24906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                             SkMatrix::kScale_Mask)) == 0);
25006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkASSERT(s.fInvKy == 0);
251fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
25206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const unsigned maxX = s.fBitmap->width() - 1;
25306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const SkFixed one = s.fFilterOneX;
25406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const SkFixed dx = s.fInvSx;
25506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkFixed fx;
25606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
25706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkPoint pt;
2589c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com    s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
2599c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com                             SkIntToScalar(y) + SK_ScalarHalf, &pt);
26006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
26106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const unsigned maxY = s.fBitmap->height() - 1;
26206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    // compute our two Y values up front
26306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
26406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    // now initialize fx
26506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    fx = SkScalarToFixed(pt.fX) - (one >> 1);
26606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
26706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    // test if we don't need to apply the tile proc
26806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
26906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
27006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        if (count >= 4) {
27106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            // SSE version of decal_filter_scale
27206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while ((size_t(xy) & 0x0F) != 0) {
27306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                SkASSERT((fx >> (16 + 14)) == 0);
27406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
27506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += dx;
27606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count--;
27706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            }
27806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
27906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_1    = _mm_set1_epi32(1);
28006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
28106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
28206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                              fx + dx, fx);
28306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
28406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while (count >= 4) {
285fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                __m128i wide_out;
286fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
28706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
28806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
289fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                                        _mm_srai_epi32(wide_fx, 16), wide_1));
290fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
29106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
292fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
29306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                xy += 4;
29406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += dx * 4;
29506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
29606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count -= 4;
29706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            } // while count >= 4
29806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        } // if count >= 4
29906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
30006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        while (count-- > 0) {
30106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            SkASSERT((fx >> (16 + 14)) == 0);
30206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
30306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            fx += dx;
30406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        }
30506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    } else {
30606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // SSE2 only support 16bit interger max & min, so only process the case
30706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
308fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        // height, there should be rare bitmap whose height will be greater
30906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // than max 16bit interger in the real world.
31006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        if ((count >= 4) && (maxX <= 0xFFFF)) {
31106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while (((size_t)xy & 0x0F) != 0) {
31206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
31306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += dx;
31406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count--;
31506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            }
316fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
31706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
31806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                              fx + dx, fx);
31906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
32006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_one  = _mm_set1_epi32(one);
321fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            __m128i wide_maxX = _mm_set1_epi32(maxX);
32206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_mask = _mm_set1_epi32(0xF);
32306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
32406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com             while (count >= 4) {
32506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_i;
32606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_lo;
32706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_fx1;
32806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
32906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                // i = SkClampMax(f>>16,maxX)
330fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
33106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                       _mm_setzero_si128());
33206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_i = _mm_min_epi16(wide_i, wide_maxX);
333fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
33406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                // i<<4 | TILEX_LOW_BITS(fx)
33506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_lo = _mm_srli_epi32(wide_fx, 12);
33606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_lo = _mm_and_si128(wide_lo, wide_mask);
337fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                wide_i  = _mm_slli_epi32(wide_i, 4);
338fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                wide_i  = _mm_or_si128(wide_i, wide_lo);
339fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
34006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                // i<<14
34106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_i = _mm_slli_epi32(wide_i, 14);
342fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
34306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                // SkClampMax(((f+one))>>16,max)
34406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
345fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
34606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                                        _mm_setzero_si128());
34706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
348fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
34906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                // final combination
35006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_i = _mm_or_si128(wide_i, wide_fx1);
351fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
352fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
35306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
354fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                fx += dx * 4;
35506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                xy += 4;
35606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count -= 4;
35706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            } // while count >= 4
35806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        } // if count >= 4
35906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
36006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        while (count-- > 0) {
36106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
36206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            fx += dx;
36306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        }
36406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    }
36506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com}
36606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
36706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com/*  SSE version of ClampX_ClampY_nofilter_scale()
36806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *  portable version is in core/SkBitmapProcState_matrix.h
36906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com */
37006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.comvoid ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
37106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                    uint32_t xy[], int count, int x, int y) {
37206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
37306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                             SkMatrix::kScale_Mask)) == 0);
37406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
37506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    // we store y, x, x, x, x, x
37606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const unsigned maxX = s.fBitmap->width() - 1;
37706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkFixed fx;
37806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    SkPoint pt;
3799c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com    s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
3809c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com                             SkIntToScalar(y) + SK_ScalarHalf, &pt);
38106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    fx = SkScalarToFixed(pt.fY);
38206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const unsigned maxY = s.fBitmap->height() - 1;
38306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    *xy++ = SkClampMax(fx >> 16, maxY);
38406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    fx = SkScalarToFixed(pt.fX);
385fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
38606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    if (0 == maxX) {
38706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // all of the following X values must be 0
38806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        memset(xy, 0, count * sizeof(uint16_t));
38906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        return;
39006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    }
39106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
39206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    const SkFixed dx = s.fInvSx;
39306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
39406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    // test if we don't need to apply the tile proc
39506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    if ((unsigned)(fx >> 16) <= maxX &&
39606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
39706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // SSE version of decal_nofilter_scale
39806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        if (count >= 8) {
39906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while (((size_t)xy & 0x0F) != 0) {
40006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
40106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += 2 * dx;
40206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count -= 2;
40306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            }
40406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
40506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
40606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
40706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
40806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
40906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                             fx + dx, fx);
41006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
41106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
41206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while (count >= 8) {
41306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
41406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
41506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
41606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_result = _mm_packs_epi32(wide_out_low,
41706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                                      wide_out_high);
41806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
419fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
42006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_low = _mm_add_epi32(wide_low, wide_dx8);
42106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_high = _mm_add_epi32(wide_high, wide_dx8);
42206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
42306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                xy += 4;
42406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += dx * 8;
42506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count -= 8;
42606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            }
42706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        } // if count >= 8
42806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
42906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
43006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        while (count-- > 0) {
43106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            *xx++ = SkToU16(fx >> 16);
43206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            fx += dx;
43306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        }
43406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    } else {
43506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // SSE2 only support 16bit interger max & min, so only process the case
43606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
437fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        // height, there should be rare bitmap whose height will be greater
43806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        // than max 16bit interger in the real world.
43906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        if ((count >= 8) && (maxX <= 0xFFFF)) {
44006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while (((size_t)xy & 0x0F) != 0) {
441602f2272be351cee0e5d6723b57c4256d473bd2bmike@reedtribe.org                *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
442602f2272be351cee0e5d6723b57c4256d473bd2bmike@reedtribe.org                                        SkClampMax(fx >> 16, maxX));
44306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += 2 * dx;
44406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count -= 2;
44506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            }
44606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
44706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
44806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
44906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
45006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
45106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                             fx + dx, fx);
45206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
45306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            __m128i wide_maxX = _mm_set1_epi32(maxX);
45406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
45506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            while (count >= 8) {
45606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
45706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
45806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
459fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                wide_out_low  = _mm_max_epi16(wide_out_low,
46006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                              _mm_setzero_si128());
46106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
46206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_out_high = _mm_max_epi16(wide_out_high,
46306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                              _mm_setzero_si128());
46406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
46506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
46606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                __m128i wide_result = _mm_packs_epi32(wide_out_low,
46706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                                                      wide_out_high);
46806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
46906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
47006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_low  = _mm_add_epi32(wide_low, wide_dx8);
47106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                wide_high = _mm_add_epi32(wide_high, wide_dx8);
47206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
47306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                xy += 4;
47406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                fx += dx * 8;
47506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com                count -= 8;
47606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            }
47706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        } // if count >= 8
47806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com
47906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
48006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        while (count-- > 0) {
48106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            *xx++ = SkClampMax(fx >> 16, maxX);
48206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com            fx += dx;
48306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com        }
48406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com    }
48506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com}
4865efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
4875efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com/*  SSE version of ClampX_ClampY_filter_affine()
4885efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com *  portable version is in core/SkBitmapProcState_matrix.h
4895efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com */
4905efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.comvoid ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
4915efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                      uint32_t xy[], int count, int x, int y) {
4925efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkPoint srcPt;
4939c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com    s.fInvProc(s.fInvMatrix,
4945efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com               SkIntToScalar(x) + SK_ScalarHalf,
4955efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
496fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
4975efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed oneX = s.fFilterOneX;
4985efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed oneY = s.fFilterOneY;
4995efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
5005efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
5015efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed dx = s.fInvSx;
5025efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed dy = s.fInvKy;
5035efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    unsigned maxX = s.fBitmap->width() - 1;
5045efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    unsigned maxY = s.fBitmap->height() - 1;
5055efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5065efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    if (count >= 2 && (maxX <= 0xFFFF)) {
5075efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        SkFixed dx2 = dx + dx;
5085efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        SkFixed dy2 = dy + dy;
5095efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5105efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
5115efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
5125efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
513fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
5145efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_mask = _mm_set1_epi32(0xF);
5155efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5165efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        while (count >= 2) {
5175efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // i = SkClampMax(f>>16,maxX)
518fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
5195efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                           _mm_setzero_si128());
5205efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_i = _mm_min_epi16(wide_i, wide_max);
521fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
5225efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // i<<4 | TILEX_LOW_BITS(f)
5235efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
5245efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_lo = _mm_and_si128(wide_lo, wide_mask);
525fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            wide_i  = _mm_slli_epi32(wide_i, 4);
526fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            wide_i  = _mm_or_si128(wide_i, wide_lo);
527fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
5285efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // i<<14
5295efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_i = _mm_slli_epi32(wide_i, 14);
530fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
5315efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // SkClampMax(((f+one))>>16,max)
5325efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
533fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
5345efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                                   _mm_setzero_si128());
5355efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_f1 = _mm_min_epi16(wide_f1, wide_max);
536fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
5375efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // final combination
5385efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_i = _mm_or_si128(wide_i, wide_f1);
539fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
540fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
5415efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_f = _mm_add_epi32(wide_f, wide_d2);
5425efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
543fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            fx += dx2;
5445efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            fy += dy2;
5455efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            xy += 4;
5465efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            count -= 2;
5475efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        } // while count >= 2
5485efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    } // if count >= 2
5495efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5505efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    while (count-- > 0) {
5515efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
5525efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        fy += dy;
5535efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
554fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        fx += dx;
5555efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    }
5565efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com}
5575efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5585efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com/*  SSE version of ClampX_ClampY_nofilter_affine()
5595efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com *  portable version is in core/SkBitmapProcState_matrix.h
5605efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com */
5615efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.comvoid ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
5625efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                      uint32_t xy[], int count, int x, int y) {
5635efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
5645efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
5655efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                             SkMatrix::kScale_Mask |
5665efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                             SkMatrix::kAffine_Mask)) == 0);
5675efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5685efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkPoint srcPt;
5699c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com    s.fInvProc(s.fInvMatrix,
5705efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com               SkIntToScalar(x) + SK_ScalarHalf,
5715efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
572fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
5735efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed fx = SkScalarToFixed(srcPt.fX);
5745efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed fy = SkScalarToFixed(srcPt.fY);
5755efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed dx = s.fInvSx;
5765efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    SkFixed dy = s.fInvKy;
5775efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    int maxX = s.fBitmap->width() - 1;
5785efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    int maxY = s.fBitmap->height() - 1;
5795efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5805efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    if (count >= 4 && (maxX <= 0xFFFF)) {
5815efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        while (((size_t)xy & 0x0F) != 0) {
582fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
5835efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                  SkClampMax(fx >> 16, maxX);
5845efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            fx += dx;
5855efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            fy += dy;
5865efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            count--;
5875efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        }
5885efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5895efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        SkFixed dx4 = dx * 4;
5905efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        SkFixed dy4 = dy * 4;
5915efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
5925efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
5935efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                          fx + dx, fx);
5945efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
5955efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                          fy + dy, fy);
5965efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_dx4  = _mm_set1_epi32(dx4);
5975efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        __m128i wide_dy4  = _mm_set1_epi32(dy4);
5985efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
599fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        __m128i wide_maxX = _mm_set1_epi32(maxX);
600fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        __m128i wide_maxY = _mm_set1_epi32(maxY);
6015efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
6025efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        while (count >= 4) {
6035efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // SkClampMax(fx>>16,maxX)
604fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
6055efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                            _mm_setzero_si128());
6065efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
607fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6085efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // SkClampMax(fy>>16,maxY)
609fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
6105efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                            _mm_setzero_si128());
6115efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
612fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6135efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            // final combination
6145efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
6155efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                                          wide_lo);
616fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
617fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6185efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
6195efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
6205efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
621fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            fx += dx4;
6225efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            fy += dy4;
6235efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            xy += 4;
6245efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com            count -= 4;
6255efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        } // while count >= 4
6265efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    } // if count >= 4
6275efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com
6285efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    while (count-- > 0) {
6295efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
6305efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com                              SkClampMax(fx >> 16, maxX);
6315efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com        fx += dx;
632fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        fy += dy;
6335efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com    }
6345efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com}
6357866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6367866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com/*  SSE version of S32_D16_filter_DX_SSE2
6377866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com *  Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
6387866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com *  It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
6397866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com */
6407866228f06e402d37f8fcab70a688e1f34c1d27breed@google.comvoid S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
6414b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org                            const uint32_t* xy,
6424b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org                            int count, uint16_t* colors) {
6437866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    SkASSERT(count > 0 && colors != NULL);
6449cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
645c77392ed58ec78ab19fa0e3ff99fb8110854fba2reed    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
6467866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    SkASSERT(s.fBitmap->isOpaque());
6477866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
648fc91dc70042dcb6d2868e8822fbab15aa4402375robertphillips@google.com    SkPMColor dstColor;
6497866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
650e5f48243bdbed2662be7a31be0888abc273b09e8scroggo@google.com    size_t rb = s.fBitmap->rowBytes();
6517866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    uint32_t XY = *xy++;
6527866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    unsigned y0 = XY >> 14;
6537866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
6547866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
6557866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    unsigned subY = y0 & 0xF;
6567866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6577866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // ( 0,  0,  0,  0,  0,  0,  0, 16)
6587866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    __m128i sixteen = _mm_cvtsi32_si128(16);
6597866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6607866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // ( 0,  0,  0,  0, 16, 16, 16, 16)
6617866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    sixteen = _mm_shufflelo_epi16(sixteen, 0);
6627866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6637866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // ( 0,  0,  0,  0,  0,  0,  0,  y)
6647866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    __m128i allY = _mm_cvtsi32_si128(subY);
6657866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6667866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // ( 0,  0,  0,  0,  y,  y,  y,  y)
6677866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    allY = _mm_shufflelo_epi16(allY, 0);
6687866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6697866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
6707866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    __m128i negY = _mm_sub_epi16(sixteen, allY);
6717866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6727866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
6737866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    allY = _mm_unpacklo_epi64(allY, negY);
6747866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6757866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // (16, 16, 16, 16, 16, 16, 16, 16 )
6767866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    sixteen = _mm_shuffle_epi32(sixteen, 0);
6777866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6787866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    // ( 0,  0,  0,  0,  0,  0,  0,  0)
6797866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    __m128i zero = _mm_setzero_si128();
6807866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6817866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    do {
6827866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
6837866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        unsigned x0 = XX >> 18;
6847866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        unsigned x1 = XX & 0x3FFF;
6857866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6867866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (0, 0, 0, 0, 0, 0, 0, x)
6877866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
6887866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6897866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (0, 0, 0, 0, x, x, x, x)
6907866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        allX = _mm_shufflelo_epi16(allX, 0);
6917866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6927866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (x, x, x, x, x, x, x, x)
6937866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        allX = _mm_shuffle_epi32(allX, 0);
6947866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6957866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
6967866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i negX = _mm_sub_epi16(sixteen, allX);
6977866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
6987866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // Load 4 samples (pixels).
6997866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
7007866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
7017866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
7027866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
7037866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7047866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (0, 0, a00, a10)
7057866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
7067866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7077866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // Expand to 16 bits per component.
7087866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
7097866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7107866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // ((a00 * (16-y)), (a10 * y)).
7117866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        a00a10 = _mm_mullo_epi16(a00a10, allY);
7127866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7137866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
7147866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        a00a10 = _mm_mullo_epi16(a00a10, negX);
7157866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7167866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (0, 0, a01, a10)
7177866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
7187866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7197866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // Expand to 16 bits per component.
7207866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
7217866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7227866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (a01 * (16-y)), (a11 * y)
7237866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        a01a11 = _mm_mullo_epi16(a01a11, allY);
7247866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7257866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (a01 * (16-y) * x), (a11 * y * x)
7267866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        a01a11 = _mm_mullo_epi16(a01a11, allX);
7277866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7287866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
7297866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i sum = _mm_add_epi16(a00a10, a01a11);
7307866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7317866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (DC, a00*w00 + a01*w01)
7327866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
7337866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7347866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
7357866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        sum = _mm_add_epi16(sum, shifted);
7367866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7377866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // Divide each 16 bit component by 256.
7387866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        sum = _mm_srli_epi16(sum, 8);
7397866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7407866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // Pack lower 4 16 bit values of sum into lower 4 bytes.
7417866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        sum = _mm_packus_epi16(sum, zero);
7427866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7437866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        // Extract low int and store.
7447866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com        dstColor = _mm_cvtsi128_si32(sum);
7457866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com
7464b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org        *colors++ = SkPixel32ToPixel16(dstColor);
7477866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com    } while (--count > 0);
7487866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com}
749