1dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org/* 2ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Copyright 2009 The Android Open Source Project 3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * 4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be 5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file. 6dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org */ 7dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 8dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org#include <emmintrin.h> 9dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org#include "SkBitmapProcState_opts_SSE2.h" 104b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org#include "SkColorPriv.h" 119cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com#include "SkPaint.h" 12dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org#include "SkUtils.h" 13dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 14dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.orgvoid S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, 15dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const uint32_t* xy, 16dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org int count, uint32_t* colors) { 17dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(count > 0 && colors != NULL); 189cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); 19c77392ed58ec78ab19fa0e3ff99fb8110854fba2reed SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); 20aa4f0c682d4eced5c0c3aa711f76d440eae60588senorblanco@chromium.org SkASSERT(s.fAlphaScale == 256); 21dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 22dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 23e5f48243bdbed2662be7a31be0888abc273b09e8scroggo@google.com size_t rb = s.fBitmap->rowBytes(); 24dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t XY = *xy++; 25dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org unsigned y0 = XY >> 14; 26dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 27dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 28dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org unsigned subY = y0 & 0xF; 29dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 30dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ( 0, 0, 0, 0, 0, 0, 0, 16) 31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i sixteen = _mm_cvtsi32_si128(16); 32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ( 0, 0, 0, 0, 16, 16, 16, 16) 34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org sixteen = _mm_shufflelo_epi16(sixteen, 0); 35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ( 0, 0, 0, 0, 0, 0, 0, y) 37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i allY = _mm_cvtsi32_si128(subY); 38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ( 0, 0, 0, 0, y, y, y, y) 40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org allY = _mm_shufflelo_epi16(allY, 0); 41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i negY = _mm_sub_epi16(sixteen, allY); 44dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 45dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 46dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org allY = _mm_unpacklo_epi64(allY, negY); 47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (16, 16, 16, 16, 16, 16, 16, 16 ) 49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org sixteen = _mm_shuffle_epi32(sixteen, 0); 50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ( 0, 0, 0, 0, 0, 0, 0, 0) 52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i zero = _mm_setzero_si128(); 53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org do { 54dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t XX = *xy++; // x0:14 | 4 | x1:14 55dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org unsigned x0 = XX >> 18; 56dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org unsigned x1 = XX & 0x3FFF; 57dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (0, 0, 0, 0, 0, 0, 0, x) 59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 60fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (0, 0, 0, 0, x, x, x, x) 62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org allX = _mm_shufflelo_epi16(allX, 0); 63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 64dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (x, x, x, x, x, x, x, x) 65dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org allX = _mm_shuffle_epi32(allX, 0); 66dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 67dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 68dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i negX = _mm_sub_epi16(sixteen, allX); 69dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 70dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 samples (pixels). 71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (0, 0, a00, a10) 77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 78dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 79dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Expand to 16 bits per component. 80dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org a00a10 = _mm_unpacklo_epi8(a00a10, zero); 81dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 82dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // ((a00 * (16-y)), (a10 * y)). 83dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org a00a10 = _mm_mullo_epi16(a00a10, allY); 84dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 85dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 86dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org a00a10 = _mm_mullo_epi16(a00a10, negX); 87dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (0, 0, a01, a10) 89dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 90dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 91dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Expand to 16 bits per component. 92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org a01a11 = _mm_unpacklo_epi8(a01a11, zero); 93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (a01 * (16-y)), (a11 * y) 95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org a01a11 = _mm_mullo_epi16(a01a11, allY); 96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (a01 * (16-y) * x), (a11 * y * x) 98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org a01a11 = _mm_mullo_epi16(a01a11, allX); 99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (a00*w00 + a01*w01, a10*w10 + a11*w11) 101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i sum = _mm_add_epi16(a00a10, a01a11); 102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 103dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (DC, a00*w00 + a01*w01) 104dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 105dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 106dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 107dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org sum = _mm_add_epi16(sum, shifted); 108dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 109dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide each 16 bit component by 256. 110dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org sum = _mm_srli_epi16(sum, 8); 111dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 112dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Pack lower 4 16 bit values of sum into lower 4 bytes. 113dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org sum = _mm_packus_epi16(sum, zero); 114dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 115dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Extract low int and store. 116dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *colors++ = _mm_cvtsi128_si32(sum); 117dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } while (--count > 0); 118dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org} 119f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 120f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.orgvoid S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, 121f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org const uint32_t* xy, 122f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org int count, uint32_t* colors) { 123f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org SkASSERT(count > 0 && colors != NULL); 1249cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); 125c77392ed58ec78ab19fa0e3ff99fb8110854fba2reed SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); 126f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org SkASSERT(s.fAlphaScale < 256); 127f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 128f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 129e5f48243bdbed2662be7a31be0888abc273b09e8scroggo@google.com size_t rb = s.fBitmap->rowBytes(); 130f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org uint32_t XY = *xy++; 131f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org unsigned y0 = XY >> 14; 132f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 133f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 134f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org unsigned subY = y0 & 0xF; 135f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 136f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( 0, 0, 0, 0, 0, 0, 0, 16) 137f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i sixteen = _mm_cvtsi32_si128(16); 138f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 139f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( 0, 0, 0, 0, 16, 16, 16, 16) 140f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sixteen = _mm_shufflelo_epi16(sixteen, 0); 141f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 142f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( 0, 0, 0, 0, 0, 0, 0, y) 143f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i allY = _mm_cvtsi32_si128(subY); 144f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 145f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( 0, 0, 0, 0, y, y, y, y) 146f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org allY = _mm_shufflelo_epi16(allY, 0); 147f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 148f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 149f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i negY = _mm_sub_epi16(sixteen, allY); 150f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 151f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 152f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org allY = _mm_unpacklo_epi64(allY, negY); 153f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 154f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (16, 16, 16, 16, 16, 16, 16, 16 ) 155f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sixteen = _mm_shuffle_epi32(sixteen, 0); 156f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 157f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( 0, 0, 0, 0, 0, 0, 0, 0) 158f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i zero = _mm_setzero_si128(); 159f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 160f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha ) 161f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i alpha = _mm_set1_epi16(s.fAlphaScale); 162f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 163f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org do { 164f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org uint32_t XX = *xy++; // x0:14 | 4 | x1:14 165f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org unsigned x0 = XX >> 18; 166f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org unsigned x1 = XX & 0x3FFF; 167f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 168f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (0, 0, 0, 0, 0, 0, 0, x) 169f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 170fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 171f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (0, 0, 0, 0, x, x, x, x) 172f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org allX = _mm_shufflelo_epi16(allX, 0); 173f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 174f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (x, x, x, x, x, x, x, x) 175f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org allX = _mm_shuffle_epi32(allX, 0); 176f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 177f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 178f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i negX = _mm_sub_epi16(sixteen, allX); 179f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 180f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Load 4 samples (pixels). 181f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 182f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 183f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 184f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 185f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 186f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (0, 0, a00, a10) 187f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 188f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 189f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Expand to 16 bits per component. 190f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org a00a10 = _mm_unpacklo_epi8(a00a10, zero); 191f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 192f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // ((a00 * (16-y)), (a10 * y)). 193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org a00a10 = _mm_mullo_epi16(a00a10, allY); 194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org a00a10 = _mm_mullo_epi16(a00a10, negX); 197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (0, 0, a01, a10) 199f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 200f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 201f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Expand to 16 bits per component. 202f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org a01a11 = _mm_unpacklo_epi8(a01a11, zero); 203f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 204f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a01 * (16-y)), (a11 * y) 205f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org a01a11 = _mm_mullo_epi16(a01a11, allY); 206f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 207f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a01 * (16-y) * x), (a11 * y * x) 208f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org a01a11 = _mm_mullo_epi16(a01a11, allX); 209f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 210f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a00*w00 + a01*w01, a10*w10 + a11*w11) 211f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i sum = _mm_add_epi16(a00a10, a01a11); 212f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 213f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (DC, a00*w00 + a01*w01) 214f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 215f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 216f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 217f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sum = _mm_add_epi16(sum, shifted); 218f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 219f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Divide each 16 bit component by 256. 220f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sum = _mm_srli_epi16(sum, 8); 221f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 222f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Multiply by alpha. 223f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sum = _mm_mullo_epi16(sum, alpha); 224f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 225f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Divide each 16 bit component by 256. 226f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sum = _mm_srli_epi16(sum, 8); 227f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 228f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Pack lower 4 16 bit values of sum into lower 4 bytes. 229f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org sum = _mm_packus_epi16(sum, zero); 230f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 231f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // Extract low int and store. 232f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org *colors++ = _mm_cvtsi128_si32(sum); 233f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org } while (--count > 0); 234f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org} 23506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 23606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.comstatic inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max, 23706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkFixed one) { 23806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com unsigned i = SkClampMax(f >> 16, max); 23906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com i = (i << 4) | ((f >> 12) & 0xF); 24006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com return (i << 14) | SkClampMax((f + one) >> 16, max); 24106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com} 24206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 24306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com/* SSE version of ClampX_ClampY_filter_scale() 24406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com * portable version is in core/SkBitmapProcState_matrix.h 24506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com */ 24606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.comvoid ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], 24706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com int count, int x, int y) { 24806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 24906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkMatrix::kScale_Mask)) == 0); 25006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkASSERT(s.fInvKy == 0); 251fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 25206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const unsigned maxX = s.fBitmap->width() - 1; 25306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const SkFixed one = s.fFilterOneX; 25406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const SkFixed dx = s.fInvSx; 25506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkFixed fx; 25606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 25706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkPoint pt; 2589c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 2599c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com SkIntToScalar(y) + SK_ScalarHalf, &pt); 26006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 26106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const unsigned maxY = s.fBitmap->height() - 1; 26206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // compute our two Y values up front 26306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY); 26406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // now initialize fx 26506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx = SkScalarToFixed(pt.fX) - (one >> 1); 26606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 26706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // test if we don't need to apply the tile proc 26806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if (dx > 0 && (unsigned)(fx >> 16) <= maxX && 26906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { 27006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if (count >= 4) { 27106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // SSE version of decal_filter_scale 27206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while ((size_t(xy) & 0x0F) != 0) { 27306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkASSERT((fx >> (16 + 14)) == 0); 27406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 27506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx; 27606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count--; 27706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 27806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 27906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_1 = _mm_set1_epi32(1); 28006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 28106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 28206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx + dx, fx); 28306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 28406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count >= 4) { 285fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_out; 286fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 28706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14); 28806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out = _mm_or_si128(wide_out, _mm_add_epi32( 289fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com _mm_srai_epi32(wide_fx, 16), wide_1)); 290fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 29106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out); 292fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 29306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com xy += 4; 29406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx * 4; 29506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 29606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count -= 4; 29706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } // while count >= 4 29806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } // if count >= 4 29906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 30006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count-- > 0) { 30106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkASSERT((fx >> (16 + 14)) == 0); 30206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 30306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx; 30406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 30506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } else { 30606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // SSE2 only support 16bit interger max & min, so only process the case 30706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // maxX less than the max 16bit interger. Actually maxX is the bitmap's 308fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // height, there should be rare bitmap whose height will be greater 30906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // than max 16bit interger in the real world. 31006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if ((count >= 4) && (maxX <= 0xFFFF)) { 31106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (((size_t)xy & 0x0F) != 0) { 31206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); 31306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx; 31406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count--; 31506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 316fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 31706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 31806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx + dx, fx); 31906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 32006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_one = _mm_set1_epi32(one); 321fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_maxX = _mm_set1_epi32(maxX); 32206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_mask = _mm_set1_epi32(0xF); 32306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 32406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count >= 4) { 32506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_i; 32606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_lo; 32706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_fx1; 32806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 32906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // i = SkClampMax(f>>16,maxX) 330fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 33106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_setzero_si128()); 33206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_i = _mm_min_epi16(wide_i, wide_maxX); 333fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 33406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // i<<4 | TILEX_LOW_BITS(fx) 33506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_lo = _mm_srli_epi32(wide_fx, 12); 33606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_lo = _mm_and_si128(wide_lo, wide_mask); 337fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_i = _mm_slli_epi32(wide_i, 4); 338fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_i = _mm_or_si128(wide_i, wide_lo); 339fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 34006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // i<<14 34106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_i = _mm_slli_epi32(wide_i, 14); 342fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 34306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // SkClampMax(((f+one))>>16,max) 34406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_fx1 = _mm_add_epi32(wide_fx, wide_one); 345fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), 34606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_setzero_si128()); 34706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX); 348fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 34906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // final combination 35006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_i = _mm_or_si128(wide_i, wide_fx1); 351fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 352fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 35306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 354fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com fx += dx * 4; 35506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com xy += 4; 35606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count -= 4; 35706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } // while count >= 4 35806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } // if count >= 4 35906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 36006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count-- > 0) { 36106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); 36206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx; 36306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 36406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 36506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com} 36606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 36706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com/* SSE version of ClampX_ClampY_nofilter_scale() 36806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com * portable version is in core/SkBitmapProcState_matrix.h 36906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com */ 37006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.comvoid ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, 37106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com uint32_t xy[], int count, int x, int y) { 37206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 37306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkMatrix::kScale_Mask)) == 0); 37406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 37506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // we store y, x, x, x, x, x 37606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const unsigned maxX = s.fBitmap->width() - 1; 37706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkFixed fx; 37806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com SkPoint pt; 3799c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 3809c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com SkIntToScalar(y) + SK_ScalarHalf, &pt); 38106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx = SkScalarToFixed(pt.fY); 38206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const unsigned maxY = s.fBitmap->height() - 1; 38306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = SkClampMax(fx >> 16, maxY); 38406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx = SkScalarToFixed(pt.fX); 385fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 38606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if (0 == maxX) { 38706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // all of the following X values must be 0 38806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com memset(xy, 0, count * sizeof(uint16_t)); 38906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com return; 39006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 39106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 39206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com const SkFixed dx = s.fInvSx; 39306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 39406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // test if we don't need to apply the tile proc 39506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if ((unsigned)(fx >> 16) <= maxX && 39606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { 39706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // SSE version of decal_nofilter_scale 39806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if (count >= 8) { 39906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (((size_t)xy & 0x0F) != 0) { 40006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 40106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += 2 * dx; 40206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count -= 2; 40306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 40406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 40506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 40606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); 40706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 40806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 40906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx + dx, fx); 41006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); 41106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 41206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count >= 8) { 41306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); 41406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); 41506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 41606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_result = _mm_packs_epi32(wide_out_low, 41706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out_high); 41806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); 419fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 42006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_low = _mm_add_epi32(wide_low, wide_dx8); 42106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_high = _mm_add_epi32(wide_high, wide_dx8); 42206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 42306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com xy += 4; 42406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx * 8; 42506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count -= 8; 42606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 42706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } // if count >= 8 42806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 42906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com uint16_t* xx = reinterpret_cast<uint16_t*>(xy); 43006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count-- > 0) { 43106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xx++ = SkToU16(fx >> 16); 43206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx; 43306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 43406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } else { 43506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // SSE2 only support 16bit interger max & min, so only process the case 43606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // maxX less than the max 16bit interger. Actually maxX is the bitmap's 437fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // height, there should be rare bitmap whose height will be greater 43806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com // than max 16bit interger in the real world. 43906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com if ((count >= 8) && (maxX <= 0xFFFF)) { 44006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (((size_t)xy & 0x0F) != 0) { 441602f2272be351cee0e5d6723b57c4256d473bd2bmike@reedtribe.org *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX), 442602f2272be351cee0e5d6723b57c4256d473bd2bmike@reedtribe.org SkClampMax(fx >> 16, maxX)); 44306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += 2 * dx; 44406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count -= 2; 44506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 44606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 44706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 44806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); 44906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 45006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 45106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx + dx, fx); 45206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); 45306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_maxX = _mm_set1_epi32(maxX); 45406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 45506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count >= 8) { 45606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); 45706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); 45806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 459fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_out_low = _mm_max_epi16(wide_out_low, 46006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_setzero_si128()); 46106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX); 46206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out_high = _mm_max_epi16(wide_out_high, 46306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_setzero_si128()); 46406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX); 46506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 46606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com __m128i wide_result = _mm_packs_epi32(wide_out_low, 46706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_out_high); 46806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); 46906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 47006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_low = _mm_add_epi32(wide_low, wide_dx8); 47106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com wide_high = _mm_add_epi32(wide_high, wide_dx8); 47206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 47306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com xy += 4; 47406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx * 8; 47506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com count -= 8; 47606a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 47706a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } // if count >= 8 47806a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com 47906a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com uint16_t* xx = reinterpret_cast<uint16_t*>(xy); 48006a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com while (count-- > 0) { 48106a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com *xx++ = SkClampMax(fx >> 16, maxX); 48206a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com fx += dx; 48306a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 48406a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com } 48506a7313430728b18f2ed92f14b189f3320fb8d13tomhudson@google.com} 4865efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 4875efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com/* SSE version of ClampX_ClampY_filter_affine() 4885efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com * portable version is in core/SkBitmapProcState_matrix.h 4895efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com */ 4905efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.comvoid ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s, 4915efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com uint32_t xy[], int count, int x, int y) { 4925efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkPoint srcPt; 4939c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com s.fInvProc(s.fInvMatrix, 4945efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkIntToScalar(x) + SK_ScalarHalf, 4955efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 496fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 4975efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed oneX = s.fFilterOneX; 4985efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed oneY = s.fFilterOneY; 4995efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 5005efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 5015efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dx = s.fInvSx; 5025efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dy = s.fInvKy; 5035efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com unsigned maxX = s.fBitmap->width() - 1; 5045efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com unsigned maxY = s.fBitmap->height() - 1; 5055efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5065efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com if (count >= 2 && (maxX <= 0xFFFF)) { 5075efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dx2 = dx + dx; 5085efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dy2 = dy + dy; 5095efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5105efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy); 5115efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2); 5125efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY); 513fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY); 5145efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_mask = _mm_set1_epi32(0xF); 5155efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5165efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com while (count >= 2) { 5175efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // i = SkClampMax(f>>16,maxX) 518fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16), 5195efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com _mm_setzero_si128()); 5205efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_i = _mm_min_epi16(wide_i, wide_max); 521fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 5225efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // i<<4 | TILEX_LOW_BITS(f) 5235efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_lo = _mm_srli_epi32(wide_f, 12); 5245efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_lo = _mm_and_si128(wide_lo, wide_mask); 525fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_i = _mm_slli_epi32(wide_i, 4); 526fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_i = _mm_or_si128(wide_i, wide_lo); 527fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 5285efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // i<<14 5295efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_i = _mm_slli_epi32(wide_i, 14); 530fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 5315efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // SkClampMax(((f+one))>>16,max) 5325efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one); 533fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16), 5345efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com _mm_setzero_si128()); 5355efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_f1 = _mm_min_epi16(wide_f1, wide_max); 536fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 5375efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // final combination 5385efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_i = _mm_or_si128(wide_i, wide_f1); 539fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i); 540fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 5415efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_f = _mm_add_epi32(wide_f, wide_d2); 5425efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 543fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com fx += dx2; 5445efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fy += dy2; 5455efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com xy += 4; 5465efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com count -= 2; 5475efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } // while count >= 2 5485efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } // if count >= 2 5495efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5505efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com while (count-- > 0) { 5515efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY); 5525efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fy += dy; 5535efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX); 554fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com fx += dx; 5555efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } 5565efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com} 5575efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5585efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com/* SSE version of ClampX_ClampY_nofilter_affine() 5595efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com * portable version is in core/SkBitmapProcState_matrix.h 5605efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com */ 5615efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.comvoid ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, 5625efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com uint32_t xy[], int count, int x, int y) { 5635efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 5645efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 5655efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkMatrix::kScale_Mask | 5665efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkMatrix::kAffine_Mask)) == 0); 5675efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5685efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkPoint srcPt; 5699c96d4b5ffdbf8c82f55b2058a2fea7225fe11d6humper@google.com s.fInvProc(s.fInvMatrix, 5705efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkIntToScalar(x) + SK_ScalarHalf, 5715efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 572fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 5735efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed fx = SkScalarToFixed(srcPt.fX); 5745efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed fy = SkScalarToFixed(srcPt.fY); 5755efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dx = s.fInvSx; 5765efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dy = s.fInvKy; 5775efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com int maxX = s.fBitmap->width() - 1; 5785efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com int maxY = s.fBitmap->height() - 1; 5795efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5805efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com if (count >= 4 && (maxX <= 0xFFFF)) { 5815efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com while (((size_t)xy & 0x0F) != 0) { 582fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 5835efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkClampMax(fx >> 16, maxX); 5845efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fx += dx; 5855efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fy += dy; 5865efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com count--; 5875efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } 5885efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5895efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dx4 = dx * 4; 5905efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkFixed dy4 = dy * 4; 5915efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 5925efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 5935efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fx + dx, fx); 5945efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2, 5955efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fy + dy, fy); 5965efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_dx4 = _mm_set1_epi32(dx4); 5975efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_dy4 = _mm_set1_epi32(dy4); 5985efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 599fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_maxX = _mm_set1_epi32(maxX); 600fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_maxY = _mm_set1_epi32(maxY); 6015efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 6025efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com while (count >= 4) { 6035efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // SkClampMax(fx>>16,maxX) 604fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 6055efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com _mm_setzero_si128()); 6065efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_lo = _mm_min_epi16(wide_lo, wide_maxX); 607fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 6085efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // SkClampMax(fy>>16,maxY) 609fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16), 6105efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com _mm_setzero_si128()); 6115efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_hi = _mm_min_epi16(wide_hi, wide_maxY); 612fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 6135efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com // final combination 6145efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16), 6155efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_lo); 616fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 617fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 6185efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 6195efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com wide_fy = _mm_add_epi32(wide_fy, wide_dy4); 6205efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 621fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com fx += dx4; 6225efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fy += dy4; 6235efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com xy += 4; 6245efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com count -= 4; 6255efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } // while count >= 4 6265efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } // if count >= 4 6275efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com 6285efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com while (count-- > 0) { 6295efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 6305efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com SkClampMax(fx >> 16, maxX); 6315efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com fx += dx; 632fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com fy += dy; 6335efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com } 6345efaf268931d01498f4f1af63c556d811e5d5797tomhudson@google.com} 6357866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6367866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com/* SSE version of S32_D16_filter_DX_SSE2 6377866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com * Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp 6387866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com * It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16 6397866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com */ 6407866228f06e402d37f8fcab70a688e1f34c1d27breed@google.comvoid S32_D16_filter_DX_SSE2(const SkBitmapProcState& s, 6414b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org const uint32_t* xy, 6424b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org int count, uint16_t* colors) { 6437866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com SkASSERT(count > 0 && colors != NULL); 6449cfc83cc8ac2ee50a7ce889e65a707941f48bdeareed@google.com SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel); 645c77392ed58ec78ab19fa0e3ff99fb8110854fba2reed SkASSERT(kN32_SkColorType == s.fBitmap->colorType()); 6467866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com SkASSERT(s.fBitmap->isOpaque()); 6477866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 648fc91dc70042dcb6d2868e8822fbab15aa4402375robertphillips@google.com SkPMColor dstColor; 6497866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 650e5f48243bdbed2662be7a31be0888abc273b09e8scroggo@google.com size_t rb = s.fBitmap->rowBytes(); 6517866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com uint32_t XY = *xy++; 6527866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com unsigned y0 = XY >> 14; 6537866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 6547866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 6557866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com unsigned subY = y0 & 0xF; 6567866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6577866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ( 0, 0, 0, 0, 0, 0, 0, 16) 6587866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i sixteen = _mm_cvtsi32_si128(16); 6597866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6607866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ( 0, 0, 0, 0, 16, 16, 16, 16) 6617866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com sixteen = _mm_shufflelo_epi16(sixteen, 0); 6627866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6637866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ( 0, 0, 0, 0, 0, 0, 0, y) 6647866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i allY = _mm_cvtsi32_si128(subY); 6657866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6667866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ( 0, 0, 0, 0, y, y, y, y) 6677866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com allY = _mm_shufflelo_epi16(allY, 0); 6687866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6697866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 6707866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i negY = _mm_sub_epi16(sixteen, allY); 6717866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6727866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 6737866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com allY = _mm_unpacklo_epi64(allY, negY); 6747866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6757866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (16, 16, 16, 16, 16, 16, 16, 16 ) 6767866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com sixteen = _mm_shuffle_epi32(sixteen, 0); 6777866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6787866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ( 0, 0, 0, 0, 0, 0, 0, 0) 6797866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i zero = _mm_setzero_si128(); 6807866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6817866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com do { 6827866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com uint32_t XX = *xy++; // x0:14 | 4 | x1:14 6837866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com unsigned x0 = XX >> 18; 6847866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com unsigned x1 = XX & 0x3FFF; 6857866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6867866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (0, 0, 0, 0, 0, 0, 0, x) 6877866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 6887866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6897866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (0, 0, 0, 0, x, x, x, x) 6907866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com allX = _mm_shufflelo_epi16(allX, 0); 6917866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6927866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (x, x, x, x, x, x, x, x) 6937866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com allX = _mm_shuffle_epi32(allX, 0); 6947866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6957866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 6967866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i negX = _mm_sub_epi16(sixteen, allX); 6977866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 6987866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // Load 4 samples (pixels). 6997866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 7007866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 7017866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 7027866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 7037866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7047866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (0, 0, a00, a10) 7057866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 7067866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7077866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // Expand to 16 bits per component. 7087866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com a00a10 = _mm_unpacklo_epi8(a00a10, zero); 7097866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7107866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // ((a00 * (16-y)), (a10 * y)). 7117866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com a00a10 = _mm_mullo_epi16(a00a10, allY); 7127866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7137866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 7147866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com a00a10 = _mm_mullo_epi16(a00a10, negX); 7157866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7167866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (0, 0, a01, a10) 7177866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 7187866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7197866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // Expand to 16 bits per component. 7207866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com a01a11 = _mm_unpacklo_epi8(a01a11, zero); 7217866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7227866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (a01 * (16-y)), (a11 * y) 7237866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com a01a11 = _mm_mullo_epi16(a01a11, allY); 7247866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7257866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (a01 * (16-y) * x), (a11 * y * x) 7267866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com a01a11 = _mm_mullo_epi16(a01a11, allX); 7277866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7287866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (a00*w00 + a01*w01, a10*w10 + a11*w11) 7297866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i sum = _mm_add_epi16(a00a10, a01a11); 7307866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7317866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (DC, a00*w00 + a01*w01) 7327866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 7337866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7347866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 7357866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com sum = _mm_add_epi16(sum, shifted); 7367866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7377866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // Divide each 16 bit component by 256. 7387866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com sum = _mm_srli_epi16(sum, 8); 7397866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7407866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // Pack lower 4 16 bit values of sum into lower 4 bytes. 7417866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com sum = _mm_packus_epi16(sum, zero); 7427866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7437866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com // Extract low int and store. 7447866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com dstColor = _mm_cvtsi128_si32(sum); 7457866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com 7464b9b4562bfa0c5ee30313229026ba4f81a8e2705commit-bot@chromium.org *colors++ = SkPixel32ToPixel16(dstColor); 7477866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com } while (--count > 0); 7487866228f06e402d37f8fcab70a688e1f34c1d27breed@google.com} 749