11cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger 240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger/* 31cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Copyright 2009 The Android Open Source Project 41cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * 51cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Use of this source code is governed by a BSD-style license that can be 61cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * found in the LICENSE file. 740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger */ 840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 91cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger 1040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger#include <emmintrin.h> 1140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger#include "SkBitmapProcState_opts_SSE2.h" 1240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger#include "SkUtils.h" 1340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 1440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergervoid S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, 1540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const uint32_t* xy, 1640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger int count, uint32_t* colors) { 1740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(count > 0 && colors != NULL); 1840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(s.fDoFilter); 1940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 2040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(s.fAlphaScale == 256); 2140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 2240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 2340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned rb = s.fBitmap->rowBytes(); 2440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger uint32_t XY = *xy++; 2540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned y0 = XY >> 14; 2640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 2740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 2840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned subY = y0 & 0xF; 2940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 3040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 0, 0, 0, 16) 3140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i sixteen = _mm_cvtsi32_si128(16); 3240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 3340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 16, 16, 16, 16) 3440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sixteen = _mm_shufflelo_epi16(sixteen, 0); 3540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 3640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 0, 0, 0, y) 3740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i allY = _mm_cvtsi32_si128(subY); 3840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 3940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, y, y, y, y) 4040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allY = _mm_shufflelo_epi16(allY, 0); 4140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 4240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 4340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i negY = _mm_sub_epi16(sixteen, allY); 4440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 4540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 4640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allY = _mm_unpacklo_epi64(allY, negY); 4740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 4840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (16, 16, 16, 16, 16, 16, 16, 16 ) 4940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sixteen = _mm_shuffle_epi32(sixteen, 0); 5040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 5140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 0, 0, 0, 0) 5240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i zero = _mm_setzero_si128(); 5340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger do { 5440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger uint32_t XX = *xy++; // x0:14 | 4 | x1:14 5540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned x0 = XX >> 18; 5640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned x1 = XX & 0x3FFF; 5740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 5840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, 0, 0, 0, 0, 0, x) 5940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 6040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 6140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, 0, 0, x, x, x, x) 6240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allX = _mm_shufflelo_epi16(allX, 0); 6340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 6440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (x, x, x, x, x, x, x, x) 6540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allX = _mm_shuffle_epi32(allX, 0); 6640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 6740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 6840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i negX = _mm_sub_epi16(sixteen, allX); 6940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 7040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Load 4 samples (pixels). 7140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 7240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 7340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 7440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 7540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 7640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, a00, a10) 7740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 7840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 7940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Expand to 16 bits per component. 8040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a00a10 = _mm_unpacklo_epi8(a00a10, zero); 8140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 8240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ((a00 * (16-y)), (a10 * y)). 8340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a00a10 = _mm_mullo_epi16(a00a10, allY); 8440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 8540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 8640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a00a10 = _mm_mullo_epi16(a00a10, negX); 8740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 8840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, a01, a10) 8940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 9040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 9140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Expand to 16 bits per component. 9240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a01a11 = _mm_unpacklo_epi8(a01a11, zero); 9340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 9440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a01 * (16-y)), (a11 * y) 9540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a01a11 = _mm_mullo_epi16(a01a11, allY); 9640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 9740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a01 * (16-y) * x), (a11 * y * x) 9840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a01a11 = _mm_mullo_epi16(a01a11, allX); 9940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 10040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a00*w00 + a01*w01, a10*w10 + a11*w11) 10140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i sum = _mm_add_epi16(a00a10, a01a11); 10240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 10340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (DC, a00*w00 + a01*w01) 10440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 10540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 10640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 10740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_add_epi16(sum, shifted); 10840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 10940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Divide each 16 bit component by 256. 11040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_srli_epi16(sum, 8); 11140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 11240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Pack lower 4 16 bit values of sum into lower 4 bytes. 11340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_packus_epi16(sum, zero); 11440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 11540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Extract low int and store. 11640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger *colors++ = _mm_cvtsi128_si32(sum); 11740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger } while (--count > 0); 11840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger} 11940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 12040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergervoid S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, 12140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const uint32_t* xy, 12240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger int count, uint32_t* colors) { 12340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(count > 0 && colors != NULL); 12440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(s.fDoFilter); 12540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 12640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger SkASSERT(s.fAlphaScale < 256); 12740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 12840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 12940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned rb = s.fBitmap->rowBytes(); 13040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger uint32_t XY = *xy++; 13140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned y0 = XY >> 14; 13240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 13340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 13440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned subY = y0 & 0xF; 13540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 13640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 0, 0, 0, 16) 13740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i sixteen = _mm_cvtsi32_si128(16); 13840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 13940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 16, 16, 16, 16) 14040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sixteen = _mm_shufflelo_epi16(sixteen, 0); 14140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 14240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 0, 0, 0, y) 14340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i allY = _mm_cvtsi32_si128(subY); 14440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 14540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, y, y, y, y) 14640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allY = _mm_shufflelo_epi16(allY, 0); 14740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 14840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 14940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i negY = _mm_sub_epi16(sixteen, allY); 15040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 15140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 15240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allY = _mm_unpacklo_epi64(allY, negY); 15340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 15440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (16, 16, 16, 16, 16, 16, 16, 16 ) 15540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sixteen = _mm_shuffle_epi32(sixteen, 0); 15640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 15740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( 0, 0, 0, 0, 0, 0, 0, 0) 15840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i zero = _mm_setzero_si128(); 15940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 16040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha ) 16140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i alpha = _mm_set1_epi16(s.fAlphaScale); 16240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 16340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger do { 16440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger uint32_t XX = *xy++; // x0:14 | 4 | x1:14 16540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned x0 = XX >> 18; 16640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger unsigned x1 = XX & 0x3FFF; 16740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 16840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, 0, 0, 0, 0, 0, x) 16940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 17040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 17140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, 0, 0, x, x, x, x) 17240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allX = _mm_shufflelo_epi16(allX, 0); 17340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 17440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (x, x, x, x, x, x, x, x) 17540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger allX = _mm_shuffle_epi32(allX, 0); 17640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 17740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 17840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i negX = _mm_sub_epi16(sixteen, allX); 17940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 18040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Load 4 samples (pixels). 18140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 18240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 18340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 18440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 18540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 18640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, a00, a10) 18740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 18840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 18940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Expand to 16 bits per component. 19040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a00a10 = _mm_unpacklo_epi8(a00a10, zero); 19140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 19240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // ((a00 * (16-y)), (a10 * y)). 19340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a00a10 = _mm_mullo_epi16(a00a10, allY); 19440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 19540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 19640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a00a10 = _mm_mullo_epi16(a00a10, negX); 19740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 19840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (0, 0, a01, a10) 19940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 20040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 20140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Expand to 16 bits per component. 20240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a01a11 = _mm_unpacklo_epi8(a01a11, zero); 20340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 20440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a01 * (16-y)), (a11 * y) 20540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a01a11 = _mm_mullo_epi16(a01a11, allY); 20640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 20740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a01 * (16-y) * x), (a11 * y * x) 20840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger a01a11 = _mm_mullo_epi16(a01a11, allX); 20940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 21040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (a00*w00 + a01*w01, a10*w10 + a11*w11) 21140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i sum = _mm_add_epi16(a00a10, a01a11); 21240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 21340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (DC, a00*w00 + a01*w01) 21440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 21540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 21640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 21740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_add_epi16(sum, shifted); 21840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 21940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Divide each 16 bit component by 256. 22040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_srli_epi16(sum, 8); 22140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 22240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Multiply by alpha. 22340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_mullo_epi16(sum, alpha); 22440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 22540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Divide each 16 bit component by 256. 22640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_srli_epi16(sum, 8); 22740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 22840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Pack lower 4 16 bit values of sum into lower 4 bytes. 22940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger sum = _mm_packus_epi16(sum, zero); 23040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger 23140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger // Extract low int and store. 23240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger *colors++ = _mm_cvtsi128_si32(sum); 23340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger } while (--count > 0); 23440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger} 2354f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2364f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenbergerstatic inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max, 2374f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed one) { 2384f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger unsigned i = SkClampMax(f >> 16, max); 2394f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger i = (i << 4) | ((f >> 12) & 0xF); 2404f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger return (i << 14) | SkClampMax((f + one) >> 16, max); 2414f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger} 2424f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2434f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger/* SSE version of ClampX_ClampY_filter_scale() 2444f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger * portable version is in core/SkBitmapProcState_matrix.h 2454f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger */ 2464f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenbergervoid ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], 2474f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger int count, int x, int y) { 2484f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 2494f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkMatrix::kScale_Mask)) == 0); 2504f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT(s.fInvKy == 0); 2514f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2524f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const unsigned maxX = s.fBitmap->width() - 1; 2534f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const SkFixed one = s.fFilterOneX; 2544f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const SkFixed dx = s.fInvSx; 2554f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed fx; 2564f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2574f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkPoint pt; 2584f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 2594f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkIntToScalar(y) + SK_ScalarHalf, &pt); 2604f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 2614f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const unsigned maxY = s.fBitmap->height() - 1; 2624f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // compute our two Y values up front 2634f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY); 2644f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // now initialize fx 2654f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx = SkScalarToFixed(pt.fX) - (one >> 1); 2664f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2674f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // test if we don't need to apply the tile proc 2684f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if (dx > 0 && (unsigned)(fx >> 16) <= maxX && 2694f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { 2704f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if (count >= 4) { 2714f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SSE version of decal_filter_scale 2724f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while ((size_t(xy) & 0x0F) != 0) { 2734f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT((fx >> (16 + 14)) == 0); 2744f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 2754f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 2764f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count--; 2774f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 2784f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2794f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_1 = _mm_set1_epi32(1); 2804f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 2814f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 2824f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx + dx, fx); 2834f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2844f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count >= 4) { 2854f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_out; 2864f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2874f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14); 2884f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out = _mm_or_si128(wide_out, _mm_add_epi32( 2894f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_srai_epi32(wide_fx, 16), wide_1)); 2904f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2914f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out); 2924f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 2934f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger xy += 4; 2944f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx * 4; 2954f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 2964f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 4; 2974f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // while count >= 4 2984f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // if count >= 4 2994f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3004f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count-- > 0) { 3014f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT((fx >> (16 + 14)) == 0); 3024f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 3034f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 3044f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 3054f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } else { 3064f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SSE2 only support 16bit interger max & min, so only process the case 3074f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // maxX less than the max 16bit interger. Actually maxX is the bitmap's 3084f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // height, there should be rare bitmap whose height will be greater 3094f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // than max 16bit interger in the real world. 3104f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if ((count >= 4) && (maxX <= 0xFFFF)) { 3114f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (((size_t)xy & 0x0F) != 0) { 3124f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); 3134f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 3144f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count--; 3154f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 3164f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3174f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 3184f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx + dx, fx); 3194f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 3204f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_one = _mm_set1_epi32(one); 3214f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_maxX = _mm_set1_epi32(maxX); 3224f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_mask = _mm_set1_epi32(0xF); 3234f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3244f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count >= 4) { 3254f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_i; 3264f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_lo; 3274f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_fx1; 3284f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3294f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // i = SkClampMax(f>>16,maxX) 3304f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 3314f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 3324f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_min_epi16(wide_i, wide_maxX); 3334f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3344f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // i<<4 | TILEX_LOW_BITS(fx) 3354f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_lo = _mm_srli_epi32(wide_fx, 12); 3364f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_lo = _mm_and_si128(wide_lo, wide_mask); 3374f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_slli_epi32(wide_i, 4); 3384f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_or_si128(wide_i, wide_lo); 3394f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3404f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // i<<14 3414f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_slli_epi32(wide_i, 14); 3424f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3434f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SkClampMax(((f+one))>>16,max) 3444f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fx1 = _mm_add_epi32(wide_fx, wide_one); 3454f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), 3464f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 3474f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX); 3484f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3494f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // final combination 3504f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_or_si128(wide_i, wide_fx1); 3514f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 3524f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3534f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 3544f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx * 4; 3554f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger xy += 4; 3564f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 4; 3574f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // while count >= 4 3584f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // if count >= 4 3594f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3604f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count-- > 0) { 3614f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); 3624f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 3634f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 3644f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 3654f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger} 3664f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3674f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger/* SSE version of ClampX_ClampY_nofilter_scale() 3684f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger * portable version is in core/SkBitmapProcState_matrix.h 3694f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger */ 3704f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenbergervoid ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, 3714f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger uint32_t xy[], int count, int x, int y) { 3724f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 3734f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkMatrix::kScale_Mask)) == 0); 3744f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3754f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // we store y, x, x, x, x, x 3764f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const unsigned maxX = s.fBitmap->width() - 1; 3774f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed fx; 3784f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkPoint pt; 3794f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 3804f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkIntToScalar(y) + SK_ScalarHalf, &pt); 3814f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx = SkScalarToFixed(pt.fY); 3824f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const unsigned maxY = s.fBitmap->height() - 1; 3834f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = SkClampMax(fx >> 16, maxY); 3844f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx = SkScalarToFixed(pt.fX); 3854f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3864f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if (0 == maxX) { 3874f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // all of the following X values must be 0 3884f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger memset(xy, 0, count * sizeof(uint16_t)); 3894f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger return; 3904f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 3914f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3924f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger const SkFixed dx = s.fInvSx; 3934f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 3944f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // test if we don't need to apply the tile proc 3954f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if ((unsigned)(fx >> 16) <= maxX && 3964f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { 3974f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SSE version of decal_nofilter_scale 3984f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if (count >= 8) { 3994f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (((size_t)xy & 0x0F) != 0) { 4004f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 4014f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += 2 * dx; 4024f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 2; 4034f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4044f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4054f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 4064f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); 4074f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4084f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 4094f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx + dx, fx); 4104f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); 4114f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4124f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count >= 8) { 4134f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); 4144f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); 4154f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4164f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_result = _mm_packs_epi32(wide_out_low, 4174f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out_high); 4184f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); 4194f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4204f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_low = _mm_add_epi32(wide_low, wide_dx8); 4214f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_high = _mm_add_epi32(wide_high, wide_dx8); 4224f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4234f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger xy += 4; 4244f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx * 8; 4254f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 8; 4264f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4274f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // if count >= 8 4284f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4294f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger uint16_t* xx = reinterpret_cast<uint16_t*>(xy); 4304f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count-- > 0) { 4314f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xx++ = SkToU16(fx >> 16); 4324f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 4334f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4344f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } else { 4354f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SSE2 only support 16bit interger max & min, so only process the case 4364f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // maxX less than the max 16bit interger. Actually maxX is the bitmap's 4374f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // height, there should be rare bitmap whose height will be greater 4384f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // than max 16bit interger in the real world. 4394f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if ((count >= 8) && (maxX <= 0xFFFF)) { 4404f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (((size_t)xy & 0x0F) != 0) { 4414f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = SkClampMax((fx + dx) >> 16, maxX) | 4424f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkClampMax(fx >> 16, maxX); 4434f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += 2 * dx; 4444f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 2; 4454f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4464f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4474f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx4 = _mm_set1_epi32(dx * 4); 4484f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); 4494f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4504f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 4514f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx + dx, fx); 4524f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); 4534f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_maxX = _mm_set1_epi32(maxX); 4544f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4554f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count >= 8) { 4564f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); 4574f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); 4584f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4594f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out_low = _mm_max_epi16(wide_out_low, 4604f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 4614f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX); 4624f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out_high = _mm_max_epi16(wide_out_high, 4634f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 4644f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX); 4654f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4664f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_result = _mm_packs_epi32(wide_out_low, 4674f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_out_high); 4684f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); 4694f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4704f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_low = _mm_add_epi32(wide_low, wide_dx8); 4714f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_high = _mm_add_epi32(wide_high, wide_dx8); 4724f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4734f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger xy += 4; 4744f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx * 8; 4754f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 8; 4764f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4774f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // if count >= 8 4784f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4794f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger uint16_t* xx = reinterpret_cast<uint16_t*>(xy); 4804f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count-- > 0) { 4814f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xx++ = SkClampMax(fx >> 16, maxX); 4824f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 4834f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4844f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 4854f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger} 4864f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4874f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger/* SSE version of ClampX_ClampY_filter_affine() 4884f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger * portable version is in core/SkBitmapProcState_matrix.h 4894f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger */ 4904f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenbergervoid ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s, 4914f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger uint32_t xy[], int count, int x, int y) { 4924f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkPoint srcPt; 4934f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger s.fInvProc(*s.fInvMatrix, 4944f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkIntToScalar(x) + SK_ScalarHalf, 4954f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 4964f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 4974f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed oneX = s.fFilterOneX; 4984f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed oneY = s.fFilterOneY; 4994f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 5004f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 5014f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dx = s.fInvSx; 5024f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dy = s.fInvKy; 5034f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger unsigned maxX = s.fBitmap->width() - 1; 5044f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger unsigned maxY = s.fBitmap->height() - 1; 5054f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5064f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if (count >= 2 && (maxX <= 0xFFFF)) { 5074f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dx2 = dx + dx; 5084f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dy2 = dy + dy; 5094f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5104f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy); 5114f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2); 5124f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY); 5134f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY); 5144f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_mask = _mm_set1_epi32(0xF); 5154f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5164f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count >= 2) { 5174f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // i = SkClampMax(f>>16,maxX) 5184f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16), 5194f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 5204f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_min_epi16(wide_i, wide_max); 5214f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5224f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // i<<4 | TILEX_LOW_BITS(f) 5234f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_lo = _mm_srli_epi32(wide_f, 12); 5244f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_lo = _mm_and_si128(wide_lo, wide_mask); 5254f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_slli_epi32(wide_i, 4); 5264f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_or_si128(wide_i, wide_lo); 5274f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5284f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // i<<14 5294f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_slli_epi32(wide_i, 14); 5304f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5314f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SkClampMax(((f+one))>>16,max) 5324f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one); 5334f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16), 5344f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 5354f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_f1 = _mm_min_epi16(wide_f1, wide_max); 5364f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5374f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // final combination 5384f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_i = _mm_or_si128(wide_i, wide_f1); 5394f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i); 5404f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5414f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_f = _mm_add_epi32(wide_f, wide_d2); 5424f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5434f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx2; 5444f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fy += dy2; 5454f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger xy += 4; 5464f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 2; 5474f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // while count >= 2 5484f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // if count >= 2 5494f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5504f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count-- > 0) { 5514f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY); 5524f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fy += dy; 5534f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX); 5544f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 5554f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 5564f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger} 5574f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5584f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger/* SSE version of ClampX_ClampY_nofilter_affine() 5594f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger * portable version is in core/SkBitmapProcState_matrix.h 5604f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger */ 5614f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenbergervoid ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, 5624f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger uint32_t xy[], int count, int x, int y) { 5634f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 5644f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 5654f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkMatrix::kScale_Mask | 5664f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkMatrix::kAffine_Mask)) == 0); 5674f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5684f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkPoint srcPt; 5694f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger s.fInvProc(*s.fInvMatrix, 5704f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkIntToScalar(x) + SK_ScalarHalf, 5714f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 5724f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5734f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed fx = SkScalarToFixed(srcPt.fX); 5744f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed fy = SkScalarToFixed(srcPt.fY); 5754f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dx = s.fInvSx; 5764f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dy = s.fInvKy; 5774f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger int maxX = s.fBitmap->width() - 1; 5784f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger int maxY = s.fBitmap->height() - 1; 5794f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5804f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger if (count >= 4 && (maxX <= 0xFFFF)) { 5814f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (((size_t)xy & 0x0F) != 0) { 5824f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 5834f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkClampMax(fx >> 16, maxX); 5844f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 5854f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fy += dy; 5864f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count--; 5874f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 5884f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5894f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dx4 = dx * 4; 5904f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkFixed dy4 = dy * 4; 5914f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5924f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, 5934f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx + dx, fx); 5944f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2, 5954f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fy + dy, fy); 5964f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dx4 = _mm_set1_epi32(dx4); 5974f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_dy4 = _mm_set1_epi32(dy4); 5984f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 5994f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_maxX = _mm_set1_epi32(maxX); 6004f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_maxY = _mm_set1_epi32(maxY); 6014f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 6024f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count >= 4) { 6034f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SkClampMax(fx>>16,maxX) 6044f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 6054f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 6064f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_lo = _mm_min_epi16(wide_lo, wide_maxX); 6074f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 6084f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // SkClampMax(fy>>16,maxY) 6094f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16), 6104f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_setzero_si128()); 6114f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_hi = _mm_min_epi16(wide_hi, wide_maxY); 6124f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 6134f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger // final combination 6144f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16), 6154f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_lo); 6164f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 6174f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 6184f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fx = _mm_add_epi32(wide_fx, wide_dx4); 6194f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger wide_fy = _mm_add_epi32(wide_fy, wide_dy4); 6204f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 6214f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx4; 6224f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fy += dy4; 6234f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger xy += 4; 6244f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger count -= 4; 6254f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // while count >= 4 6264f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } // if count >= 4 6274f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 6284f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger while (count-- > 0) { 6294f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 6304f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger SkClampMax(fx >> 16, maxX); 6314f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fx += dx; 6324f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger fy += dy; 6334f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger } 6344f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger} 635