SkBlitRow_opts_SSE2.cpp revision 40528743dbb9ce7f39f093e0cdc47849ac8887cf
1/* 2 ** 3 ** Copyright 2009, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18#include "SkBlitRow_opts_SSE2.h" 19#include "SkColorPriv.h" 20#include "SkUtils.h" 21 22#include <emmintrin.h> 23 24/* SSE2 version of S32_Blend_BlitRow32() 25 * portable version is in core/SkBlitRow_D32.cpp 26 */ 27void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 28 const SkPMColor* SK_RESTRICT src, 29 int count, U8CPU alpha) { 30 SkASSERT(alpha <= 255); 31 if (count <= 0) { 32 return; 33 } 34 35 uint32_t src_scale = SkAlpha255To256(alpha); 36 uint32_t dst_scale = 256 - src_scale; 37 38 if (count >= 4) { 39 SkASSERT(((size_t)dst & 0x03) == 0); 40 while (((size_t)dst & 0x0F) != 0) { 41 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 42 src++; 43 dst++; 44 count--; 45 } 46 47 const __m128i *s = reinterpret_cast<const __m128i*>(src); 48 __m128i *d = reinterpret_cast<__m128i*>(dst); 49 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 50 __m128i src_scale_wide = _mm_set1_epi16(src_scale); 51 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); 52 while (count >= 4) { 53 // Load 4 pixels each of src and dest. 54 __m128i src_pixel = _mm_loadu_si128(s); 55 __m128i dst_pixel = _mm_load_si128(d); 56 57 // Get red and blue pixels into lower byte of each word. 58 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 59 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 60 61 // Get alpha and green into lower byte of each word. 62 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 63 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 64 65 // Multiply by scale. 66 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 67 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 68 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); 69 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); 70 71 // Divide by 256. 72 src_rb = _mm_srli_epi16(src_rb, 8); 73 dst_rb = _mm_srli_epi16(dst_rb, 8); 74 src_ag = _mm_andnot_si128(rb_mask, src_ag); 75 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 76 77 // Combine back into RGBA. 78 src_pixel = _mm_or_si128(src_rb, src_ag); 79 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 80 81 // Add result 82 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 83 _mm_store_si128(d, result); 84 s++; 85 d++; 86 count -= 4; 87 } 88 src = reinterpret_cast<const SkPMColor*>(s); 89 dst = reinterpret_cast<SkPMColor*>(d); 90 } 91 92 while (count > 0) { 93 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 94 src++; 95 dst++; 96 count--; 97 } 98} 99 100void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 101 const SkPMColor* SK_RESTRICT src, 102 int count, U8CPU alpha) { 103 SkASSERT(alpha == 255); 104 if (count <= 0) { 105 return; 106 } 107 108 if (count >= 4) { 109 SkASSERT(((size_t)dst & 0x03) == 0); 110 while (((size_t)dst & 0x0F) != 0) { 111 *dst = SkPMSrcOver(*src, *dst); 112 src++; 113 dst++; 114 count--; 115 } 116 117 const __m128i *s = reinterpret_cast<const __m128i*>(src); 118 __m128i *d = reinterpret_cast<__m128i*>(dst); 119#ifdef SK_USE_ACCURATE_BLENDING 120 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 121 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 122 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 123 while (count >= 4) { 124 // Load 4 pixels 125 __m128i src_pixel = _mm_loadu_si128(s); 126 __m128i dst_pixel = _mm_load_si128(d); 127 128 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 129 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 130 // Shift alphas down to lower 8 bits of each quad. 131 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 132 133 // Copy alpha to upper 3rd byte of each quad 134 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 135 136 // Subtract alphas from 255, to get 0..255 137 alpha = _mm_sub_epi16(c_255, alpha); 138 139 // Multiply by red and blue by src alpha. 140 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 141 // Multiply by alpha and green by src alpha. 142 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 143 144 // dst_rb_low = (dst_rb >> 8) 145 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 146 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 147 148 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 149 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 150 dst_rb = _mm_add_epi16(dst_rb, c_128); 151 dst_rb = _mm_srli_epi16(dst_rb, 8); 152 153 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 154 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 155 dst_ag = _mm_add_epi16(dst_ag, c_128); 156 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 157 158 // Combine back into RGBA. 159 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 160 161 // Add result 162 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 163 _mm_store_si128(d, result); 164 s++; 165 d++; 166 count -= 4; 167 } 168 #else 169 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 170 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 171 while (count >= 4) { 172 // Load 4 pixels 173 __m128i src_pixel = _mm_loadu_si128(s); 174 __m128i dst_pixel = _mm_load_si128(d); 175 176 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 177 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 178 179 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 180 __m128i alpha = _mm_srli_epi16(src_pixel, 8); 181 182 // (a0, a0, a1, a1, a2, g2, a3, g3) 183 alpha = _mm_shufflehi_epi16(alpha, 0xF5); 184 185 // (a0, a0, a1, a1, a2, a2, a3, a3) 186 alpha = _mm_shufflelo_epi16(alpha, 0xF5); 187 188 // Subtract alphas from 256, to get 1..256 189 alpha = _mm_sub_epi16(c_256, alpha); 190 191 // Multiply by red and blue by src alpha. 192 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 193 // Multiply by alpha and green by src alpha. 194 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 195 196 // Divide by 256. 197 dst_rb = _mm_srli_epi16(dst_rb, 8); 198 199 // Mask out high bits (already in the right place) 200 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 201 202 // Combine back into RGBA. 203 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 204 205 // Add result 206 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 207 _mm_store_si128(d, result); 208 s++; 209 d++; 210 count -= 4; 211 } 212#endif 213 src = reinterpret_cast<const SkPMColor*>(s); 214 dst = reinterpret_cast<SkPMColor*>(d); 215 } 216 217 while (count > 0) { 218 *dst = SkPMSrcOver(*src, *dst); 219 src++; 220 dst++; 221 count--; 222 } 223} 224 225void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 226 const SkPMColor* SK_RESTRICT src, 227 int count, U8CPU alpha) { 228 SkASSERT(alpha <= 255); 229 if (count <= 0) { 230 return; 231 } 232 233 if (count >= 4) { 234 while (((size_t)dst & 0x0F) != 0) { 235 *dst = SkBlendARGB32(*src, *dst, alpha); 236 src++; 237 dst++; 238 count--; 239 } 240 241 uint32_t src_scale = SkAlpha255To256(alpha); 242 243 const __m128i *s = reinterpret_cast<const __m128i*>(src); 244 __m128i *d = reinterpret_cast<__m128i*>(dst); 245 __m128i src_scale_wide = _mm_set1_epi16(src_scale); 246 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 247 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 248 while (count >= 4) { 249 // Load 4 pixels each of src and dest. 250 __m128i src_pixel = _mm_loadu_si128(s); 251 __m128i dst_pixel = _mm_load_si128(d); 252 253 // Get red and blue pixels into lower byte of each word. 254 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 255 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 256 257 // Get alpha and green into lower byte of each word. 258 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 259 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 260 261 // Put per-pixel alpha in low byte of each word. 262 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 263 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 264 265 // dst_alpha = dst_alpha * src_scale 266 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 267 268 // Divide by 256. 269 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 270 271 // Subtract alphas from 256, to get 1..256 272 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 273 274 // Multiply red and blue by dst pixel alpha. 275 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 276 // Multiply alpha and green by dst pixel alpha. 277 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 278 279 // Multiply red and blue by global alpha. 280 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 281 // Multiply alpha and green by global alpha. 282 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 283 284 // Divide by 256. 285 dst_rb = _mm_srli_epi16(dst_rb, 8); 286 src_rb = _mm_srli_epi16(src_rb, 8); 287 288 // Mask out low bits (goodies already in the right place; no need to divide) 289 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 290 src_ag = _mm_andnot_si128(rb_mask, src_ag); 291 292 // Combine back into RGBA. 293 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 294 src_pixel = _mm_or_si128(src_rb, src_ag); 295 296 // Add two pixels into result. 297 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 298 _mm_store_si128(d, result); 299 s++; 300 d++; 301 count -= 4; 302 } 303 src = reinterpret_cast<const SkPMColor*>(s); 304 dst = reinterpret_cast<SkPMColor*>(d); 305 } 306 307 while (count > 0) { 308 *dst = SkBlendARGB32(*src, *dst, alpha); 309 src++; 310 dst++; 311 count--; 312 } 313} 314 315/* SSE2 version of Color32() 316 * portable version is in core/SkBlitRow_D32.cpp 317 */ 318void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 319 SkPMColor color) { 320 321 if (count <= 0) { 322 return; 323 } 324 325 if (0 == color) { 326 if (src != dst) { 327 memcpy(dst, src, count * sizeof(SkPMColor)); 328 } 329 } 330 331 unsigned colorA = SkGetPackedA32(color); 332 if (255 == colorA) { 333 sk_memset32(dst, color, count); 334 } else { 335 unsigned scale = 256 - SkAlpha255To256(colorA); 336 337 if (count >= 4) { 338 SkASSERT(((size_t)dst & 0x03) == 0); 339 while (((size_t)dst & 0x0F) != 0) { 340 *dst = color + SkAlphaMulQ(*src, scale); 341 src++; 342 dst++; 343 count--; 344 } 345 346 const __m128i *s = reinterpret_cast<const __m128i*>(src); 347 __m128i *d = reinterpret_cast<__m128i*>(dst); 348 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 349 __m128i src_scale_wide = _mm_set1_epi16(scale); 350 __m128i color_wide = _mm_set1_epi32(color); 351 while (count >= 4) { 352 // Load 4 pixels each of src and dest. 353 __m128i src_pixel = _mm_loadu_si128(s); 354 355 // Get red and blue pixels into lower byte of each word. 356 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 357 358 // Get alpha and green into lower byte of each word. 359 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 360 361 // Multiply by scale. 362 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 363 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 364 365 // Divide by 256. 366 src_rb = _mm_srli_epi16(src_rb, 8); 367 src_ag = _mm_andnot_si128(rb_mask, src_ag); 368 369 // Combine back into RGBA. 370 src_pixel = _mm_or_si128(src_rb, src_ag); 371 372 // Add color to result. 373 __m128i result = _mm_add_epi8(color_wide, src_pixel); 374 375 // Store result. 376 _mm_store_si128(d, result); 377 s++; 378 d++; 379 count -= 4; 380 } 381 src = reinterpret_cast<const SkPMColor*>(s); 382 dst = reinterpret_cast<SkPMColor*>(d); 383 } 384 385 while (count > 0) { 386 *dst = color + SkAlphaMulQ(*src, scale); 387 src += 1; 388 dst += 1; 389 count--; 390 } 391 } 392} 393