15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(_MSC_VER) 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <intrin.h> 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <mmintrin.h> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <emmintrin.h> 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "media/base/simd/filter_yuv.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace media { 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void FilterYUVRows_SSE2(uint8* dest, 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* src0, 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* src1, 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int width, 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int fraction) { 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int pixel = 0; 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Process the unaligned bytes first. 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int unaligned_width = 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (16 - (reinterpret_cast<uintptr_t>(dest) & 15)) & 15; 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (pixel < width && pixel < unaligned_width) { 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dest[pixel] = (src0[pixel] * (256 - fraction) + 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src1[pixel] * fraction) >> 8; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++pixel; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i zero = _mm_setzero_si128(); 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i src1_fraction = _mm_set1_epi16(fraction); 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i src0_fraction = _mm_set1_epi16(256 - fraction); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const __m128i* src0_128 = 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reinterpret_cast<const __m128i*>(src0 + pixel); 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const __m128i* src1_128 = 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reinterpret_cast<const __m128i*>(src1 + pixel); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i* dest128 = reinterpret_cast<__m128i*>(dest + pixel); 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i* end128 = reinterpret_cast<__m128i*>( 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reinterpret_cast<uintptr_t>(dest + width) & ~15); 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (dest128 < end128) { 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i src0 = _mm_loadu_si128(src0_128); 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i src1 = _mm_loadu_si128(src1_128); 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i src2 = _mm_unpackhi_epi8(src0, zero); 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __m128i src3 = _mm_unpackhi_epi8(src1, zero); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src0 = _mm_unpacklo_epi8(src0, zero); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src1 = _mm_unpacklo_epi8(src1, zero); 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src0 = _mm_mullo_epi16(src0, src0_fraction); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src1 = _mm_mullo_epi16(src1, src1_fraction); 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src2 = _mm_mullo_epi16(src2, src0_fraction); 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src3 = _mm_mullo_epi16(src3, src1_fraction); 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src0 = _mm_add_epi16(src0, src1); 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src2 = _mm_add_epi16(src2, src3); 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src0 = _mm_srli_epi16(src0, 8); 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src2 = _mm_srli_epi16(src2, 8); 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src0 = _mm_packus_epi16(src0, src2); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *dest128++ = src0; 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++src0_128; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++src1_128; 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pixel += 16; 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (pixel < width) { 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dest[pixel] = (src0[pixel] * (256 - fraction) + 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) src1[pixel] * fraction) >> 8; 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++pixel; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace media 73