1dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com/* 2dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * 4dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * Use of this source code is governed by a BSD-style license 5dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * that can be found in the LICENSE file in the root of the source 6dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * tree. An additional intellectual property rights grant can be found 7dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * in the file PATENTS. All contributing project authors may 8dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com * be found in the AUTHORS file in the root of the source tree. 9dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com */ 10dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 11dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#include "libyuv/row.h" 12dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 13dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#ifdef __cplusplus 14dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comnamespace libyuv { 15dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comextern "C" { 16dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#endif 17dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 18dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// This module is for Visual C x86. 19dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 21dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 0 to 9 22dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf0 = 23dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 24dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 25dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 26dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf1 = 27dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 28dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 29dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 30dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf2 = 31dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 32dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 33dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 0 to 10 34dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf01 = 35dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 36dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 37dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 38dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf11 = 39dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 40dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 41dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 42dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf21 = 43dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 44dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 45dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 0 to 10 46dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kMadd01 = 47dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 48dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 49dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 10 to 21 50dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kMadd11 = 51dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 52dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 53dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 21 to 31 54dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kMadd21 = 55dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 56dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 57dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 21 to 31 58dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic vec16 kRound34 = 59dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 2, 2, 2, 2, 2, 2, 2, 2 }; 60dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 61dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf38a = 62dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 63dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 64dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf38b = 65dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 66dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 67dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange words 0,3,6 into 0,1,2 68dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAc = 69dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 70dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 71dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange words 0,3,6 into 3,4,5 72dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAc3 = 73dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 74dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 75dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scaling values for boxes of 3x3 and 2x3 76dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec16 kScaleAc33 = 77dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 78dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 79dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange first value for pixels 0,1,2,3,4,5 80dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAb0 = 81dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 82dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 83dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange second value for pixels 0,1,2,3,4,5 84dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAb1 = 85dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 86dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 87dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange third value for pixels 0,1,2,3,4,5 88dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAb2 = 89dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 90dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 91dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scaling values for boxes of 3x2 and 2x2 92dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec16 kScaleAb2 = 93dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 94dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 95dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 32 pixels, throws half away and writes 16 pixels. 96dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 97dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 98dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 99dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 100dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 101dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 102dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 103dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 104dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 105dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 106c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 107dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 108dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 109dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 110dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 111dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 // isolate odd pixels. 112dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 113dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 114dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 115dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 116dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 117dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 118dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 119dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 120dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 121dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 122dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 123dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x1 rectangle to 16x1. 124dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 125dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 126dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 127dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 128dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 129dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 130dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride 131dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 132dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 133dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 134dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm5, 8 135dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 136c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 137dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 138dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 139dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 140dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 141dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 142dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 143dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 144dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, xmm1 145dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 146dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm2, xmm5 147dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm3, xmm5 148dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm0, xmm2 149dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm1, xmm3 150dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 151dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 152dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 153dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 154dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 155dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 156dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 157dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 158dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 159dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 160dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 161dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x2 rectangle to 16x1. 162dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 163dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 164dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 165dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 166dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 167dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 168dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_ptr 169dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 170dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_ptr 171dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 172dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 173dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm5, 8 174dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 175c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 176dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 177dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 178dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 179dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, [eax + esi] 180dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, [eax + esi + 16] 181dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 182dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 // average rows 183dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm3 184dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 185dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 186dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 187dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, xmm1 188dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 189dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm2, xmm5 190dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm3, xmm5 191dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm0, xmm2 192dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm1, xmm3 193dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 194dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 195dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 196dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 197dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 198dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 199dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 200dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 201dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 202dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 203dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 204dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 205dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 32 pixels, throws half away and writes 16 pixels. 206dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 207dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 208dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, 209dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 210dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 211dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 212dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 213dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 214dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 215dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 216dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 217c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 218dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 219dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm0, [eax] 220dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm1, [eax + 16] 221dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 222dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 // isolate odd pixels. 223dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 224dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 225dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 226dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu [edx], xmm0 227dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 228dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 229dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 230dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 231dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 232dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 233dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 234dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x1 rectangle to 16x1. 235dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 236dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 2371f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.comvoid ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, 2381f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com ptrdiff_t src_stride, 239dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 240dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 241dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 242dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride 243dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 244dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 245dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 246dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm5, 8 247dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 248c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 249dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 250dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm0, [eax] 251dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm1, [eax + 16] 252dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 253dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 254dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 255dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 256dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, xmm1 257dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 258dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm2, xmm5 259dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm3, xmm5 260dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm0, xmm2 261dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm1, xmm3 262dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 263dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 264dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 265dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu [edx], xmm0 266dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 267dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 268dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 269dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 270dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 271dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 272dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 273dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x2 rectangle to 16x1. 274dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 275dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 276dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, 277dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 278dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 279dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 280dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 281dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_ptr 282dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 283dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_ptr 284dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 285dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 286dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm5, 8 287dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 288c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 289dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 290dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm0, [eax] 291dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm1, [eax + 16] 292dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm2, [eax + esi] 293dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm3, [eax + esi + 16] 294dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 295dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 // average rows 296dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm3 297dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 298dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 299dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 300dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, xmm1 301dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 302dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm2, xmm5 303dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm3, xmm5 304dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm0, xmm2 305dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm1, xmm3 306dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 307dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 308dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 309dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu [edx], xmm0 310dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 311dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 312dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 313dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 314dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 315dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 316dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 317dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 318dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Point samples 32 pixels to 8 pixels. 319dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 320dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 321dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 322dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 323dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 324dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 325dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 326dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 327dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 328dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 329dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrld xmm5, 24 330dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pslld xmm5, 16 331dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 332c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 333dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 334dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 335dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 336dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 337dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm0, xmm5 338dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm1, xmm5 339dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 340dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 341dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 342dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 8 343dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx], xmm0 344dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 8] 345dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 346dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 347dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 348dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 349dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 350dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 351dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x4 rectangle to 8x1. 352dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 353dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 354dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 355dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 356dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 357dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 358dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 359dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 8 + 4] // src_ptr 360dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 8 + 8] // src_stride 361dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 8 + 12] // dst_ptr 362dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 8 + 16] // dst_width 363dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [esi + esi * 2] // src_stride * 3 364dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 365dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm7, 8 366dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 367c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 368dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 369dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 370dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 371dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, [eax + esi] 372dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, [eax + esi + 16] 373dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 // average rows 374dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm3 375dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, [eax + esi * 2] 376dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, [eax + esi * 2 + 16] 377dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, [eax + edi] 378dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, [eax + edi + 16] 379dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 380dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm2, xmm4 381dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm3, xmm5 382dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 383dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm3 384dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 385dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 386dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 387dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, xmm1 388dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 8 389dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm2, xmm7 390dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm3, xmm7 391dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm0, xmm2 392dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm1, xmm3 393dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm1 394dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 395dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 396dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 8 397dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pand xmm2, xmm7 398dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgw xmm0, xmm2 399dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 400dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 401dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 8 402dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx], xmm0 403dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 8] 404dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 405dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 406dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 407dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 408dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 409dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 410dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 411dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 412dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Point samples 32 pixels to 24 pixels. 413dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 414dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Then shuffled to do the scaling. 415dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 416dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Note that movdqa+palign may be better than movdqu. 417dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 418dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 419dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 420dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 421dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 422dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 423dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 424dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 425dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 426dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, kShuf0 427dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kShuf1 428dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, kShuf2 429dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 430c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 431dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 432dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 433dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 434dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 435dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm1 436dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com palignr xmm1, xmm0, 8 437dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm3 438dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm1, xmm4 439dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm2, xmm5 440dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx], xmm0 441dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx + 8], xmm1 442dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx + 16], xmm2 443dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 24] 444dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 24 445dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 446dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 447dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 448dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 449dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 450dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 451dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x2 rectangle to 24x1 452dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 453dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Then shuffled to do the scaling. 454dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 455dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Register usage: 456dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm0 src_row 0 457dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm1 src_row 1 458dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm2 shuf 0 459dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm3 shuf 1 460dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm4 shuf 2 461dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm5 madd 0 462dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm6 madd 1 463dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm7 kRound34 464dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 465dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Note that movdqa+palign may be better than movdqu. 466dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 467dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 468dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 469dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 470dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 471dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 472dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 473dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_ptr 474dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 475dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_ptr 476dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 477dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, kShuf01 478dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, kShuf11 479dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kShuf21 480dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, kMadd01 481dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm6, kMadd11 482dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm7, kRound34 483dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 484c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 485dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 486dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] // pixels 0..7 487dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + esi] 488dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm1 489dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm2 490dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm5 491dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddsw xmm0, xmm7 492dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 2 493dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 494dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx], xmm0 495dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm0, [eax + 8] // pixels 8..15 496dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm1, [eax + esi + 8] 497dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm1 498dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm3 499dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm6 500dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddsw xmm0, xmm7 501dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 2 502dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 503dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx + 8], xmm0 504dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax + 16] // pixels 16..23 505dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + esi + 16] 506dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 507dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm1 508dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm4 509dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, kMadd21 510dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm1 511dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddsw xmm0, xmm7 512dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 2 513dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 514dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 24 515dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx + 16], xmm0 516dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 24] 517dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 518dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 519dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 520dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 521dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 522dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 523dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 524dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Note that movdqa+palign may be better than movdqu. 525dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 526dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 527dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 528dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 529dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 530dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 531dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 532dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_ptr 533dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 534dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_ptr 535dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 536dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, kShuf01 537dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, kShuf11 538dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kShuf21 539dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, kMadd01 540dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm6, kMadd11 541dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm7, kRound34 542dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 543c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 544dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 545dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] // pixels 0..7 546dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + esi] 547dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm0 548dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm1 549dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm2 550dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm5 551dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddsw xmm0, xmm7 552dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 2 553dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 554dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx], xmm0 555dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm0, [eax + 8] // pixels 8..15 556dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu xmm1, [eax + esi + 8] 557dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm0 558dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm1 559dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm3 560dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm6 561dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddsw xmm0, xmm7 562dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 2 563dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 564dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx + 8], xmm0 565dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax + 16] // pixels 16..23 566dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + esi + 16] 567dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 568dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm0 569dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm1 570dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm4 571dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, kMadd21 572dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm1 573dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddsw xmm0, xmm7 574dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 2 575dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 576dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 24 577dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx + 16], xmm0 578dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx+24] 579dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 580dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 581dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 582dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 583dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 584dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 585dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 586dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// 3/8 point sampler 587dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 588dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scale 32 pixels to 12 589dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 590dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 591dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 592dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 593dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_ptr 594dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 595dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_ptr 596dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 597dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kShuf38a 598dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, kShuf38b 599dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 600c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 601dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop: 602dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 603dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 604dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 605dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm4 606dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm1, xmm5 607dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusb xmm0, xmm1 608dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 609dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 12 610dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edx], xmm0 // write 12 pixels 611dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhlps xmm1, xmm0 612dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd [edx + 8], xmm1 613dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 12] 614dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg xloop 615dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 616dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 617dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 618dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 619dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 620dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scale 16x3 pixels to 6x1 with interpolation 621dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 622dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 623dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 624dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 625dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 626dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 627dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_ptr 628dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 629dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_ptr 630dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 631dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, kShufAc 632dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, kShufAc3 633dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kScaleAc33 634dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pxor xmm5, xmm5 635dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 636c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 637dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop: 638dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 639dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm6, [eax + esi] 640dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhlps xmm1, xmm0 641dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhlps xmm7, xmm6 642dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm0, xmm5 643dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm1, xmm5 644dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm6, xmm5 645dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm7, xmm5 646dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm0, xmm6 647dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm1, xmm7 648dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm6, [eax + esi * 2] 649dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 16] 650dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhlps xmm7, xmm6 651dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm6, xmm5 652dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm7, xmm5 653dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm0, xmm6 654dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm1, xmm7 655dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 656dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 657dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrldq xmm0, 2 658dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm6, xmm0 659dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrldq xmm0, 2 660dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm6, xmm0 661dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm6, xmm2 662dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 663dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 664dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrldq xmm1, 2 665dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm7, xmm1 666dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrldq xmm1, 2 667dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm7, xmm1 668dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm7, xmm3 669dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm6, xmm7 670dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 671dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 672dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm6, xmm6 673dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 674dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 6 675dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd [edx], xmm6 // write 6 pixels 676dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlq xmm6, 16 677dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd [edx + 2], xmm6 678dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 6] 679dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg xloop 680dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 681dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 682dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 683dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 684dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 685dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 686dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scale 16x2 pixels to 6x1 with interpolation 687dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 688dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 689dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 690dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_ptr, int dst_width) { 691dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 692dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 693dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_ptr 694dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 695dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_ptr 696dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 697dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, kShufAb0 698dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, kShufAb1 699dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kShufAb2 700dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, kScaleAb2 701dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 702c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 703dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop: 704dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] // average 2 rows into xmm0 705dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, [eax + esi] 706dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 16] 707dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 708dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 709dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm1, xmm2 710dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm6, xmm0 711dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm6, xmm3 712dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm1, xmm6 713dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm4 714dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm1, xmm0 715dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 716dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 717dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm1, xmm1 718dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 719dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 6 720dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd [edx], xmm1 // write 6 pixels 721dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlq xmm1, 16 722dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd [edx + 2], xmm1 723dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 6] 724dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg xloop 725dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 726dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 727dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 728dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 729dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 730dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 731dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 16xN bytes and produces 16 shorts at a time. 732dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. 733dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 734dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 735dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint16* dst_ptr, int src_width, 736dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com int src_height) { 737dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 738dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 739dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 740dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push ebx 741dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push ebp 742dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 16 + 4] // src_ptr 743dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 16 + 8] // src_stride 744dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edi, [esp + 16 + 12] // dst_ptr 745dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16 + 16] // dst_width 746dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ebx, [esp + 16 + 20] // height 747dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pxor xmm4, xmm4 748dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com dec ebx 749dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 750c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 751dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop: 752dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // first row 753dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [esi] 754dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [esi + edx] 755dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, xmm0 756dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm0, xmm4 757dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckhbw xmm1, xmm4 758dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea esi, [esi + 16] 759dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ebp, ebx 760dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com test ebp, ebp 761dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com je ydone 762dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 763dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // sum remaining rows 764c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 765dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com yloop: 766dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, [eax] // read 16 pixels 767dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + edx] // advance to next row 768dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, xmm2 769dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm2, xmm4 770dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckhbw xmm3, xmm4 771dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm0, xmm2 // sum 16 words 772dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddusw xmm1, xmm3 773dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ebp, 1 774dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg yloop 775dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 776c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 777dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ydone: 778dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edi], xmm0 779dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edi + 16], xmm1 780dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [edi + 32] 781dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 782dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 16 783dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg xloop 784dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 785dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop ebp 786dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop ebx 787dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 788dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 789dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 790dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 791dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 792dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 793dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Bilinear column filtering. SSSE3 version. 794dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// TODO(fbarchard): Port to Neon 79548e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// TODO(fbarchard): Switch the following: 79648e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// xor ebx, ebx 79748e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// mov bx, word ptr [esi + eax] // 2 source x0 pixels 79848e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// To 79948e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 80048e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// when drmemory bug fixed. 80148e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// https://code.google.com/p/drmemory/issues/detail?id=1396 802dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 803dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 804dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 805dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com int dst_width, int x, int dx) { 806dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 807dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push ebx 808dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 809dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 810dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edi, [esp + 12 + 4] // dst_ptr 811dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 12 + 8] // src_ptr 812dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 12 + 12] // dst_width 813dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm2, [esp + 12 + 16] // x 814dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm3, [esp + 12 + 20] // dx 815dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, 0x04040000 // shuffle to line up fractions with pixel. 816dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm5, eax 817dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 818dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm6, 9 819dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 1 // get x0 integer. preroll 820dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 2 821dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jl xloop29 822dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 823dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, xmm2 // x1 = x0 + dx 824dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm0, xmm3 825dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm2, xmm0 // x0 x1 826dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm3, xmm3 // dx dx 827dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm3, xmm3 // dx * 2, dx * 2 828dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 3 // get x1 integer. preroll 829dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 830dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 2 Pixel loop. 831dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 832dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop2: 833dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, xmm2 // x0, x1 fractions. 834dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm2, xmm3 // x += dx 8350db78ad127aacd528b5a699a8d0f3d6fb01e4a01fbarchard@google.com movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 836dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm0, ebx 837dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 9 // 7 bit fractions. 8380db78ad127aacd528b5a699a8d0f3d6fb01e4a01fbarchard@google.com movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 839dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm4, ebx 840dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm1, xmm5 // 0011 841dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklwd xmm0, xmm4 842dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pxor xmm1, xmm6 // 0..7f and 7f..0 843dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 844dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 1 // get x0 integer. next iteration. 845dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 3 // get x1 integer. next iteration. 846dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 847dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 // 8 bits, 2 pixels. 848dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd ebx, xmm0 849dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov [edi], bx 850dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [edi + 2] 851dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 2 // 2 pixels 852dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jge xloop2 853dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 854dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 855dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop29: 856dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 857dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com add ecx, 2 - 1 858dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jl xloop99 859dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 860dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 1 pixel remainder 8610db78ad127aacd528b5a699a8d0f3d6fb01e4a01fbarchard@google.com movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 862dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm0, ebx 863dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm2, 9 // 7 bit fractions. 864dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm2, xmm5 // 0011 865dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pxor xmm2, xmm6 // 0..7f and 7f..0 866dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm2 // 16 bit 867dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 868dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 // 8 bits 869dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd ebx, xmm0 870dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov [edi], bl 871dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 872c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 873dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop99: 874dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 875dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 876dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 877dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop ebx 878dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 879dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 880dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 881dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 882dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 16 pixels, duplicates them and writes 32 pixels. 883dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 884dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 885dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 8861f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com int dst_width, int x, int dx) { 887dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 888dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4] // dst_ptr 889dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 8] // src_ptr 890dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 12] // dst_width 891dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 892c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 893dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 894dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 895dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 16] 896dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, xmm0 897dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklbw xmm0, xmm0 898dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckhbw xmm1, xmm1 899dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 32 900dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 901dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx + 16], xmm1 902dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 32] 903dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 904dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 905dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 906dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 907dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 908dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 909dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 910dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 911dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 912dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDown2_SSE2(const uint8* src_argb, 9131f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com ptrdiff_t src_stride, 914dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_argb, int dst_width) { 915dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 916dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_argb 917dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 918dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_argb 919dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 920dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 921c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 922dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 923dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 924dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 925dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 926dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm0, xmm1, 0xdd 927dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 928dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 929dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 930dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 931dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 932dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 933dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 934dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 935dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 936dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 8x1 rectangle to 4x1. 937dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 938dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 939dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 9401f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com ptrdiff_t src_stride, 941dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_argb, int dst_width) { 942dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 943dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4] // src_argb 944dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 945dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12] // dst_argb 946dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 16] // dst_width 947dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 948c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 949dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 950dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 951dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 952dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 953dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 954dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm0, xmm1, 0x88 // even pixels 955dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm2, xmm1, 0xdd // odd pixels 956dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 957dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 958dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 959dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 960dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 961dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 962dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 963dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 964dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 965dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 966dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 8x2 rectangle to 4x1. 967dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 968dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 969dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 970dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 971dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_argb, int dst_width) { 972dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 973dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 974dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 4 + 4] // src_argb 975dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 4 + 8] // src_stride 976dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4 + 12] // dst_argb 977dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 4 + 16] // dst_width 978dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 979c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 980dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 981dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 982dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, [eax + 16] 983dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, [eax + esi] 984dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm3, [eax + esi + 16] 985dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 32] 986dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 // average rows 987dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm3 988dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 989dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm0, xmm1, 0x88 // even pixels 990dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm2, xmm1, 0xdd // odd pixels 991dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 992dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 993dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 994dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 995dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 996dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 997dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 998dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 999dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 1000dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 1001dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1002dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 4 pixels at a time. 1003dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: dst_argb 16 byte aligned. 1004dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 1005dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1006dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com int src_stepx, 1007dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_argb, int dst_width) { 1008dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 1009dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push ebx 1010dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 1011dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 8 + 4] // src_argb 1012dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // src_stride ignored 1013dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ebx, [esp + 8 + 12] // src_stepx 1014dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 8 + 16] // dst_argb 1015dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 8 + 20] // dst_width 1016dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea ebx, [ebx * 4] 1017dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [ebx + ebx * 2] 1018dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1019c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 1020dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 1021dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm0, [eax] 1022dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm1, [eax + ebx] 1023dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm0, xmm1 1024dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm2, [eax + ebx * 2] 1025dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm3, [eax + edi] 1026dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + ebx * 4] 1027dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm2, xmm3 1028dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklqdq xmm0, xmm2 1029dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 1030dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 1031dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 1032dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 1033dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1034dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 1035dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop ebx 1036dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 1037dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 1038dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 1039dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1040dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends four 2x2 to 4x1. 1041dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: dst_argb 16 byte aligned. 1042dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 1043dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1044dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ptrdiff_t src_stride, 1045dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com int src_stepx, 1046dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com uint8* dst_argb, int dst_width) { 1047dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 1048dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push ebx 1049dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 1050dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 1051dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 12 + 4] // src_argb 1052dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 12 + 8] // src_stride 1053dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ebx, [esp + 12 + 12] // src_stepx 1054dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 12 + 16] // dst_argb 1055dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 12 + 20] // dst_width 1056dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea esi, [eax + esi] // row1 pointer 1057dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea ebx, [ebx * 4] 1058dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [ebx + ebx * 2] 1059dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1060c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 1061dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 1062dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq xmm0, qword ptr [eax] // row0 4 pairs 1063dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhps xmm0, qword ptr [eax + ebx] 1064dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq xmm1, qword ptr [eax + ebx * 2] 1065dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhps xmm1, qword ptr [eax + edi] 1066dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + ebx * 4] 1067dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq xmm2, qword ptr [esi] // row1 4 pairs 1068dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhps xmm2, qword ptr [esi + ebx] 1069dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq xmm3, qword ptr [esi + ebx * 2] 1070dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhps xmm3, qword ptr [esi + edi] 1071dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea esi, [esi + ebx * 4] 1072dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 // average rows 1073dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm1, xmm3 1074dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1075dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm0, xmm1, 0x88 // even pixels 1076dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com shufps xmm2, xmm1, 0xdd // odd pixels 1077dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pavgb xmm0, xmm2 1078dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 1079dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 1080dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 16] 1081dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 1082dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1083dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 1084dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 1085dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop ebx 1086dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 1087dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 1088dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 1089dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1090dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Column scaling unfiltered. SSE2 version. 1091dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 1092dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1093dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com int dst_width, int x, int dx) { 1094dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 1095dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 1096dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 1097dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edi, [esp + 8 + 4] // dst_argb 1098dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 8 + 8] // src_argb 1099dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 8 + 12] // dst_width 1100dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm2, [esp + 8 + 16] // x 1101dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm3, [esp + 8 + 20] // dx 1102dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1103dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1104dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1105dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm2, xmm0 1106dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1107dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1108dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm2, xmm0 // x3 x2 x1 x0 1109dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1110dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1111dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1112dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 1 // get x0 integer. 1113dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 3 // get x1 integer. 1114dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1115dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com cmp ecx, 0 1116dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jle xloop99 1117dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 1118dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jl xloop49 1119dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1120dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 4 Pixel loop. 1121dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 1122dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop4: 1123dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1124dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1125dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 5 // get x2 integer. 1126dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 7 // get x3 integer. 1127dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm2, xmm3 // x += dx 1128dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm0, xmm1 // x0 x1 1129dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1130dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1131dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1132dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1133dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1134dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm1, xmm4 // x2 x3 1135dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1136dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 4 // 4 pixels 1137dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqu [edi], xmm0 1138dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [edi + 16] 1139dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jge xloop4 1140dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1141dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 1142dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop49: 1143dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com test ecx, 2 1144dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com je xloop29 1145dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1146dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 2 Pixels. 1147dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1148dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1149dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 5 // get x2 integer. 1150dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm0, xmm1 // x0 x1 1151dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1152dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edi], xmm0 1153dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [edi + 8] 1154dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1155dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop29: 1156dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com test ecx, 1 1157dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com je xloop99 1158dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1159dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 1 Pixels. 1160dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1161dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd dword ptr [edi], xmm0 1162dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 1163dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop99: 1164dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1165dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 1166dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 1167dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 1168dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 1169dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 1170dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1171dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1172dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// TODO(fbarchard): Port to Neon 1173dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1174dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1175dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuffleColARGB = { 1176dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1177dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1178dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}; 1179dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1180dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Shuffle table for duplicating 2 fractions into 8 bytes each 1181dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuffleFractions = { 1182dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1183dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}; 1184dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1185dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 1186dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1187dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com int dst_width, int x, int dx) { 1188dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 1189dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push esi 1190dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com push edi 1191dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edi, [esp + 8 + 4] // dst_argb 1192dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov esi, [esp + 8 + 8] // src_argb 1193dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 8 + 12] // dst_width 1194dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm2, [esp + 8 + 16] // x 1195dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd xmm3, [esp + 8 + 20] // dx 1196dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm4, kShuffleColARGB 1197dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm5, kShuffleFractions 1198dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1199dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm6, 9 1200dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 1 // get x0 integer. preroll 1201dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 2 1202dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jl xloop29 1203dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1204dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, xmm2 // x1 = x0 + dx 1205dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm0, xmm3 1206dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm2, xmm0 // x0 x1 1207dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm3, xmm3 // dx dx 1208dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm3, xmm3 // dx * 2, dx * 2 1209dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 3 // get x1 integer. preroll 1210dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1211dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 2 Pixel loop. 1212dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 1213dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop2: 1214dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, xmm2 // x0, x1 fractions. 1215dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com paddd xmm2, xmm3 // x += dx 1216dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1217dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm1, 9 // 7 bit fractions. 1218dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1219dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm1, xmm5 // 0000000011111111 1220dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm4 // arrange pixels into pairs 1221dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pxor xmm1, xmm6 // 0..7f and 7f..0 1222dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1223dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1224dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1225dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1226dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1227dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq qword ptr [edi], xmm0 1228dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edi, [edi + 8] 1229dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 2 // 2 pixels 1230dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jge xloop2 1231dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1232dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com align 4 1233dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop29: 1234dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1235dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com add ecx, 2 - 1 1236dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jl xloop99 1237dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1238dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com // 1 pixel remainder 1239dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm2, 9 // 7 bit fractions. 1240dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1241dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm2, xmm5 // 00000000 1242dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pshufb xmm0, xmm4 // arrange pixels into pairs 1243dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pxor xmm2, xmm6 // 0..7f and 7f..0 1244dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1245dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com psrlw xmm0, 7 1246dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1247dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movd [edi], xmm0 1248dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1249c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 1250dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop99: 1251dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1252dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop edi 1253dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com pop esi 1254dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 1255dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 1256dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 1257dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1258dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 4 pixels, duplicates them and writes 8 pixels. 1259dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1260dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16)) 1261dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 12621f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com int dst_width, int x, int dx) { 1263dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com __asm { 1264dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov edx, [esp + 4] // dst_argb 1265dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov eax, [esp + 8] // src_argb 1266dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com mov ecx, [esp + 12] // dst_width 1267dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1268c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com align 4 1269dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com wloop: 1270dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm0, [eax] 1271dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea eax, [eax + 16] 1272dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa xmm1, xmm0 1273dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckldq xmm0, xmm0 1274dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com punpckhdq xmm1, xmm1 1275dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com sub ecx, 8 1276dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx], xmm0 1277dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com movdqa [edx + 16], xmm1 1278dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com lea edx, [edx + 32] 1279dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com jg wloop 1280dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1281dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com ret 1282dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com } 1283dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} 1284dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 12855dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com// Divide num by div and return as 16.16 fixed point result. 12865dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com__declspec(naked) __declspec(align(16)) 12875dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.comint FixedDiv_X86(int num, int div) { 12885dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com __asm { 12895dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com mov eax, [esp + 4] // num 12905dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com cdq // extend num to 64 bits 12915dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com shld edx, eax, 16 // 32.16 12925dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com shl eax, 16 12935dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com idiv dword ptr [esp + 8] 12945dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com ret 12955dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com } 12965dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com} 12975dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com 12985dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com// Divide num by div and return as 16.16 fixed point result. 12995dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com__declspec(naked) __declspec(align(16)) 13005dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.comint FixedDiv1_X86(int num, int div) { 13015dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com __asm { 13025dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com mov eax, [esp + 4] // num 13035dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com mov ecx, [esp + 8] // denom 13045dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com cdq // extend num to 64 bits 13055dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com shld edx, eax, 16 // 32.16 13065dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com shl eax, 16 13075dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com sub eax, 0x00010001 13085dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com sbb edx, 0 13095dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com sub ecx, 1 13105dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com idiv ecx 13115dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com ret 13125dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com } 13135dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com} 13145dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com 1315dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1316dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com 1317dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#ifdef __cplusplus 1318dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} // extern "C" 1319dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com} // namespace libyuv 1320dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#endif 1321